In [3]:
import sqlite3

print(sqlite3.version)
print(sqlite3.sqlite_version)

import pandas as pd

2.6.0
3.39.3


In [4]:
conn = sqlite3.connect('aw.db') #permanent database

In [5]:
conn.execute('DROP TABLE IF EXISTS customer')

<sqlite3.Cursor at 0x170d409dc70>

In [7]:
conn.execute('''
CREATE TABLE customer (
    Customerkey   INTEGER PRIMARY KEY NOT NULL,
    LastName      TEXT,
    BirthDate     TEXT,
    MaritalStatus TEXT,
    YearlyIncome  REAL,
    ModifiedDAte  TEXT,
    ETLLastUpdate DATETIME DEFAULT current_timestamp
    );
    ''')

<sqlite3.Cursor at 0x170d40df9d0>

In [8]:
import pandas as pd
pd.read_sql_query("select name from sqlite_master where type = 'table'", conn)

Unnamed: 0,name
0,customer


### Load the master table

In [9]:
custdf = pd.read_csv("data/dimcustomer.csv", index_col = False)

In [10]:
custdf

Unnamed: 0,CustomerKey,LastName,BirthDate,MaritalStatus,YearlyIncome,ModifiedDate
0,11000,Yang,1971-10-06,M,90000.0,2019-01-01
1,11001,Huang,1976-05-10,S,60000.0,2019-01-01
2,11002,Torres,1971-02-09,M,60000.0,2019-01-01
3,11003,Zhu,1973-08-14,S,70000.0,2019-01-01


### Save the dataframe to a table
Do not use the if_exists = 'replace' or you will losse the primary key

In [11]:
custdf.to_sql('customer', conn, if_exists='append', index=False)

4

In [12]:
pd.read_sql_query("select * from customer", conn)

Unnamed: 0,Customerkey,LastName,BirthDate,MaritalStatus,YearlyIncome,ModifiedDAte,ETLLastUpdate
0,11000,Yang,1971-10-06,M,90000.0,2019-01-01,2022-11-09 07:20:53
1,11001,Huang,1976-05-10,S,60000.0,2019-01-01,2022-11-09 07:20:53
2,11002,Torres,1971-02-09,M,60000.0,2019-01-01,2022-11-09 07:20:53
3,11003,Zhu,1973-08-14,S,70000.0,2019-01-01,2022-11-09 07:20:53


**Getting meta data**

In [13]:
custdf.columns

Index(['CustomerKey', 'LastName', 'BirthDate', 'MaritalStatus', 'YearlyIncome',
       'ModifiedDate'],
      dtype='object')

In [14]:
custdf.dtypes

CustomerKey        int64
LastName          object
BirthDate         object
MaritalStatus     object
YearlyIncome     float64
ModifiedDate      object
dtype: object

In [15]:
# Get table schema
import pandas as pd

pd.read_sql_query("""
PRAGMA table_info('customer');
""", conn)

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,Customerkey,INTEGER,1,,1
1,1,LastName,TEXT,0,,0
2,2,BirthDate,TEXT,0,,0
3,3,MaritalStatus,TEXT,0,,0
4,4,YearlyIncome,REAL,0,,0
5,5,ModifiedDAte,TEXT,0,,0
6,6,ETLLastUpdate,DATETIME,0,current_timestamp,0


### Generic Inserting, Updating, Deleting

**Insert**

In [16]:
sql = '''
INSERT INTO customer (CustomerKey, LastName, BirthDate, MaritalStatus, YearlyIncome, ModifiedDate)
VALUES (999901, 'Zach', '2001-02-01', 'S', 32000, '2020-03-01')
'''
conn.execute(sql)

<sqlite3.Cursor at 0x170d413d180>

In [17]:
pd.read_sql_query("select * from customer", conn)

Unnamed: 0,Customerkey,LastName,BirthDate,MaritalStatus,YearlyIncome,ModifiedDAte,ETLLastUpdate
0,11000,Yang,1971-10-06,M,90000.0,2019-01-01,2022-11-09 07:20:53
1,11001,Huang,1976-05-10,S,60000.0,2019-01-01,2022-11-09 07:20:53
2,11002,Torres,1971-02-09,M,60000.0,2019-01-01,2022-11-09 07:20:53
3,11003,Zhu,1973-08-14,S,70000.0,2019-01-01,2022-11-09 07:20:53
4,999901,Zach,2001-02-01,S,32000.0,2020-03-01,2022-11-09 07:41:43


**Update**

In [18]:
conn.execute('''UPDATE customer 
                 SET LastName = 'Jones', MaritalStatus = 'M' 
                WHERE CustomerKey = 11001''')

pd.read_sql_query("select * from customer", conn)

Unnamed: 0,Customerkey,LastName,BirthDate,MaritalStatus,YearlyIncome,ModifiedDAte,ETLLastUpdate
0,11000,Yang,1971-10-06,M,90000.0,2019-01-01,2022-11-09 07:20:53
1,11001,Jones,1976-05-10,M,60000.0,2019-01-01,2022-11-09 07:20:53
2,11002,Torres,1971-02-09,M,60000.0,2019-01-01,2022-11-09 07:20:53
3,11003,Zhu,1973-08-14,S,70000.0,2019-01-01,2022-11-09 07:20:53
4,999901,Zach,2001-02-01,S,32000.0,2020-03-01,2022-11-09 07:41:43


**Delete**

In [19]:
conn.execute("DELETE FROM customer where CustomerKey = 999901")
pd.read_sql_query("select * from customer", conn)

Unnamed: 0,Customerkey,LastName,BirthDate,MaritalStatus,YearlyIncome,ModifiedDAte,ETLLastUpdate
0,11000,Yang,1971-10-06,M,90000.0,2019-01-01,2022-11-09 07:20:53
1,11001,Jones,1976-05-10,M,60000.0,2019-01-01,2022-11-09 07:20:53
2,11002,Torres,1971-02-09,M,60000.0,2019-01-01,2022-11-09 07:20:53
3,11003,Zhu,1973-08-14,S,70000.0,2019-01-01,2022-11-09 07:20:53


### Load the transaction file

In [20]:
transdf = pd.read_csv("data/dimcustomertransactions.csv", index_col = None)
transdf

Unnamed: 0,CustomerKey,LastName,BirthDate,MaritalStatus,YearlyIncome,ActionInd,ModifiedDate
0,11000,Yang,1971-10-06,M,250000.0,U,2020-01-01
1,11001,Jones,1976-05-10,S,360000.0,U,2019-02-01
2,333301,Murhpy,1975-02-09,M,33000.0,A,2018-01-01
3,333302,Jain,1980-01-09,M,28000.0,A,2020-02-01
4,11002,Torres,1971-02-09,M,60000.0,D,2020-02-01


In [21]:
transdf.to_sql('cust_trans', conn, if_exists='replace', index = False)
pd.read_sql_query("select * from cust_trans", conn)

Unnamed: 0,CustomerKey,LastName,BirthDate,MaritalStatus,YearlyIncome,ActionInd,ModifiedDate
0,11000,Yang,1971-10-06,M,250000.0,U,2020-01-01
1,11001,Jones,1976-05-10,S,360000.0,U,2019-02-01
2,333301,Murhpy,1975-02-09,M,33000.0,A,2018-01-01
3,333302,Jain,1980-01-09,M,28000.0,A,2020-02-01
4,11002,Torres,1971-02-09,M,60000.0,D,2020-02-01


In [22]:
#  Get the customer data back to the start position...
conn.execute('DELETE FROM customer')
custdf.to_sql('customer', conn, if_exists='append', index = False)
pd.read_sql_query("select * from customer", conn)

Unnamed: 0,Customerkey,LastName,BirthDate,MaritalStatus,YearlyIncome,ModifiedDAte,ETLLastUpdate
0,11000,Yang,1971-10-06,M,90000.0,2019-01-01,2022-11-09 07:50:33
1,11001,Huang,1976-05-10,S,60000.0,2019-01-01,2022-11-09 07:50:33
2,11002,Torres,1971-02-09,M,60000.0,2019-01-01,2022-11-09 07:50:33
3,11003,Zhu,1973-08-14,S,70000.0,2019-01-01,2022-11-09 07:50:33


In [23]:
pd.read_sql_query("select * from cust_trans", conn)

Unnamed: 0,CustomerKey,LastName,BirthDate,MaritalStatus,YearlyIncome,ActionInd,ModifiedDate
0,11000,Yang,1971-10-06,M,250000.0,U,2020-01-01
1,11001,Jones,1976-05-10,S,360000.0,U,2019-02-01
2,333301,Murhpy,1975-02-09,M,33000.0,A,2018-01-01
3,333302,Jain,1980-01-09,M,28000.0,A,2020-02-01
4,11002,Torres,1971-02-09,M,60000.0,D,2020-02-01


### Let's update the customer table with the transactions.
- Add    - If the customer is not found, insert transaction as new customer.
- Change - If the customer is found, update the existing customer with the transaction data.
- Delete - If the DropInd = 'Y', remove the customer row. 

In [24]:
# Insert
conn.execute('''INSERT INTO customer 
                   (CustomerKey, LastName, BirthDate, MaritalStatus, YearlyIncome, ModifiedDate)     
                SELECT CustomerKey, LastName, BirthDate, MaritalStatus, YearlyIncome, ModifiedDate 
                FROM cust_trans 
                where ActionInd = 'A';''')

pd.read_sql_query("select * from customer", conn)

Unnamed: 0,Customerkey,LastName,BirthDate,MaritalStatus,YearlyIncome,ModifiedDAte,ETLLastUpdate
0,11000,Yang,1971-10-06,M,90000.0,2019-01-01,2022-11-09 07:50:33
1,11001,Huang,1976-05-10,S,60000.0,2019-01-01,2022-11-09 07:50:33
2,11002,Torres,1971-02-09,M,60000.0,2019-01-01,2022-11-09 07:50:33
3,11003,Zhu,1973-08-14,S,70000.0,2019-01-01,2022-11-09 07:50:33
4,333301,Murhpy,1975-02-09,M,33000.0,2018-01-01,2022-11-09 07:57:39
5,333302,Jain,1980-01-09,M,28000.0,2020-02-01,2022-11-09 07:57:39


###  SQLite Update statement is virtually useless

In [None]:
#  This example is to show the syntax only.  It cannot be executed.

UPDATE orders 
   SET   item_name=(
            SELECT prod_mast.prod_name 
            FROM prod_mast 
            WHERE orders.item_id=prod_mast.prod_id), 
         cost=(
            SELECT prod_mast.prod_rate*orders.ord_qty 
            FROM prod_mast 
            WHERE orders.item_id=prod_mast.prod_id);

In [26]:
pd.read_sql_query("select * from cust_trans where ActionInd = 'U'", conn)

Unnamed: 0,CustomerKey,LastName,BirthDate,MaritalStatus,YearlyIncome,ActionInd,ModifiedDate
0,11000,Yang,1971-10-06,M,250000.0,U,2020-01-01
1,11001,Jones,1976-05-10,S,360000.0,U,2019-02-01


#### We'll Use REPLACE instead of Update but remember, this command deletes and re-inserts the row!!!

In [27]:
# Replace 
conn.execute('''
             INSERT OR REPLACE INTO customer 
                (CustomerKey, LastName, BirthDate, MaritalStatus, YearlyIncome, ModifiedDate)     
             SELECT CustomerKey, LastName, BirthDate, MaritalStatus, YearlyIncome, ModifiedDate 
             FROM cust_trans where ActionInd = 'U';''')

pd.read_sql_query("select * from customer", conn)

Unnamed: 0,Customerkey,LastName,BirthDate,MaritalStatus,YearlyIncome,ModifiedDAte,ETLLastUpdate
0,11000,Yang,1971-10-06,M,250000.0,2020-01-01,2022-11-09 07:59:27
1,11001,Jones,1976-05-10,S,360000.0,2019-02-01,2022-11-09 07:59:27
2,11002,Torres,1971-02-09,M,60000.0,2019-01-01,2022-11-09 07:50:33
3,11003,Zhu,1973-08-14,S,70000.0,2019-01-01,2022-11-09 07:50:33
4,333301,Murhpy,1975-02-09,M,33000.0,2018-01-01,2022-11-09 07:57:39
5,333302,Jain,1980-01-09,M,28000.0,2020-02-01,2022-11-09 07:57:39


### Delete does not support joining to other tables...

In [28]:
pd.read_sql_query("select * from cust_trans where ActionInd = 'D'", conn)

Unnamed: 0,CustomerKey,LastName,BirthDate,MaritalStatus,YearlyIncome,ActionInd,ModifiedDate
0,11002,Torres,1971-02-09,M,60000.0,D,2020-02-01


In [29]:
# Delete
conn.execute('''DELETE FROM customer  
                WHERE CustomerKey IN (
                SELECT CustomerKey 
                FROM cust_trans where ActionInd = 'D');''')

pd.read_sql_query("select * from customer", conn)

Unnamed: 0,Customerkey,LastName,BirthDate,MaritalStatus,YearlyIncome,ModifiedDAte,ETLLastUpdate
0,11000,Yang,1971-10-06,M,250000.0,2020-01-01,2022-11-09 07:59:27
1,11001,Jones,1976-05-10,S,360000.0,2019-02-01,2022-11-09 07:59:27
2,11003,Zhu,1973-08-14,S,70000.0,2019-01-01,2022-11-09 07:50:33
3,333301,Murhpy,1975-02-09,M,33000.0,2018-01-01,2022-11-09 07:57:39
4,333302,Jain,1980-01-09,M,28000.0,2020-02-01,2022-11-09 07:57:39


In [30]:
#  Get the customer data back to the start position...
conn.execute('DELETE FROM customer')
custdf.to_sql('customer', conn, if_exists='append', index = False)
pd.read_sql_query("select * from customer", conn)

Unnamed: 0,Customerkey,LastName,BirthDate,MaritalStatus,YearlyIncome,ModifiedDAte,ETLLastUpdate
0,11000,Yang,1971-10-06,M,90000.0,2019-01-01,2022-11-09 08:00:17
1,11001,Huang,1976-05-10,S,60000.0,2019-01-01,2022-11-09 08:00:17
2,11002,Torres,1971-02-09,M,60000.0,2019-01-01,2022-11-09 08:00:17
3,11003,Zhu,1973-08-14,S,70000.0,2019-01-01,2022-11-09 08:00:17
