## Import Basics


In [None]:
import pandas as pd
from sqlalchemy import create_engine, text
import time
from tqdm.notebook import tqdm

from etl_functions import (
    get_watermark,
    update_watermark,
    test_select_query,
    _initialise_etl_state,
    _clear_table_data,
    run_incremental_load,
    upsert_data,
    src_engine,
    tgt_engine
)

# Play

<br>

Data:

In [58]:
dim_staff_extract_sql = text("""
    SELECT 
        st.staff_id,
        st.first_name,
        st.last_name,
        st.email,
        st.active,
        st.username,
        st.password,
        st.picture,
        a.address,
        a.address2,
        a.district,
        ci.city,
        co.country,
        a.postal_code,
        a.phone,
        st.last_update as src_last_update
    FROM sakila.staff AS st
    JOIN sakila.address AS a ON st.address_id = a.address_id
    JOIN sakila.city AS ci ON a.city_id = ci.city_id
    JOIN sakila.country AS co ON ci.country_id = co.country_id
    WHERE st.last_update > :watermark
""")

dim_staff_load_sql = text("""
    INSERT INTO dim_staff (
        staff_id, 
        first_name,
        last_name,
        email,
        active,
        username,
        password,
        picture,
        address, 
        address2, 
        district, 
        city, 
        country, 
        postal_code, 
        phone, 
        src_last_update
    ) VALUES (
        :staff_id,
        :first_name,
        :last_name,
        :email,
        :active,
        :username,
        :password,
        :picture,
        :address, 
        :address2, 
        :district, 
        :city, 
        :country, 
        :postal_code, 
        :phone, 
        :src_last_update
    )
    ON DUPLICATE KEY UPDATE
        first_name      = VALUES(first_name),
        last_name       = VALUES(last_name),      
        email           = VALUES(email),      
        active          = VALUES(active),      
        username        = VALUES(username),      
        password        = VALUES(password),      
        picture         = VALUES(picture),      
        address         = VALUES(address),      
        address2        = VALUES(address2),      
        district        = VALUES(district),      
        city            = VALUES(city),      
        country         = VALUES(country),      
        postal_code     = VALUES(postal_code),      
        phone           = VALUES(phone)           
""")

upsert_list = [
    {
        "table_name": "dim_staff",
        "extract_sql": dim_staff_extract_sql,
        "load_sql": dim_staff_load_sql,
    },
]

<br>

Functions run:

In [57]:
# test_select_query(dim_customer_extract_sql, src_engine)
test_select_query(dim_staff_extract_sql, tgt_engine)

Unnamed: 0,staff_id,first_name,last_name,email,active,username,password,picture,address,address2,district,city,country,postal_code,phone,src_last_update
0,1,Mike,Hillyer,Mike.Hillyer@sakilastaff.com,1,Mike,8cb2237d0679ca88db6464eac60da96345513964,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,23 Workhaven Lane,,Alberta,Lethbridge,Canada,,14033335568,2006-02-15 03:57:16
1,2,Jon,Stephens,Jon.Stephens@sakilastaff.com,1,Jon,,,1411 Lillydale Drive,,QLD,Woodridge,Australia,,6172235589,2006-02-15 03:57:16


In [56]:
test_select_query("DESCRIBE staff;", src_engine)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,staff_id,tinyint unsigned,NO,PRI,,auto_increment
1,first_name,varchar(45),NO,,,
2,last_name,varchar(45),NO,,,
3,address_id,smallint unsigned,NO,MUL,,
4,picture,blob,YES,,,
5,email,varchar(50),YES,,,
6,store_id,tinyint unsigned,NO,MUL,,
7,active,tinyint(1),NO,,1,
8,username,varchar(16),NO,,,
9,password,varchar(40),YES,,,


In [None]:
test_select_query("DESCRIBE address;", src_engine)

In [60]:
# ===== PERFORM INCREMENTAL LOAD ===== #

_initialise_etl_state()
_clear_table_data(["dim_staff"], tgt_engine, force=False)


        
# upsert_data(upsert_list, src_engine, tgt_engine)





 >> INITIALISING ETL STATE ...

[fact_rental] Updated watermark from '1970-01-01 00:00:00' to '1970-01-01 00:00:00' in 'etl_state' table
[dim_film] Updated watermark from '1970-01-01 00:00:00' to '1970-01-01 00:00:00' in 'etl_state' table
[dim_customer] Updated watermark from '1970-01-01 00:00:00' to '1970-01-01 00:00:00' in 'etl_state' table
[dim_staff] Updated watermark from '2006-02-15 03:57:16' to '1970-01-01 00:00:00' in 'etl_state' table
[dim_actor] Updated watermark from '1970-01-01 00:00:00' to '1970-01-01 00:00:00' in 'etl_state' table
[bridge_actor] Updated watermark from '1970-01-01 00:00:00' to '1970-01-01 00:00:00' in 'etl_state' table
[dim_store] Updated watermark from '1970-01-01 00:00:00' to '1970-01-01 00:00:00' in 'etl_state' table


 >> CLEARING DATA FROM TABLES ['dim_staff'] ...

Success: Deleted 2 rows from dim_staff.
