## Import Basics


In [96]:
import pandas as pd
from sqlalchemy import create_engine, text
import time
from tqdm.notebook import tqdm

from etl_functions import (
    get_watermark,
    update_watermark,
    test_select_query,
    _initialise_etl_state,
    _clear_table_data,
    run_incremental_load,
    upsert_data,
    src_engine,
    tgt_engine
)

# Play & Test

<br>

Data:

In [97]:
dim_category_extract_sql = text("""
    SELECT 
        c.category_id,
        c.name,
        c.last_update as src_last_update
    FROM sakila.category AS c
    WHERE c.last_update > :watermark
""")

dim_category_load_sql = text("""
    INSERT INTO dim_category (
        category_id, 
        name,
        src_last_update
    ) VALUES (
        :category_id,
        :name,
        :src_last_update
    )
    ON DUPLICATE KEY UPDATE
        name          = VALUES(name),             
        src_last_update  = VALUES(src_last_update)       
""")

upsert_list = [
    {
        "table_name": "dim_category",
        "extract_sql": dim_category_extract_sql,
        "load_sql": dim_category_load_sql,
    },
]

<br>

Functions run:

In [92]:
# test_select_query(dim_customer_extract_sql, src_engine)
# test_select_query(dim_staff_extract_sql, tgt_engine)
test_select_query("DESCRIBE dim_category", tgt_engine)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,category_id,tinyint unsigned,NO,PRI,,
1,name,varchar(25),YES,,,
2,src_last_update,timestamp,NO,,,
3,etl_loaded_at,datetime,NO,,CURRENT_TIMESTAMP,DEFAULT_GENERATED


In [95]:
test_select_query("DESCRIBE category;", src_engine)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,category_id,tinyint unsigned,NO,PRI,,auto_increment
1,name,varchar(25),NO,,,
2,last_update,timestamp,NO,,CURRENT_TIMESTAMP,DEFAULT_GENERATED on update CURRENT_TIMESTAMP


In [98]:
# ===== PERFORM INCREMENTAL LOAD ===== #

_initialise_etl_state()
_clear_table_data(["dim_category"], tgt_engine, force=False)


        
upsert_data(upsert_list, src_engine, tgt_engine)





 >> INITIALISING ETL STATE ...

[fact_rental] Updated watermark from '1970-01-01 00:00:00' to '1970-01-01 00:00:00' in 'etl_state' table
[dim_film] Updated watermark from '2006-02-15 05:03:42' to '1970-01-01 00:00:00' in 'etl_state' table
[dim_customer] Updated watermark from '2006-02-15 04:57:20' to '1970-01-01 00:00:00' in 'etl_state' table
[dim_staff] Updated watermark from '2006-02-15 03:57:16' to '1970-01-01 00:00:00' in 'etl_state' table
[dim_actor] Updated watermark from '1970-01-01 00:00:00' to '1970-01-01 00:00:00' in 'etl_state' table
[bridge_actor] Updated watermark from '1970-01-01 00:00:00' to '1970-01-01 00:00:00' in 'etl_state' table
[dim_store] Updated watermark from '2006-02-15 04:57:12' to '1970-01-01 00:00:00' in 'etl_state' table


 >> CLEARING DATA FROM TABLES ['dim_category'] ...

Success: Deleted 0 rows from dim_category.


 >> UPDATING / INSERTING DATA ...

[dim_category] Checking for updates since 1900-01-01 00:00:00...
[dim_category] Found 16 rows. Loading..