## Import Basics


In [None]:
import pandas as pd
from sqlalchemy import create_engine, text
import time
from tqdm.notebook import tqdm

from etl_functions import (
    get_watermark,
    update_watermark,
    test_select_query,
    _initialise_etl_state,
    _clear_table_data,
    run_incremental_load,
    upsert_data,
    src_engine,
    tgt_engine
)

# Play

<br>

Data:

In [83]:
dim_category_extract_sql = text("""
    SELECT 
        f.film_id,
        f.title,
        f.description,
        f.release_year,
        l.name AS language_name,
        f.rental_duration,
        f.rental_rate,
        f.length,
        f.replacement_cost,
        f.rating,
        f.last_update as src_last_update
    FROM sakila.category AS c
    JOIN sakila.language AS l ON l.language_id = f.language_id
    WHERE c.last_update > :watermark
""")

dim_category_load_sql = text("""
    INSERT INTO dim_film (
        film_id, 
        title,
        description,
        release_year,
        language_name,
        rental_duration,
        rental_rate,
        length,
        replacement_cost, 
        rating, 
        src_last_update
    ) VALUES (
        :film_id,
        :title,
        :description,
        :release_year,
        :language_name,
        :rental_duration,
        :rental_rate,
        :length,
        :replacement_cost,
        :rating,
        :src_last_update
    )
    ON DUPLICATE KEY UPDATE
        film_id          = VALUES(film_id),       
        title            = VALUES(title),       
        description      = VALUES(description),       
        release_year     = VALUES(release_year),       
        language_name    = VALUES(language_name),       
        rental_duration  = VALUES(rental_duration),       
        rental_rate      = VALUES(rental_rate),       
        length           = VALUES(length),       
        replacement_cost = VALUES(replacement_cost),       
        rating           = VALUES(rating),       
        src_last_update  = VALUES(src_last_update)       
""")

upsert_list = [
    {
        "table_name": "dim_category",
        "extract_sql": dim_category_extract_sql,
        "load_sql": dim_category_load_sql,
    },
]

<br>

Functions run:

In [86]:
# test_select_query(dim_customer_extract_sql, src_engine)
# test_select_query(dim_staff_extract_sql, tgt_engine)
test_select_query("DESCRIBE dim_category", tgt_engine)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,category_id,tinyint unsigned,NO,PRI,,
1,name,varchar(25),YES,,,
2,src_last_update,timestamp,NO,,,
3,etl_loaded_at,datetime,NO,,CURRENT_TIMESTAMP,DEFAULT_GENERATED


In [85]:
test_select_query("DESCRIBE category;", src_engine)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,category_id,tinyint unsigned,NO,PRI,,auto_increment
1,name,varchar(25),NO,,,
2,last_update,timestamp,NO,,CURRENT_TIMESTAMP,DEFAULT_GENERATED on update CURRENT_TIMESTAMP


In [84]:
# ===== PERFORM INCREMENTAL LOAD ===== #

_initialise_etl_state()
_clear_table_data(["dim_category"], tgt_engine, force=False)


        
upsert_data(upsert_list, src_engine, tgt_engine)





 >> INITIALISING ETL STATE ...

[fact_rental] Updated watermark from '1970-01-01 00:00:00' to '1970-01-01 00:00:00' in 'etl_state' table
[dim_film] Updated watermark from '1970-01-01 00:00:00' to '1970-01-01 00:00:00' in 'etl_state' table
[dim_customer] Updated watermark from '1970-01-01 00:00:00' to '1970-01-01 00:00:00' in 'etl_state' table
[dim_staff] Updated watermark from '1970-01-01 00:00:00' to '1970-01-01 00:00:00' in 'etl_state' table
[dim_actor] Updated watermark from '1970-01-01 00:00:00' to '1970-01-01 00:00:00' in 'etl_state' table
[bridge_actor] Updated watermark from '1970-01-01 00:00:00' to '1970-01-01 00:00:00' in 'etl_state' table
[dim_store] Updated watermark from '1970-01-01 00:00:00' to '1970-01-01 00:00:00' in 'etl_state' table


 >> CLEARING DATA FROM TABLES ['dim_film'] ...

Success: Deleted 0 rows from dim_film.


 >> UPDATING / INSERTING DATA ...

[dim_film] Checking for updates since 1970-01-01 00:00:00...
[dim_film] Found 1000 rows. Loading...
[dim_film] U