# Incremental ETL Pipeline

This notebook orchestrates the flow of data from the transactional source database (sakila) to the analytical data warehouse (sakila_star).

Key Features:
- Incremental Loading: Uses a "Watermark" strategy to fetch only records created or modified since the last successful run.
- Star Schema Transformation: Denormalizes data (e.g., joining address, city, and country into dimension tables).
- Upsert Logic: Uses ON DUPLICATE KEY UPDATE to handle modifications to existing records.
- Deletion Sync: Identifies and removes records in the warehouse that have been hard-deleted from the source. \[WIP\]

<br>

In [None]:
import pandas as pd
from sqlalchemy import create_engine, text
import time
from tqdm.notebook import tqdm

import pandas as pd
from sqlalchemy import create_engine, text
import time
from tqdm.notebook import tqdm

from etl_functions import (
    get_watermark,
    update_watermark,
    test_select_query,
    _initialise_etl_state,
    _clear_table_data,
    run_incremental_load,
    upsert_data,
    src_engine,
    tgt_engine
)


<br>

### Check schemas (Optional)

Check if you can read from both databases.


In [None]:
pd.read_sql("SHOW FULL TABLES;", src_engine)

In [None]:
pd.read_sql("SHOW FULL TABLES;", tgt_engine)

<br>

## SQL Tables

The folloiwing extract / load SQL statements implement the logic to extract data from the source database (sakila) and to load the data into the new analytics database (sakila_star).


#### dim_store

In [None]:
# ========= DIM_STORE ========= #


# Get 'dim_store' data from src
dim_store_extract_sql = text("""
    SELECT 
        s.store_id, 
        a.address, 
        a.address2, 
        a.district, 
        c.city, 
        co.country, 
        a.postal_code, 
        a.phone, 
        s.last_update as src_last_update
    FROM sakila.store s
    JOIN sakila.address a ON s.address_id = a.address_id
    JOIN sakila.city c ON a.city_id = c.city_id
    JOIN sakila.country co ON c.country_id = co.country_id
    WHERE s.last_update > :watermark
""")

# Define the Load SQL (Upsert)
dim_store_load_sql = text("""
    INSERT INTO dim_store (
        store_id, 
        address, 
        address2, 
        district, 
        city, 
        country, 
        postal_code, 
        phone, 
        src_last_update
    ) VALUES (
        :store_id, 
        :address, 
        :address2, 
        :district, 
        :city, 
        :country, 
        :postal_code, 
        :phone, 
        :src_last_update
    )
    ON DUPLICATE KEY UPDATE
        address         = VALUES(address),
        address2        = VALUES(address2),      -- Added
        district        = VALUES(district),      -- Added
        city            = VALUES(city),
        country         = VALUES(country),       -- Added
        postal_code     = VALUES(postal_code),   -- Added
        phone           = VALUES(phone),
        src_last_update = VALUES(src_last_update);
""")

dim_store_config = {
    "table_name": "dim_store",
    "extract_sql": dim_store_extract_sql,
    "load_sql": dim_store_load_sql
}

#### dim_customer

In [None]:
# ========= DIM_CUSTOMER ========= #


# Get 'dim_customer' data from src
dim_customer_extract_sql = text("""
    SELECT 
        c.customer_id, 
        c.store_id, 
        c.first_name, 
        c.last_name, 
        c.email, 
        c.active as activebool,  -- Map source 'active' to target 'activebool'
        c.active,                -- Also keep 'active' for the second column
        c.create_date, 
        a.address, 
        a.address2,              
        a.district,              
        ci.city, 
        co.country, 
        a.postal_code,          
        a.phone,                 
        c.last_update as src_last_update
    FROM sakila.customer c
    JOIN sakila.address a ON c.address_id = a.address_id
    JOIN sakila.city ci ON a.city_id = ci.city_id
    JOIN sakila.country co ON ci.country_id = co.country_id
    WHERE c.last_update > :watermark;
""")

# Define the Load SQL (Upsert)
dim_customer_load_sql = text("""
    INSERT INTO dim_customer (
        customer_id, 
        store_id, 
        first_name, 
        last_name, 
        email, 
        activebool, 
        active, 
        create_date, 
        address, 
        address2, 
        district, 
        city, 
        country, 
        postal_code, 
        phone, 
        src_last_update
    ) VALUES (
        :customer_id, 
        :store_id, 
        :first_name, 
        :last_name, 
        :email, 
        :activebool, 
        :active, 
        :create_date, 
        :address, 
        :address2, 
        :district, 
        :city, 
        :country, 
        :postal_code, 
        :phone, 
        :src_last_update
    )
    ON DUPLICATE KEY UPDATE
        store_id        = VALUES(store_id),
        first_name      = VALUES(first_name),
        last_name       = VALUES(last_name),
        email           = VALUES(email),
        activebool      = VALUES(activebool),
        active          = VALUES(active),
        address         = VALUES(address),
        address2        = VALUES(address2),
        district        = VALUES(district),
        city            = VALUES(city),
        country         = VALUES(country),
        postal_code     = VALUES(postal_code),
        phone           = VALUES(phone),
        src_last_update = VALUES(src_last_update);
""")

dim_customer_config =     {
    "table_name": "dim_customer",
    "extract_sql": dim_customer_extract_sql,
    "load_sql": dim_customer_load_sql
}

#### dim_staff

In [None]:
# ========= DIM_STAFF ========= #


# Get 'dim_staff' data from src
dim_staff_extract_sql = text("""
    SELECT 
        st.staff_id,
        st.first_name,
        st.last_name,
        st.email,
        st.active,
        st.username,
        st.password,
        st.picture,
        a.address,
        a.address2,
        a.district,
        ci.city,
        co.country,
        a.postal_code,
        a.phone,
        st.last_update as src_last_update
    FROM sakila.staff AS st
    JOIN sakila.address AS a ON st.address_id = a.address_id
    JOIN sakila.city AS ci ON a.city_id = ci.city_id
    JOIN sakila.country AS co ON ci.country_id = co.country_id
    WHERE st.last_update > :watermark
""")

# Define the Load SQL (Upsert)
dim_staff_load_sql = text("""
    INSERT INTO dim_staff (
        staff_id, 
        first_name,
        last_name,
        email,
        active,
        username,
        password,
        picture,
        address, 
        address2, 
        district, 
        city, 
        country, 
        postal_code, 
        phone, 
        src_last_update
    ) VALUES (
        :staff_id,
        :first_name,
        :last_name,
        :email,
        :active,
        :username,
        :password,
        :picture,
        :address, 
        :address2, 
        :district, 
        :city, 
        :country, 
        :postal_code, 
        :phone, 
        :src_last_update
    )
    ON DUPLICATE KEY UPDATE
        first_name      = VALUES(first_name),
        last_name       = VALUES(last_name),      
        email           = VALUES(email),      
        active          = VALUES(active),      
        username        = VALUES(username),      
        password        = VALUES(password),      
        picture         = VALUES(picture),      
        address         = VALUES(address),      
        address2        = VALUES(address2),      
        district        = VALUES(district),      
        city            = VALUES(city),      
        country         = VALUES(country),      
        postal_code     = VALUES(postal_code),      
        phone           = VALUES(phone)           
""")

dim_staff_config =     {
    "table_name": "dim_staff",
    "extract_sql": dim_staff_extract_sql,
    "load_sql": dim_staff_load_sql
}

#### dim_film

In [None]:
# ========= DIM_FILM ========= #


# Get 'dim_film' data from src
dim_film_extract_sql = text("""
    SELECT 
        f.film_id,
        f.title,
        f.description,
        f.release_year,
        l.name AS language_name,
        f.rental_duration,
        f.rental_rate,
        f.length,
        f.replacement_cost,
        f.rating,
        f.last_update as src_last_update
    FROM sakila.film AS f
    JOIN sakila.language AS l ON l.language_id = f.language_id
    WHERE f.last_update > :watermark
""")

# Define the Load SQL (Upsert)
dim_film_load_sql = text("""
    INSERT INTO dim_film (
        film_id, 
        title,
        description,
        release_year,
        language_name,
        rental_duration,
        rental_rate,
        length,
        replacement_cost, 
        rating, 
        src_last_update
    ) VALUES (
        :film_id,
        :title,
        :description,
        :release_year,
        :language_name,
        :rental_duration,
        :rental_rate,
        :length,
        :replacement_cost,
        :rating,
        :src_last_update
    )
    ON DUPLICATE KEY UPDATE
        film_id          = VALUES(film_id),       
        title            = VALUES(title),       
        description      = VALUES(description),       
        release_year     = VALUES(release_year),       
        language_name    = VALUES(language_name),       
        rental_duration  = VALUES(rental_duration),       
        rental_rate      = VALUES(rental_rate),       
        length           = VALUES(length),       
        replacement_cost = VALUES(replacement_cost),       
        rating           = VALUES(rating),       
        src_last_update  = VALUES(src_last_update)       
""")

dim_film_config = {
        "table_name": "dim_film",
        "extract_sql": dim_film_extract_sql,
        "load_sql": dim_film_load_sql
}

### Merge Configs

The above configs need to be merged for the "upsert_data" function to be able to run everything at once.

_IMPORTANT! Order Matters. Parents need to run first with Children to follow_

In [None]:
# === MERGE OF CONFIGS === #

upsert_list = [
    dim_store_config,      # Level 1: No dependencies
    dim_film_config,       # Level 1: No dependencies 
    dim_customer_config,   # Level 2: Depends on Store
    dim_staff_config,      # Level 2: Independent of Customer, but usually depends on Address
]

<br>

## Run Incremental Load for all data

In [None]:
# ===== PERFORM INCREMENTAL LOAD ===== #

# # __DEBUGGING ONLY - DELETES DATABASE DATA__
_initialise_etl_state()
_clear_table_data([       # Clean starting from lower levels
        "dim_staff",
        "dim_customer", 
        "dim_film", 
        "dim_store", 
    ], tgt_engine, force=False)


print("\n\n\n====<< STARTING UPSERTING OF DATA >>====")
upsert_data(upsert_list, src_engine, tgt_engine)

