# Import data from Sakila

This notebook checks for new / modified data that sakila_star database needs, and update sakila_star according to sakila changes.



In [None]:
import pandas as pd
from sqlalchemy import create_engine, text
import time

# Connection strings based on your docker-compose environment variables [2]
# Source: Transactional Database
src_engine = create_engine("mysql+pymysql://app:app_password@db:3306/sakila")

# Target: Data Warehouse
tgt_engine = create_engine("mysql+pymysql://app:app_password@db:3306/sakila_star")




## Check schemas:


In [None]:
pd.read_sql("SHOW FULL TABLES;", src_engine)

In [None]:
pd.read_sql("SHOW FULL TABLES;", tgt_engine)

## Watermark functions

In [119]:
def get_watermark(pipeline_name, conn):
    """Retrieve the last successful timestamp from sakila_star.etl_state"""
    
    query = text("SELECT last_success_ts FROM etl_state WHERE pipeline_name = :p_name")
    
    result = conn.execute(
        query, 
        {"p_name": pipeline_name}
    ).fetchone()

    
    # If never run, default to an old timestamp (start of epoch)
    return result[0] if result else None

# Test
with tgt_engine.connect() as conn:
    watermark_value = get_watermark("fact_rental", conn)
    print(watermark_value)



def update_watermark(pipeline_names, new_ts, conn):
    """
    Update the watermark for a list of pipelines.
    
    Args:
        pipeline_names (list): A list of pipeline names (e.g. ['dim_customer'])
        new_ts (datetime/str): The timestamp to set (e.g. '2025-01-01 12:00:00')
        conn: The active database connection
    """
    
    query = text("""
        INSERT INTO etl_state (pipeline_name, last_success_ts) 
        VALUES (:p_name, :ts) 
        ON DUPLICATE KEY UPDATE last_success_ts = VALUES(last_success_ts)
    """)
    

    for name in pipeline_names:
        conn.execute(query, {"p_name": name, "ts": new_ts})
        print(f"Updated watermark for: {name}")
    


def _initialise_etl_state(ts='1970-01-01 00:00:00'):
    with tgt_engine.connect() as conn:
        update_watermark(["fact_rental", "dim_film", "dim_customer", "dim_staff", "dim_actor", "bridge_actor"], ts, conn)
        conn.commit() 


1970-01-01 00:00:00


In [118]:
# watermark_value = '1970-01-01 00:00:00' 
with tgt_engine.connect() as conn:
    watermark_value = get_watermark("fact_rental", conn)
    print(f"\n\nWatermark Value: {watermark_value}\n\n")

if not watermark_value:
    watermark_value = '1970-01-01 00:00:00' 

query = text(
"""
SELECT 
    c.customer_id, 
    c.store_id, 
    c.first_name, 
    c.last_name, 
    c.email, 
    c.active,
    c.create_date,
    a.address, 
    ci.city, 
    co.country, 
    c.last_update as src_last_update
FROM sakila.customer c
JOIN sakila.address a ON c.address_id = a.address_id
JOIN sakila.city ci ON a.city_id = ci.city_id
JOIN sakila.country co ON ci.country_id = co.country_id
WHERE c.last_update > :watermark
LIMIT 2;
"""
    
)
pd.read_sql(
    query, 
    src_engine, 
    params={"watermark": watermark_value}
)





Watermark Value: 1970-01-01 00:00:00




Unnamed: 0,customer_id,store_id,first_name,last_name,email,active,create_date,address,city,country,src_last_update
0,1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,1,2006-02-14 22:04:36,1913 Hanoi Way,Sasebo,Japan,2006-02-15 04:57:20
1,2,1,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,1,2006-02-14 22:04:36,1121 Loja Avenue,San Bernardino,United States,2006-02-15 04:57:20
