In [0]:
print("=== DAY 5: DELTA LAKE ADVANCED ===\n")

delta_path = "/Volumes/workspace/ecommerce/ecommerce_data/events_delta"

=== DAY 5: DELTA LAKE ADVANCED ===



##CHECKING EXISTING DATA BEFORE MERGE

In [0]:
print("=== CHECKING EXISTING DATA BEFORE MERGE ===")

# 1. Check what users exist in current Delta table
print("\n1. Top users in current Delta table:")
existing_users = spark.sql(f"""
    SELECT user_id, COUNT(*) as event_count
    FROM delta.`{delta_path}`
    GROUP BY user_id
    ORDER BY event_count DESC
    LIMIT 10
""")
display(existing_users)

=== CHECKING EXISTING DATA BEFORE MERGE ===

1. Top users in current Delta table:


user_id,event_count
512475445,7436
512365995,4013
526731152,2912
512505687,2894
513021392,2862
546159478,2433
546270188,2426
514649263,2390
516308435,2316
512401084,2232


In [0]:
print("\n2. Checking user 514148024:")
user_512475445 = spark.sql(f"""
    SELECT user_id, product_id, event_time, event_type, price
    FROM delta.`{delta_path}`
    WHERE user_id = 512475445
    ORDER BY event_time DESC
    LIMIT 5
""")
print(f"Found {user_512475445.count()} events for  user_512475445")
display(user_512475445)


2. Checking user 514148024:
Found 5 events for  user_512475445


user_id,product_id,event_time,event_type,price
512475445,4700590,2019-10-31T18:53:19.000Z,view,140.28
512475445,4700478,2019-10-31T18:53:13.000Z,view,141.86
512475445,5701197,2019-10-31T18:53:08.000Z,view,78.38
512475445,5700793,2019-10-31T18:53:03.000Z,view,182.55
512475445,5701127,2019-10-31T18:52:57.000Z,view,215.86


##1: MERGE for incremental updates

In [0]:
# checking actual table schema
print("Checking Delta table schema:")
delta_df = spark.read.format("delta").load(delta_path)
delta_df.printSchema()

# See what columns exist
print("\nColumns in your Delta table:")
print(delta_df.columns)

# Check a few sample rows
print("\nSample data from Delta table:")
display(delta_df.limit(3))

Checking Delta table schema:
root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)


Columns in your Delta table:
['event_time', 'event_type', 'product_id', 'category_id', 'category_code', 'brand', 'price', 'user_id', 'user_session']

Sample data from Delta table:


event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-10-03T12:26:50.000Z,view,17900196,2053013560178508465,construction.tools.generator,firman,1377.25,514465215,a91dceb0-3439-4774-ac99-6bb441a392f5
2019-10-03T12:26:50.000Z,view,1002524,2053013555631882655,electronics.smartphone,apple,513.4,503347063,971c9398-ff5e-4cbb-8f20-7c007c696875
2019-10-03T12:26:50.000Z,view,1004856,2053013555631882655,electronics.smartphone,samsung,132.17,523588323,54bf6c25-54db-49d2-bfa1-e3a0124e9354


##Merge

In [0]:
print("--- TASK 1: MERGE for incremental updates ---")

from delta.tables import DeltaTable

# Get actual user data with ALL columns
user_check_df = spark.sql(f"""
    SELECT *
    FROM delta.`{delta_path}`
    WHERE user_id = 512475445
    ORDER BY event_time DESC
    LIMIT 1
""")

print("Full row from existing data:")
display(user_check_df)

user_check = user_check_df.collect()[0]

# Create updates with ALL 9 columns from your schema
updates_data = [
    # Row 1: Update existing (same user_id + event_time)
    (
        user_check['event_time'],       # event_time (same)
        "purchase",                     # event_type (changed from 'view' to 'purchase')
        user_check['product_id'],       # product_id (same)
        user_check['category_id'],      # category_id (same)
        user_check['category_code'],    # category_code (same)
        user_check['brand'],            # brand (same)
        200.00,                         # price (changed from 140.28 to 200.00)
        user_check['user_id'],          # user_id (same)
        user_check['user_session']      # user_session (same)
    ),
    
    # Row 2: Insert new (same user, new time)
    (
        "2026-01-13 10:00:00",          # new event_time
        "view",                         # event_type
        9999999,                        # product_id
        12345,                          # category_id (new)
        "electronics.phone",            # category_code
        "Samsung",                      # brand
        899.99,                         # price
        512475445,                      # user_id (same user)
        "session_999"                   # user_session (new)
    ),
    
    # Row 3: Insert new (new user)
    (
        "2026-01-13 10:05:00",          # event_time
        "purchase",                     # event_type
        8888888,                        # product_id
        67890,                          # category_id
        "electronics.tablet",           # category_code
        "Apple",                        # brand
        1299.99,                        # price
        999888777,                      # user_id (new user)
        "session_888"                   # user_session
    )
]

# Create DataFrame with ALL columns in CORRECT ORDER
updates = spark.createDataFrame(updates_data, 
                               ["event_time", "event_type", "product_id", "category_id",
                                "category_code", "brand", "price", "user_id", "user_session"])

print("\nData to merge:")
print("1. Row 1: Will UPDATE (same user_id + event_time)")
print("2. Row 2: Will INSERT (same user, new time)")
print("3. Row 3: Will INSERT (new user)")
display(updates)

# Execute MERGE
deltaTable = DeltaTable.forPath(spark, delta_path)
deltaTable.alias("t").merge(
    updates.alias("s"),
    "t.user_id = s.user_id AND t.event_time = s.event_time"
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()

print("✓ MERGE completed successfully!")

# Verify
print("\nAfter MERGE - User 512475445 events:")
result = spark.sql(f"""
    SELECT user_id, event_time, event_type, product_id, price, user_session
    FROM delta.`{delta_path}`
    WHERE user_id = 512475445
    ORDER BY event_time DESC
    LIMIT 3
""")
display(result)

--- TASK 1: MERGE for incremental updates ---
Full row from existing data:


event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-10-31T18:53:19.000Z,view,4700590,2053013560899928785,auto.accessories.videoregister,sho-me,140.28,512475445,df4794db-b6db-4ac3-89d6-fd50265257a4



Data to merge:
1. Row 1: Will UPDATE (same user_id + event_time)
2. Row 2: Will INSERT (same user, new time)
3. Row 3: Will INSERT (new user)


event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-10-31 18:53:19,purchase,4700590,2053013560899928785,auto.accessories.videoregister,sho-me,200.0,512475445,df4794db-b6db-4ac3-89d6-fd50265257a4
2026-01-13 10:00:00,view,9999999,12345,electronics.phone,Samsung,899.99,512475445,session_999
2026-01-13 10:05:00,purchase,8888888,67890,electronics.tablet,Apple,1299.99,999888777,session_888


✓ MERGE completed successfully!

After MERGE - User 512475445 events:


user_id,event_time,event_type,product_id,price,user_session
512475445,2026-01-13T10:00:00.000Z,view,9999999,899.99,session_999
512475445,2019-10-31T18:53:19.000Z,purchase,4700590,200.0,df4794db-b6db-4ac3-89d6-fd50265257a4
512475445,2019-10-31T18:53:13.000Z,view,4700478,141.86,c5da95a8-20be-40fe-af2f-189e4047b0f5


##2: Query historical versions

In [0]:

#2: Query historical versions


print("\n--- TASK 2: Time travel ---")

# Show history
history = spark.sql(f"DESCRIBE HISTORY delta.`{delta_path}`")
print("Table history:")
display(history.select("version", "timestamp", "operation"))

# Time travel
v0 = spark.read.format("delta").option("versionAsOf", 0).load(delta_path)
print(f"\n✓ Version 0 rows: {v0.count():,}")

v1 = spark.read.format("delta").option("versionAsOf", 1).load(delta_path)
print(f"✓ Version 1 rows: {v1.count():,}")






--- TASK 2: Time travel ---
Table history:


version,timestamp,operation
2,2026-01-13T12:11:41.000Z,OPTIMIZE
1,2026-01-13T12:09:48.000Z,MERGE
0,2026-01-12T12:57:52.000Z,WRITE



✓ Version 0 rows: 42,448,764
✓ Version 1 rows: 42,448,766


##3: Optimize

In [0]:

# TASK 3: Optimize


print("\n--- TASK 3: OPTIMIZE ---")
spark.sql(f"OPTIMIZE delta.`{delta_path}` ZORDER BY (event_type, user_id)")
print("✓ OPTIMIZE completed")


--- TASK 3: OPTIMIZE ---
✓ OPTIMIZE completed


## VACUUM

In [0]:

#  4: VACUUM

print("\n--- TASK 4: VACUUM ---")
spark.sql(f"VACUUM delta.`{delta_path}` RETAIN 168 HOURS")
print("✓ VACUUM completed")

print("\n✅ All 4 Day 5 tasks completed!")


--- TASK 4: VACUUM ---
✓ VACUUM completed

✅ All 4 Day 5 tasks completed!
