# Day 05 of 14 Days Databricks Challenge


In [0]:
%python
from delta.tables import DeltaTable

deltaTable = DeltaTable.forPath(
    spark,
    "/Volumes/workspace/ecommerce/ecommerce_data/events_delta"
)

updates = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv")


In [0]:
%python
updates.show(10)

In [0]:
%python
deltaTable.alias("t").merge(
    updates.alias("s"),
    "t.user_session = s.user_session AND t.event_time = s.event_time"
).whenMatchedUpdateAll() \
.whenNotMatchedInsertAll() \
.execute()



In [0]:
%python
v0 = spark.read.format("delta") \
    .option("versionAsOf", 0) \
    .load("/Volumes/workspace/ecommerce/ecommerce_data/events_delta")


In [0]:
%python
v0.count()

In [0]:
%python
v1 = spark.read.format("delta") \
    .option("versionAsOf", 1) \
    .load("/Volumes/workspace/ecommerce/ecommerce_data/events_delta")

v1.count()


In [0]:
%python
v2 = spark.read.format("delta") \
    .option("versionAsOf", 2) \
    .load("/Volumes/workspace/ecommerce/ecommerce_data/events_delta")
v2.count()

In [0]:
%python
yesterday = spark.read.format("delta") \
    .option("timestampAsOf", "2026-01-13") \
    .load("/Volumes/workspace/ecommerce/ecommerce_data/events_delta")


In [0]:
%python
yesterday.count()

In [0]:
%python
spark.sql("""
OPTIMIZE events_delta
ZORDER BY (event_type, user_id)
""")

In [0]:
%python
print("ðŸ”¹ Checking number of data files after OPTIMIZE...")
spark.sql("""
DESCRIBE DETAIL events_delta
""").show(truncate=False)


In [0]:
%python
from delta.tables import DeltaTable

deltaTable = DeltaTable.forPath(
    spark,
    "/Volumes/workspace/ecommerce/ecommerce_data/events_delta"
)

deltaTable.history().select(
    "version",
    "operation",
    "operationMetrics"
).show(truncate=False)


In [0]:
%sql
DESCRIBE DETAIL events_delta

In [0]:
print("ðŸ”¹ Optimizing Delta table with ZORDER...")
spark.sql("""
OPTIMIZE events_delta
ZORDER BY (event_type, user_id)
""")
print("âœ… OPTIMIZE + ZORDER completed")


In [0]:
%sql
DESCRIBE DETAIL events_delta

In [0]:
from delta.tables import DeltaTable

deltaTable = DeltaTable.forPath(
    spark,
    "/Volumes/workspace/ecommerce/ecommerce_data/events_delta"
)

deltaTable.history().select(
    "version",
    "operation",
    "operationMetrics"
).show(truncate=False)

### Optimise did not happened because of the file size was already larger than the small file problem

Very small files (1â€“10 MB) â†’ strong candidate

Medium files (~30â€“50 MB) â†’ usually skipped

Large files (100+ MB) â†’ not touched


In [0]:
print("ðŸ”¹ Running VACUUM to clean old files...")
spark.sql("""
VACUUM events_delta RETAIN 168 HOURS
""")

print("ðŸ§¹ VACUUM completed successfully")


### To force cleanup immediately:

SET spark.databricks.delta.retentionDurationCheck.enabled = false;
VACUUM events_delta RETAIN 0 HOURS;

In [0]:
print("Delta history after VACUUM:")
deltaTable.history().show(truncate=False)
