### Analyze the query plan

In [0]:
spark.sql("""
SELECT *
FROM workspace.ecommerce.ecommerce_events_delta
WHERE event_type = 'purchase'
""").explain(mode="extended")

### Collect table statistics

In [0]:
%sql
ANALYZE TABLE workspace.ecommerce.ecommerce_events_delta
COMPUTE STATISTICS;


### Partitioned Delta table

In [0]:
%sql
CREATE TABLE workspace.ecommerce.ecommerce_events_delta_part
USING DELTA
PARTITIONED BY (event_date)
AS
SELECT *, DATE(event_time) AS event_date
FROM workspace.ecommerce.ecommerce_events_delta;

In [0]:
%sql
SHOW PARTITIONS workspace.ecommerce.ecommerce_events_delta_part;


### Optimize file layout + apply ZORDER

In [0]:
%sql
OPTIMIZE workspace.ecommerce.ecommerce_events_delta
ZORDER BY (user_id, product_id);

### Benchmark performance properly (before vs after)

In [0]:
import time

start = time.time()
spark.sql("""
SELECT COUNT(*)
FROM workspace.ecommerce.ecommerce_events_delta
WHERE user_id = 12345
""").collect()

print(f"Baseline time: {time.time() - start:.2f}s")

In [0]:
start = time.time()
spark.sql("""
SELECT COUNT(*)
FROM workspace.ecommerce.ecommerce_events_delta
WHERE user_id = 12345
""").collect()

print(f"Optimized time: {time.time() - start:.2f}s")


### Cache

In [0]:
events_cached = spark.table("workspace.ecommerce.ecommerce_events_delta").cache()

# Materialize cache
events_cached.count()