In [0]:
spark.sql("""
SELECT *
FROM workspace.default.events_delta
WHERE event_type = 'purchase'
""").explain(mode="extended")

== Parsed Logical Plan ==
'Project [*]
+- 'Filter ('event_type = purchase)
   +- 'UnresolvedRelation [workspace, default, events_delta], [], false

== Analyzed Logical Plan ==
event_time: timestamp, event_type: string, product_id: bigint, category_id: bigint, category_code: string, brand: string, price: double, user_id: bigint, user_session: string
Project [event_time#13194, event_type#13195, product_id#13196L, category_id#13197L, category_code#13198, brand#13199, price#13200, user_id#13201L, user_session#13202]
+- Filter (event_type#13195 = purchase)
   +- SubqueryAlias workspace.default.events_delta
      +- Relation workspace.default.events_delta[event_time#13194,event_type#13195,product_id#13196L,category_id#13197L,category_code#13198,brand#13199,price#13200,user_id#13201L,user_session#13202] parquet

== Optimized Logical Plan ==
Filter (isnotnull(event_type#13195) AND (event_type#13195 = purchase))
+- Relation workspace.default.events_delta[event_time#13194,event_type#13195,produc

In [0]:
%sql
ANALYZE TABLE workspace.default.events_delta
COMPUTE STATISTICS;

In [0]:
%sql
CREATE TABLE IF NOT EXISTS workspace.default.events_delta
USING DELTA
PARTITIONED BY (event_date)
AS
SELECT *, DATE(event_time) AS event_date
FROM workspace.default.events_delta;

num_affected_rows,num_inserted_rows


In [0]:
%sql
-- SHOW PARTITIONS workspace.default.events_delta;
-- The table is not partitioned, so this statement is invalid and has been commented out.

In [0]:
import time

start = time.time()
spark.sql("""
SELECT COUNT(*)
FROM workspace.default.events_delta
WHERE user_id = 12345
""").collect()

print(f"Baseline time: {time.time() - start:.2f}s")

Baseline time: 1.06s


In [0]:
start = time.time()
spark.sql("""
SELECT COUNT(*)
FROM workspace.default.events_delta
WHERE user_id = 12345
""").collect()

print(f"Optimized time: {time.time() - start:.2f}s")

Optimized time: 0.73s


In [0]:
events_cached = spark.table("workspace.default.events_delta")

# Materialize
events_cached.count()

42448764