In [0]:
%sql
CREATE TABLE events (
  eventId BIGINT,
  eventType STRING,
  eventTime TIMESTAMP
)
USING DELTA
CLUSTER BY (eventType, eventId);


In [0]:
%sql
ALTER TABLE events CLUSTER BY (eventType);


In [0]:
(spark.readStream.table("source_table")
  .writeStream
  .clusterBy("eventType")
  .toTable("events"))


In [0]:
%sql
-- Optimize the entire table
OPTIMIZE events;

-- Optimize a specific partition (if using traditional partitioning)
OPTIMIZE events WHERE date >= '2026-01-01';

-- Perform a full rewrite and recompression (Databricks Runtime 16.0+)
OPTIMIZE events FULL;


In [0]:
from delta.tables import DeltaTable

deltaTable = DeltaTable.forName(spark, "events")
deltaTable.optimize().executeCompaction()


In [0]:
%sql
-- Preview files to be deleted without actually deleting them
VACUUM events DRY RUN;

-- Delete files older than the default 7 days
VACUUM events;

-- Delete files older than 24 hours (requires setting spark.databricks.delta.retentionDurationCheck.enabled = false)
VACUUM events RETAIN 24 HOURS;

-- Fast cleanup using only the transaction log (Databricks Runtime 16.1+)
VACUUM events LITE;


In [0]:
deltaTable = DeltaTable.forName(spark, "events")

# Remove files older than default retention (168 hours)
deltaTable.vacuum()

# Remove files older than 100 hours
deltaTable.vacuum(100)
