# Assignment: DataBricks Delta Lake

In [0]:
# task 1
sales_df = spark.read.option("header", "true").csv("file:/Workspace/Shared/sales_data2.csv")
sales_df.write.format("delta").mode("overwrite").save("/delta/sales_data2")


In [0]:
customer_df = spark.read.option("multiline", "true").json("file:/Workspace/Shared/customer_data.json")
customer_df.write.format("delta").mode("overwrite").save("/delta/customer_data")


In [0]:
# task 2

new_sales_df = spark.read.option("header", "true").csv("file:/Workspace/Shared/new_sales_data.csv")
new_sales_df.write.format("delta").mode("overwrite").save("/delta/new_sales_data")

In [0]:
# create delta tables
spark.sql("CREATE TABLE IF NOT EXISTS sales_delta_table USING DELTA LOCATION '/delta/sales_data2'")
spark.sql("CREATE TABLE IF NOT EXISTS new_sales_delta_table USING DELTA LOCATION '/delta/new_sales_data'")

DataFrame[]

In [0]:
# merge
spark.sql('''
MERGE INTO sales_delta_table AS TARGET
USING new_sales_delta_table AS SOURCE
ON TARGET.OrderID = SOURCE.OrderID
WHEN MATCHED THEN UPDATE SET TARGET.Quantity=SOURCE.Quantity, TARGET.Price=SOURCE.Price
WHEN NOT MATCHED THEN INSERT (OrderID, OrderDate, CustomerID, Product, Quantity, Price) 
VALUES (SOURCE.OrderID, SOURCE.OrderDate, SOURCE.CustomerID, SOURCE.Product, SOURCE.Quantity, SOURCE.Price)
''')

spark.sql("SELECT * FROM sales_delta_table").show()

+-------+----------+----------+--------+--------+-----+
|OrderID| OrderDate|CustomerID| Product|Quantity|Price|
+-------+----------+----------+--------+--------+-----+
|   1001|2024-01-15|      C001|Widget A|      10|25.50|
|   1003|2024-01-16|      C001|Widget C|       8|22.50|
|   1004|2024-01-17|      C003|Widget A|      15|25.50|
|   1005|2024-01-18|      C004|Widget D|       7|30.00|
|   1006|2024-01-19|      C002|Widget B|       9|15.75|
|   1007|2024-01-20|      C005|Widget C|      12|22.50|
|   1008|2024-01-21|      C003|Widget A|      10|25.50|
|   1009|2024-01-22|      C006|Widget E|      14|20.00|
|   1010|2024-01-23|      C007|Widget F|       6|35.00|
|   1002|2024-01-16|      C002|Widget B|      10|15.75|
+-------+----------+----------+--------+--------+-----+



In [0]:
# task 3
spark.sql("OPTIMIZE sales_delta_table ZORDER BY (OrderDate)")


DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,clusteringStats:struct<inputZCubeFiles:struct<numFiles:bigint,size:bigint>,inputOtherFiles:struct<numFiles:bigint,size:bigint>,inputNumZCubes:bigint,mergedFiles:struct<numFiles:bigint,size:bigint>,numOutputZCubes:bigint>,numBins:bigint,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,

In [0]:
# task 4
spark.sql("DESCRIBE HISTORY sales_delta_table")

DataFrame[version: bigint, timestamp: timestamp, userId: string, userName: string, operation: string, operationParameters: map<string,string>, job: struct<jobId:string,jobName:string,jobRunId:string,runId:string,jobOwnerId:string,triggerType:string>, notebook: struct<notebookId:string>, clusterId: string, readVersion: bigint, isolationLevel: string, isBlindAppend: boolean, operationMetrics: map<string,string>, userMetadata: string, engineInfo: string]

In [0]:
spark.sql("VACUUM sales_delta_table RETAIN 168 HOURS")

DataFrame[path: string]

In [0]:
# task 5
version_number = 1 
sales_version_df = spark.read.format("delta").option("versionAsOf", version_number).load("/delta/sales_data2")
sales_version_df.show()

+-------+----------+----------+--------+--------+-----+
|OrderID| OrderDate|CustomerID| Product|Quantity|Price|
+-------+----------+----------+--------+--------+-----+
|   1001|2024-01-15|      C001|Widget A|      10|25.50|
|   1003|2024-01-16|      C001|Widget C|       8|22.50|
|   1004|2024-01-17|      C003|Widget A|      15|25.50|
|   1005|2024-01-18|      C004|Widget D|       7|30.00|
|   1006|2024-01-19|      C002|Widget B|       9|15.75|
|   1007|2024-01-20|      C005|Widget C|      12|22.50|
|   1008|2024-01-21|      C003|Widget A|      10|25.50|
|   1002|2024-01-16|      C002|Widget B|      10|15.75|
|   1009|2024-01-22|      C006|Widget E|      14|20.00|
|   1010|2024-01-23|      C007|Widget F|       6|35.00|
+-------+----------+----------+--------+--------+-----+



In [0]:
# Enforce schema while writing
schema = sales_df.schema
new_sales_df.write.format("delta").mode("append").option("mergeSchema", "true").save("/delta/sales_data_enforce")

In [0]:
# Perform vacuum to remove old data files
spark.sql("VACUUM delta.`/delta/sales_data_enforce` RETAIN 300 HOURS")


DataFrame[path: string]

In [0]:
df = spark.read.format("delta").load("/delta/sales_data_enforce")
display(df)

OrderID,OrderDate,CustomerID,Product,Quantity,Price
1009,2024-01-22,C006,Widget E,14,20.0
1010,2024-01-23,C007,Widget F,6,35.0
1002,2024-01-16,C002,Widget B,10,15.75
