# Azure Delta Lake Demo

In [None]:
%%pyspark
csvFilePath = 'abfss://root@adlesilabs.dfs.core.windows.net/demofiles/csv/PurchaseOrderDetail.csv'
DeltaDF = spark.read.load(
     csvFilePath
   , format='csv'
   , header=True
   , inferschema=True
)
display(DeltaDF.limit(3))
print(DeltaDF)

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import DecimalType,datetime

group_cols = ["PurchaseOrderID", "DueDate","ModifiedDate"]

DeltaDF = (
   DeltaDF.groupBy(group_cols)
          .agg(
              sum("LineTotal").alias("SubTotal")
          )
).withColumn(
    "SubTotal",col("SubTotal").cast(DecimalType(18,2))
).select(
          "PurchaseOrderID"
        , date_format("DueDate","MM-dd-yyyy").alias("DueDate")
        , date_format("ModifiedDate","MM-dd-yyyy").alias("ModifiedDate")
        , "SubTotal"
)


DeltaDF.show()


print(DeltaDF)



In [None]:
%%pyspark

deltaFilePath = "abfss://root@adlesilabs.dfs.core.windows.net/delta/Table/PurchaseOrderDetail"

# Save as a Delta files
(
    DeltaDF.write.format("delta")
           .mode("overwrite")
           .option('overwriteSchema','true')
           .save(deltaFilePath)
)

In [None]:
%%pyspark

# Can be saved as a Spark Table

FilePath = 'abfss://root@adlesilabs.dfs.core.windows.net/delta/Table/PurchaseOrderDetail'
df = spark.read.load(FilePath, format='delta')

df.write.mode("overwrite").saveAsTable("default.PurchaseOrderDetails")

In [None]:
%%pyspark
# Leverage SQL to read the data
DailyTop5Sales = spark.sql("SELECT * FROM default.PurchaseOrderDetails ORDER BY SubTotal DESC LIMIT 5")
display(DailyTop5Sales)

In [None]:
%%sql
-- Leverage SQL to read the data
SELECT * FROM default.PurchaseOrderDetails ORDER BY SubTotal DESC LIMIT 5

In [None]:
%%sql
DESCRIBE HISTORY 'abfss://root@adlesilabs.dfs.core.windows.net/delta/Table/PurchaseOrderDetail/'

In [None]:
%%sql
-- About to update these records
SELECT * 
FROM default.PurchaseOrderDetails
WHERE PurchaseOrderID = 4012

In [None]:
from delta.tables import *
from pyspark.sql.functions import *

deltaTable = DeltaTable.forPath(spark, 'abfss://root@adlesilabs.dfs.core.windows.net/delta/Table/PurchaseOrderDetail/')

# Declare the predicate by using a SQL-formatted string.
deltaTable.update(
  condition = "PurchaseOrderID = 4012",
  set = { 
          "ModifiedDate": "'8-12-2017'",
          "DueDate": "'8-12-2017'"
        }
)

In [None]:
%%sql
DESCRIBE HISTORY 'abfss://root@adlesilabs.dfs.core.windows.net/delta/Table/PurchaseOrderDetail/'

In [None]:
# Why the correct value is not showing?
df2 = spark.read.format("delta").option("versionAsOf", 0).load("abfss://root@adlesilabs.dfs.core.windows.net/delta/Table/PurchaseOrderDetail/")
display(df2.where("PurchaseOrderID = 4012"))

In [None]:
from delta.tables import *
from pyspark.sql.functions import *

deltaTable = DeltaTable.forPath(spark, 'abfss://root@adlesilabs.dfs.core.windows.net/delta/Table/PurchaseOrderDetail/')

# Declare the predicate by using a SQL-formatted string.
deltaTable.delete("PurchaseOrderID = 4012")

In [None]:
%%sql
DESCRIBE HISTORY 'abfss://root@adlesilabs.dfs.core.windows.net/delta/Table/PurchaseOrderDetail/'

In [None]:
df2 = spark.read.format("delta").option("versionAsOf", 2).load("abfss://root@adlesilabs.dfs.core.windows.net/delta/Table/PurchaseOrderDetail/")
display(df2.where("PurchaseOrderID = 4012"))

# Delta File Table Maintenance

In [None]:
from delta.tables import *

pathToTable = "abfss://root@adlesilabs.dfs.core.windows.net/delta/Table/PurchaseOrderDetail/"

deltaTable = DeltaTable.forPath(spark, pathToTable)

deltaTable.vacuum() # vacuum files not required by versions older than the default retention period

In [None]:
deltaTable.vacuum(100)     # vacuum files not required by versions more than 100 hours old

In [None]:
# Vacuum DRY RUN
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", False)

# We can leverage SQL as well

In [None]:
%%sql
CREATE DATABASE IF NOT EXISTS DP203

In [None]:
%%sql
CREATE TABLE IF NOT EXISTS DP203.PurchaseOrderDetail_SQL
(
  PurchaseOrderID int
, DueDate string
, ModifiedDate string
, SubTotal decimal(18,2)

)
USING delta
LOCATION 'abfss://root@adlesilabs.dfs.core.windows.net/delta/Table/PurchaseOrderDetail/'

In [None]:
%%sql
describe HISTORY DP203.PurchaseOrderDetail_SQL

In [None]:
%%sql
INSERT INTO DP203.PurchaseOrderDetail_SQL
SELECT 4013,'7/24/2015','8/12/2015',41.57

In [None]:
%%sql
describe HISTORY DP203.PurchaseOrderDetail_SQL

# Clean Up

In [None]:
%%sql
-- Claning up
DROP DATABASE DP203 CASCADE

In [None]:
%%sql
-- Claning up
USE default;

DROP TABLE PurchaseOrderDetails

In [None]:
# Delete Delta Table (folder)
delta_table_path = "abfss://root@adlesilabs.dfs.core.windows.net/delta/Table/PurchaseOrderDetail"
mssparkutils.fs.rm(delta_table_path, recurse=True)