## Delta Pipeline, with Azure Databricks

![stream](https://kpistoropen.blob.core.windows.net/collateral/delta/Delta.png)

In [2]:
#spark.sql("set spark.databricks.delta.preview.enabled=true")

In [3]:
dbutils.fs.rm("dbfs:/workshop/delta/", True)
# clear out current version to walk through tutorial

In [4]:
from pyspark.sql.functions import expr

rawData = spark.read \
  .option("inferSchema", "true") \
  .json("/mnt/databricks-workshop-datasets/Contoso-retail/structured-streaming/events/") \
  .drop("time") \
  .withColumn("date", expr("cast(concat('2018-01-', cast(rand(5) * 30 as int) + 1) as date)")) \
  .withColumn("deviceId", expr("cast(rand(5) * 100 as int)"))
  # add a couple of columns for demo purposes

# Step 1: Write out raw data and create our table

In [6]:
#Define path where to write to -- by default, in this workshop, we write to the workspace filestore
writeBase = "dbfs:/workshop/delta/"
writePath = writeBase + "iotPipeline/"

#If there are multiple users working on the same instance, please use this writeBase, adding your $USERNAME to the path, and to any subsequent write/read
#writeBase = writeBase = "dbfs:/workshop/delta/$USERNAME/"
#writePath = writeBase + "iotPipeline/"

#As backup, you can always write to this blob
#writeBase = "dbfs:/mnt/databricks-workshop-exercises/Contoso-retail/delta"
#writePath = writeBase + "iotPipeline/"

In [7]:
rawData.write.format("delta").partitionBy("date").save(writePath)

In [8]:
%fs ls dbfs:/workshop/delta/iotPipeline/

path,name,size
dbfs:/workshop/delta/iotPipeline/_delta_log/,_delta_log/,0
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/,date=2018-01-01/,0
dbfs:/workshop/delta/iotPipeline/date=2018-01-02/,date=2018-01-02/,0
dbfs:/workshop/delta/iotPipeline/date=2018-01-03/,date=2018-01-03/,0
dbfs:/workshop/delta/iotPipeline/date=2018-01-04/,date=2018-01-04/,0
dbfs:/workshop/delta/iotPipeline/date=2018-01-05/,date=2018-01-05/,0
dbfs:/workshop/delta/iotPipeline/date=2018-01-06/,date=2018-01-06/,0
dbfs:/workshop/delta/iotPipeline/date=2018-01-07/,date=2018-01-07/,0
dbfs:/workshop/delta/iotPipeline/date=2018-01-08/,date=2018-01-08/,0
dbfs:/workshop/delta/iotPipeline/date=2018-01-09/,date=2018-01-09/,0


In [9]:
%fs ls dbfs:/workshop/delta/iotPipeline/_delta_log/

path,name,size
dbfs:/workshop/delta/iotPipeline/_delta_log/00000000000000000000.crc,00000000000000000000.crc,93
dbfs:/workshop/delta/iotPipeline/_delta_log/00000000000000000000.json,00000000000000000000.json,150910


In [10]:
%fs ls dbfs:/workshop/delta/iotPipeline/date=2018-01-01/

path,name,size
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00000-a85ef268-552a-48b3-8954-7e5dee0ff792.c000.snappy.parquet,part-00000-a85ef268-552a-48b3-8954-7e5dee0ff792.c000.snappy.parquet,802
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00001-a9d6326b-dfe5-40b3-b7c3-516117963a89.c000.snappy.parquet,part-00001-a9d6326b-dfe5-40b3-b7c3-516117963a89.c000.snappy.parquet,790
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00002-1d0fbe27-d50d-473a-ab6a-42bdc22a2f22.c000.snappy.parquet,part-00002-1d0fbe27-d50d-473a-ab6a-42bdc22a2f22.c000.snappy.parquet,814
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00003-1794d020-28fa-4abc-ae87-8aa7c93fe857.c000.snappy.parquet,part-00003-1794d020-28fa-4abc-ae87-8aa7c93fe857.c000.snappy.parquet,805
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00004-3934bd6c-15a0-4668-80ce-1d2ff37823b9.c000.snappy.parquet,part-00004-3934bd6c-15a0-4668-80ce-1d2ff37823b9.c000.snappy.parquet,811
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00005-012e78f2-d0ed-443e-bcc1-1ec0282d6cd1.c000.snappy.parquet,part-00005-012e78f2-d0ed-443e-bcc1-1ec0282d6cd1.c000.snappy.parquet,804
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00006-80079b5b-5db3-4a1e-8a25-e9a06d42e417.c000.snappy.parquet,part-00006-80079b5b-5db3-4a1e-8a25-e9a06d42e417.c000.snappy.parquet,802
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00007-d32a10eb-88b8-4c5b-aed7-6bffdb397a34.c000.snappy.parquet,part-00007-d32a10eb-88b8-4c5b-aed7-6bffdb397a34.c000.snappy.parquet,800
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00008-3c84ab98-4f6e-4fca-92f9-9daf01121d3d.c000.snappy.parquet,part-00008-3c84ab98-4f6e-4fca-92f9-9daf01121d3d.c000.snappy.parquet,802
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00009-f1d3dd51-985d-41fc-9383-11ef49d09fe8.c000.snappy.parquet,part-00009-f1d3dd51-985d-41fc-9383-11ef49d09fe8.c000.snappy.parquet,801


In [11]:
%sql
DROP TABLE IF EXISTS demo_iot_data_delta;
CREATE TABLE demo_iot_data_delta
USING DELTA
LOCATION "dbfs:/workshop/delta/iotPipeline/"

# Step 2: Query the data

In [13]:
%sql

SELECT count(*) FROM demo_iot_data_delta

count(1)
100000


Worked great, no repair table necessary, since Delta automatically handles the metadata

# Step 3: Adding new data

In [16]:
new_data = spark.range(100000) \
  .selectExpr("'Open' as action", "cast('2018-01-30' as date) as date") \
  .withColumn("deviceId", expr("cast(rand(5) * 500 as int)"))

In [17]:
display(new_data)

action,date,deviceId
Open,2018-01-30,43
Open,2018-01-30,289
Open,2018-01-30,348
Open,2018-01-30,219
Open,2018-01-30,247
Open,2018-01-30,296
Open,2018-01-30,289
Open,2018-01-30,3
Open,2018-01-30,3
Open,2018-01-30,415


In [18]:
new_data.write.format("delta").partitionBy("date").mode("append").save(writePath)

# Step 4: Query should show new results

In [20]:
%sql

SELECT count(*) FROM demo_iot_data_delta


count(1)
200000


Again, no update necessary.

# Step 5: Updating previous data

In [23]:
new_data.write.format("delta").mode("overwrite") \
  .option("replaceWhere", "date = cast('2018-01-30' as date)") \
  .save(writePath)

# Step 6: Query should reflect new data

In [25]:
%sql

SELECT count(*) FROM demo_iot_data_delta

count(1)
196674


# Step 7: Add historical data

In [27]:
from pyspark.sql.functions import expr
  
old_batch_data = spark.range(100000) \
  .repartition(200) \
  .selectExpr("'Open' as action", "cast(concat('2018-01-', cast(rand(5) * 15 as int) + 1) as date) as date") \
  .withColumn("deviceId", expr("cast(rand(5) * 100 as int)"))

old_batch_data.write.format("delta").partitionBy("date").mode("append").save(writePath)

In [28]:
%sql
SELECT count(*) FROM demo_iot_data_delta

count(1)
296674


# Performance Improvements

Now we want to build other pipelines with this information, we want to write out to our data warehouse and allow data scientists to query it quickly. The above query took 7 seconds, there's not much data there - it's probably just not well formatted.

With Delta, fixing this is simple.

In [30]:
%fs ls dbfs:/workshop/delta/iotPipeline/date=2018-01-01/

path,name,size
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00000-1ae18fa7-7938-4da1-9a8d-7c2ebb27a0f6.c000.snappy.parquet,part-00000-1ae18fa7-7938-4da1-9a8d-7c2ebb27a0f6.c000.snappy.parquet,723
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00000-a85ef268-552a-48b3-8954-7e5dee0ff792.c000.snappy.parquet,part-00000-a85ef268-552a-48b3-8954-7e5dee0ff792.c000.snappy.parquet,802
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00001-40e3a11d-280a-47c3-ac5b-937024f28d6a.c000.snappy.parquet,part-00001-40e3a11d-280a-47c3-ac5b-937024f28d6a.c000.snappy.parquet,723
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00001-a9d6326b-dfe5-40b3-b7c3-516117963a89.c000.snappy.parquet,part-00001-a9d6326b-dfe5-40b3-b7c3-516117963a89.c000.snappy.parquet,790
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00002-1d0fbe27-d50d-473a-ab6a-42bdc22a2f22.c000.snappy.parquet,part-00002-1d0fbe27-d50d-473a-ab6a-42bdc22a2f22.c000.snappy.parquet,814
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00002-a1bdfd68-b710-4f94-b411-8474fd871a16.c000.snappy.parquet,part-00002-a1bdfd68-b710-4f94-b411-8474fd871a16.c000.snappy.parquet,720
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00003-1794d020-28fa-4abc-ae87-8aa7c93fe857.c000.snappy.parquet,part-00003-1794d020-28fa-4abc-ae87-8aa7c93fe857.c000.snappy.parquet,805
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00003-7cb40062-d820-4c51-8174-b720d5c6b59e.c000.snappy.parquet,part-00003-7cb40062-d820-4c51-8174-b720d5c6b59e.c000.snappy.parquet,720
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00004-3934bd6c-15a0-4668-80ce-1d2ff37823b9.c000.snappy.parquet,part-00004-3934bd6c-15a0-4668-80ce-1d2ff37823b9.c000.snappy.parquet,811
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00004-a55db4f6-03e2-4480-a890-c5c83d3029ed.c000.snappy.parquet,part-00004-a55db4f6-03e2-4480-a890-c5c83d3029ed.c000.snappy.parquet,720


In [31]:
%sql
OPTIMIZE "dbfs:/workshop/delta/iotPipeline/"

path
dbfs:/workshop/delta/iotPipeline/


Now the table is optimized for querying. This is going to be an order of magnitude faster.

In [33]:
%fs ls dbfs:/workshop/delta/iotPipeline/date=2018-01-01/

path,name,size
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00000-1ae18fa7-7938-4da1-9a8d-7c2ebb27a0f6.c000.snappy.parquet,part-00000-1ae18fa7-7938-4da1-9a8d-7c2ebb27a0f6.c000.snappy.parquet,723
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00000-a85ef268-552a-48b3-8954-7e5dee0ff792.c000.snappy.parquet,part-00000-a85ef268-552a-48b3-8954-7e5dee0ff792.c000.snappy.parquet,802
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00001-40e3a11d-280a-47c3-ac5b-937024f28d6a.c000.snappy.parquet,part-00001-40e3a11d-280a-47c3-ac5b-937024f28d6a.c000.snappy.parquet,723
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00001-a9d6326b-dfe5-40b3-b7c3-516117963a89.c000.snappy.parquet,part-00001-a9d6326b-dfe5-40b3-b7c3-516117963a89.c000.snappy.parquet,790
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00002-1d0fbe27-d50d-473a-ab6a-42bdc22a2f22.c000.snappy.parquet,part-00002-1d0fbe27-d50d-473a-ab6a-42bdc22a2f22.c000.snappy.parquet,814
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00002-a1bdfd68-b710-4f94-b411-8474fd871a16.c000.snappy.parquet,part-00002-a1bdfd68-b710-4f94-b411-8474fd871a16.c000.snappy.parquet,720
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00003-1794d020-28fa-4abc-ae87-8aa7c93fe857.c000.snappy.parquet,part-00003-1794d020-28fa-4abc-ae87-8aa7c93fe857.c000.snappy.parquet,805
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00003-4cd78b7c-170b-4823-bcc3-54e0bad8c3d4.c000.snappy.parquet,part-00003-4cd78b7c-170b-4823-bcc3-54e0bad8c3d4.c000.snappy.parquet,4971
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00003-7cb40062-d820-4c51-8174-b720d5c6b59e.c000.snappy.parquet,part-00003-7cb40062-d820-4c51-8174-b720d5c6b59e.c000.snappy.parquet,720
dbfs:/workshop/delta/iotPipeline/date=2018-01-01/part-00004-3934bd6c-15a0-4668-80ce-1d2ff37823b9.c000.snappy.parquet,part-00004-3934bd6c-15a0-4668-80ce-1d2ff37823b9.c000.snappy.parquet,811


### Use the cell below as the benchmark for a delta pipeline

In [35]:
%sql

SELECT count(*) FROM demo_iot_data_delta

count(1)
296674


&copy; 2018 Databricks, Inc. All rights reserved.<br/>
Apache, Apache Spark, Spark and the Spark logo are trademarks of the <a href="http://www.apache.org/">Apache Software Foundation</a>.<br/>
<br/>
<a href="https://databricks.com/privacy-policy">Privacy Policy</a> | <a href="https://databricks.com/terms-of-use">Terms of Use</a> | <a href="http://help.databricks.com/">Support</a>