In [0]:
# when dealing with streaming data we need to explicitly specify types of data
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

In [0]:
#remove the location with possible content from previous course. Note that system-created folders remain
#dbutils.fs.rm("dbfs:/FileStore/datasets", True) # recursion = True
#dbutils.fs.rm("dbfs:/FileStore/output", True)
#dbutils.fs.rm("dbfs:/FileStore/tables", True)

In [0]:
#create new folder
dbutils.fs.mkdirs("dbfs:/FileStore/datasets")

In [0]:
# list the directory
dbutils.fs.ls("dbfs:/FileStore/datasets")

In [0]:
# it is required to define schema before streaming data. Here "False" stands for (is nullable)
schema = StructType([StructField("Id", IntegerType(), False), \
                    StructField("Company", StringType(), False), \
                    StructField("Product", StringType(), False), \
                    StructField("TypeName", StringType(), False), \
                    StructField("Price_euros", FloatType(), False)])

In [0]:
# read batch data from csv file located in local DBFS storage
laptop_data = spark.read.format("csv") \
                   .option("header", True) \
                   .schema(schema) \
                   .load("dbfs:/FileStore/datasets/laptops.csv")

laptop_data.display()

Id,Company,Product,TypeName,Price_euros
1,Apple,MacBook Pro,Ultrabook,1339.69
2,Apple,Macbook Air,Ultrabook,898.94
3,HP,250 G6,Notebook,575.0
4,Apple,MacBook Pro,Ultrabook,2537.45
5,Apple,MacBook Pro,Ultrabook,1803.6
6,Acer,Aspire 3,Notebook,400.0
7,Apple,MacBook Pro,Ultrabook,2139.97
8,Apple,Macbook Air,Ultrabook,1158.7
9,Asus,ZenBook UX430UN,Ultrabook,1495.0
10,Acer,Swift 3,Ultrabook,770.0


In [0]:
# check if the data is streaming
print("Is this streaming data? ", laptop_data.isStreaming)

In [0]:
# let's create a new directory we'll use in streaming process
dbutils.fs.mkdirs("dbfs:/FileStore/datasets/laptop_source_stream")

In [0]:
# now we'll create a new dataframe that will read streaming data from the target directory
laptop_stream_data = spark.readStream \
                          .format("csv") \
                          .option("header", True) \
                          .schema(schema) \
                          .load("dbfs:/FileStore/datasets/laptop_source_stream")
print("Is this streaming data? ", laptop_stream_data.isStreaming)

In [0]:
# once the command above is run, it will pick the new content of target folder, without need to rerun it. We just added 2 new files into folder
laptop_stream_data.display()

Id,Company,Product,TypeName,Price_euros
60,Asus,X541UA-DM1897 (i3-6006U/4GB/256GB/FHD/Linux),Notebook,415.0
61,Dell,Inspiron 5770,Notebook,1299.0
62,Dell,Vostro 5471,Ultrabook,879.0
63,Lenovo,IdeaPad 520S-14IKB,Notebook,599.0
64,Asus,UX410UA-GV350T (i5-8250U/8GB/256GB/FHD/W10),Notebook,941.0
66,HP,250 G6,Notebook,690.0
67,Asus,ZenBook Pro,Ultrabook,1983.0
68,HP,250 G6,Notebook,438.69
69,HP,Stream 14-AX040wm,Notebook,229.0
70,Lenovo,V310-15ISK (i5-7200U/4GB/1TB/FHD/W10),Notebook,549.0


In [0]:
# add new column to your streaming dataframe 
from pyspark.sql.functions import col, round

laptop_stream_data_updated = laptop_stream_data.withColumn("Price_usd", round(col("Price_euros") * 1.4389, 2))

laptop_stream_data_updated.display()

Id,Company,Product,TypeName,Price_euros,Price_usd
60,Asus,X541UA-DM1897 (i3-6006U/4GB/256GB/FHD/Linux),Notebook,415.0,597.14
61,Dell,Inspiron 5770,Notebook,1299.0,1869.13
62,Dell,Vostro 5471,Ultrabook,879.0,1264.79
63,Lenovo,IdeaPad 520S-14IKB,Notebook,599.0,861.9
64,Asus,UX410UA-GV350T (i5-8250U/8GB/256GB/FHD/W10),Notebook,941.0,1354.0
66,HP,250 G6,Notebook,690.0,992.84
67,Asus,ZenBook Pro,Ultrabook,1983.0,2853.34
68,HP,250 G6,Notebook,438.69,631.23
69,HP,Stream 14-AX040wm,Notebook,229.0,329.51
70,Lenovo,V310-15ISK (i5-7200U/4GB/1TB/FHD/W10),Notebook,549.0,789.96


In [0]:
#query streaming data. Take a look a visual representation available below. Graph will be updated with new records comming in automatically
laptop_stream_data_updated.select('Typename', 'Price_usd').display()

Typename,Price_usd
Notebook,597.14
Notebook,1869.13
Ultrabook,1264.79
Notebook,861.9
Notebook,1354.0
Notebook,992.84
Ultrabook,2853.34
Notebook,631.23
Notebook,329.51
Notebook,789.96


Output can only be rendered in Databricks

In [0]:
# note that those visualizations are available just on Spark on Databricks
laptop_stream_data_updated.select('Company', 'Price_euros').display()

Company,Price_euros
Asus,415.0
Dell,1299.0
Dell,879.0
Lenovo,599.0
Asus,941.0
HP,690.0
Asus,1983.0
HP,438.69
HP,229.0
Lenovo,549.0


Output can only be rendered in Databricks

In [0]:
premium_laptops = laptop_stream_data_updated.select("Id", "Company", "Price_usd") \
                                       .where("Price_usd > 2000")

premium_laptops.display()

Id,Company,Price_usd
67,Asus,2853.34
42,Dell,2156.91
46,Apple,2041.8
59,MSI,3523.87
34,Dell,2689.3
83,Apple,2172.74
90,Asus,2084.97
91,Dell,2372.75
4,Apple,3651.14
5,Apple,2595.2


#Triggers

In [0]:
# implement fixed-interval microbatch trigger (20 seconds)
# queryName("premium_laptops_20") is a batch data created by trigger every 20 seconds
premium_laptops.writeStream \
               .format("memory") \
               .queryName("premium_laptops_20") \
               .trigger(processingTime ="20 seconds") \
               .start()

In [0]:
# now we're quering batch data at current point of time, created by trigger, not the streaming
spark.sql("select Company, count(*) from premium_laptops_20 group by Company").display()

Company,count(1)
Dell,3
Asus,3
Apple,8
MSI,1


In [0]:
# let's rerun the query after the new file has being added to the source directory
spark.sql("select Company, count(*) from premium_laptops_20 group by Company").display()

Company,count(1)
Dell,4
Asus,3
Apple,8
MSI,1
HP,1
Lenovo,2


In [0]:
# you can perform usual queries and aggregations on this batch data, created by trigger
spark.sql("select Company, avg(Price_usd) from premium_laptops_20 group by Company").display()

Company,avg(Price_usd)
Dell,2390.7325
Asus,2363.1566666666663
Apple,2918.5462500000003
MSI,3523.87
HP,2013.01
Lenovo,2071.295


In [0]:
# one-time micro-batch trigger
premium_laptops.writeStream \
               .format("memory") \
               .queryName("premium_laptops_once") \
               .trigger(once=True) \
               .start()

In [0]:
# you can perform usual queries and aggregations on this batch data, created by trigger
spark.sql("select Company, avg(Price_usd) from premium_laptops_once group by Company").display()

Company,avg(Price_usd)
Dell,2390.7325
Asus,2363.1566666666663
Lenovo,2071.295
HP,2013.01
Apple,2918.54625
MSI,3523.87


In [0]:
#after new data is being added to source, rerun the query to make sure that nothing happend due to one-time trigger
spark.sql("select Company, avg(Price_usd) from premium_laptops_20 group by Company").display()

Company,avg(Price_usd)
Dell,2314.326
Asus,2363.1566666666663
Apple,2918.54625
MSI,3374.58
HP,2013.01
Lenovo,2071.295
