In [0]:
car_stream_data = spark.readStream.format("cloudFiles") \
                          .option("cloudFiles.format", "csv") \
                          .option("cloudFiles.schemaLocation", 
                                 "dbfs:/FileStore/datasets/car_source_stream") \
                          .load("dbfs:/FileStore/datasets/car_source_stream")
car_stream_data.display()

car,price,body,mileage,engV,engType,registration,year,model,drive,_rescued_data
Porsche,55000.0,crossover,105,3.0,Diesel,yes,2012,Cayenne,full,
Kia,18000.0,crossover,64,2.0,Diesel,yes,2011,Sportage,full,
Kia,21700.0,crossover,58,2.0,Diesel,yes,2012,Sportage,full,
Volkswagen,2400.0,vagon,320,1.9,Diesel,no,2000,Passat B5,front,
Volkswagen,2350.0,sedan,300,1.9,Diesel,no,1998,Passat B5,front,
Honda,18600.0,vagon,98,2.4,Gas,yes,2011,Accord,front,
Mercedes-Benz,104999.0,crossover,1,3.0,Diesel,yes,2016,GLE-Class,full,
Toyota,195000.0,crossover,0,4.5,Diesel,yes,2016,Land Cruiser 200,full,
Porsche,49900.0,crossover,73,3.0,Diesel,yes,2011,Cayenne,full,
Porsche,50900.0,crossover,53,3.6,Petrol,yes,2013,Cayenne,full,


In [0]:
# filter data that will be used into memory sink
car_filtered_data = car_stream_data.select(car_stream_data.car,
                                          car_stream_data.model, 
                                          car_stream_data.price,
                                          car_stream_data.mileage,
                                          car_stream_data.year)\
                                    .where(car_stream_data.year > 2014)
car_filtered_data.display()

car,model,price,mileage,year
Mercedes-Benz,GLE-Class,104999.0,1,2016
Toyota,Land Cruiser 200,195000.0,0,2016
Porsche,Cayenne,99999.0,1,2016
Mercedes-Benz,GLE-Class,0.0,0,2016
Toyota,Land Cruiser 200,0.0,0,2016
Toyota,Land Cruiser 200,102999.0,0,2016
Toyota,Land Cruiser 200,103999.0,0,2016
BMW,520,39333.0,6,2016
Mercedes-Benz,GLE-Class,99999.0,0,2016
Mercedes-Benz,GLE-Class,70999.0,0,2016


###Append Mode

In [0]:
# we've chosen the format("memory") to store data in cash of current notebook/session, without writing to physical location 
# .outputMode("append") means that just new rows created by trigger will be uppended
# note that "append" mode doesn't support agregations
query = car_filtered_data.writeStream\
                         .queryName("carDetailsAfter2014")\
                         .outputMode("append")\
                         .format("memory")\
                         .start()

In [0]:
%sql
select * from carDetailsAfter2014

car,model,price,mileage,year
Mercedes-Benz,GLE-Class,104999.0,1,2016
Toyota,Land Cruiser 200,195000.0,0,2016
Porsche,Cayenne,99999.0,1,2016
Mercedes-Benz,GLE-Class,0.0,0,2016
Toyota,Land Cruiser 200,0.0,0,2016
Toyota,Land Cruiser 200,102999.0,0,2016
Toyota,Land Cruiser 200,103999.0,0,2016
BMW,520,39333.0,6,2016
Mercedes-Benz,GLE-Class,99999.0,0,2016
Mercedes-Benz,GLE-Class,70999.0,0,2016


In [0]:
#to be able to apply aggregations we need to select complete mode of output 
car_grouped_data = car_stream_data.select(car_stream_data.body,
                                          car_stream_data.price,
                                          car_stream_data.mileage)\
                                    .groupBy(car_stream_data.body)\
                                    .agg({"price":"avg", "mileage":"avg"})
car_grouped_data.display()

body,avg(price),avg(mileage)
van,27179.9,208.0
crossover,49249.607,48.97142857142857
other,25750.0,121.0
sedan,19661.404761904763,144.76190476190476
hatch,14791.0,26.2
vagon,7821.111111111111,217.66666666666663


In [0]:
# let's try to output data to memory using append mode (shouldn't work). Output results in error: "AnalysisException: Append output mode not 
# supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark;". Watermark is advanced topic
query = car_grouped_data.writeStream\
                         .queryName("averagePriceMileageByBody")\
                         .outputMode("append")\
                         .format("memory")\
                         .start()

###Complete Mode

In [0]:
# we'll try the same operations but now to the "complete" output mode (for each trigger the entire dataset will be written to a sink)
query = car_grouped_data.writeStream\
                         .queryName("averagePriceMileageByBody")\
                         .outputMode("complete")\
                         .format("memory")\
                         .start()

In [0]:
%sql
select * from averagePriceMileageByBody

body,avg(price),avg(mileage)
van,37399.857142857145,176.71428571428572
crossover,53163.15604166667,41.72916666666666
other,25750.0,121.0
sedan,21499.322580645163,148.7741935483871
hatch,15388.75,26.25
vagon,8740.0,217.16666666666663


now we're going to add new files to our source folder and rerun the last query

In [0]:
%sql
select * from averagePriceMileageByBody

body,avg(price),avg(mileage)
van,30199.88888888889,213.11111111111111
crossover,48783.12913793104,50.741379310344826
other,25750.0,121.0
sedan,20429.972222222223,148.22222222222223
hatch,14791.0,26.2
vagon,8355.0,216.0


###Update Mode

In [0]:
car_grouped_data = car_stream_data.select(car_stream_data.car,
                                          car_stream_data.price,
                                          car_stream_data.mileage)\
                                    .groupBy(car_stream_data.car)\
                                    .agg({"price":"count"})
car_grouped_data.display()

car,count(price)
Volkswagen,15
Jaguar,1
Mitsubishi,3
Kia,3
Chevrolet,1
Hyundai,3
Honda,4
Audi,11
Land Rover,2
Mercedes-Benz,32


In [0]:
query = car_grouped_data.writeStream\
                         .queryName("countByCar")\
                         .outputMode("update")\
                         .format("memory")\
                         .start()

In [0]:
%sql
select * from countByCar

car,count(price)
Volkswagen,11
Jaguar,1
Mitsubishi,3
Kia,3
Hyundai,3
Honda,4
Audi,10
Land Rover,1
Mercedes-Benz,29
Renault,2


In [0]:
# now we're adding new data to source folder. The Update mode only adds records that has being changed (it doesn't change originals, just adds new records with new values!)

In [0]:
%sql
select * from countByCar

car,count(price)
Volkswagen,11
Jaguar,1
Mitsubishi,3
Kia,3
Hyundai,3
Honda,4
Audi,10
Land Rover,1
Mercedes-Benz,29
Renault,2
