In [0]:
# let's create a new location for a new dataset, to start from scratch
dbutils.fs.mkdirs("dbfs:/FileStore/datasets/car_source_stream")

In [0]:
# load stream data using AutoLoader. use cloudFiles.schemaHints to hint the proper data type
# observing output shows that some data from price column were in float type. As we specified in hints that it must be integer, dats is placed 
# in _rescued_data column
car_stream_data = spark.readStream.format("cloudFiles") \
                          .option("cloudFiles.format", "csv") \
                          .option("cloudFiles.schemaLocation", 
                                 "dbfs:/FileStore/datasets/car_source_stream") \
                          .option("cloudFiles.schemaHints", "price int, mileage int, engV float, year int")\
                          .load("dbfs:/FileStore/datasets/car_source_stream")
car_stream_data.display()

car,price,body,mileage,engV,engType,registration,year,model,drive,_rescued_data
Ford,15500.0,crossover,68,2.5,Gas,yes,2010,Kuga,full,
Mercedes-Benz,20500.0,sedan,173,1.8,Gas,yes,2011,E-Class,rear,
Mercedes-Benz,35000.0,other,135,5.5,Petrol,yes,2008,CL 550,rear,
Mercedes-Benz,17800.0,van,162,1.8,Diesel,yes,2012,B 180,front,
Nissan,16600.0,crossover,83,2.0,Petrol,yes,2013,X-Trail,full,
Honda,6500.0,sedan,199,2.0,Petrol,yes,2003,Accord,front,
Renault,10500.0,vagon,185,1.5,Diesel,yes,2011,Megane,front,
Mercedes-Benz,21500.0,sedan,146,1.8,Gas,yes,2012,E-Class,rear,
Mercedes-Benz,22700.0,sedan,125,2.2,Diesel,yes,2010,E-Class,rear,
Nissan,,crossover,0,1.2,Petrol,yes,2016,Qashqai,front,"{""price"":""20447.154"",""_file_path"":""dbfs:/FileStore/datasets/car_source_stream/car_ad_01.csv""}"


In [0]:
car_stream_data.select("year", "price").display()

year,price
2010,15500.0
2011,20500.0
2008,35000.0
2012,17800.0
2013,16600.0
2003,6500.0
2011,10500.0
2012,21500.0
2010,22700.0
2016,


Output can only be rendered in Databricks

In [0]:
# selection and filtering transformation 
car_stream_transformed_1 = car_stream_data.select("car", "model", "year", "body", "price")\
                                          .where("price > 10000")
car_stream_transformed_1.display()

car,model,year,body,price
Ford,Kuga,2010,crossover,15500
Mercedes-Benz,E-Class,2011,sedan,20500
Mercedes-Benz,CL 550,2008,other,35000
Mercedes-Benz,B 180,2012,van,17800
Nissan,X-Trail,2013,crossover,16600
Renault,Megane,2011,vagon,10500
Mercedes-Benz,E-Class,2012,sedan,21500
Mercedes-Benz,E-Class,2010,sedan,22700
Mercedes-Benz,E-Class,2011,sedan,20400
Mercedes-Benz,E-Class,2012,sedan,22500


In [0]:
car_stream_transformed_2 = car_stream_transformed_1.select("*")\
                                                   .filter((car_stream_transformed_1["car"].isin(["Mercedes-Benz", "BMW"])) & \
                                                          (car_stream_transformed_1["year"] > 2010))
car_stream_transformed_2.display()

car,model,year,body,price
Mercedes-Benz,E-Class,2011,sedan,20500
Mercedes-Benz,B 180,2012,van,17800
Mercedes-Benz,E-Class,2012,sedan,21500
Mercedes-Benz,E-Class,2011,sedan,20400
Mercedes-Benz,E-Class,2012,sedan,22500
Mercedes-Benz,E-Class,2012,sedan,21500
BMW,750,2016,sedan,129222
Mercedes-Benz,GLE-Class,2016,crossover,99999
Mercedes-Benz,GLE-Class,2016,crossover,104999
Mercedes-Benz,E-Class,2011,sedan,20400


In [0]:
# aggregations on streaming data
car_stream_transformed_3 = car_stream_transformed_2.select("*")\
                                                   .groupBy("year")\
                                                   .count()\
                                                   .withColumnRenamed("count", "total")
car_stream_transformed_3.display()

year,total
2015,1
2013,1
2014,1
2012,4
2016,13
2011,4


In [0]:
car_stream_transformed_4 = car_stream_transformed_2.select("*")\
                                                   .groupBy("body")\
                                                   .agg({"price": "avg", "year": "count"})\
                                                   .withColumnRenamed("avg(price)", "average_price")\
                                                   .withColumnRenamed("count(year)", "count")
car_stream_transformed_4.display()

body,count,average_price
van,1,17800.0
crossover,13,80091.69230769231
sedan,10,40075.5
