In [0]:
#let's start with cleaning up DBFS datastore from data of previous tutorials

#dbutils.fs.rm("dbfs:/FileStore/datasets", True)# select True for recursive deletion of all files from "datasets" container
#dbutils.fs.rm("dbfs:/FileStore/tables", True)
#dbutils.fs.rm("dbfs:/FileStore/code", True)
#dbutils.fs.rm("dbfs:/FileStore/checkpoint_dir", True)

In [0]:
# make new source directory
dbutils.fs.mkdirs("dbfs:/FileStore/datasets/airline_data")

In [0]:
# import libraries
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import current_timestamp, window

In [0]:
# specify schema for streaming data
schema = StructType([StructField("id", IntegerType(), True),
                     StructField("Gender", StringType(), True),
                     StructField("Customer Type", StringType(), True),
                     StructField("Age", IntegerType(), True),
                     StructField("Type of Travel", StringType(), True),
                     StructField("Class", StringType(), True),
                     StructField("Flight Distance", IntegerType(), True),
                     StructField("Inflight wifi service", IntegerType(), True),
                     StructField("Departure/Arrival time convenient", IntegerType(), True),
                     StructField("Ease of Online booking", IntegerType(), True),
                     StructField("Gate location", IntegerType(), True),
                     StructField("Food and drink", IntegerType(), True),
                     StructField("Online boarding", IntegerType(), True),
                     StructField("Seat comfort", IntegerType(), True),
                     StructField("Inflight entertainment", IntegerType(), True),
                     StructField("On-board service", IntegerType(), True),
                     StructField("Leg room service", IntegerType(), True),
                     StructField("Baggage handling", IntegerType(), True),
                     StructField("Checkin service", IntegerType(), True),
                     StructField("Inflight service", IntegerType(), True),
                     StructField("Cleanliness", IntegerType(), True),
                     StructField("Departure Delay in Minutes", IntegerType(), True),
                     StructField("Arrival Delay in Minutes", IntegerType(), True),
                     StructField("satisfaction", StringType(), True)
                    ])

In [0]:
# read Stream data into spark dataframe. Dataframe will automatically upddates with new data comming into a source directory (default trigger)
airline_data_full = spark.readStream \
                         .format("csv") \
                         .option("header", True) \
                         .schema(schema) \
                         .load("dbfs:/FileStore/datasets/airline_data")

airline_data_full.display()

id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,3,4,3,4,3,5,5,5,5,2,5,5,50,44,satisfied
90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,3,1,5,4,5,4,4,4,4,3,4,5,0,0,satisfied
12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,2,4,2,2,2,2,4,1,3,2,2,2,0,0,neutral or dissatisfied
77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,0,2,3,4,4,1,1,1,1,3,1,4,0,6,satisfied
36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,4,3,4,1,2,2,2,2,2,4,2,4,0,20,satisfied
39177,Male,Loyal Customer,16,Business travel,Eco,311,3,3,3,3,5,5,3,5,4,3,1,1,2,5,0,0,satisfied
79433,Female,Loyal Customer,77,Business travel,Business,3987,5,5,5,5,3,5,5,5,5,5,5,4,5,3,0,0,satisfied
97286,Female,Loyal Customer,43,Business travel,Business,2556,2,2,2,2,4,4,5,4,4,4,4,5,4,3,77,65,satisfied
27508,Male,Loyal Customer,47,Business travel,Eco,556,5,2,2,2,5,5,5,5,2,2,5,3,3,5,1,0,satisfied
62482,Female,Loyal Customer,46,Business travel,Business,1744,2,2,2,2,3,4,4,4,4,4,4,5,4,4,28,14,satisfied


In [0]:
airline_data = airline_data_full.select("Gender", "Age", "Type of Travel", "Class", "Baggage Handling", "Checkin service", "Cleanliness",
                                        "Departure Delay in Minutes", "Arrival Delay in Minutes")
 
airline_data.display()

Gender,Age,Type of Travel,Class,Baggage Handling,Checkin service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
Female,52,Business travel,Eco,5,2,5,50,44
Female,36,Business travel,Business,4,3,5,0,0
Male,20,Business travel,Eco,3,2,2,0,0
Male,44,Business travel,Business,1,3,4,0,6
Female,49,Business travel,Eco,2,4,4,0,20
Male,16,Business travel,Eco,1,1,5,0,0
Female,77,Business travel,Business,5,4,3,0,0
Female,43,Business travel,Business,4,5,3,77,65
Male,47,Business travel,Eco,5,3,5,1,0
Female,46,Business travel,Business,4,5,4,28,14


In [0]:
# in the desplayed data there is no event time present
# what we can do is to add current processing timestamp column 

airline_data = airline_data.withColumn("Timestamp", current_timestamp())

airline_data.display()

Gender,Age,Type of Travel,Class,Baggage Handling,Checkin service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,Timestamp
Female,52,Business travel,Eco,5,2,5,50,44,2023-03-08T10:44:29.910+0000
Female,36,Business travel,Business,4,3,5,0,0,2023-03-08T10:44:29.910+0000
Male,20,Business travel,Eco,3,2,2,0,0,2023-03-08T10:44:29.910+0000
Male,44,Business travel,Business,1,3,4,0,6,2023-03-08T10:44:29.910+0000
Female,49,Business travel,Eco,2,4,4,0,20,2023-03-08T10:44:29.910+0000
Male,16,Business travel,Eco,1,1,5,0,0,2023-03-08T10:44:29.910+0000
Female,77,Business travel,Business,5,4,3,0,0,2023-03-08T10:44:29.910+0000
Female,43,Business travel,Business,4,5,3,77,65,2023-03-08T10:44:29.910+0000
Male,47,Business travel,Eco,5,3,5,1,0,2023-03-08T10:44:29.910+0000
Female,46,Business travel,Business,4,5,4,28,14,2023-03-08T10:44:29.910+0000


In [0]:
# make some aggregations. Due to window isn't specified - we are dealing with Global window (doesn't require any timestamp column)
# all agregations by default are performed on global window
flight_class_age_df = airline_data.groupBy(airline_data.Class) \
                              .agg({"Age": "avg"})

flight_class_age_df.display()

Class,avg(Age)
Eco Plus,37.0
Business,41.75
Eco,40.28947368421053


In [0]:
#some new files will be added to the source. Open graph and note how changes dynamicaly
flight_class_baggage_df = airline_data.groupBy(airline_data.Class) \
                                       .agg({"Baggage Handling": "avg"})

display(flight_class_baggage_df)

Class,avg(Baggage Handling)
Eco Plus,3.1818181818181817
Business,3.888888888888889
Eco,3.697368421052632


Output can only be rendered in Databricks

####Tumbling window

In [0]:
# run the code. After that, after 2 min. upload new files into a source directory. Observr code output update
avg_dep_delay_window_df = airline_data.groupBy(window(airline_data.Timestamp, "2 minutes")) \
                              .agg({"Departure Delay in Minutes": "avg"})

display(avg_dep_delay_window_df)

window,avg(Departure Delay in Minutes)
"List(2023-03-08T11:16:00.000+0000, 2023-03-08T11:18:00.000+0000)",15.95
"List(2023-03-08T10:56:00.000+0000, 2023-03-08T10:58:00.000+0000)",11.2
"List(2023-03-08T10:54:00.000+0000, 2023-03-08T10:56:00.000+0000)",15.542372881355933
"List(2023-03-08T11:08:00.000+0000, 2023-03-08T11:10:00.000+0000)",13.05
"List(2023-03-08T10:58:00.000+0000, 2023-03-08T11:00:00.000+0000)",10.55
"List(2023-03-08T11:04:00.000+0000, 2023-03-08T11:06:00.000+0000)",11.8


Note that if you upload multiple files within tumbling window interval they all be ancounted in one window calculations

In [0]:
# perform grouping both on Tumbling window and column
avg_checkin_score_window_df = airline_data.groupBy(window(airline_data.Timestamp, "2 minutes"), "Class") \
                                          .agg({"Checkin service": "avg"})

display(avg_checkin_score_window_df)

window,Class,avg(Checkin service)
"List(2023-03-08T11:08:00.000+0000, 2023-03-08T11:10:00.000+0000)",Eco,2.857142857142857
"List(2023-03-08T11:16:00.000+0000, 2023-03-08T11:18:00.000+0000)",Business,3.8
"List(2023-03-08T11:04:00.000+0000, 2023-03-08T11:06:00.000+0000)",Business,3.571428571428572
"List(2023-03-08T11:08:00.000+0000, 2023-03-08T11:10:00.000+0000)",Business,2.6
"List(2023-03-08T11:02:00.000+0000, 2023-03-08T11:04:00.000+0000)",Eco,3.127659574468085
"List(2023-03-08T11:08:00.000+0000, 2023-03-08T11:10:00.000+0000)",Eco Plus,3.6666666666666665
"List(2023-03-08T11:04:00.000+0000, 2023-03-08T11:06:00.000+0000)",Eco Plus,4.0
"List(2023-03-08T11:04:00.000+0000, 2023-03-08T11:06:00.000+0000)",Eco,3.083333333333333
"List(2023-03-08T11:02:00.000+0000, 2023-03-08T11:04:00.000+0000)",Eco Plus,3.2857142857142856
"List(2023-03-08T11:02:00.000+0000, 2023-03-08T11:04:00.000+0000)",Business,3.6222222222222222


In [0]:
avg_checkin_score_df = airline_data.groupBy(window(airline_data.Timestamp, "30 seconds"), airline_data["Type of Travel"]) \
                              .agg({"Checkin service": "avg"})

display(avg_checkin_score_df)

window,Type of Travel,avg(Checkin service)
"List(2023-03-08T11:17:00.000+0000, 2023-03-08T11:17:30.000+0000)",Business travel,3.625
"List(2023-03-08T11:07:00.000+0000, 2023-03-08T11:07:30.000+0000)",Personal Travel,3.702702702702703
"List(2023-03-08T11:07:00.000+0000, 2023-03-08T11:07:30.000+0000)",Business travel,3.1951219512195124
"List(2023-03-08T11:17:00.000+0000, 2023-03-08T11:17:30.000+0000)",Personal Travel,1.25
"List(2023-03-08T11:09:00.000+0000, 2023-03-08T11:09:30.000+0000)",Personal Travel,3.25
"List(2023-03-08T11:09:00.000+0000, 2023-03-08T11:09:30.000+0000)",Business travel,2.583333333333333


Output can only be rendered in Databricks

In [0]:
# add window.start, window.end columns (like unpivoting output objects)
avg_age_df = airline_data.groupBy(window(airline_data.Timestamp, "1 minute"), airline_data.Gender) \
                              .agg({"Age": "avg"}) \
                              .withColumnRenamed("avg(Age)", "avg_age") \
                              .select("window.start", "window.end", "Gender", "avg_age")

display(avg_age_df)

start,end,Gender,avg_age
2023-03-08T11:17:00.000+0000,2023-03-08T11:18:00.000+0000,Male,32.333333333333336
2023-03-08T11:11:00.000+0000,2023-03-08T11:12:00.000+0000,Female,43.06493506493506
2023-03-08T11:11:00.000+0000,2023-03-08T11:12:00.000+0000,Male,38.016129032258064
2023-03-08T11:17:00.000+0000,2023-03-08T11:18:00.000+0000,Female,46.45454545454545


####Sliding Window

In [0]:
# here we have to specify both window interval and sliding interval (sliding step): every 1 minute create a window of 3 min. wide
avg_clean_df = airline_data.groupBy(window(airline_data.Timestamp, "3 minutes", "1 minute"), airline_data.Gender) \
                           .agg({"Cleanliness": "avg"})

display(avg_clean_df)

window,Gender,avg(Cleanliness)
"List(2023-03-08T11:16:00.000+0000, 2023-03-08T11:19:00.000+0000)",Male,2.7777777777777777
"List(2023-03-08T11:13:00.000+0000, 2023-03-08T11:16:00.000+0000)",Female,3.636363636363636
"List(2023-03-08T11:15:00.000+0000, 2023-03-08T11:18:00.000+0000)",Female,3.522727272727273
"List(2023-03-08T11:14:00.000+0000, 2023-03-08T11:17:00.000+0000)",Female,3.636363636363636
"List(2023-03-08T11:16:00.000+0000, 2023-03-08T11:19:00.000+0000)",Female,2.727272727272727
"List(2023-03-08T11:14:00.000+0000, 2023-03-08T11:17:00.000+0000)",Male,3.193548387096774
"List(2023-03-08T11:17:00.000+0000, 2023-03-08T11:20:00.000+0000)",Female,2.727272727272727
"List(2023-03-08T11:13:00.000+0000, 2023-03-08T11:16:00.000+0000)",Male,3.193548387096774
"List(2023-03-08T11:17:00.000+0000, 2023-03-08T11:20:00.000+0000)",Male,2.7777777777777777
"List(2023-03-08T11:15:00.000+0000, 2023-03-08T11:18:00.000+0000)",Male,3.140845070422535


In [0]:
# here we'll perform additional ordering based on window.start
avg_baggage_df = airline_data.groupBy(window(airline_data.Timestamp, "2 minutes", "30 seconds"), airline_data["Type of Travel"]) \
                         .agg({"Baggage Handling": "avg"}) \
                         .orderBy("window.start")


display(avg_baggage_df)

window,Type of Travel,avg(Baggage Handling)
"List(2023-03-08T11:17:30.000+0000, 2023-03-08T11:19:30.000+0000)",Business travel,3.6818181818181817
"List(2023-03-08T11:17:30.000+0000, 2023-03-08T11:19:30.000+0000)",Personal Travel,3.8979591836734695
"List(2023-03-08T11:18:00.000+0000, 2023-03-08T11:20:00.000+0000)",Business travel,3.6818181818181817
"List(2023-03-08T11:18:00.000+0000, 2023-03-08T11:20:00.000+0000)",Personal Travel,3.8979591836734695
"List(2023-03-08T11:18:30.000+0000, 2023-03-08T11:20:30.000+0000)",Personal Travel,3.8979591836734695
"List(2023-03-08T11:18:30.000+0000, 2023-03-08T11:20:30.000+0000)",Business travel,3.6818181818181817
"List(2023-03-08T11:19:00.000+0000, 2023-03-08T11:21:00.000+0000)",Personal Travel,3.8979591836734695
"List(2023-03-08T11:19:00.000+0000, 2023-03-08T11:21:00.000+0000)",Business travel,3.6818181818181817
