### Predicting flight delay

We are going to make models that predict wheter a flight will be delayed:

We will first explore the flight data, then clean it and lastly build a model that predicts if a flight will be delayed



In [25]:
#First we initialize spark

from pyspark.sql import SparkSession

### Specify clusers. The name. Get or create will make sure that we do not initialize two times the same session 
spark = SparkSession.builder.master('local[*]').appName('flights delay').getOrCreate()

In [26]:
### Next we read the data for this we use spark.read.csv
flights = spark.read.csv('./data/flights.csv',
                         sep=',',
                         header=True,
                         inferSchema=True, ### slow -> must go true the entire data once. We can specify the schema.
                         nullValue="NA")


In [27]:
### We can view the first 10 entries (notis the null values):
flights.show(10)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| null|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| null|
|  5|  2|  1|     UA|   704|SFO| 550|  7.98|     102|    2|
|  7|  2|  6|     AA|   380|ORD| 733| 10.83|     135|   54|
|  1| 16|  6|     UA|  1477|ORD|1440|   8.0|     232|   -7|
|  1| 22|  5|     UA|   620|SJC|1829|  7.98|     250|  -13|
| 11|  8|  1|     OO|  5590|SFO| 158|  7.77|      60|   88|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 10 rows



In [28]:
### We can also check the dtype
flights.dtypes

[('mon', 'int'),
 ('dom', 'int'),
 ('dow', 'int'),
 ('carrier', 'string'),
 ('flight', 'int'),
 ('org', 'string'),
 ('mile', 'int'),
 ('depart', 'double'),
 ('duration', 'int'),
 ('delay', 'int')]

In [29]:
### We next check how many flights have a missing delay value, we can do this with the filter command:
# Number of records with missing 'delay' values
print(flights.filter('delay IS NULL').count())


2978


In [30]:
### With the use of the filter we can create a new dataframe
flights_valid_delay = flights.filter('delay IS NOT NULL')
### We check how many are left
print(flights_valid_delay.count())


# We can also do it with the dropna (drops every row with NA values)
flights_na_cleaned = flights.dropna()
print(flights_na_cleaned.count())

47022
47022


In [31]:
### Now we use drop the "flight" column as it does not provide us any information
flights_cleaned = flights_na_cleaned.drop("flight")


In [32]:
from pyspark.sql.functions import round
### Now we show how to create new columns
# We do not like the imperial system hence we will create a km column and remove the mile column
flights_km = flights_cleaned.withColumn("km", round(flights.mile *1.60934, 0 )).drop('mile')

flights_km.show(5)



+---+---+---+-------+---+------+--------+-----+------+
|mon|dom|dow|carrier|org|depart|duration|delay|    km|
+---+---+---+-------+---+------+--------+-----+------+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30| 509.0|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8| 542.0|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|1989.0|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2| 885.0|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|1180.0|
+---+---+---+-------+---+------+--------+-----+------+
only showing top 5 rows



In [33]:
### Next we want to see if a flight is delayed or not. A flight is officially delayed when it arives 15 minutes or later
### We again create a new column
flights_delayed = flights_km.withColumn("label", (flights_km.delay >= 15).cast('integer'))

# Next we check the results
flights_delayed.show(5)



+---+---+---+-------+---+------+--------+-----+------+-----+
|mon|dom|dow|carrier|org|depart|duration|delay|    km|label|
+---+---+---+-------+---+------+--------+-----+------+-----+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30| 509.0|    1|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8| 542.0|    0|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|1989.0|    0|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2| 885.0|    0|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|1180.0|    1|
+---+---+---+-------+---+------+--------+-----+------+-----+
only showing top 5 rows



In [34]:
### Creating an indexer
from pyspark.ml.feature import StringIndexer

# Creating the object, describing the input collumn and the output column
indexer = StringIndexer(inputCol="carrier", outputCol='carrier_idx')

# The indecer needs to be fit on the data
indexer_model = indexer.fit(flights_delayed)

# Then we need to transform the data. 
flights_indexed = indexer_model.transform(flights_delayed)

# A one liner for the org column
flights_indexed = StringIndexer(inputCol="org", outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)
flights_indexed.show(10)

+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+
|mon|dom|dow|carrier|org|depart|duration|delay|    km|label|carrier_idx|org_idx|
+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30| 509.0|    1|        0.0|    0.0|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8| 542.0|    0|        0.0|    1.0|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|1989.0|    0|        1.0|    0.0|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2| 885.0|    0|        0.0|    1.0|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|1180.0|    1|        1.0|    0.0|
|  1| 16|  6|     UA|ORD|   8.0|     232|   -7|2317.0|    0|        0.0|    0.0|
|  1| 22|  5|     UA|SJC|  7.98|     250|  -13|2943.0|    0|        0.0|    5.0|
| 11|  8|  1|     OO|SFO|  7.77|      60|   88| 254.0|    1|        2.0|    1.0|
|  4| 26|  1|     AA|SFO| 13.25|     210|  -10|2356.0|    0|        1.0|    1.0|
|  4| 25|  0|     AA|ORD| 13

In [35]:
### Indexes are not good for classification as indexes do not have a linear ordering. Hence we will create a one-hot encoding
from pyspark.ml.feature import OneHotEncoder

one_hot = OneHotEncoder(inputCols=["carrier_idx", "org_idx"], outputCols=["carrier_hot", "org_hot"])
one_hot_model = one_hot.fit(flights_indexed)
flights_hot = one_hot_model.transform(flights_indexed)

print(flights_hot.show(5))

+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+-------------+-------------+
|mon|dom|dow|carrier|org|depart|duration|delay|    km|label|carrier_idx|org_idx|  carrier_hot|      org_hot|
+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+-------------+-------------+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30| 509.0|    1|        0.0|    0.0|(8,[0],[1.0])|(7,[0],[1.0])|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8| 542.0|    0|        0.0|    1.0|(8,[0],[1.0])|(7,[1],[1.0])|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|1989.0|    0|        1.0|    0.0|(8,[1],[1.0])|(7,[0],[1.0])|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2| 885.0|    0|        0.0|    1.0|(8,[0],[1.0])|(7,[1],[1.0])|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|1180.0|    1|        1.0|    0.0|(8,[1],[1.0])|(7,[0],[1.0])|
+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+-------------+-------------+
only showing top 5 

In [36]:
### lastly, in pyspark we need to assemble all columns that we want to use for predictions into 1 columns. We will use the Vector Assembler for this

# Import the necessary class
from pyspark.ml.feature import VectorAssembler

# Create an assembler object
assembler = VectorAssembler(inputCols=[
    'mon', 'dom', 'dow', 'carrier_hot', 'org_hot', 'km', 'depart','duration'
], outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flights_hot)

# Check the resulting column
flights_assembled.select('features', 'delay').show(5, truncate=False)

+--------------------------------------------------------------------+-----+
|features                                                            |delay|
+--------------------------------------------------------------------+-----+
|(21,[1,2,3,11,18,19,20],[22.0,2.0,1.0,1.0,509.0,16.33,82.0])        |30   |
|(21,[0,1,2,3,12,18,19,20],[2.0,20.0,4.0,1.0,1.0,542.0,6.17,82.0])   |-8   |
|(21,[0,1,2,4,11,18,19,20],[9.0,13.0,1.0,1.0,1.0,1989.0,10.33,195.0])|-5   |
|(21,[0,1,2,3,12,18,19,20],[5.0,2.0,1.0,1.0,1.0,885.0,7.98,102.0])   |2    |
|(21,[0,1,2,4,11,18,19,20],[7.0,2.0,6.0,1.0,1.0,1180.0,10.83,135.0]) |54   |
+--------------------------------------------------------------------+-----+
only showing top 5 rows



In [37]:
### Now that we cleaned the data, we will start the training
## First create a random split
# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights_assembled.randomSplit([0.8, 0.2], 17) ## 17 = seed

# Check that training set has around 80% of records
training_ratio = flights_test.count() / flights_train.count()
print(training_ratio)

0.25475650433622415


In [38]:
### Next we will use a decision tree:
# Import the Decision Tree Classifier class
from pyspark.ml.classification import DecisionTreeClassifier

# Create a classifier object and fit to the training data
tree = DecisionTreeClassifier()
tree_model = tree.fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
prediction = tree_model.transform(flights_test)
prediction.select('label', 'prediction', 'probability').show(10, False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|1    |0.0       |[0.5996802923041791,0.400319707695821]  |
|0    |1.0       |[0.33513097072419107,0.6648690292758089]|
|0    |0.0       |[0.5996802923041791,0.400319707695821]  |
|1    |1.0       |[0.33513097072419107,0.6648690292758089]|
|1    |1.0       |[0.3125577100646353,0.6874422899353647] |
|1    |1.0       |[0.3125577100646353,0.6874422899353647] |
|1    |1.0       |[0.33513097072419107,0.6648690292758089]|
|0    |1.0       |[0.33513097072419107,0.6648690292758089]|
|1    |1.0       |[0.33513097072419107,0.6648690292758089]|
|1    |1.0       |[0.33513097072419107,0.6648690292758089]|
+-----+----------+----------------------------------------+
only showing top 10 rows



In [39]:
### Confusion matrix
# Create a confusion matrix
prediction.groupBy("label", 'prediction').count().show()

# Calculate the elements of the confusion matrix
TN = prediction.filter('prediction = 0 AND label = prediction').count()
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label != prediction').count()
FP = prediction.filter('prediction = 1 AND label != prediction').count()

# Accuracy measures the proportion of correct predictions
accuracy = (TN + TP)/ (TN + TP + FN + FP)
print("TN", TN)
print("TP", TP)
print("FN", FN)
print("FP", FP)
print(accuracy)

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1333|
|    0|       0.0| 2303|
|    1|       1.0| 3577|
|    0|       1.0| 2334|
+-----+----------+-----+

TN 2303
TP 3577
FN 1333
FP 2334
0.6159002828113543


In [41]:
# Import the logistic regression class
from pyspark.ml.classification import LogisticRegression

# Create a classifier object and train on training data
logistic = LogisticRegression().fit(flights_train)

# Create predictions for the testing data and show confusion matrix
prediction = logistic.transform(flights_test)
prediction.groupBy('label', 'prediction').count().show()

# Calculate the elements of the confusion matrix
TN = prediction.filter('prediction = 0 AND label = prediction').count()
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label != prediction').count()
FP = prediction.filter('prediction = 1 AND label != prediction').count()

# Accuracy measures the proportion of correct predictions
accuracy = (TN + TP)/ (TN + TP + FN + FP)
print("TN", TN)
print("TP", TP)
print("FN", FN)
print("FP", FP)
print(accuracy)

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1618|
|    0|       0.0| 2498|
|    1|       1.0| 3292|
|    0|       1.0| 2139|
+-----+----------+-----+

TN 2498
TP 3292
FN 1618
FP 2139
0.6064732376662826


In [42]:
### We can check the coefficients
logistic.coefficients

DenseVector([-0.0573, 0.0003, -0.0043, 0.7835, 0.924, 0.7461, 0.601, 0.6403, 0.9975, 0.3143, 0.5775, 1.3961, 1.2412, 1.2774, 1.3595, 0.7784, 0.6897, 0.7251, -0.0, 0.0789, 0.0017])

In [23]:
# Import the logistic regression class
from pyspark.ml.classification import LinearSVC

# Create a classifier object and train on training data
svc = LinearSVC().fit(flights_train)

# Create predictions for the testing data and show confusion matrix
prediction = svc.transform(flights_test)
prediction.groupBy('label', 'prediction').count().show()

# Calculate the elements of the confusion matrix
TN = prediction.filter('prediction = 0 AND label = prediction').count()
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label != prediction').count()
FP = prediction.filter('prediction = 1 AND label != prediction').count()

# Accuracy measures the proportion of correct predictions
accuracy = (TN + TP)/ (TN + TP + FN + FP)
print("TN", TN)
print("TP", TP)
print("FN", FN)
print("FP", FP)
print(accuracy)

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1517|
|    0|       0.0| 2403|
|    1|       1.0| 3393|
|    0|       1.0| 2234|
+-----+----------+-----+

TN 2403
TP 3393
FN 1517
FP 2234
0.6071017073426207


In [24]:
### evaluating
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

# Find weighted precision
multi_evaluator = MulticlassClassificationEvaluator()
weighted_precision = multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: "weightedPrecision"})

print(weighted_precision)

# Find AUC
binary_evaluator = BinaryClassificationEvaluator()
auc = binary_evaluator.evaluate(prediction, {binary_evaluator.metricName: "areaUnderROC"})
print(auc)

0.6078545760403053
0.6452004091766956
