# Demo Tree Model

### Dataset: flights.csv
- You'll build a regression model to predict flight delay or not 
- With 'mon', 'dom', 'dow', 'carrier_idx', 'org_idx', 'km', 'depart', 'duration' as a predictor

First thing to do is start a Spark Session

In [1]:
import findspark
findspark.init()

In [2]:
import pyspark

In [3]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('lr_demo').getOrCreate()

In [6]:
# Use Spark to read flights.csv file.
data = spark.read.csv("flights.csv",inferSchema=True,header=True)

In [7]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- mon: integer (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- org: string (nullable = true)
 |-- mile: integer (nullable = true)
 |-- depart: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- delay: string (nullable = true)



In [8]:
data.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 3 rows



In [9]:
data.head()

Row(mon=11, dom=20, dow=6, carrier='US', flight=19, org='JFK', mile=2153, depart=9.48, duration=351, delay='NA')

In [10]:
# for item in data.head():
#     print(item)

In [11]:
data.count()

50000

In [12]:
# Remove the 'flight' column
data = data.drop('flight')

In [13]:
# Number of records with missing 'delay' values
data.filter('delay IS NULL').count()

0

In [14]:
# Remove records with missing 'delay' values
data = data.filter('delay IS NOT NULL')

In [15]:
# Remove records with missing values in any column and get the number of remaining rows
data = data.na.drop()
data.count()

50000

In [16]:
# Import the required function
from pyspark.sql.functions import round

In [17]:
# Convert 'mile' to 'km' and drop 'mile' column
data = data.withColumn('km', round(data.mile * 1.60934, 0))

In [18]:
# Create 'label' column indicating whether flight delayed (1) or not (0)
data = data.withColumn('label', (data.delay >= 15).cast('integer'))
# Check first five records
data.show(3)

+---+---+---+-------+---+----+------+--------+-----+------+-----+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|    km|label|
+---+---+---+-------+---+----+------+--------+-----+------+-----+
| 11| 20|  6|     US|JFK|2153|  9.48|     351|   NA|3465.0| null|
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30| 509.0|    1|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8| 542.0|    0|
+---+---+---+-------+---+----+------+--------+-----+------+-----+
only showing top 3 rows



## Categories data

In [19]:
from pyspark.ml.feature import StringIndexer

In [20]:
# Create an indexer
indexer = StringIndexer(inputCol='carrier', outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(data)

# Indexer creates a new column with numeric index values
data_indexed = indexer_model.transform(data)

# Repeat the process for the other categorical feature
data_indexed = StringIndexer(inputCol='org', outputCol='org_idx').fit(data_indexed).transform(data_indexed)

In [21]:
data_indexed.show(3)

+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|    km|label|carrier_idx|org_idx|
+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+
| 11| 20|  6|     US|JFK|2153|  9.48|     351|   NA|3465.0| null|        6.0|    2.0|
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30| 509.0|    1|        0.0|    0.0|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8| 542.0|    0|        0.0|    1.0|
+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+
only showing top 3 rows



## Setting Up DataFrame for Machine Learning 

## Assembling columns

In [22]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [23]:
data_indexed.columns

['mon',
 'dom',
 'dow',
 'carrier',
 'org',
 'mile',
 'depart',
 'duration',
 'delay',
 'km',
 'label',
 'carrier_idx',
 'org_idx']

In [24]:
# Create an assembler object
assembler = VectorAssembler(inputCols=[
    'mon', 'dom', 'dow', 'carrier_idx', 'org_idx', 'km', 'depart', 'duration'
], outputCol='features')

In [25]:
data_pre = assembler.transform(data_indexed)

In [26]:
# Check the resulting column
data_pre.select('features', 'label').show(2, truncate=False)

+-----------------------------------------+-----+
|features                                 |label|
+-----------------------------------------+-----+
|[11.0,20.0,6.0,6.0,2.0,3465.0,9.48,351.0]|null |
|[0.0,22.0,2.0,0.0,0.0,509.0,16.33,82.0]  |1    |
+-----------------------------------------+-----+
only showing top 2 rows



In [27]:
data_pre.show(3, False)

+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+-----------------------------------------+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|km    |label|carrier_idx|org_idx|features                                 |
+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+-----------------------------------------+
|11 |20 |6  |US     |JFK|2153|9.48  |351     |NA   |3465.0|null |6.0        |2.0    |[11.0,20.0,6.0,6.0,2.0,3465.0,9.48,351.0]|
|0  |22 |2  |UA     |ORD|316 |16.33 |82      |30   |509.0 |1    |0.0        |0.0    |[0.0,22.0,2.0,0.0,0.0,509.0,16.33,82.0]  |
|2  |20 |4  |UA     |SFO|337 |6.17  |82      |-8   |542.0 |0    |0.0        |1.0    |[2.0,20.0,4.0,0.0,1.0,542.0,6.17,82.0]   |
+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+-----------------------------------------+
only showing top 3 rows



In [28]:
final_data = data_pre.select("features","label")
final_data.count()

50000

In [29]:
final_data = final_data.na.drop()
final_data.count()

47022

In [30]:
final_data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.0,22.0,2.0,0.0...|    1|
|[2.0,20.0,4.0,0.0...|    0|
|[9.0,13.0,1.0,1.0...|    0|
|[5.0,2.0,1.0,0.0,...|    0|
|[7.0,2.0,6.0,1.0,...|    1|
+--------------------+-----+
only showing top 5 rows



In [31]:
train_data,test_data = final_data.randomSplit([0.8,0.2])

In [32]:
train_data.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|             37519|
|   mean|0.5122471281217517|
| stddev|0.4998566467729511|
|    min|                 0|
|    max|                 1|
+-------+------------------+



In [33]:
test_data.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|              9503|
|   mean|0.5076291697358729|
| stddev|0.4999680988764607|
|    min|                 0|
|    max|                 1|
+-------+------------------+



# Decision Tree
- ...

In [34]:
# Import the Decision Tree Classifier class
from pyspark.ml.classification import DecisionTreeClassifier

In [35]:
# Create a classifier object and fit to the training data
tree = DecisionTreeClassifier(featuresCol='features',
                              labelCol='label',
                              predictionCol='prediction')

In [36]:
# Fit the model to the data and call this tree_model
tree_model = tree.fit(train_data)

In [37]:
# Check test dataset
test_model = tree_model.transform(test_data)

In [38]:
# Inspect results
test_model.select('label', 'prediction', 'probability').show(3, False)

+-----+----------+---------------------------------------+
|label|prediction|probability                            |
+-----+----------+---------------------------------------+
|1    |1.0       |[0.3672396245618003,0.6327603754381997]|
|0    |1.0       |[0.3672396245618003,0.6327603754381997]|
|1    |1.0       |[0.3672396245618003,0.6327603754381997]|
+-----+----------+---------------------------------------+
only showing top 3 rows



In [39]:
# Create a confusion matrix
test_model.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1224|
|    0|       0.0| 2466|
|    1|       1.0| 3600|
|    0|       1.0| 2213|
+-----+----------+-----+



In [40]:
# Calculate the elements of the confusion matrix
TN = test_model.filter('prediction = 0 and label = prediction').count()
TP = test_model.filter('prediction = 1 and label = prediction').count()
FN = test_model.filter('prediction = 0 and label != prediction').count()
FP = test_model.filter('prediction = 1 and label != prediction').count()

In [41]:
# Accuracy measures the proportion of correct predictions
print('Accuracy: ', (TN + TP) / (TN+TP+FN+FP))

Accuracy:  0.6383247395559297


In [42]:
# Calculate precision and recall
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print('precision = {:.2f}\nrecall   = {:.2}'.format(precision, recall))

precision = 0.62
recall   = 0.75


In [44]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
# Find weighted precision
multi_evaluator = MulticlassClassificationEvaluator()
weighted_precision = multi_evaluator.evaluate(test_model,
                                              {multi_evaluator.metricName: "weightedPrecision"})

In [45]:
weighted_precision

0.6434233626410526

In [46]:
# Find AUC
binary_evaluator = BinaryClassificationEvaluator()
auc = binary_evaluator.evaluate(test_model,
                                {binary_evaluator.metricName: 'areaUnderROC'})

In [47]:
auc

0.6445129512018165

In [48]:
# Save model
tree_model.save('tree_model_flights_50k')

In [66]:
from pyspark.ml.classification import DecisionTreeClassificationModel
# Load model from
tree_model2 = DecisionTreeClassificationModel.load('tree_model_flights_50k')

In [67]:
# Predict new values (Assuming select test_data)
unlabeled_data = test_data.select('features')

In [68]:
predictions = tree_model2.transform(unlabeled_data)

In [69]:
predictions.show(2, False)

+--------------------------------------+----------------+---------------------------------------+----------+
|features                              |rawPrediction   |probability                            |prediction|
+--------------------------------------+----------------+---------------------------------------+----------+
|(8,[1,5,6,7],[6.0,538.0,20.0,84.0])   |[6495.0,11191.0]|[0.3672396245618003,0.6327603754381997]|1.0       |
|(8,[1,5,6,7],[6.0,1366.0,13.17,135.0])|[6495.0,11191.0]|[0.3672396245618003,0.6327603754381997]|1.0       |
+--------------------------------------+----------------+---------------------------------------+----------+
only showing top 2 rows



# Random Forest

In [54]:
from pyspark.ml.classification import RandomForestClassifier

In [56]:
rfc = RandomForestClassifier(featuresCol='features',
                             labelCol='label',
                             predictionCol='prediction')

In [57]:
# Fit the model to the data and call this rfc_model
rfc_model = rfc.fit(train_data)

In [59]:
# Find the number of tree and the relative importance of features
print('Number of trees:', rfc_model.getNumTrees)
print('Relative importance of features:', rfc_model.featureImportances)

Number of trees: 20
Relative importance of features: (8,[0,1,2,3,4,5,6,7],[0.21332959390852432,0.01820939604960225,0.014906799404443616,0.06302817698402711,0.2507675488599955,0.050950740177925,0.34216746370462287,0.0466402809108592])


In [60]:
# Check test dataset
rfc_test_model = rfc_model.transform(test_data)

In [61]:
# Inspect results
rfc_test_model.select('label', 'prediction', 'probability').show(3, False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|1    |1.0       |[0.3938817093547602,0.6061182906452398] |
|0    |1.0       |[0.35297497107958875,0.6470250289204114]|
|1    |1.0       |[0.34245483144831407,0.657545168551686] |
+-----+----------+----------------------------------------+
only showing top 3 rows



In [62]:
# Create a confusion matrix
rfc_test_model.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1621|
|    0|       0.0| 2896|
|    1|       1.0| 3203|
|    0|       1.0| 1783|
+-----+----------+-----+



In [63]:
# Calculate the elements of the confusion matrix
TN = test_model.filter('prediction = 0 and label = prediction').count()
TP = test_model.filter('prediction = 1 and label = prediction').count()
FN = test_model.filter('prediction = 0 and label != prediction').count()
FP = test_model.filter('prediction = 1 and label != prediction').count()

In [64]:
# Accuracy measures the proportion of correct predictions
print('Accuracy: ', (TN + TP) / (TN+TP+FN+FP))

Accuracy:  0.6383247395559297


In [65]:
# Save model
rfc_model.save('rfc_model_flights_50k')

In [70]:
from pyspark.ml.classification import RandomForestClassificationModel
# Load model from
rfc_model2 = RandomForestClassificationModel.load('rfc_model_flights_50k')

# Predict new values (Assuming select test_data)
unlabeled_data = test_data.select('features')

predictions = rfc_model2.transform(unlabeled_data)

predictions.show(3, False)

+--------------------------------------+---------------------------------------+----------------------------------------+----------+
|features                              |rawPrediction                          |probability                             |prediction|
+--------------------------------------+---------------------------------------+----------------------------------------+----------+
|(8,[1,5,6,7],[6.0,538.0,20.0,84.0])   |[7.877634187095204,12.122365812904796] |[0.3938817093547602,0.6061182906452398] |1.0       |
|(8,[1,5,6,7],[6.0,1366.0,13.17,135.0])|[7.0594994215917755,12.940500578408226]|[0.35297497107958875,0.6470250289204114]|1.0       |
|(8,[1,5,6,7],[6.0,2411.0,20.68,241.0])|[6.849096628966281,13.150903371033719] |[0.34245483144831407,0.657545168551686] |1.0       |
+--------------------------------------+---------------------------------------+----------------------------------------+----------+
only showing top 3 rows



# Gradient-Boosted Trees

In [72]:
from pyspark.ml.classification import GBTClassifier

In [73]:
gbt = GBTClassifier(featuresCol='features',
                    labelCol='label',
                    predictionCol='prediction')

In [74]:
# Fit the model to the data and call this gbt_model
gbt_model = gbt.fit(train_data)

In [75]:
# Find the number of trees and the relative importance of features
print('Number of trees: ', gbt_model.getNumTrees)
print('Relative importance of features:', gbt_model.featureImportances)

Number of trees:  20
Relative importance of features: (8,[0,1,2,3,4,5,6,7],[0.19310486189285184,0.16009260469429737,0.15006906228740075,0.08658397782702128,0.16370047680501348,0.06830546520387297,0.13871524602018986,0.03942830526935252])


In [77]:
# Check test dataset
gbt_test_model = gbt_model.transform(test_data)

In [78]:
# Inspect results
gbt_test_model.select('label', 'prediction', 'probability').show(3, False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|1    |1.0       |[0.19018799066680808,0.809812009333192] |
|0    |1.0       |[0.29952519496746777,0.7004748050325322]|
|1    |1.0       |[0.16564957437919892,0.8343504256208011]|
+-----+----------+----------------------------------------+
only showing top 3 rows



In [79]:
# Create a confusion matrix
gbt_test_model.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1377|
|    0|       0.0| 2898|
|    1|       1.0| 3447|
|    0|       1.0| 1781|
+-----+----------+-----+



In [80]:
# Calculate the elements of the confusion matrix
TN = test_model.filter('prediction = 0 and label = prediction').count()
TP = test_model.filter('prediction = 1 and label = prediction').count()
FN = test_model.filter('prediction = 0 and label != prediction').count()
FP = test_model.filter('prediction = 1 and label != prediction').count()

In [81]:
# Accuracy measures the proportion of correct predictions
print('Accuracy: ', (TN + TP) / (TN+TP+FN+FP))

Accuracy:  0.6383247395559297


In [82]:
# Save model
gbt_model.save('gbt_model_flights_50k')

In [83]:
from pyspark.ml.classification import GBTClassificationModel

In [84]:
# Load model from
gbt_model2 = GBTClassificationModel.load('gbt_model_flights_50k')

# Predict new values (Assuming select test_data)
unlabeled_data = test_data.select('features')

predictions = gbt_model2.transform(unlabeled_data)

predictions.show(3, False)

+--------------------------------------+------------------------------------------+----------------------------------------+----------+
|features                              |rawPrediction                             |probability                             |prediction|
+--------------------------------------+------------------------------------------+----------------------------------------+----------+
|(8,[1,5,6,7],[6.0,538.0,20.0,84.0])   |[-0.7243945629611589,0.7243945629611589]  |[0.19018799066680808,0.809812009333192] |1.0       |
|(8,[1,5,6,7],[6.0,1366.0,13.17,135.0])|[-0.42477993028121197,0.42477993028121197]|[0.29952519496746777,0.7004748050325322]|1.0       |
|(8,[1,5,6,7],[6.0,2411.0,20.68,241.0])|[-0.8083894646854622,0.8083894646854622]  |[0.16564957437919892,0.8343504256208011]|1.0       |
+--------------------------------------+------------------------------------------+----------------------------------------+----------+
only showing top 3 rows



## Compare 3 model results: Decision Tree, Random Forest, GBT model

In [88]:
# With MulticlassClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
dtc_predictions = tree_model.transform(test_data)
rfc_predictions = rfc_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)

In [90]:
# Select (prediction, true label) and compute test error
acc_evaluator = MulticlassClassificationEvaluator(labelCol='label',
                                                  predictionCol='prediction',
                                                  metricName='accuracy')

In [91]:
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
rfc_acc = acc_evaluator.evaluate(rfc_predictions)
gbt_acc = acc_evaluator.evaluate(gbt_predictions)

In [92]:
print('Results')
print('-'*60)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(dtc_acc*100))
print('-'*60)
print('A random forest ensemble has an accuracy of: {0:2.2f}%'.format(rfc_acc*100))
print('-'*60)
print('An ensemble using GBT has an accuracy of: {0:2.2f}%'.format(gbt_acc*100))

Results
------------------------------------------------------------
A single decision tree has an accuracy of: 63.83%
------------------------------------------------------------
A random forest ensemble has an accuracy of: 64.18%
------------------------------------------------------------
An ensemble using GBT has an accuracy of: 66.77%


In [94]:
# With BinaryClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [95]:
# Compare AUC on testing data
evaluator = BinaryClassificationEvaluator()

In [98]:
dtc_acc_2 = evaluator.evaluate(dtc_predictions)
rfc_acc_2 = evaluator.evaluate(rfc_predictions)
gbt_acc_2 = evaluator.evaluate(gbt_predictions)

In [99]:
print('Results')
print('-'*60)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(dtc_acc_2*100))
print('-'*60)
print('A random forest ensemble has an accuracy of: {0:2.2f}%'.format(rfc_acc_2*100))
print('-'*60)
print('An ensemble using GBT has an accuracy of: {0:2.2f}%'.format(gbt_acc_2*100))

Results
------------------------------------------------------------
A single decision tree has an accuracy of: 64.45%
------------------------------------------------------------
A random forest ensemble has an accuracy of: 69.05%
------------------------------------------------------------
An ensemble using GBT has an accuracy of: 73.05%
