# Demo Tree Model

### Dataset: flights.csv
- You'll build a regression model to predict flight delay or not 
- With 'mon', 'dom', 'dow', 'carrier_idx', 'org_idx', 'km', 'depart', 'duration' as a predictor

First thing to do is start a Spark Session

In [1]:
import findspark
findspark.init()

In [2]:
import pyspark

In [3]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

In [4]:
spark = SparkSession.builder.appName('dt_demo').getOrCreate()

In [5]:
# Use Spark to read flights.csv file.
data = spark.read.csv("../../Data/flights.csv",inferSchema=True,header=True)

In [6]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- mon: integer (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- org: string (nullable = true)
 |-- mile: integer (nullable = true)
 |-- depart: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- delay: string (nullable = true)



In [7]:
data.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 3 rows



In [8]:
data.head()

Row(mon=11, dom=20, dow=6, carrier='US', flight=19, org='JFK', mile=2153, depart=9.48, duration=351, delay='NA')

In [9]:
# for item in data.head():
#     print(item)

In [10]:
data.count()

50000

In [11]:
# Remove the 'flight' column
data = data.drop('flight')

In [12]:
# Number of records with missing 'delay' values
data.filter('delay IS NULL').count()

0

In [13]:
# Remove records with missing 'delay' values
data = data.filter('delay IS NOT NULL')

In [14]:
# Remove records with missing values in any column and get the number of remaining rows
data = data.na.drop()
data.count()

50000

In [15]:
# Import the required function
from pyspark.sql.functions import round

In [16]:
# Convert 'mile' to 'km' and drop 'mile' column
data = data.withColumn('km', round(data.mile * 1.60934, 0))

In [17]:
# Create 'label' column indicating whether flight delayed (1) or not (0)
data = data.withColumn('label', (data.delay >= 15).cast('integer'))
# Check first five records
data.show(3)

+---+---+---+-------+---+----+------+--------+-----+------+-----+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|    km|label|
+---+---+---+-------+---+----+------+--------+-----+------+-----+
| 11| 20|  6|     US|JFK|2153|  9.48|     351|   NA|3465.0| null|
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30| 509.0|    1|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8| 542.0|    0|
+---+---+---+-------+---+----+------+--------+-----+------+-----+
only showing top 3 rows



## Categories data

In [18]:
from pyspark.ml.feature import StringIndexer

In [19]:
# Create an indexer
indexer = StringIndexer(inputCol='carrier', outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(data)

# Indexer creates a new column with numeric index values
data_indexed = indexer_model.transform(data)

# Repeat the process for the other categorical feature
data_indexed = StringIndexer(inputCol='org', outputCol='org_idx').fit(data_indexed).transform(data_indexed)

In [20]:
data_indexed.show(3)

+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|    km|label|carrier_idx|org_idx|
+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+
| 11| 20|  6|     US|JFK|2153|  9.48|     351|   NA|3465.0| null|        6.0|    2.0|
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30| 509.0|    1|        0.0|    0.0|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8| 542.0|    0|        0.0|    1.0|
+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+
only showing top 3 rows



## Setting Up DataFrame for Machine Learning 

## Assembling columns

In [21]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [22]:
data_indexed.columns

['mon',
 'dom',
 'dow',
 'carrier',
 'org',
 'mile',
 'depart',
 'duration',
 'delay',
 'km',
 'label',
 'carrier_idx',
 'org_idx']

In [23]:
# Create an assembler object
assembler = VectorAssembler(inputCols=[
    'mon', 'dom', 'dow', 'carrier_idx', 'org_idx', 'km', 'depart', 'duration'
], outputCol='features')

In [24]:
data_pre = assembler.transform(data_indexed)

In [25]:
# Check the resulting column
data_pre.select('features', 'label').show(2, truncate=False)

+-----------------------------------------+-----+
|features                                 |label|
+-----------------------------------------+-----+
|[11.0,20.0,6.0,6.0,2.0,3465.0,9.48,351.0]|null |
|[0.0,22.0,2.0,0.0,0.0,509.0,16.33,82.0]  |1    |
+-----------------------------------------+-----+
only showing top 2 rows



In [26]:
data_pre.show(3, False)

+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+-----------------------------------------+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|km    |label|carrier_idx|org_idx|features                                 |
+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+-----------------------------------------+
|11 |20 |6  |US     |JFK|2153|9.48  |351     |NA   |3465.0|null |6.0        |2.0    |[11.0,20.0,6.0,6.0,2.0,3465.0,9.48,351.0]|
|0  |22 |2  |UA     |ORD|316 |16.33 |82      |30   |509.0 |1    |0.0        |0.0    |[0.0,22.0,2.0,0.0,0.0,509.0,16.33,82.0]  |
|2  |20 |4  |UA     |SFO|337 |6.17  |82      |-8   |542.0 |0    |0.0        |1.0    |[2.0,20.0,4.0,0.0,1.0,542.0,6.17,82.0]   |
+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+-----------------------------------------+
only showing top 3 rows



In [27]:
final_data = data_pre.select("features","label")
final_data.count()

50000

In [28]:
final_data = final_data.na.drop()
final_data.count()

47022

In [29]:
final_data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.0,22.0,2.0,0.0...|    1|
|[2.0,20.0,4.0,0.0...|    0|
|[9.0,13.0,1.0,1.0...|    0|
|[5.0,2.0,1.0,0.0,...|    0|
|[7.0,2.0,6.0,1.0,...|    1|
+--------------------+-----+
only showing top 5 rows



In [30]:
train_data,test_data = final_data.randomSplit([0.8,0.2])

In [31]:
train_data.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|             37660|
|   mean|0.5130111524163569|
| stddev|0.4998373174711939|
|    min|                 0|
|    max|                 1|
+-------+------------------+



In [32]:
test_data.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|              9362|
|   mean|0.5044862208929716|
| stddev|0.5000065781772699|
|    min|                 0|
|    max|                 1|
+-------+------------------+



# Decision Tree

### Xây dựng model

In [33]:
# Import the Decision Tree Classifier class
from pyspark.ml.classification import DecisionTreeClassifier

In [34]:
# Create a classifier object and fit to the training data
tree = DecisionTreeClassifier(featuresCol='features', labelCol='label', predictionCol='prediction')

In [35]:
# Fit the model to the data and call this tree_model
tree_model = tree.fit(train_data)

### Đánh giá kết quả

In [36]:
# Check test datase
tree_test_model = tree_model.transform(test_data)

In [37]:
# Inspect results
tree_test_model.select('label', 'prediction', 'probability').show(3, False)

+-----+----------+---------------------------------------+
|label|prediction|probability                            |
+-----+----------+---------------------------------------+
|1    |1.0       |[0.3412434082708854,0.6587565917291146]|
|1    |1.0       |[0.3412434082708854,0.6587565917291146]|
|0    |1.0       |[0.3412434082708854,0.6587565917291146]|
+-----+----------+---------------------------------------+
only showing top 3 rows



In [38]:
# Create predictions for the testing data and show confutsion matrix
tree_test_model.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1270|
|    0|       0.0| 2424|
|    1|       1.0| 3453|
|    0|       1.0| 2215|
+-----+----------+-----+



### Đánh giá model với test dataset

In [39]:
# Calculate the elements of the confusion matrix
TN = tree_test_model.filter('prediction = 0 AND label = prediction').count()
TP = tree_test_model.filter('prediction = 1 AND label = prediction').count()
FN = tree_test_model.filter('prediction = 0 AND label != prediction').count()
FP = tree_test_model.filter('prediction = 1 AND label != prediction').count()

In [40]:
acc = (TP + TN) / (TP + TN + FP + FN)
# Calculate precision and recall
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print('precision = {:.2f}\nrecall = {:.2f}\nacc={:.2f}'.format(precision, recall, acc))

precision = 0.61
recall = 0.73
acc=0.63


### Dự đoán dữ liệu mới

In [41]:
# Predict new values (Assuming select test_data)
unlabeled_data = test_data.select('features')

In [42]:
predictions = tree_model.transform(unlabeled_data)

In [43]:
predictions.show(5)

+--------------------+---------------+--------------------+----------+
|            features|  rawPrediction|         probability|prediction|
+--------------------+---------------+--------------------+----------+
|(8,[1,5,6,7],[6.0...|[4918.0,9494.0]|[0.34124340827088...|       1.0|
|(8,[1,5,6,7],[6.0...|[4918.0,9494.0]|[0.34124340827088...|       1.0|
|(8,[1,5,6,7],[13....|[4918.0,9494.0]|[0.34124340827088...|       1.0|
|(8,[1,5,6,7],[13....|[4918.0,9494.0]|[0.34124340827088...|       1.0|
|(8,[1,5,6,7],[13....|[1464.0,1254.0]|[0.53863134657836...|       0.0|
+--------------------+---------------+--------------------+----------+
only showing top 5 rows



# Random Forest

### Xây dựng model

In [44]:
# Import the Decision Random Forest Classifier class
from pyspark.ml.classification import RandomForestClassifier

In [45]:
rfc = RandomForestClassifier(featuresCol='features', labelCol='label', predictionCol='prediction')

In [46]:
# Fit the model to the data and call this rfc_model
rfc_model = rfc.fit(train_data)

In [47]:
# Find the number of trees and the relative importance of features
print("Number of trees:", rfc_model.getNumTrees)
print("Relative importance of features:", rfc_model.featureImportances)

Number of trees: 20
Relative importance of features: (8,[0,1,2,3,4,5,6,7],[0.22603041428846674,0.008907446947624902,0.005574732834578202,0.07620316036476163,0.2053546304399398,0.05233778761130855,0.37508538954179205,0.050506437971528116])


### Đánh giá kết quả

In [48]:
# Check test dataset
rfc_test_model = rfc_model.transform(test_data)

In [49]:
# Inspect results
rfc_test_model.select('label', 'prediction', 'probability').show(3, False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|1    |1.0       |[0.38784917500718313,0.6121508249928168]|
|1    |1.0       |[0.3360111906367321,0.6639888093632679] |
|0    |1.0       |[0.3773481068653801,0.6226518931346199] |
+-----+----------+----------------------------------------+
only showing top 3 rows



In [50]:
# Create predictions for the testing data and show confutsion matrix
rfc_test_model.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1682|
|    0|       0.0| 2902|
|    1|       1.0| 3041|
|    0|       1.0| 1737|
+-----+----------+-----+



### Đánh giá model với test dataset

In [51]:
# Calculate the elements of the confusion matrix
TN = rfc_test_model.filter('prediction = 0 AND label = prediction').count()
TP = rfc_test_model.filter('prediction = 1 AND label = prediction').count()
FN = rfc_test_model.filter('prediction = 0 AND label != prediction').count()
FP = rfc_test_model.filter('prediction = 1 AND label != prediction').count()

In [52]:
acc = (TP + TN) / (TP + TN + FP + FN)
# Calculate precision and recall
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print('precision = {:.2f}\nrecall = {:.2f}\nacc={:.2f}'.format(precision, recall, acc))

precision = 0.64
recall = 0.64
acc=0.63


### Dự đoán dữ liệu mới

In [53]:
predictions = rfc_model.transform(unlabeled_data)

In [54]:
predictions.show(5)

+--------------------+--------------------+--------------------+----------+
|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+----------+
|(8,[1,5,6,7],[6.0...|[7.75698350014366...|[0.38784917500718...|       1.0|
|(8,[1,5,6,7],[6.0...|[6.72022381273464...|[0.33601119063673...|       1.0|
|(8,[1,5,6,7],[13....|[7.54696213730760...|[0.37734810686538...|       1.0|
|(8,[1,5,6,7],[13....|[6.78822111442580...|[0.33941105572129...|       1.0|
|(8,[1,5,6,7],[13....|[10.9307894280257...|[0.54653947140128...|       0.0|
+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



# Gradient-Boosted Trees

### Xây dựng model

In [55]:
# Import the Decision Gradient-Boosted Trees Classifier class
from pyspark.ml.classification import GBTClassifier

In [56]:
gbt = GBTClassifier(featuresCol='features', labelCol='label', predictionCol='prediction')

In [57]:
# Fit the model to the data and call this rfc_model
gbt_model = gbt.fit(train_data)

In [58]:
# Find the number of trees and the relative importance of features
print("Number of trees:", gbt_model.getNumTrees)
print("Relative importance of features:", gbt_model.featureImportances)

Number of trees: 20
Relative importance of features: (8,[0,1,2,3,4,5,6,7],[0.22600051106186897,0.1592260024526632,0.13645822373863706,0.0939109357745934,0.1713998510505963,0.059770174019025325,0.13505216349969715,0.018182138402918633])


### Đánh giá kết quả

In [59]:
# Check test dataset
gbt_test_model = gbt_model.transform(test_data)

In [60]:
# Inspect results
gbt_test_model.select('label', 'prediction', 'probability').show(3, False)

+-----+----------+---------------------------------------+
|label|prediction|probability                            |
+-----+----------+---------------------------------------+
|1    |1.0       |[0.32545822929341595,0.674541770706584]|
|1    |1.0       |[0.306844386068517,0.693155613931483]  |
|0    |1.0       |[0.2953139669620007,0.7046860330379994]|
+-----+----------+---------------------------------------+
only showing top 3 rows



In [61]:
# Create predictions for the testing data and show confutsion matrix
gbt_test_model.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1377|
|    0|       0.0| 2778|
|    1|       1.0| 3346|
|    0|       1.0| 1861|
+-----+----------+-----+



### Đánh giá model với test dataset

In [62]:
# Calculate the elements of the confusion matrix
TN = gbt_test_model.filter('prediction = 0 AND label = prediction').count()
TP = gbt_test_model.filter('prediction = 1 AND label = prediction').count()
FN = gbt_test_model.filter('prediction = 0 AND label != prediction').count()
FP = gbt_test_model.filter('prediction = 1 AND label != prediction').count()

In [63]:
acc = (TP + TN) / (TP + TN + FP + FN)
# Calculate precision and recall
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print('precision = {:.2f}\nrecall = {:.2f}\nacc={:.2f}'.format(precision, recall, acc))

precision = 0.64
recall = 0.71
acc=0.65


### Dự đoán dữ liệu mới

In [64]:
predictions = gbt_model.transform(unlabeled_data)

In [65]:
predictions.show(5)

+--------------------+--------------------+--------------------+----------+
|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+----------+
|(8,[1,5,6,7],[6.0...|[-0.3643997382705...|[0.32545822929341...|       1.0|
|(8,[1,5,6,7],[6.0...|[-0.4074568959014...|[0.30684438606851...|       1.0|
|(8,[1,5,6,7],[13....|[-0.4348566376214...|[0.29531396696200...|       1.0|
|(8,[1,5,6,7],[13....|[-0.1881301750517...|[0.40702917084277...|       1.0|
|(8,[1,5,6,7],[13....|[0.06431314063294...|[0.53211230842346...|       0.0|
+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



# So sánh kết quả 3 model

### Với MulticlassClassificationEvaluator

In [66]:
# Select (prediction, true label) and compute test error
acc_evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")

In [68]:
dtc_acc = acc_evaluator.evaluate(tree_test_model)
rfc_acc = acc_evaluator.evaluate(rfc_test_model)
gbt_acc = acc_evaluator.evaluate(gbt_test_model)

In [69]:
print("Results")
print('-'*60)
print('A single Decision Tree has an accuracy of {0:2.2f}'.format(dtc_acc*100))
print('-'*60)
print('A single Random Forest has an accuracy of {0:2.2f}'.format(rfc_acc*100))
print('-'*60)
print('An essemble using Gradient-Boosted Trees has an accuracy of {0:2.2f}'.format(gbt_acc*100))

Results
------------------------------------------------------------
A single Decision Tree has an accuracy of 62.78
------------------------------------------------------------
A single Random Forest has an accuracy of 63.48
------------------------------------------------------------
An essemble using Gradient-Boosted Trees has an accuracy of 65.41


### Với BinaryClassificationEvaluator

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
binary_evaluator = BinaryClassificationEvaluator()

In [None]:
dtc_acc2 = acc_evaluator.evaluate(tree_test_model)
rfc_acc2 = acc_evaluator.evaluate(rfc_test_model)
gbt_acc2 = acc_evaluator.evaluate(gbt_test_model)

In [None]:
print("Results")
print('-'*60)
print('A single Decision Tree has an accuracy of {0:2.2f}'.format(dtc_acc2*100))
print('-'*60)
print('A single Random Forest has an accuracy of {0:2.2f}'.format(rfc_acc2*100))
print('-'*60)
print('An essemble using Gradient-Boosted Trees has an accuracy of {0:2.2f}'.format(gbt_acc2*100))