In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz
!tar -xvf spark-2.4.0-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.0-bin-hadoop2.7"
import findspark
findspark.init()

spark-2.4.0-bin-hadoop2.7/
spark-2.4.0-bin-hadoop2.7/python/
spark-2.4.0-bin-hadoop2.7/python/setup.cfg
spark-2.4.0-bin-hadoop2.7/python/pyspark/
spark-2.4.0-bin-hadoop2.7/python/pyspark/resultiterable.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/python/
spark-2.4.0-bin-hadoop2.7/python/pyspark/python/pyspark/
spark-2.4.0-bin-hadoop2.7/python/pyspark/python/pyspark/shell.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/heapq3.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/join.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/version.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/rdd.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/java_gateway.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/find_spark_home.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/_globals.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/worker.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/accumulators.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/mllib/
spark-2.4.0-bin-hadoop2.7/python/pyspark/mllib/feature.py
spark-2.4.0-bin-hadoop2.7/python/pyspark

In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [3]:
%cd '/content/gdrive/My Drive/LDS9_K273_ONLINE_BuiThiHuong/Chapter8/'

/content/gdrive/My Drive/LDS9_K273_ONLINE_BuiThiHuong/Chapter8


# Demo Tree Model

### Dataset: flights.csv
- You'll build a regression model to predict flight delay or not 
- With 'mon', 'dom', 'dow', 'carrier_idx', 'org_idx', 'km', 'depart', 'duration' as a predictor

First thing to do is start a Spark Session

In [4]:
import pyspark

In [5]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import round
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeClassificationModel, \
                                      RandomForestClassifier, RandomForestClassificationModel, \
                                      GBTClassifier, GBTClassificationModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

In [6]:
spark = SparkSession.builder.appName('lr_demo').getOrCreate()

In [7]:
# Use Spark to read flights.csv file.
data = spark.read.csv("flights.csv",inferSchema=True,header=True)

In [8]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- mon: integer (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- org: string (nullable = true)
 |-- mile: integer (nullable = true)
 |-- depart: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- delay: string (nullable = true)



In [9]:
data.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 3 rows



In [10]:
data.head()

Row(mon=11, dom=20, dow=6, carrier='US', flight=19, org='JFK', mile=2153, depart=9.48, duration=351, delay='NA')

In [11]:
# for item in data.head():
#     print(item)

In [12]:
data.count()

50000

In [13]:
# Remove the 'flight' column
data = data.drop('flight')

In [14]:
# Number of records with missing 'delay' values
data.filter('delay IS NULL').count()

0

In [15]:
# Remove records with missing 'delay' values
data = data.filter('delay IS NOT NULL')

In [16]:
# Remove records with missing values in any column and get the number of remaining rows
data = data.na.drop()
data.count()

50000

In [17]:
# Import the required function
from pyspark.sql.functions import round

In [18]:
# Convert 'mile' to 'km' and drop 'mile' column
data = data.withColumn('km', round(data.mile * 1.60934, 0))

In [19]:
# Create 'label' column indicating whether flight delayed (1) or not (0)
data = data.withColumn('label', (data.delay >= 15).cast('integer'))
# Check first five records
data.show(3)

+---+---+---+-------+---+----+------+--------+-----+------+-----+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|    km|label|
+---+---+---+-------+---+----+------+--------+-----+------+-----+
| 11| 20|  6|     US|JFK|2153|  9.48|     351|   NA|3465.0| null|
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30| 509.0|    1|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8| 542.0|    0|
+---+---+---+-------+---+----+------+--------+-----+------+-----+
only showing top 3 rows



## Categories data

In [20]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [21]:
# Create an indexer
indexer = StringIndexer(inputCol='carrier', outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(data)

# Indexer creates a new column with numeric index values
data_indexed = indexer_model.transform(data)

# Repeat the process for the other categorical feature
data_indexed = StringIndexer(inputCol='org', outputCol='org_idx').fit(data_indexed).transform(data_indexed)

# Create an encoder
encoder = OneHotEncoder(inputCol="carrier_idx",
                        outputCol="carrier_vec", 
                        dropLast=True) # default
# them fit vao neu version cua ban bao Loi                       
data_indexed = encoder.transform(data_indexed)

encoder1 = OneHotEncoder(inputCol="org_idx",
                        outputCol="org_vec", 
                        dropLast=True) # default
# them fit vao neu version cua ban bao Loi                          
data_indexed = encoder1.transform(data_indexed)

In [22]:
data_indexed.show(3)

+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+-------------+-------------+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|    km|label|carrier_idx|org_idx|  carrier_vec|      org_vec|
+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+-------------+-------------+
| 11| 20|  6|     US|JFK|2153|  9.48|     351|   NA|3465.0| null|        6.0|    2.0|(8,[6],[1.0])|(7,[2],[1.0])|
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30| 509.0|    1|        0.0|    0.0|(8,[0],[1.0])|(7,[0],[1.0])|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8| 542.0|    0|        0.0|    1.0|(8,[0],[1.0])|(7,[1],[1.0])|
+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+-------------+-------------+
only showing top 3 rows



## Setting Up DataFrame for Machine Learning 

## Assembling columns

In [23]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [24]:
data_indexed.columns

['mon',
 'dom',
 'dow',
 'carrier',
 'org',
 'mile',
 'depart',
 'duration',
 'delay',
 'km',
 'label',
 'carrier_idx',
 'org_idx',
 'carrier_vec',
 'org_vec']

In [25]:
# Create an assembler object
assembler = VectorAssembler(inputCols=[
    'mon', 'dom', 'dow', 'carrier_vec', 'org_vec', 'km', 'depart', 'duration'
], outputCol='features')

In [26]:
data_pre = assembler.transform(data_indexed)

In [27]:
# Check the resulting column
data_pre.select('features', 'label').show(2, truncate=False)

+--------------------------------------------------------------------+-----+
|features                                                            |label|
+--------------------------------------------------------------------+-----+
|(21,[0,1,2,9,13,18,19,20],[11.0,20.0,6.0,1.0,1.0,3465.0,9.48,351.0])|null |
|(21,[1,2,3,11,18,19,20],[22.0,2.0,1.0,1.0,509.0,16.33,82.0])        |1    |
+--------------------------------------------------------------------+-----+
only showing top 2 rows



In [28]:
data_pre.show(3, False)

+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+-------------+-------------+--------------------------------------------------------------------+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|km    |label|carrier_idx|org_idx|carrier_vec  |org_vec      |features                                                            |
+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+-------------+-------------+--------------------------------------------------------------------+
|11 |20 |6  |US     |JFK|2153|9.48  |351     |NA   |3465.0|null |6.0        |2.0    |(8,[6],[1.0])|(7,[2],[1.0])|(21,[0,1,2,9,13,18,19,20],[11.0,20.0,6.0,1.0,1.0,3465.0,9.48,351.0])|
|0  |22 |2  |UA     |ORD|316 |16.33 |82      |30   |509.0 |1    |0.0        |0.0    |(8,[0],[1.0])|(7,[0],[1.0])|(21,[1,2,3,11,18,19,20],[22.0,2.0,1.0,1.0,509.0,16.33,82.0])        |
|2  |20 |4  |UA     |SFO|337 |6.17  |82      |-8   |542.0 |0    |0.0        |1.0    |

In [29]:
final_data = data_pre.select("features","label")
final_data.count()

50000

In [30]:
final_data = final_data.na.drop()
final_data.count()

47022

In [31]:
final_data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(21,[1,2,3,11,18,...|    1|
|(21,[0,1,2,3,12,1...|    0|
|(21,[0,1,2,4,11,1...|    0|
|(21,[0,1,2,3,12,1...|    0|
|(21,[0,1,2,4,11,1...|    1|
+--------------------+-----+
only showing top 5 rows



In [32]:
new_data = data_pre.select("features", "label").filter(data_pre.label.isNull())
new_data.show(3, False)

+--------------------------------------------------------------------+-----+
|features                                                            |label|
+--------------------------------------------------------------------+-----+
|(21,[0,1,2,9,13,18,19,20],[11.0,20.0,6.0,1.0,1.0,3465.0,9.48,351.0])|null |
|(21,[0,1,2,4,11,18,19,20],[4.0,2.0,5.0,1.0,1.0,415.0,8.92,65.0])    |null |
|(21,[1,2,3,11,18,19,20],[8.0,2.0,1.0,1.0,538.0,11.08,85.0])         |null |
+--------------------------------------------------------------------+-----+
only showing top 3 rows



In [33]:
new_data.count()

2978

## Train, Test split

In [34]:
train_data,test_data = final_data.randomSplit([0.8,0.2])

In [35]:
train_data.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|             37723|
|   mean|0.5124990059115129|
| stddev|0.4998503757545898|
|    min|                 0|
|    max|                 1|
+-------+------------------+



In [36]:
test_data.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|              9299|
|   mean|0.5065060759221421|
| stddev|0.4999845536877082|
|    min|                 0|
|    max|                 1|
+-------+------------------+



# Decision Tree
- ...

In [37]:
# create a classifier object and fit to the training data
tree = DecisionTreeClassifier(featuresCol='features',
                              labelCol='label',
                              predictionCol='prediction')

In [38]:
# fit the model to the data and call this tree_model
tree_model = tree.fit(train_data)

## Evaluation

In [39]:
# check test dataset
test_model = tree_model.transform(test_data)

In [40]:
# inspect results
test_model.select('label','prediction','probability').show(3,False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|0    |1.0       |[0.36419944255187364,0.6358005574481264]|
|1    |1.0       |[0.36419944255187364,0.6358005574481264]|
|0    |1.0       |[0.36419944255187364,0.6358005574481264]|
+-----+----------+----------------------------------------+
only showing top 3 rows



In [41]:
# create predictions for the testing data and show confusion matrix
test_model.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1513|
|    0|       0.0| 2583|
|    1|       1.0| 3197|
|    0|       1.0| 2006|
+-----+----------+-----+



In [42]:
# calculte the elements of the confusion matrix
TN = test_model.filter('prediction = 0 AND label = prediction').count()
TP = test_model.filter('prediction = 1 AND label = prediction').count()
FN = test_model.filter('prediction = 0 AND label != prediction').count()
FP = test_model.filter('prediction = 1 AND label != prediction').count()

In [43]:
# calculate precision and recall
precision = TP/(TP+FP)
recall = TP/(TP+FN)
print('precision = {:.2f}\nrecall = {:.2f}'.format(precision, recall))

precision = 0.61
recall = 0.68


In [44]:
acc = (TP+TN)/(TP+TN+FP+FN)
acc

0.6215722120658135

## Save model

In [48]:
# save model
tree_model.save('tree_model_flights_50k_new')

In [49]:
# load model from
tree_model2 = DecisionTreeClassificationModel.load('tree_model_flights_50k_new')

In [50]:
# predict new values (Assuming select test_data)
unlabeled_data = new_data.select('features')

In [51]:
predictions = tree_model2.transform(unlabeled_data)

In [52]:
predictions.show(3,False)

+--------------------------------------------------------------------+---------------+----------------------------------------+----------+
|features                                                            |rawPrediction  |probability                             |prediction|
+--------------------------------------------------------------------+---------------+----------------------------------------+----------+
|(21,[0,1,2,9,13,18,19,20],[11.0,20.0,6.0,1.0,1.0,3465.0,9.48,351.0])|[307.0,983.0]  |[0.237984496124031,0.762015503875969]   |1.0       |
|(21,[0,1,2,4,11,18,19,20],[4.0,2.0,5.0,1.0,1.0,415.0,8.92,65.0])    |[1857.0,744.0] |[0.7139561707035755,0.28604382929642447]|0.0       |
|(21,[1,2,3,11,18,19,20],[8.0,2.0,1.0,1.0,538.0,11.08,85.0])         |[1176.0,2053.0]|[0.36419944255187364,0.6358005574481264]|1.0       |
+--------------------------------------------------------------------+---------------+----------------------------------------+----------+
only showing top 3 rows



# Random Forest
- ...

In [53]:
# create a classifier object and fit to the training data
rfc = RandomForestClassifier(featuresCol='features',
                              labelCol='label',
                              predictionCol='prediction')

In [54]:
# fit the model to the data and call this tree_model
rfc_model = rfc.fit(train_data)

In [55]:
# find the number of trees and the relative importance of features
print('Number of trees:', rfc_model.getNumTrees)
print('Relative importance of features:', rfc_model.featureImportances)

Number of trees: 20
Relative importance of features: (21,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],[0.19150403321116244,0.010754073519195778,0.012010725387058824,0.004152612771819334,0.008826181570605971,0.0003266624179534335,0.03651130966825634,0.0006875095813922468,0.005784621803919187,0.010518481197236836,0.023158270062908357,0.018171232563075713,0.005384912409508673,0.0022812575839863103,5.4435994605303184e-05,0.023431309291201054,0.026379435679899265,0.001916973119309293,0.07113894724656014,0.46402228758769537,0.08298472733265023])


## Evaluation

In [56]:
# check test dataset
rfc_test_model = rfc_model.transform(test_data)

In [57]:
# inspect results
rfc_test_model.select('label','prediction','probability').show(3,False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|0    |1.0       |[0.39721794227816776,0.6027820577218324]|
|1    |1.0       |[0.4420340925962766,0.5579659074037234] |
|0    |1.0       |[0.36084935890249825,0.6391506410975017]|
+-----+----------+----------------------------------------+
only showing top 3 rows



In [58]:
# create predictions for the testing data and show confusion matrix
rfc_test_model.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1442|
|    0|       0.0| 2537|
|    1|       1.0| 3268|
|    0|       1.0| 2052|
+-----+----------+-----+



In [59]:
# calculte the elements of the confusion matrix
TN = rfc_test_model.filter('prediction = 0 AND label = prediction').count()
TP = rfc_test_model.filter('prediction = 1 AND label = prediction').count()
FN = rfc_test_model.filter('prediction = 0 AND label != prediction').count()
FP = rfc_test_model.filter('prediction = 1 AND label != prediction').count()

In [60]:
# calculate precision and recall
precision = TP/(TP+FP)
recall = TP/(TP+FN)
print('precision = {:.2f}\nrecall = {:.2f}'.format(precision, recall))

precision = 0.61
recall = 0.69


In [61]:
acc = (TP+TN)/(TP+TN+FP+FN)
acc

0.6242606731906657

=> Model RandomForest is the same with DecisionTree

## Save model

In [62]:
# save model
rfc_model.save('rfc_model_flights_50k_new')

In [63]:
# load model from
rfc_model2 = RandomForestClassificationModel.load('rfc_model_flights_50k_new')

In [64]:
# predict new values (Assuming select test_data)
unlabeled_data = new_data.select('features')

In [65]:
rfc_predictions = rfc_model2.transform(unlabeled_data)

In [66]:
rfc_predictions.show(3,False)

+--------------------------------------------------------------------+---------------------------------------+----------------------------------------+----------+
|features                                                            |rawPrediction                          |probability                             |prediction|
+--------------------------------------------------------------------+---------------------------------------+----------------------------------------+----------+
|(21,[0,1,2,9,13,18,19,20],[11.0,20.0,6.0,1.0,1.0,3465.0,9.48,351.0])|[10.387940074337656,9.612059925662344] |[0.5193970037168828,0.4806029962831172] |0.0       |
|(21,[0,1,2,4,11,18,19,20],[4.0,2.0,5.0,1.0,1.0,415.0,8.92,65.0])    |[12.681939001241362,7.3180609987586305]|[0.6340969500620683,0.36590304993793166]|0.0       |
|(21,[1,2,3,11,18,19,20],[8.0,2.0,1.0,1.0,538.0,11.08,85.0])         |[9.224927086138312,10.775072913861688] |[0.4612463543069156,0.5387536456930844] |1.0       |
+---------------------

# Gradient-Boosted Trees
- ...

In [67]:
# create a classifier object and fit to the training data
gbt = GBTClassifier(featuresCol='features',
                              labelCol='label',
                              predictionCol='prediction')

In [68]:
# fit the model to the data and call this tree_model
gbt_model = gbt.fit(train_data)

In [69]:
# find the number of trees and the relative importance of features
print('Number of trees:', gbt_model.getNumTrees)
print('Relative importance of features:', gbt_model.featureImportances)

Number of trees: 20
Relative importance of features: (21,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],[0.21912744754521793,0.15077266173040607,0.12142838586746277,0.0021636637239017685,0.005891807214845181,0.016092728330022312,0.012174264185565357,0.00897268213833214,0.009699548752469972,0.02452163827828701,0.008217186896669133,0.014007582661314386,0.03626532079931329,0.026262604469381192,0.02528608403861775,0.027806379837765755,0.015851757214173316,0.013163143691974568,0.05631361003694564,0.14839707686256337,0.057584425724770884])


## Evaluation

In [70]:
# check test dataset
gbt_test_model = gbt_model.transform(test_data)

In [71]:
# inspect results
gbt_test_model.select('label','prediction','probability').show(3,False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|0    |1.0       |[0.17120461428835995,0.82879538571164]  |
|1    |1.0       |[0.22799262727500075,0.7720073727249992]|
|0    |1.0       |[0.09808180183181761,0.9019181981681824]|
+-----+----------+----------------------------------------+
only showing top 3 rows



In [72]:
# create predictions for the testing data and show confusion matrix
gbt_test_model.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1480|
|    0|       0.0| 2864|
|    1|       1.0| 3230|
|    0|       1.0| 1725|
+-----+----------+-----+



In [73]:
# calculte the elements of the confusion matrix
TN = gbt_test_model.filter('prediction = 0 AND label = prediction').count()
TP = gbt_test_model.filter('prediction = 1 AND label = prediction').count()
FN = gbt_test_model.filter('prediction = 0 AND label != prediction').count()
FP = gbt_test_model.filter('prediction = 1 AND label != prediction').count()

In [74]:
# calculate precision and recall
precision = TP/(TP+FP)
recall = TP/(TP+FN)
print('precision = {:.2f}\nrecall = {:.2f}'.format(precision, recall))

precision = 0.65
recall = 0.69


In [75]:
acc = (TP+TN)/(TP+TN+FP+FN)
acc

0.6553392837939563

=> Model GBTClassifier is better than RandomForest and DecisionTree

## Save model

In [76]:
# save model
gbt_model.save('gbt_model_flights_50k_new')

In [77]:
# load model from
gbt_model2 = GBTClassificationModel.load('gbt_model_flights_50k_new')

In [78]:
# predict new values (Assuming select test_data)
unlabeled_data = new_data.select('features')

In [79]:
gbt_predictions = gbt_model2.transform(unlabeled_data)

In [80]:
gbt_predictions.show(3,False)

+--------------------------------------------------------------------+------------------------------------------+----------------------------------------+----------+
|features                                                            |rawPrediction                             |probability                             |prediction|
+--------------------------------------------------------------------+------------------------------------------+----------------------------------------+----------+
|(21,[0,1,2,9,13,18,19,20],[11.0,20.0,6.0,1.0,1.0,3465.0,9.48,351.0])|[0.07448986592987994,-0.07448986592987994]|[0.537176198033139,0.462823801966861]   |0.0       |
|(21,[0,1,2,4,11,18,19,20],[4.0,2.0,5.0,1.0,1.0,415.0,8.92,65.0])    |[0.40749104324609164,-0.40749104324609164]|[0.6931701393993993,0.3068298606006007] |0.0       |
|(21,[1,2,3,11,18,19,20],[8.0,2.0,1.0,1.0,538.0,11.08,85.0])         |[-0.31162428718825264,0.31162428718825264]|[0.34904297492518616,0.6509570250748138]|1.0       |
+---

# Compare Models

In [81]:
dtc_predictions = tree_model.transform(test_data)
rfc_predictions = rfc_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)

In [89]:
dtc_predictions.printSchema()

root
 |-- features: vector (nullable = true)
 |-- label: integer (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [82]:
# select (prediction, true label) and compute test error
acc_evaluator = MulticlassClassificationEvaluator(labelCol='label',
                                                  predictionCol='prediction',
                                                  metricName='accuracy')

In [83]:
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
rfc_acc = acc_evaluator.evaluate(rfc_predictions)
gbt_acc = acc_evaluator.evaluate(gbt_predictions)

In [84]:
print('Results')
print('-'*60)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(dtc_acc*100))
print('-'*60)
print('A random forest has an accuracy of: {0:2.2f}%'.format(rfc_acc*100))
print('-'*60)
print('A ensemble using GBT has an accuracy of: {0:2.2f}%'.format(gbt_acc*100))


Results
------------------------------------------------------------
A single decision tree has an accuracy of: 62.16%
------------------------------------------------------------
A random forest has an accuracy of: 62.43%
------------------------------------------------------------
A ensemble using GBT has an accuracy of: 65.53%


In [85]:
# compare AUC on testing data
evaluator = BinaryClassificationEvaluator()

In [86]:
dtc_auc = evaluator.evaluate(dtc_predictions)
rfc_auc = evaluator.evaluate(rfc_predictions)
gbt_auc = evaluator.evaluate(gbt_predictions)

In [87]:
print('Results')
print('-'*60)
print('A single decision tree has an AUC of: {0:2.2f}%'.format(dtc_auc*100))
print('-'*60)
print('A random forest has an AUC of: {0:2.2f}%'.format(rfc_auc*100))
print('-'*60)
print('A ensemble using GBT has an AUC of: {0:2.2f}%'.format(gbt_auc*100))


Results
------------------------------------------------------------
A single decision tree has an AUC of: 58.14%
------------------------------------------------------------
A random forest has an AUC of: 67.14%
------------------------------------------------------------
A ensemble using GBT has an AUC of: 71.94%
