In [1]:
# Initialise the app
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import *
spark = SparkSession.builder.appName('iteration4-decision-tree').getOrCreate()

In [2]:
final_table = spark.read.csv('./final_table.csv', header='true')
final_table.show()

+------------------+--------------------+----------------+------+--------------+--------------------+----------------------------+
|collision_severity|            lighting|alcohol_involved|season|party_sobriety|       cellphone_use|movement_preceding_collision|
+------------------+--------------------+----------------+------+--------------+--------------------+----------------------------+
|             fatal|dark with street ...|           false|winter|             A|Cell Phone Not in...|            making left turn|
|             fatal|            daylight|            true|winter|             H|Cell Phone Not in...|                      parked|
|             fatal|            daylight|            true|winter|             C|Cell Phone Not in...|         proceeding straight|
|             fatal|            daylight|           false|winter|             A|Cell Phone Not in...|         proceeding straight|
|             fatal|            daylight|           false|winter|             G|Cel

In [3]:
from pyspark.sql.types import BooleanType
final_table = final_table.withColumn("alcohol_involved",final_table.alcohol_involved.cast(BooleanType()))

In [4]:
final_table.printSchema()

root
 |-- collision_severity: string (nullable = true)
 |-- lighting: string (nullable = true)
 |-- alcohol_involved: boolean (nullable = true)
 |-- season: string (nullable = true)
 |-- party_sobriety: string (nullable = true)
 |-- cellphone_use: string (nullable = true)
 |-- movement_preceding_collision: string (nullable = true)



In [5]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)

In [6]:
index = StringIndexer(inputCol='collision_severity', outputCol='collision_severity_index')
test = index.fit(final_table).transform(final_table)
test = test.drop('collision_severity')


index = StringIndexer(inputCol='lighting', outputCol='lighting_index')
test = index.fit(test).transform(test)
test = test.drop('lighting')

index = StringIndexer(inputCol='season', outputCol='season_index')
test = index.fit(test).transform(test)
test = test.drop('season')

index = StringIndexer(inputCol='party_sobriety', outputCol='party_sobriety_index')
test = index.fit(test).transform(test)
test = test.drop('party_sobriety')

index = StringIndexer(inputCol='cellphone_use', outputCol='cellphone_use_index')
test = index.fit(test).transform(test)
test = test.drop('cellphone_use')


index = StringIndexer(inputCol='movement_preceding_collision', outputCol='movement_preceding_collision_index')
test = index.fit(test).transform(test)
test = test.drop('movement_preceding_collision')

test.show()

+----------------+------------------------+--------------+------------+--------------------+-------------------+----------------------------------+
|alcohol_involved|collision_severity_index|lighting_index|season_index|party_sobriety_index|cellphone_use_index|movement_preceding_collision_index|
+----------------+------------------------+--------------+------------+--------------------+-------------------+----------------------------------+
|           false|                     3.0|           1.0|         3.0|                 0.0|                0.0|                               1.0|
|            true|                     3.0|           0.0|         3.0|                 2.0|                0.0|                               6.0|
|            true|                     3.0|           0.0|         3.0|                 4.0|                0.0|                               0.0|
|           false|                     3.0|           0.0|         3.0|                 0.0|                0.0|

In [7]:
test.dtypes

[('alcohol_involved', 'boolean'),
 ('collision_severity_index', 'double'),
 ('lighting_index', 'double'),
 ('season_index', 'double'),
 ('party_sobriety_index', 'double'),
 ('cellphone_use_index', 'double'),
 ('movement_preceding_collision_index', 'double')]

In [8]:
assembler = VectorAssembler(inputCols=[
 'lighting_index',
 'season_index',
 'party_sobriety_index',
'cellphone_use_index', 
'movement_preceding_collision_index'],outputCol='features')

In [9]:
output = assembler.transform(test)
output.show() #this dataframe now contains the features column

+----------------+------------------------+--------------+------------+--------------------+-------------------+----------------------------------+--------------------+
|alcohol_involved|collision_severity_index|lighting_index|season_index|party_sobriety_index|cellphone_use_index|movement_preceding_collision_index|            features|
+----------------+------------------------+--------------+------------+--------------------+-------------------+----------------------------------+--------------------+
|           false|                     3.0|           1.0|         3.0|                 0.0|                0.0|                               1.0|[1.0,3.0,0.0,0.0,...|
|            true|                     3.0|           0.0|         3.0|                 2.0|                0.0|                               6.0|[0.0,3.0,2.0,0.0,...|
|            true|                     3.0|           0.0|         3.0|                 4.0|                0.0|                               0.0| (5,[1,2

In [10]:
output = output.select("features",'collision_severity_index')

In [67]:
# train_data,test_data = output.randomSplit([0.7,0.3])

In [68]:
# train_data,test_data = output.randomSplit([0.5,0.5])

In [70]:
# train_data,test_data = output.randomSplit([0.9,0.1])

In [None]:
train_data,test_data = output.randomSplit([0.8,0.2])

In [71]:
from pyspark.ml.classification import DecisionTreeClassifier,RandomForestClassifier
from pyspark.ml import Pipeline

In [72]:
dtc = DecisionTreeClassifier(labelCol='collision_severity_index',featuresCol='features')
rfc = RandomForestClassifier(labelCol='collision_severity_index',featuresCol='features')

dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)

dtc_predictions = dtc_model.transform(test_data)
rfc_predictions = rfc_model.transform(test_data)

In [75]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [76]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol="collision_severity_index", predictionCol="prediction", metricName="accuracy")

In [77]:
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
rfc_acc = acc_evaluator.evaluate(rfc_predictions)

In [84]:
print("Here are the Multiclass Classification results!")
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(dtc_acc*100))
print("Test Error = %g" % (1.0 - dtc_acc))
print('-'*40)
print('A random forest ensemble has an accuracy of: {0:2.2f}%'.format(rfc_acc*100))
print("Test Error = %g" % (1.0 - rfc_acc))
print('-'*40)


Here are the Multiclass Classification results!
----------------------------------------
A single decision tree has an accuracy of: 64.20%
Test Error = 0.358001
----------------------------------------
A random forest ensemble has an accuracy of: 64.64%
Test Error = 0.353615
----------------------------------------


In [96]:
#similar to confusion matrix - for decision tree
totalResults = dtc_predictions.select('collision_severity_index','prediction')

correctResults = totalResults.filter(totalResults['collision_severity_index'] == totalResults['prediction'])

countTR = totalResults.count()
print("Correct: " + str(countTR))

countTC = correctResults.count()
print("Total Correct: " + str(countTC))

Correct: 25310
Total Correct: 16249


In [115]:
#similar to confusion matrix - for random forrest
totalResults = rfc_predictions.select('collision_severity_index','prediction')

correctResults = totalResults.filter(totalResults['collision_severity_index'] == totalResults['prediction'])

countTR = totalResults.count()
print("Correct: " + str(countTR))

countTC = correctResults.count()
print("Total Correct: " + str(countTC))

Correct: 25310
Total Correct: 16360


In [116]:
#Extracting tree rules from our model
print(dtc_model.toDebugString)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4c38811e73646aad1ae1) of depth 3 with 15 nodes
  If (feature 2 in {0.0,4.0})
   If (feature 4 in {0.0,1.0,2.0,4.0,5.0,7.0,8.0,9.0,10.0,11.0,12.0,14.0,15.0,16.0})
    If (feature 4 in {0.0,2.0,4.0,5.0,7.0,10.0,11.0,12.0,15.0})
     Predict: 0.0
    Else (feature 4 not in {0.0,2.0,4.0,5.0,7.0,10.0,11.0,12.0,15.0})
     Predict: 0.0
   Else (feature 4 not in {0.0,1.0,2.0,4.0,5.0,7.0,8.0,9.0,10.0,11.0,12.0,14.0,15.0,16.0})
    If (feature 4 in {3.0,6.0,17.0})
     Predict: 0.0
    Else (feature 4 not in {3.0,6.0,17.0})
     Predict: 0.0
  Else (feature 2 not in {0.0,4.0})
   If (feature 4 in {1.0,2.0,4.0,7.0,11.0,14.0,17.0})
    If (feature 4 in {14.0,17.0})
     Predict: 1.0
    Else (feature 4 not in {14.0,17.0})
     Predict: 0.0
   Else (feature 4 not in {1.0,2.0,4.0,7.0,11.0,14.0,17.0})
    If (feature 4 in {3.0,6.0,10.0,12.0,15.0})
     Predict: 1.0
    Else (feature 4 not in {3.0,6.0,10.0,12.0,15.0})
     Predict: 0.0



In [117]:
#Extracting tree rules from our model
print(rfc_model.toDebugString)

RandomForestClassificationModel (uid=rfc_30e053d58297) with 20 trees
  Tree 0 (weight 1.0):
    If (feature 2 in {0.0,4.0})
     If (feature 4 in {0.0,1.0,2.0,4.0,5.0,7.0,8.0,9.0,10.0,11.0,12.0,14.0,15.0,16.0})
      If (feature 4 in {0.0,2.0,4.0,5.0,7.0,10.0,11.0,12.0,14.0,15.0,16.0})
       If (feature 4 in {2.0,5.0,7.0,15.0})
        If (feature 2 in {0.0})
         Predict: 0.0
        Else (feature 2 not in {0.0})
         Predict: 0.0
       Else (feature 4 not in {2.0,5.0,7.0,15.0})
        If (feature 0 in {2.0})
         Predict: 0.0
        Else (feature 0 not in {2.0})
         Predict: 0.0
      Else (feature 4 not in {0.0,2.0,4.0,5.0,7.0,10.0,11.0,12.0,14.0,15.0,16.0})
       If (feature 0 in {0.0,3.0})
        If (feature 4 in {1.0})
         Predict: 0.0
        Else (feature 4 not in {1.0})
         Predict: 0.0
       Else (feature 0 not in {0.0,3.0})
        If (feature 1 in {2.0})
         Predict: 0.0
        Else (feature 1 not in {2.0})
         Predict: 0.0
     

In [None]:
# from sklearn.datasets import load_iris
# iris = load_iris()

# # Model (can also use single decision tree)
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier(n_estimators=10)

# # Train
# model.fit(iris.data, iris.target)
# # Extract single tree
# estimator = model.estimators_[5]

# from sklearn.tree import export_graphviz
# # Export as dot file
# export_graphviz(estimator, out_file='tree.dot', 
#                 feature_names = iris.feature_names,
#                 class_names = iris.target_names,
#                 rounded = True, proportion = False, 
#                 precision = 2, filled = True)

# # Convert to png using system command (requires Graphviz)
# from subprocess import call
# call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# # Display in jupyter notebook
# from IPython.display import Image
# Image(filename = 'tree.png')

In [None]:
#Confusion Matrix
# import sklearn
# from sklearn.metrics import classification_report, confusion_matrix

# y_true = rfc_predictions.select(['collision_severity_index']).collect()
# y_pred = rfc_predictions.select(['prediction']).collect()

# print(classification_report(y_true, y_pred))