In [1]:
# Initialise the app
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import *
spark = SparkSession.builder.appName('iteration4-after-join-table').getOrCreate()

In [2]:
join_table = spark.read.csv('./join_table.csv', header='true')
join_table.show()

+-------+---------+------------------+--------------+----------------+--------------------+----------------+--------------+-------------------+-----+------+-------+--------------+--------------------+----------------------------+
|case_id|weather_1|collision_severity|killed_victims|road_condition_1|            lighting|alcohol_involved|collision_date|     formatted_date|month|season|     id|party_sobriety|       cellphone_use|movement_preceding_collision|
+-------+---------+------------------+--------------+----------------+--------------------+----------------+--------------+-------------------+-----+------+-------+--------------+--------------------+----------------------------+
|6292121|   cloudy|             fatal|             1|          normal|dark with street ...|               0|    18/01/2015|2015-01-18 00:00:00|    1|winter|4734135|             A|Cell Phone Not in...|            making left turn|
|6292127|    clear|             fatal|             1|          normal|          

## 3.5 Format the data as required

In [3]:
#Exclude data from the join table
columns_to_drop = ['id','case_id',"collision_date","formatted_date","month"]
join_table = join_table.drop(*columns_to_drop)
join_table.show()

+---------+------------------+--------------+----------------+--------------------+----------------+------+--------------+--------------------+----------------------------+
|weather_1|collision_severity|killed_victims|road_condition_1|            lighting|alcohol_involved|season|party_sobriety|       cellphone_use|movement_preceding_collision|
+---------+------------------+--------------+----------------+--------------------+----------------+------+--------------+--------------------+----------------------------+
|   cloudy|             fatal|             1|          normal|dark with street ...|               0|winter|             A|Cell Phone Not in...|            making left turn|
|    clear|             fatal|             1|          normal|            daylight|               1|winter|             H|Cell Phone Not in...|                      parked|
|    clear|             fatal|             1|          normal|            daylight|               1|winter|             C|Cell Phone No

In [4]:
print(str(join_table.count()))
print(str(len(join_table.columns)))

126108
10


In [5]:
join_table.printSchema()

root
 |-- weather_1: string (nullable = true)
 |-- collision_severity: string (nullable = true)
 |-- killed_victims: string (nullable = true)
 |-- road_condition_1: string (nullable = true)
 |-- lighting: string (nullable = true)
 |-- alcohol_involved: string (nullable = true)
 |-- season: string (nullable = true)
 |-- party_sobriety: string (nullable = true)
 |-- cellphone_use: string (nullable = true)
 |-- movement_preceding_collision: string (nullable = true)



In [6]:
from pyspark.sql.types import IntegerType, BooleanType
join_table = join_table.withColumn("alcohol_involved",join_table.alcohol_involved.cast(BooleanType()))
join_table = join_table.withColumn("killed_victims",join_table.killed_victims.cast(IntegerType()))

In [7]:
join_table.printSchema()

root
 |-- weather_1: string (nullable = true)
 |-- collision_severity: string (nullable = true)
 |-- killed_victims: integer (nullable = true)
 |-- road_condition_1: string (nullable = true)
 |-- lighting: string (nullable = true)
 |-- alcohol_involved: boolean (nullable = true)
 |-- season: string (nullable = true)
 |-- party_sobriety: string (nullable = true)
 |-- cellphone_use: string (nullable = true)
 |-- movement_preceding_collision: string (nullable = true)



# 4. Data Transformation
## 4.1 Reduce the data

In [8]:
#feature selection through RFormula

from pyspark.ml.feature import RFormula

formula = RFormula(
    formula="collision_severity ~ weather_1 + killed_victims + road_condition_1 + lighting + alcohol_involved + season + party_sobriety + cellphone_use + movement_preceding_collision",
    featuresCol="features",
    labelCol="label")

output = formula.fit(join_table).transform(join_table)
output.select("features", "label").show(truncate = False)

+-------------------------------------------------------------+-----+
|features                                                     |label|
+-------------------------------------------------------------+-----+
|(46,[1,6,7,15,22,27,30],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])       |4.0  |
|(46,[0,6,7,14,18,24,27,35],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|4.0  |
|(46,[0,6,7,14,18,26,27,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|4.0  |
|(46,[0,6,7,14,22,27,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])       |4.0  |
|(46,[0,6,7,14,23,27,42],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])       |4.0  |
|(46,[1,6,8,15,18,22,27,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|4.0  |
|(46,[1,6,8,15,18,25,27,34],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|4.0  |
|(46,[0,6,7,16,22,27,42],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])       |4.0  |
|(46,[0,6,7,14,23,27,38],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])       |4.0  |
|(46,[0,6,7,17,22,27,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])       |4.0  |
|(46,[0,6,7,16,23,27,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])       |4.0  |
|(46,[0,6,7,15,22,28

In [9]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)
from pyspark.ml import Pipeline

In [10]:
# First create a string indexer
# A number will be assigned to every category in the column.
# weather_1_indexer = StringIndexer(inputCol='weather_1',outputCol='weather_1Index')

# Now we can one hot encode these numbers. This converts the various outputs into a single vector.
# This makes it easier to process when you have multiple classes.
# weather_1_encoder = OneHotEncoder(inputCol='weather_1Index',outputCol='weather_1Vec')

In [11]:
# road_condition_1_indexer = StringIndexer(inputCol='road_condition_1',outputCol='road_condition_1Index')
# road_condition_1_encoder = OneHotEncoder(inputCol='road_condition_1Index',outputCol='road_condition_1Vec')

In [12]:
# collision_severity_indexer = StringIndexer(inputCol='collision_severity',outputCol='collision_severityIndex')
# collision_severity_encoder = OneHotEncoder(inputCol='collision_severityIndex',outputCol='collision_severityVec')

In [13]:
# lighting_indexer = StringIndexer(inputCol='lighting',outputCol='lightingIndex')
# lighting_encoder = OneHotEncoder(inputCol='lightingIndex',outputCol='lightingVec')

In [14]:
# season_indexer = StringIndexer(inputCol='season',outputCol='seasonIndex')
# season_encoder = OneHotEncoder(inputCol='seasonIndex',outputCol='seasonVec')

In [15]:
# party_sobriety_indexer = StringIndexer(inputCol='party_sobriety',outputCol='party_sobrietyIndex')
# party_sobriety_encoder = OneHotEncoder(inputCol='party_sobrietyIndex',outputCol='party_sobrietyVec')

In [16]:
# cellphone_use_indexer = StringIndexer(inputCol='cellphone_use',outputCol='cellphone_useIndex')
# cellphone_use_encoder = OneHotEncoder(inputCol='cellphone_useIndex',outputCol='cellphone_useVec')

In [17]:
# movement_preceding_collision_indexer = StringIndexer(inputCol='movement_preceding_collision',outputCol='movement_preceding_collisionIndex')
# movement_preceding_collision_encoder = OneHotEncoder(inputCol='movement_preceding_collisionIndex',outputCol='movement_preceding_collisionVec')

In [18]:
# Now we can assemble all of this as one vector in the features column. 
# assembler = VectorAssembler(inputCols=['weather_1Vec',
#  'road_condition_1Vec',
#  "alcohol_involved",
#  "killed_victims",
#  'lightingVec',
#  'seasonVec',
#  'party_sobrietyVec',
#  'cellphone_useVec',
#  'movement_preceding_collisionVec'],outputCol='features')

In [19]:
numerical_columns=['alcohol_involved', 'killed_victims']
categorical_columns = ['collision_severity','weather_1', 'road_condition_1', 'lighting', 'season', 'party_sobriety', 'cellphone_use', 'movement_preceding_collision']

In [20]:
def construct(df,categorical_columns,numerical_columns):
    stringindexer_stages = [StringIndexer(inputCol=c, outputCol='strindexed_' + c) for c in categorical_columns]
    # encode label column and add it to stringindexer_stages
    stringindexer_stages += [StringIndexer(inputCol='collision_severity', outputCol='label')]
    
    onehotencoder_stages = [OneHotEncoder(inputCol='strindexed_' + c, outputCol='onehot_' + c) for c in categorical_columns]
    
    feature_columns = ['onehot_' + c for c in categorical_columns]
    feature_columns += numerical_columns
    vectorassembler_stage = VectorAssembler(inputCols=feature_columns, outputCol='features')
    
    allc = stringindexer_stages + onehotencoder_stages + [vectorassembler_stage]
    pipeline = Pipeline(stages=allc)
    pipeline_model = pipeline.fit(df)
    final_columns = feature_columns + ['features', 'label']
    constructed = pipeline_model.transform(df).\
            select(final_columns)
            
    return constructed

In [21]:
transformed = construct(join_table,categorical_columns,numerical_columns)

In [53]:
transformed.select('features','label').show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(50,[5,10,18,24,2...|  4.0|
|(50,[4,10,17,26,2...|  4.0|
|(50,[4,10,17,28,2...|  4.0|
|(50,[4,10,17,24,2...|  4.0|
|(50,[4,10,17,25,2...|  4.0|
|(50,[5,11,18,24,2...|  4.0|
|(50,[5,11,18,27,2...|  4.0|
|(50,[4,10,19,24,2...|  4.0|
|(50,[4,10,17,25,2...|  4.0|
|(50,[4,10,20,24,2...|  4.0|
|(50,[4,10,19,25,2...|  4.0|
|(50,[4,10,18,24,3...|  4.0|
|(50,[4,10,18,25,3...|  4.0|
|(50,[4,10,17,27,2...|  4.0|
|(50,[4,10,20,25,2...|  4.0|
|(50,[4,10,20,24,2...|  4.0|
|(50,[4,10,18,28,2...|  4.0|
|(50,[4,10,18,24,2...|  4.0|
|(50,[4,10,17,23,2...|  4.0|
|(50,[4,10,17,23,2...|  4.0|
+--------------------+-----+
only showing top 20 rows



In [22]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier()
model = rf.fit(transformed)
vect = model.featureImportances

In [31]:
print(transformed.columns)

['onehot_collision_severity', 'onehot_weather_1', 'onehot_road_condition_1', 'onehot_lighting', 'onehot_season', 'onehot_party_sobriety', 'onehot_cellphone_use', 'onehot_movement_preceding_collision', 'alcohol_involved', 'killed_victims', 'features', 'label']


In [35]:
vec=[]
for i in vect:
    vec.append(i)

print((vec))

[0.25344602817241724, 0.20522745929848352, 0.265712690651679, 0.23189872750617896, 0.0, 5.9189197217171015e-06, 0.0, 9.483510089377672e-07, 0.0, 0.0, 2.855028294469659e-06, 4.542773999451342e-06, 0.0, 5.594236186767516e-07, 0.0, 2.008139607773779e-06, 0.0, 0.000528333760281106, 0.0, 9.857797970082177e-05, 3.0746060715192586e-06, 0.0, 0.00013595951771699048, 1.813462549000179e-05, 0.005032458439925677, 0.0014388848851153145, 0.0016456086228198504, 0.0005939966039349247, 0.0, 2.701064081912301e-05, 0.0003682122176242962, 9.684277200686688e-06, 0.0005452335074700762, 0.0004804284072853623, 0.0015967274332015654, 3.7331135459142444e-06, 7.582088713789109e-05, 0.002042474665061075, 3.9276239066030665e-05, 2.5255528454473792e-05, 0.00018046581210659713, 0.0, 0.0, 0.0, 4.1902160902816915e-05, 0.0002192254915474816, 0.0, 0.0, 0.0024330054829870217, 0.02611477682952356]


In [None]:
#this part of the code did not work
# import pandas as pd
# feature = pd.DataFrame({'feature': transformed.columns[:-2], 'importance':vec})

In [56]:
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

selector = ChiSqSelector(numTopFeatures=1, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="label")


result = selector.fit(transformed).transform(transformed)

print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
result.select("features", "label",'selectedFeatures').show()

ChiSqSelector output with top 1 features selected
+--------------------+-----+----------------+
|            features|label|selectedFeatures|
+--------------------+-----+----------------+
|(50,[5,10,18,24,2...|  4.0|       (1,[],[])|
|(50,[4,10,17,26,2...|  4.0|       (1,[],[])|
|(50,[4,10,17,28,2...|  4.0|       (1,[],[])|
|(50,[4,10,17,24,2...|  4.0|       (1,[],[])|
|(50,[4,10,17,25,2...|  4.0|       (1,[],[])|
|(50,[5,11,18,24,2...|  4.0|       (1,[],[])|
|(50,[5,11,18,27,2...|  4.0|       (1,[],[])|
|(50,[4,10,19,24,2...|  4.0|       (1,[],[])|
|(50,[4,10,17,25,2...|  4.0|       (1,[],[])|
|(50,[4,10,20,24,2...|  4.0|       (1,[],[])|
|(50,[4,10,19,25,2...|  4.0|       (1,[],[])|
|(50,[4,10,18,24,3...|  4.0|       (1,[],[])|
|(50,[4,10,18,25,3...|  4.0|       (1,[],[])|
|(50,[4,10,17,27,2...|  4.0|       (1,[],[])|
|(50,[4,10,20,25,2...|  4.0|       (1,[],[])|
|(50,[4,10,20,24,2...|  4.0|       (1,[],[])|
|(50,[4,10,18,28,2...|  4.0|       (1,[],[])|
|(50,[4,10,18,24,2...|  4.0|  

In [57]:
#Further exclude data from the join table
columns_to_drop = ['killed_victims','road_condition_1',"weather_1"]
join_table = join_table.drop(*columns_to_drop)
join_table.printSchema()

root
 |-- collision_severity: string (nullable = true)
 |-- lighting: string (nullable = true)
 |-- alcohol_involved: boolean (nullable = true)
 |-- season: string (nullable = true)
 |-- party_sobriety: string (nullable = true)
 |-- cellphone_use: string (nullable = true)
 |-- movement_preceding_collision: string (nullable = true)

126108


In [58]:
print(str(len(join_table.columns)))

7


## 4.2 Project the data
### 4.2.1 Reclassification

In [61]:
#Relabelling COLLISION_SEVERITY
from pyspark.sql.functions import when, col
join_table = join_table.withColumn("collision_severity", when(col("collision_severity") == 'other injury',"Light injury") \
      .when(col("collision_severity") == 'pain',"Light injury") \
      .when(col("collision_severity") == 'property damage only',"no injury") \
      .otherwise(join_table.collision_severity))

In [62]:
join_table.groupBy('collision_severity').count().orderBy('count', ascending=False).show()

+------------------+-----+
|collision_severity|count|
+------------------+-----+
|      Light injury|80011|
|         no injury|22568|
|     severe injury|19689|
|             fatal| 3840|
+------------------+-----+



In [63]:
join_table.printSchema()

root
 |-- collision_severity: string (nullable = true)
 |-- lighting: string (nullable = true)
 |-- alcohol_involved: boolean (nullable = true)
 |-- season: string (nullable = true)
 |-- party_sobriety: string (nullable = true)
 |-- cellphone_use: string (nullable = true)
 |-- movement_preceding_collision: string (nullable = true)



In [64]:
#Save final table as csv before data mining
join_table.write.csv('Final_Table.csv',header=True)