# US Accidents - Severity Prediction - SparkML

## Team Members
- Aditya Kamble
- Sidharth Panda

## Import Dependencies

In [103]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

## Data Preparation

In [44]:
sparkContext = SparkContext.getOrCreate()
spark = SparkSession(sparkContext)

In [45]:
data_df = spark.read.csv('data-processed/us_accidents_dec19.csv', header = True, inferSchema = True)
data_df.printSchema()

root
 |-- TMC: double (nullable = true)
 |-- Start_Lng: double (nullable = true)
 |-- Start_Lat: double (nullable = true)
 |-- Distance(mi): double (nullable = true)
 |-- Temperature(F): double (nullable = true)
 |-- Humidity(%): double (nullable = true)
 |-- Pressure(in): double (nullable = true)
 |-- Hour: double (nullable = true)
 |-- Severity: double (nullable = true)
 |-- Time_Duration_Min: double (nullable = true)
 |-- State_AR: integer (nullable = true)
 |-- State_AZ: integer (nullable = true)
 |-- State_CA: integer (nullable = true)
 |-- State_CO: integer (nullable = true)
 |-- State_CT: integer (nullable = true)
 |-- State_DC: integer (nullable = true)
 |-- State_DE: integer (nullable = true)
 |-- State_FL: integer (nullable = true)
 |-- State_GA: integer (nullable = true)
 |-- State_IA: integer (nullable = true)
 |-- State_ID: integer (nullable = true)
 |-- State_IL: integer (nullable = true)
 |-- State_IN: integer (nullable = true)
 |-- State_KS: integer (nullable = true)
 |

In [64]:
input_features = data_df.columns
input_features.remove('Severity')

In [68]:
assembler = VectorAssembler(inputCols = input_features , outputCol = 'features')

In [72]:
final_df = assembler.transform(data_df)

In [75]:
train, test = final_df.randomSplit([0.7, 0.3])

In [76]:
random_forest_classifier = RandomForestClassifier(labelCol = 'Severity', featuresCol = 'features')

In [94]:
decision_tree_classifier = DecisionTreeClassifier(labelCol = 'Severity', featuresCol = 'features')

In [78]:
train.count(),test.count()

(1542431, 660853)

In [79]:
rf_model = random_forest_classifier.fit(train)

In [80]:
rf_predictions = rf_model.transform(test)

In [95]:
dt_model = decision_tree_classifier.fit(train)

In [96]:
dt_predictions = dt_model.transform(test)

In [104]:
multi_evaluator = MulticlassClassificationEvaluator(labelCol = 'Severity', metricName = 'accuracy')

In [105]:
print('Decision Tree Accuracy:', multi_evaluator.evaluate(dt_predictions))

Decision Tree Accuracy: 0.6912142337252006


In [106]:
print('Random Forest Accuracy:', multi_evaluator.evaluate(rf_predictions))

Random Forest Accuracy: 0.6749821821191703
