In [1]:
# import the requied libraries
from pyspark.sql import SparkSession
from pyspark.sql import Row
import numpy as np
import pandas as pd
from pyspark.sql.types import *
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
from pyspark.sql import functions as fn
from pyspark.ml import feature, regression, evaluation, Pipeline
import seaborn as sns
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import  StringIndexer
from sklearn.metrics import classification_report
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression,RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [2]:
# Do not delete or change this cell

import os

# Define a function to determine if we are running on data bricks
# Return true if running in the data bricks environment, false otherwise
def is_databricks():
    # get the databricks runtime version
    db_env = os.getenv("DATABRICKS_RUNTIME_VERSION")
    
    # if running on data bricks
    if db_env != None:
        return True
    else:
        return False

# Define a function to read the data file.  The full path data file name is constructed
# by checking runtime environment variables to determine if the runtime environment is 
# databricks, or a student's personal computer.  The full path file name is then
# constructed based on the runtime env.
# 
# Params
#   data_file_name: The base name of the data file to load
# 
# Returns the full path file name based on the runtime env
#
def get_training_filename(data_file_name):    
    # if running on data bricks
    if is_databricks():
        # build the full path file name assuming data brick env
        full_path_name = "/FileStore/tables/%s" % data_file_name
    # else the data is assumed to be in the same dir as this notebook
    else:
        # Assume the student is running on their own computer and load the data
        # file from the same dir as this notebook
        full_path_name = data_file_name
    
    # return the full path file name to the caller
    return full_path_name

In [3]:
# importing the train data
us_train = spark.read.csv(get_training_filename('USAccident_train_OHE.csv'), header = True, inferSchema = True)

In [4]:
# importing the test data
us_test = spark.read.csv(get_training_filename('USAccident_validation_OHE.csv'), header = True, inferSchema = True)

Below we have converted class 2,3,4 to 0,1,2 for avoiding error while calculating the evaluation metrics

In [5]:
us_test=us_test.withColumn("Severity",when(us_test["Severity"]==2,0).otherwise(us_test["Severity"]))

In [6]:
us_train=us_train.withColumn("Severity",when(us_train["Severity"]==2,0).otherwise(us_train["Severity"]))

In [7]:
us_test=us_test.withColumn("Severity",when(us_test["Severity"]==3,1).otherwise(us_test["Severity"]))

In [8]:
us_train=us_train.withColumn("Severity",when(us_train["Severity"]==3,1).otherwise(us_train["Severity"]))

In [9]:
us_test=us_test.withColumn("Severity",when(us_test["Severity"]==4,2).otherwise(us_test["Severity"]))

In [10]:
us_train=us_train.withColumn("Severity",when(us_train["Severity"]==4,2).otherwise(us_train["Severity"]))

In [11]:
# Declaring the vector assembler
va = VectorAssembler().setInputCols([i for i in us_train.columns if i!='Severity']).setOutputCol('features')

In [12]:
# Centering our data for logistic regression model
center = feature.StandardScaler(withMean=True, withStd=False, inputCol='features', outputCol='centered_features',)

In [13]:
# Converting the labels from string to integers
label_stringIdx = StringIndexer(inputCol="Severity", outputCol="label")

In [14]:
# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="label", featuresCol="centered_features")

# Train model with Training Data
lrModel = Pipeline(stages=[label_stringIdx,va, center, lr])

lr_fit = lrModel.fit(us_train)

In [15]:
# Creating a multiclass evaluator
evaluator_mul = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [16]:
print("Accuracy:",evaluator_mul.evaluate(lr_fit.transform(us_test)))

Accuracy: 0.7213798370672098


In [52]:
lr_fit.stages[-1].getElasticNetParam()

0.0

In [53]:
lr_fit.stages[-1].getRegParam()

0.0

In [17]:
prediction_lrm=(lr_fit.transform(us_test)).toPandas()["prediction"]

In [18]:
true_labels=us_test.toPandas()["Severity"]

In [19]:
# prints the classification report for the evaluating our model
print(classification_report(y_pred=prediction_lrm,y_true=true_labels))

              precision    recall  f1-score   support

           0       0.74      0.91      0.82    131790
           1       0.64      0.36      0.46     58617
           2       0.54      0.11      0.18      5993

   micro avg       0.72      0.72      0.72    196400
   macro avg       0.64      0.46      0.49    196400
weighted avg       0.70      0.72      0.69    196400



# LR Multiclass Grid Search Model 

In [20]:
lr_new = LogisticRegression(labelCol="label", featuresCol="centered_features")

In [21]:
# Creating a grid for tuning our model
#paramGrid_lr = ParamGridBuilder().addGrid(lr_new.regParam, [0.01, 0.04,0.07]).addGrid(lr_new.elasticNetParam, [0.2,0.5,0.8]).build()
paramGrid_lr = ParamGridBuilder().addGrid(lr_new.regParam, [0.01]).addGrid(lr_new.elasticNetParam, [0.2]).build()

In [22]:
# Creates the pipeline for the model
cvModel_lrmu = Pipeline(stages=[label_stringIdx,va,center,lr_new])

In [23]:
# define the evaluator for checking the accuracy of our model
evaluator_mul = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [24]:
# Create a cross validator of 6 folds
cv = CrossValidator(estimator=cvModel_lrmu, estimatorParamMaps=paramGrid_lr, evaluator=evaluator_mul, numFolds=5,seed=42).fit(us_train)

In [None]:
cv.bestModel.stages[-1].extractParamMap()

In [42]:
cv.bestModel.stages[-1].getElasticNetParam()

0.2

In [43]:
cv.bestModel.stages[-1].getRegParam()

0.01

In [25]:
print("Accuracy:",evaluator_mul.evaluate(cv.bestModel.transform(us_test)))

Accuracy: 0.7184317718940937


In [26]:
# stores the LR co-efficients of all the variable
coeft_L1_m=cv.bestModel.stages[-1].coefficientMatrix.toArray()

In [27]:
# prints the total no of features eliminated
coef_L1_mul=cv.bestModel.stages[-1].coefficientMatrix.toArray()
coeft_L1_mb = np.squeeze(coeft_L1_m)
coef_one_b = coeft_L1_m[:][0]
coef_two_b = coeft_L1_m[:][1]
coef_three_b = coeft_L1_m[:][2]
coef_one_b = np.absolute(coef_one_b)
coef_two_b = np.absolute(coef_two_b)
coef_three_b = np.absolute(coef_three_b)

print('Total number of features are',len(coef_three_b))

sorted_abs = np.sort(coef_three_b)
weights_notzero = sorted_abs[sorted_abs == 0]
nonzero_weights = len(sorted_abs[sorted_abs == 0])

print('Eliminated features out of ' + str(len(coef_three_b)) +' are', nonzero_weights)

Total number of features are 120
Eliminated features out of 120 are 105


In [28]:
# gets the prediction by running on our test set
prediction_lrt=(cv.bestModel.transform(us_test)).toPandas()["prediction"]

In [29]:
# stores the true label for using it to print the classification report below
true_labels=us_test.toPandas()["Severity"]

In [30]:
print(classification_report(y_pred=prediction_lrm,y_true=true_labels))

              precision    recall  f1-score   support

           0       0.74      0.91      0.82    131790
           1       0.64      0.36      0.46     58617
           2       0.54      0.11      0.18      5993

   micro avg       0.72      0.72      0.72    196400
   macro avg       0.64      0.46      0.49    196400
weighted avg       0.70      0.72      0.69    196400

