In [1]:
#importing all required libraries
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
from pyspark.sql import Row
import numpy as np
import pandas as pd
from pyspark.sql.types import *
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
from pyspark.sql import functions as fn
from pyspark.ml import feature, regression, evaluation, Pipeline
import seaborn as sns
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import udf
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoderEstimator

In [2]:
# Do not delete or change this cell

import os

# Define a function to determine if we are running on data bricks
# Return true if running in the data bricks environment, false otherwise
def is_databricks():
    # get the databricks runtime version
    db_env = os.getenv("DATABRICKS_RUNTIME_VERSION")
    
    # if running on data bricks
    if db_env != None:
        return True
    else:
        return False

# Define a function to read the data file.  The full path data file name is constructed
# by checking runtime environment variables to determine if the runtime environment is 
# databricks, or a student's personal computer.  The full path file name is then
# constructed based on the runtime env.
# 
# Params
#   data_file_name: The base name of the data file to load
# 
# Returns the full path file name based on the runtime env
#
def get_training_filename(data_file_name):    
    # if running on data bricks
    if is_databricks():
        # build the full path file name assuming data brick env
        full_path_name = "/FileStore/tables/%s" % data_file_name
    # else the data is assumed to be in the same dir as this notebook
    else:
        # Assume the student is running on their own computer and load the data
        # file from the same dir as this notebook
        full_path_name = data_file_name
    
    # return the full path file name to the caller
    return full_path_name

In [4]:
# reads the cleaned data file
data=spark.read.csv(get_training_filename("Us_clean.csv"),inferSchema=True,header=True)

In [5]:
# Extracts hour of the day from the start time of the accident and stores in a new column named Hour 
data=data.withColumn("Hour", date_format(col("Start_Time"), "H"))

In [6]:
# Dropping column Start time,end time, timezone, start latitude and end latitude as they won't be helpful in predicting
# the severiy of the accident
# The columns city, county and state have high cardinality so we have dropped them
drop_col=["Start_Time","End_Time","Start_Lat","Start_Lng",'City','County','State','Timezone']

In [7]:
#Dropping the columns 
data = data.drop(*(drop_col))

In [8]:
# As the data contains very few rows (around 300 in 1 million rows) of Severity 1 we have converted it to Severity 2 because 
# both the classes indicate accidents with less severity 
data=data.withColumn("Severity",when(data["Severity"]==1,2).otherwise(data["Severity"]))

In [9]:
# TMC column is an important column so we decided not to drop it
# It has around 25000 missing so using the mode to impute does not make sense
# So, we have made a different category for the missing values
data=data.fillna({'TMC':'-1'})

In [10]:
#list of all categorical columns
categorical_columns=['Source','Side','Wind_Direction','month_of_year','day_of_week',"TMC",'Sunrise_Sunset','Civil_Twilight',
                     'Nautical_Twilight','Astronomical_Twilight',"Hour"]

In [11]:
#encoding the categrical column as models do not accept string
stages = []

#iterate through all categorical values
for categoricalCol in categorical_columns:
    #create a string indexer for those categorical values and assign a new name including the word 'Index'
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + '_Index')

    #append the string Indexer to our list of stages
    stages += [stringIndexer]

In [12]:
# Running the pipeline which encodes the categorical column

pipeline = Pipeline(stages = stages)
#fit the pipeline to our dataframe
pipelineModel = pipeline.fit(data)
#transform the dataframe
data= pipelineModel.transform(data)

In [13]:
# Dropping the original categorical column
data=data.drop(*(categorical_columns))

In [14]:
#List of columns with binary values i.e. True/False

binary_columns=['Amenity','Bump','Crossing','Give_Way','Junction','No_Exit','Railway','Roundabout','Station','Stop',
'Traffic_Calming','Traffic_Signal']

In [15]:
# Converts the binary values into 0/1

for i in binary_columns:
    data=data.withColumn(i,data[i].cast("int"))

In [16]:
#Converts the Weather Condition to lowercase
data=data.withColumn('Weather_Condition',fn.lower(col("Weather_Condition")))

In [17]:
# Replaces T-storm with Thunderstorm because they are the same weather condition
data=data.withColumn('Weather_Condition', regexp_replace('Weather_Condition', 'T-Storm', 'Thunderstorm'))

In [18]:
# List of manually picked weather conditions that we thought could help in predicting the severity 
w_conditions=["cloud","clear","whirl","wind","light","heavy","thunderstorm","shower","snow","rain","drizzle",
 "fair","hail","haze","overcast",'pellets']

In [19]:
#Splits the strings in the weather condition on space character and converts it into a list of words
data=data.withColumn('Weather_Condition', fn.split("Weather_Condition"," "))

In [20]:
# Removes the word having length less than 4 and also the word "with"
data_clean1=udf(lambda x: list([i for i in x if ((len(i)>3) and (i!="with"))]),
                returnType=ArrayType(StringType()))

In [21]:
# Executes the above function
data=data.withColumn("Weather_Condition",data_clean1("Weather_Condition"))

In [22]:
# Converts the categorical columns into String type
for i in categorical_columns:
    data = data.withColumn(i+"_Index", data[i+"_Index"].cast(StringType()))

In [23]:
# Removes the words from the weather condition column that are not present in the w_conditions list
data_clean2=udf(lambda x: list([i for i in w_conditions if any(i in j for j in x)]),
                returnType=ArrayType(StringType()))

In [24]:
# Executes the above function
data=data.withColumn("Weather_Condition",data_clean2("Weather_Condition"))

In [25]:
# Makes dummy variable for each weather condition in our list
exprs = [fn.when(fn.array_contains(fn.col('Weather_Condition'), column), 1).otherwise(0).alias(column)\
                  for column in w_conditions]

In [26]:
# Makes a temporary dataframe of our weather condition and dummy variables 
temp=data.select(['Weather_Condition']+exprs)

In [27]:
#create two dataframe which we will join to make our final dataframe 
df1 = data.withColumn("id", monotonically_increasing_id())
df2 = temp.withColumn("id", monotonically_increasing_id())

In [28]:
# Creates the final datafram
data = df2.join(df1, "id", "outer").drop("id")

In [29]:
# Now we can drop the weather condition column after making dummies from it
data=data.drop("Weather_Condition")

In [30]:
# Splits the dataframe into train and test
training_df, validation_df= data.randomSplit([0.8, 0.2],seed=42)

In [31]:
# Saves as csv
training_df.toPandas().to_csv("USAccident_train_categorical.csv",index=False)

In [32]:
# Saves as csv
validation_df.toPandas().to_csv("USAccident_validation_categorical.csv",index=False)

## One Hot Encoding

In [33]:
# list of columns to be one hot encoded
categorical_columns2=[i+"_Index"for i in categorical_columns]

In [34]:
# Creating dummies of categorical column
for category in categorical_columns2:
    categ = data.select(category).distinct().rdd.flatMap(lambda x:x).collect()
    exprs = [fn.when(fn.col(category) == cat,1).otherwise(0)\
                .alias(category+"_"+str(int(float(cat)))) for cat in categ]
    data = data.select(exprs+data.columns)

In [35]:
# Dropping all the original categorical columns
data=data.drop(*(categorical_columns2))

In [36]:
# From the n dummies made for each categorical column, dropping the nth dummy
data=data.drop(*([i+"_Index_0" for i in categorical_columns]))

In [37]:
# Splits the dataframe into train and test
training_df, validation_df= data.randomSplit([0.8, 0.2],seed=42)

In [None]:
# Saves as csv
training_df.toPandas().to_csv("USAccident_train_OHE.csv",index=False)

In [None]:
# Saves as csv
validation_df.toPandas().to_csv("USAccident_validation_OHE.csv",index=False)