In [1]:
# importing libraries pandas, matplotlib and seaborn to load dataframe and visualization of data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Importing “train_test_split” from “sklearn.model_selection”. 
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
# Importing tree, Pipeline, GridSearchCV from sklearn
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [5]:
from sklearn.metrics import make_scorer,f1_score, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [7]:
%matplotlib inline

In [8]:
# from google.colab import drive

In [9]:
# drive.mount('/content/drive')

In [8]:
%%time
# This is a countrywide car accident dataset, which covers 49 states of the USA. 
# The accident data are collected from February 2016 to June 2020, 
US_Accidents_Feb16_to_June20_data =pd.read_csv('../FinalProject/data/US_Accidents_June20.csv',encoding="utf-8")
US_Accidents_Feb16_to_June20_data.shape

Wall time: 26.1 s


(3513617, 49)

In [9]:
US_Accidents_Feb16_to_June20_data.State.unique()

array(['OH', 'WV', 'CA', 'FL', 'GA', 'SC', 'NE', 'IA', 'IL', 'MO', 'WI',
       'IN', 'MI', 'NJ', 'NY', 'CT', 'MA', 'RI', 'NH', 'PA', 'KY', 'MD',
       'VA', 'DC', 'DE', 'TX', 'WA', 'OR', 'AL', 'TN', 'NC', 'KS', 'LA',
       'OK', 'CO', 'UT', 'AZ', 'MN', 'MS', 'NV', 'ME', 'AR', 'ID', 'VT',
       'NM', 'ND', 'WY', 'SD', 'MT'], dtype=object)

In [10]:
States_records =['VA','CA','OK','KS','MO']

In [11]:
# Function to filter dataframe by state
def filter_dataframe_by_state_name(df,state):
    updated_df = df[df['State']==state]
    return updated_df

In [12]:
# function filter US accidents by States
def concat_dataframe_by_states_US_Accidents(dataframe,States_records):
    data = []
    for state in States_records:
        data.append(filter_dataframe_by_state_name(dataframe,state))

    merged_t = pd.concat(data, axis=0)
    return merged_t
    

In [13]:
US_Accidents_Feb16_to_June20_data_by_states =concat_dataframe_by_states_US_Accidents(US_Accidents_Feb16_to_June20_data,States_records)
US_Accidents_Feb16_to_June20_data_by_states.shape

(1014485, 49)

####  Feature Reduction for US_Accidents_Feb16_to_June20_data

In [14]:
# Dropping features because 70% of data is null
US_Accidents_Feb16_to_June20_data.drop(['Precipitation(in)','Number', 'End_Lng', 'End_Lat','Wind_Chill(F)','Wind_Speed(mph)','TMC'], axis=1, inplace=True)
US_Accidents_Feb16_to_June20_data.shape


(3513617, 42)

In [15]:
# Dropping features which doesn't add much value for model creation
US_Accidents_Feb16_to_June20_data.drop(['Country','Zipcode','ID','Source','Description','Street','Weather_Timestamp','Turning_Loop','City'], axis=1, inplace=True)
US_Accidents_Feb16_to_June20_data.shape
# Traffic_Signal, Turning_Loop


(3513617, 33)

In [16]:
US_Accidents_Feb16_to_June20_data['Start_Time'] = pd.to_datetime(US_Accidents_Feb16_to_June20_data['Start_Time'])
US_Accidents_Feb16_to_June20_data['Start_Time_Year'] = US_Accidents_Feb16_to_June20_data['Start_Time'].dt.year
US_Accidents_Feb16_to_June20_data['Start_Time_Year'].unique()

array([2016, 2017, 2020, 2019, 2018], dtype=int64)

In [17]:
# Dropping features 'Start_Time' and 'End_Time'
US_Accidents_Feb16_to_June20_data.drop(['Start_Time','End_Time'], axis=1, inplace=True)
US_Accidents_Feb16_to_June20_data.shape

(3513617, 32)

In [18]:
# Dropping features 'Start_Lat' and 'Start_Lng'
US_Accidents_Feb16_to_June20_data.drop(['Start_Lat','Start_Lng'], axis=1, inplace=True)
US_Accidents_Feb16_to_June20_data.shape

(3513617, 30)

In [19]:

# Dropping features where 99% of data belongs to only one category there is no variance in data
# 'Turning_Loop', 'Traffic_Calming', 'Stop', 'Station', 'Roundabout', 'Railway','No_Exit','Give_Way','Crossing', 'Bump', 'Amenity'
US_Accidents_Feb16_to_June20_data.drop(['Traffic_Calming', 'Stop', 'Station', 'Roundabout', 'Railway','No_Exit','Give_Way','Crossing', 'Bump', 'Amenity'], axis=1, inplace=True)
US_Accidents_Feb16_to_June20_data.shape


(3513617, 20)

In [20]:
# Dropping features 'Timezone', 'Airport_Code'
# Dropping feature Wind_Direction - can be used for SVM later
US_Accidents_Feb16_to_June20_data.drop(['Wind_Direction','Airport_Code'], axis=1, inplace=True)
US_Accidents_Feb16_to_June20_data.shape

(3513617, 18)

In [21]:
US_Accidents_Feb16_to_June20_data.shape

(3513617, 18)

### Feature transformation 

In [22]:
US_Accidents_Feb16_to_June20_data.columns

Index(['Severity', 'Distance(mi)', 'Side', 'County', 'State', 'Timezone',
       'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)',
       'Weather_Condition', 'Junction', 'Traffic_Signal', 'Sunrise_Sunset',
       'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight',
       'Start_Time_Year'],
      dtype='object')

#### Converting categorical values with two outcomes into binary value 

In [23]:
day_night_mapping = {'Day':0,'Night':1}

In [24]:
# mapping categorical values Day = 0 and night= 1
# 'Sunrise_Sunset', 'Civil_Twilight','Nautical_Twilight', 'Astronomical_Twilight'
US_Accidents_Feb16_to_June20_data.Sunrise_Sunset.replace(dict(day_night_mapping), inplace=True)
US_Accidents_Feb16_to_June20_data.Civil_Twilight.replace(dict(day_night_mapping), inplace=True)
US_Accidents_Feb16_to_June20_data.Nautical_Twilight.replace(dict(day_night_mapping), inplace=True)
US_Accidents_Feb16_to_June20_data.Astronomical_Twilight.replace(dict(day_night_mapping), inplace=True)


In [25]:
boolean_mapping = {True:1,False:0}
data_mapping = {'R':0,'L':1}

In [26]:
US_Accidents_Feb16_to_June20_data.Traffic_Signal.unique()

array([False,  True])

In [27]:
US_Accidents_Feb16_to_June20_data.Traffic_Signal.replace(dict(boolean_mapping),inplace=True)
US_Accidents_Feb16_to_June20_data.Junction.replace(dict(boolean_mapping), inplace=True)
US_Accidents_Feb16_to_June20_data.Side.replace(dict(data_mapping), inplace=True)

In [28]:
# Finding out columns with categorical values
US_Accidents_Feb16_to_June20_data.dtypes.unique() # unique data types 
US_Accidents_Feb16_to_June20_data.select_dtypes(exclude=['int','float']).columns #  Fetching columns with categorical values

Index(['Severity', 'Side', 'County', 'State', 'Timezone', 'Weather_Condition',
       'Junction', 'Traffic_Signal', 'Start_Time_Year'],
      dtype='object')

In [29]:
# Finding out columns with numerical values
US_Accidents_Feb16_to_June20_data.select_dtypes(include=['int','float']).columns

Index(['Distance(mi)', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)',
       'Visibility(mi)', 'Sunrise_Sunset', 'Civil_Twilight',
       'Nautical_Twilight', 'Astronomical_Twilight'],
      dtype='object')

In [30]:
# Dropping null values from the dataframe
US_Accidents_Feb16_to_June20_data.dropna(inplace=True)
US_Accidents_Feb16_to_June20_data.shape

(3410988, 18)

In [31]:
# Replace whitespace with ""
US_Accidents_Feb16_to_June20_data = US_Accidents_Feb16_to_June20_data.assign(County= US_Accidents_Feb16_to_June20_data['County'].str.replace(" ",""))

# Optimized datasets after feature extraction and cleaning

### Merge the datasets US accidents and demographic data based on YEAR, STATE, COUNTY

In [29]:
# del(US_Accidents_Feb16_to_June20_data_by_states)
# del(US_demographic_data)

In [30]:
# Droping duplicate features STNAME_, CTYNAME_, YEAR_
# US_Accidents_plus_Demographic_merged.drop(['STNAME_', 'CTYNAME_', 'YEAR_'], axis=1, inplace=True)
# US_Accidents_plus_Demographic_merged.shape

### Converting categorical values into numerical values using get_dummies 

In [31]:
# Categorical data types
# US_Accidents_Feb16_to_June20_data.select_dtypes(exclude=['int','float']).columns 

In [32]:
US_Accidents_Feb16_to_June20_data['Traffic_Signal'].unique()

array([0, 1], dtype=int64)

In [33]:
margin = len(US_Accidents_Feb16_to_June20_data)/ 100
margin

34109.88

In [34]:
# del US_Accidents_Feb16_to_June20_data_5_states_optimized
# del US_demographic_data_5_states_optimized

In [34]:
# Get the count of each value
value_counts = US_Accidents_Feb16_to_June20_data.Weather_Condition.value_counts()

# Select the values where the count is less than 1% of the data
to_remove = value_counts[value_counts <= margin].index

# Keep rows where the Weather_Condition column is not in to_remove
US_Accidents_Feb16_to_June20_data = US_Accidents_Feb16_to_June20_data[~US_Accidents_Feb16_to_June20_data.Weather_Condition.isin(to_remove)]

In [35]:
# Try to minimize the weather_condition features later on
Feature_list = ['State','Timezone','Weather_Condition']
US_Accidents_Feb16_to_June20_data.shape

(3271950, 18)

In [36]:
subset_dataframe_featurelist=[]
for variable in Feature_list:
    subset_dataframe_featurelist.append(pd.get_dummies(US_Accidents_Feb16_to_June20_data[variable],prefix=variable, dummy_na=False))

In [37]:
merged_subset_dataframe_featurelist = pd.concat(subset_dataframe_featurelist, axis=1)
merged_subset_dataframe_featurelist.shape

(3271950, 64)

In [38]:
%%time
US_Accidents_Feb16_to_June20_data = pd.concat([merged_subset_dataframe_featurelist,US_Accidents_Feb16_to_June20_data], axis=1)
US_Accidents_Feb16_to_June20_data.shape

Wall time: 506 ms


(3271950, 82)

In [39]:
# # 'Side', 'State', 'Timezone','Weather_Condition', 
US_Accidents_Feb16_to_June20_data.drop(['Side', 'State', 'Timezone','Weather_Condition'],axis=1, inplace=True)
US_Accidents_Feb16_to_June20_data.shape

(3271950, 78)

In [40]:
US_Accidents_Feb16_to_June20_data.dropna(inplace = True)
US_Accidents_Feb16_to_June20_data.shape

(3271950, 78)

##### Severity as outcome binary values 0 or 1

In [41]:
US_Accidents_Feb16_to_June20_data.Severity.unique()

array([3, 2, 1, 4], dtype=int64)

In [42]:
# Binary output
Severity_mapping = {1:0,2:0,3:1,4:1}
US_Accidents_Feb16_to_June20_data.Severity.replace(dict(Severity_mapping), inplace=True)
US_Accidents_Feb16_to_June20_data.shape

(3271950, 78)

In [43]:
US_Accidents_Feb16_to_June20_data.Severity.unique()

array([1, 0], dtype=int64)

In [44]:
# US_Accidents_plus_Demographic_merged.to_csv('US_Accidents_plus_Demographic_merged_without_na.csv',index=False)

In [45]:
y = pd.Series(US_Accidents_Feb16_to_June20_data['Severity'])
y.shape

(3271950,)

In [46]:
US_Accidents_Feb16_to_June20_data.drop(['Severity'],axis=1, inplace=True)
US_Accidents_Feb16_to_June20_data.shape

(3271950, 77)

In [47]:
y_County = pd.Series(US_Accidents_Feb16_to_June20_data.County)
y_County.shape

(3271950,)

In [48]:
US_Accidents_Feb16_to_June20_data.drop(['County'],axis=1, inplace=True)
US_Accidents_Feb16_to_June20_data.shape

(3271950, 76)

In [49]:
X= pd.DataFrame(US_Accidents_Feb16_to_June20_data)
print(X.shape,y.shape)

(3271950, 76) (3271950,)


In [50]:
# Using train_test_split() to randomly split the given training dataset into training and cross validation 
# data by 70:30 percent ratio respectively. 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)
print(X_train.shape,X_test.shape)

(2290365, 76) (981585, 76)


In [33]:
# del X_cross_validation
# del y_cross_validation

In [36]:
X_train.to_csv('X_train_5_states_optimized.csv',index=False)
X_test.to_csv('X_test_5_states_optimized.csv',index=False)
y_train.to_csv('y_train_5_states_optimized.csv',index=False)
y_test.to_csv('y_test_5_states_optimized.csv',index=False)

In [54]:
y_test.value_counts()

0    672914
1    308671
Name: Severity, dtype: int64

###  Creating model using random forest

In [55]:
clfRandom = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [56]:
%time clfRandom.fit(X_train, y_train)

Wall time: 22min 23s


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [57]:
y_train_pred = clfRandom.predict(X_train)

In [58]:
confusion_matrix(y_train, y_train_pred)

array([[1548689,   21511],
       [  43938,  676227]], dtype=int64)

In [59]:
f1_score(y_train,y_train_pred)

0.9538409891226691

In [60]:
accuracy_score(y_train,y_train_pred)

0.9714242053122537

In [61]:
y_test_pred = clfRandom.predict(X_test)

In [62]:
confusion_matrix(y_test, y_test_pred)

array([[572680, 100234],
       [155128, 153543]], dtype=int64)

In [63]:
f1_score(y_test, y_test_pred)

0.5459811395897932

In [64]:
accuracy_score(y_test,y_test_pred)

0.7398472878049277

### Creating model using decision tree

In [67]:
model_DT = tree.DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=40,
            max_features=60, max_leaf_nodes=16,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=0, splitter='best')

In [68]:

clf_DT = model_DT.fit(X_train, y_train)



In [58]:

y_train_pred = clf_DT.predict(X_train)

In [59]:
confusion_matrix(y_train, y_train_pred)

array([[294092,  53086],
       [ 86598,  78962]], dtype=int64)

In [60]:
f1_score(y_train, y_train_pred)

0.5306443375178087

In [61]:
accuracy_score(y_train, y_train_pred)

0.7275723663937528

In [62]:
y_test_pred = clf_DT.predict(X_test)

In [63]:
f1_score(y_test, y_test_pred)

0.5339716673861005

In [64]:
accuracy_score(y_test, y_test_pred)

0.7297822476051787

In [65]:
confusion_matrix(y_test, y_test_pred)

array([[126348,  22614],
       [ 36765,  34018]], dtype=int64)

### Creating model using logistic regression

In [69]:
# Used MinMaxScaler() function to further normalize features on the scale of (0 to 1)
scaler = MinMaxScaler()
scaler.fit(X_train)
# Normalized traning data set
normalized_train = scaler.transform(X_train)

In [70]:
del(X_train)

In [71]:
normalized_train.shape

(2290365, 76)

In [72]:
normalized_test = scaler.transform(X_test)
normalized_test.shape

(981585, 76)

In [73]:
del(X_test)

In [1]:
from sklearn import preprocessing

In [74]:
model_Logistic = LogisticRegression(solver='liblinear', random_state=0).fit(normalized_train, y_train)

In [76]:
y_train_pred = model_Logistic.predict(normalized_train)

In [77]:
confusion_matrix(y_train, y_train_pred)

array([[1466969,  103231],
       [ 566533,  153632]], dtype=int64)

In [78]:
f1_score(y_train, y_train_pred)

0.3144884281719664

In [79]:
accuracy_score(y_train, y_train_pred)

0.7075732470588749

In [80]:
y_test_pred = model_Logistic.predict(normalized_test)

In [81]:
confusion_matrix(y_test, y_test_pred)

array([[628583,  44331],
       [242954,  65717]], dtype=int64)

In [82]:
f1_score(y_test, y_test_pred)

0.313895476441241

In [83]:
accuracy_score(y_test, y_test_pred)

0.7073253971892398

#### Model for 5 states of US

#### Random forest

In [53]:
ModelRandom_5states = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [54]:
%time ModelRandom_5states.fit(X_train, y_train)

Wall time: 22min 45s


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [55]:
y_train_pred = ModelRandom_5states.predict(X_train)

In [56]:
confusion_matrix(y_train, y_train_pred)

array([[1548550,   21650],
       [  43786,  676379]], dtype=int64)

In [57]:
f1_score(y_train,y_train_pred)

0.9538596271032033

In [58]:
accuracy_score(y_train, y_train_pred)

0.9714298812634667

In [59]:
y_test_pred = ModelRandom_5states.predict(X_test)

In [60]:
confusion_matrix(y_test, y_test_pred)

array([[572505, 100409],
       [155081, 153590]], dtype=int64)

In [61]:
f1_score(y_test, y_test_pred)

0.5459327847583841

In [62]:
accuracy_score(y_test, y_test_pred)

0.7397168864642389

#### Decision tree

In [63]:
Model_DT_5states = tree.DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=40,
            max_features=60, max_leaf_nodes=16,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=0, splitter='best')

In [64]:
clf_DT = Model_DT_5states.fit(X_train, y_train)



In [65]:
y_train_pred = clf_DT.predict(X_train)

In [66]:
confusion_matrix(y_train, y_train_pred)

array([[1496115,   74085],
       [ 595544,  124621]], dtype=int64)

In [67]:
f1_score(y_train, y_train_pred)

0.27124808596636524

In [68]:
accuracy_score(y_train, y_train_pred)

0.707632189629164

In [69]:
y_test_pred = clf_DT.predict(X_test)

In [70]:
f1_score(y_test, y_test_pred)

0.2711042505547403

In [71]:
accuracy_score(y_test, y_test_pred)

0.7075179429188506

#### Logistic Regression 

In [73]:
# Used MinMaxScaler() function to further normalize features on the scale of (0 to 1)
scaler = MinMaxScaler()
scaler.fit(X_train)
# Normalized traning data set
normalized_train = scaler.transform(X_train)

In [74]:
normalized_test = scaler.transform(X_test)
normalized_test.shape

(981585, 76)

In [75]:
model_Logistic = LogisticRegression(solver='liblinear', random_state=0).fit(normalized_train, y_train)

In [76]:
y_train_pred = model_Logistic.predict(normalized_train)

In [77]:
confusion_matrix(y_train, y_train_pred)

array([[1466969,  103231],
       [ 566533,  153632]], dtype=int64)

In [78]:
f1_score(y_train,y_train_pred)

0.3144884281719664

In [79]:
accuracy_score(y_train, y_train_pred)

0.7075732470588749

In [80]:
y_test_pred = model_Logistic.predict(normalized_test)

In [81]:
confusion_matrix(y_test, y_test_pred)

array([[628583,  44331],
       [242954,  65717]], dtype=int64)

In [82]:
f1_score(y_test,y_test_pred)

0.313895476441241

In [83]:
accuracy_score(y_test, y_test_pred)

0.7073253971892398