In [4]:
import pandas as pd
import numpy as np
import joblib
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score



Load Dataset

In [5]:
df = pd.read_csv('../data/US_Accidents_March23_sample.csv')
df.head(10)

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-227993,Source2,2,2016-05-18 16:28:12,2016-05-18 16:58:12,39.81562,-82.822304,,,0.0,...,False,False,False,False,False,False,Day,Day,Day,Day
1,A-5584682,Source1,2,2021-10-17 00:25:30,2021-10-17 00:52:30,35.110082,-80.97522,35.107982,-80.978715,0.245,...,False,False,False,False,False,False,Night,Night,Night,Night
2,A-309630,Source2,2,2016-11-13 16:55:24,2016-11-13 17:40:24,47.234718,-122.487633,,,0.01,...,False,False,False,False,False,False,Night,Day,Day,Day
3,A-3580182,Source1,2,2017-03-15 09:21:23,2017-03-15 15:21:23,43.17657,-71.61108,43.177915,-71.624505,0.683,...,False,False,False,False,False,False,Day,Day,Day,Day
4,A-6523880,Source1,2,2021-02-12 04:27:00,2021-02-12 08:44:00,41.944932,-83.555199,41.945482,-83.559469,0.223,...,False,False,False,False,False,False,Night,Night,Night,Night
5,A-7100647,Source1,2,2020-06-19 16:26:01,2020-06-19 16:55:16,43.61386,-116.26264,43.61386,-116.26264,0.0,...,False,False,False,False,False,False,Day,Day,Day,Day
6,A-4190196,Source1,2,2022-02-22 09:02:46,2022-02-22 11:34:23,25.732933,-80.340203,25.733035,-80.335226,0.31,...,False,True,False,False,False,False,Day,Day,Day,Day
7,A-5962252,Source1,2,2021-08-28 21:19:00.000000000,2021-08-28 23:32:17.000000000,32.786155,-96.812695,32.783963,-96.813574,0.16,...,False,False,False,False,False,False,Night,Night,Night,Day
8,A-4658270,Source1,2,2022-06-19 10:50:00,2022-06-19 12:08:23,41.157002,-76.797775,41.175349,-76.791129,1.314,...,False,False,False,False,False,False,Day,Day,Day,Day
9,A-2099924,Source2,2,2019-05-13 06:15:11,2019-05-13 07:47:15,34.05426,-118.237823,,,0.0,...,False,False,False,False,True,False,Day,Day,Day,Day


Handle Missing Values

In [6]:
num_cols = ['Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)']
cat_cols = [ 'Country', 'Timezone', 'Weather_Condition', 'Sunrise_Sunset', 
            'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight',
            'Amenity', 'Bump', 'Crossing', 'Junction', 
            'Railway', 'Roundabout', 'Stop',
            'Traffic_Signal', 'Turning_Loop']

In [7]:
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

df[num_cols] = num_imputer.fit_transform(df[num_cols])
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

Feature Encoding

In [8]:
df_encoded = pd.get_dummies(df[cat_cols], drop_first=True)
df = pd.concat([df.drop(cat_cols, axis=1), df_encoded], axis=1)

Normalization of Numerical Data

In [9]:
for column in num_cols:
    df[column] = (df[column] - df[column].mean()) / df[column].std()

Feature Engineering

In [11]:
df['Duration'] = df.End_Time - df.Start_Time 
df['Duration'] = df['Duration'].apply(lambda x:round(x.total_seconds() / 60) )
print("The overall mean duration is: ", (round(df['Duration'].mean(),3)), 'min')

The overall mean duration is:  415.359 min


Removing outliers using IQR

In [12]:
Q1 = df[num_cols].quantile(0.25)
Q3 = df[num_cols].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df[num_cols] < (Q1 - 1.5 * IQR)) | (df[num_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

Model Training

In [13]:
new_df = df.drop(['Start_Time', 'End_Time', 'Weather_Timestamp'], axis=1)
f_dummy = pd.get_dummies(new_df, drop_first=True)
f_dummy

Unnamed: 0,Severity,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),...,Wind_Direction_SSE,Wind_Direction_SSW,Wind_Direction_SW,Wind_Direction_South,Wind_Direction_VAR,Wind_Direction_Variable,Wind_Direction_W,Wind_Direction_WNW,Wind_Direction_WSW,Wind_Direction_West
0,2,0.712455,0.684889,-1.803202e-15,0.000000,-0.335002,63.9,,51.0,30.09,...,False,False,False,False,False,False,False,False,False,False
1,2,-0.215829,0.791081,-2.952821e-01,1.092613,-0.185739,53.0,53.0,64.0,29.26,...,False,False,False,False,False,False,False,False,False,False
2,2,2.176056,-1.595559,-1.803202e-15,0.000000,-0.328910,52.5,,95.0,30.00,...,False,True,False,False,False,False,False,False,False,False
7,2,-0.674281,-0.119450,-8.850675e-01,-0.076388,-0.237524,86.0,86.0,61.0,29.42,...,False,False,False,False,False,False,False,False,False,False
9,2,-0.424116,-1.351228,-1.803202e-15,0.000000,-0.335002,63.0,63.0,81.0,29.79,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131374,2,0.554691,1.019157,6.962759e-01,1.385961,-0.232041,9.0,-6.0,44.0,29.83,...,False,False,False,False,False,False,False,True,False,False
131375,3,1.476690,0.361805,-1.803202e-15,0.000000,-0.335002,25.0,20.7,85.0,30.35,...,False,False,True,False,False,False,False,False,False,False
131377,2,-0.647291,-0.114007,-1.803202e-15,0.000000,-0.335002,34.0,22.0,65.0,29.44,...,False,False,False,False,False,False,False,False,False,False
131378,3,-0.059158,0.917893,-1.803202e-15,0.000000,-0.335002,97.0,97.0,46.0,29.60,...,False,False,True,False,False,False,False,False,False,False


In [14]:

f_dummy['year'] = df['Start_Time'].dt.year
f_dummy['month'] = df['Start_Time'].dt.month
f_dummy['day'] = df['Start_Time'].dt.day
f_dummy['hour'] = df['Start_Time'].dt.hour

Weather_data=['Temperature(F)','Humidity(%)','Pressure(in)','Visibility(mi)','Wind_Speed(mph)']
f_dummy = f_dummy.dropna(subset=Weather_data)

In [15]:
target = 'Severity'
y = f_dummy[target]
X = f_dummy.drop(target, axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

In [16]:
algo_lst=['Logistic Regression',' K-Nearest Neighbors','Decision Trees','Random Forest']
accuracy_lst=[]

In [17]:
X_train.head(10)

Unnamed: 0,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),...,Wind_Direction_VAR,Wind_Direction_Variable,Wind_Direction_W,Wind_Direction_WNW,Wind_Direction_WSW,Wind_Direction_West,year,month,day,hour
109594,1.11563,0.384817,-1.803202e-15,0.0,-0.335002,75.0,,73.0,29.81,6.0,...,False,False,False,False,False,False,2018,6,20,17
113715,0.261952,-1.567897,-1.803202e-15,0.0,-0.335002,66.0,66.0,40.0,30.07,10.0,...,True,False,False,False,False,False,2021,10,15,19
61954,-0.194623,0.806668,-1.803202e-15,0.0,-0.335002,57.0,,62.0,30.09,10.0,...,False,False,False,False,False,False,2018,2,1,17
515,2.282226,-1.303241,-1.803202e-15,0.0,-0.335002,30.0,30.0,82.0,27.88,10.0,...,False,False,False,False,False,False,2021,12,15,9
12890,-0.27328,-1.39521,-1.803202e-15,0.0,-0.335002,30.0,20.0,72.0,25.38,10.0,...,False,False,False,False,False,False,2020,11,9,6
28303,-0.376434,0.854029,-0.5010112,1.173621,-0.266768,91.0,91.0,52.0,29.97,10.0,...,False,False,False,False,False,False,2022,6,30,15
120370,0.837811,0.837421,-1.803202e-15,0.0,-0.335002,55.0,55.0,86.0,29.03,10.0,...,False,False,False,False,False,False,2022,4,21,21
44959,1.28357,1.20216,-1.803202e-15,0.0,-0.32891,63.0,,90.0,29.87,10.0,...,False,False,False,False,False,False,2016,10,18,8
78198,-0.079226,0.927993,-0.1215715,1.2697,0.299823,53.0,53.0,59.0,29.88,10.0,...,False,False,False,False,False,False,2020,4,17,8
120312,1.729253,0.082049,-1.803202e-15,0.0,-0.335002,21.0,11.4,85.0,30.24,10.0,...,False,False,False,False,False,False,2018,12,4,9


In [18]:
y_train.head(10)

109594    3
113715    2
61954     2
515       2
12890     2
28303     2
120370    3
44959     3
78198     2
120312    3
Name: Severity, dtype: int64

In [19]:
clf=RandomForestClassifier(n_estimators=100)

clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)


acc=accuracy_score(y_test, y_pred)

accuracy_lst.append(acc)

print("[Randon forest algorithm] accuracy_score: {:.3f}.".format(acc))

[Randon forest algorithm] accuracy_score: 0.821.


In [24]:
filename = '../models/random_forest_model.pkl'
joblib.dump(clf, filename)

['../models/random_forest_model.pkl']

In [27]:
model = joblib.load(filename)
predict = model.predict(X_test)

print(predict)

[2 3 2 ... 2 2 2]
