In [1]:
#getting started
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf

In [2]:
data = pd.read_csv('US_Accidents_March23.csv', nrows=400000)

In [3]:
data

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,0.01,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,0.01,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,0.01,...,False,False,False,False,True,False,Day,Day,Day,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399995,A-400008,Source2,3,2017-04-25 12:15:27,2017-04-25 12:49:00,37.516037,-121.940689,,,0.01,...,False,False,False,False,False,False,Day,Day,Day,Day
399996,A-400009,Source2,3,2017-04-25 12:13:06,2017-04-25 12:44:00,37.458626,-121.924133,,,0.01,...,False,False,False,False,False,False,Day,Day,Day,Day
399997,A-400010,Source2,2,2017-04-25 12:14:47,2017-04-25 12:48:00,37.355961,-121.868294,,,0.01,...,False,False,False,False,False,False,Day,Day,Day,Day
399998,A-400011,Source2,2,2017-04-25 12:32:34,2017-04-25 13:03:00,37.452118,-122.126907,,,0.01,...,False,False,False,False,False,False,Day,Day,Day,Day


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 46 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   ID                     400000 non-null  object 
 1   Source                 400000 non-null  object 
 2   Severity               400000 non-null  int64  
 3   Start_Time             400000 non-null  object 
 4   End_Time               400000 non-null  object 
 5   Start_Lat              400000 non-null  float64
 6   Start_Lng              400000 non-null  float64
 7   End_Lat                0 non-null       float64
 8   End_Lng                0 non-null       float64
 9   Distance(mi)           400000 non-null  float64
 10  Description            400000 non-null  object 
 11  Street                 400000 non-null  object 
 12  City                   399981 non-null  object 
 13  County                 400000 non-null  object 
 14  State                  400000 non-nu

In [5]:
#dealing with missing values
data.isna().mean()

ID                       0.000000
Source                   0.000000
Severity                 0.000000
Start_Time               0.000000
End_Time                 0.000000
Start_Lat                0.000000
Start_Lng                0.000000
End_Lat                  1.000000
End_Lng                  1.000000
Distance(mi)             0.000000
Description              0.000000
Street                   0.000000
City                     0.000048
County                   0.000000
State                    0.000000
Zipcode                  0.000107
Country                  0.000000
Timezone                 0.000107
Airport_Code             0.000110
Weather_Timestamp        0.008023
Temperature(F)           0.014788
Wind_Chill(F)            0.852263
Humidity(%)              0.016272
Pressure(in)             0.011617
Visibility(mi)           0.021948
Wind_Direction           0.008075
Wind_Speed(mph)          0.185427
Precipitation(in)        0.894887
Weather_Condition        0.020520
Amenity       

In [8]:
null_columns = ['End_Lat', 'End_Lng', 'Wind_Chill(F)', 'Precipitation(in)']

data = data.drop(null_columns, axis=1)

In [7]:
data.isna().sum()

ID                            0
Source                        0
Severity                      0
Start_Time                    0
End_Time                      0
Start_Lat                     0
Start_Lng                     0
End_Lat                  400000
End_Lng                  400000
Distance(mi)                  0
Description                   0
Street                        0
City                         19
County                        0
State                         0
Zipcode                      43
Country                       0
Timezone                     43
Airport_Code                 44
Weather_Timestamp          3209
Temperature(F)             5915
Wind_Chill(F)            340905
Humidity(%)                6509
Pressure(in)               4647
Visibility(mi)             8779
Wind_Direction             3230
Wind_Speed(mph)           74171
Precipitation(in)        357955
Weather_Condition          8208
Amenity                       0
Bump                          0
Crossing

In [9]:
data = data.dropna(axis=0).reset_index(drop=True)

In [10]:
print("Total missing values:", data.isna().sum().sum())

Total missing values: 0


In [11]:
data

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),Description,Street,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,0.01,Accident on OH-32 State Route 32 Westbound at ...,State Route 32,...,False,False,False,False,True,False,Night,Night,Day,Day
1,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,0.01,Accident on I-75 Southbound at Exits 52 52B US...,I-75 S,...,False,False,False,False,False,False,Night,Day,Day,Day
2,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,0.01,Accident on McEwen Rd at OH-725 Miamisburg Cen...,Miamisburg Centerville Rd,...,False,False,False,False,True,False,Day,Day,Day,Day
3,A-6,Source2,3,2016-02-08 07:44:26,2016-02-08 08:14:26,40.100590,-82.925194,0.01,Accident on I-270 Outerbelt Northbound near Ex...,Westerville Rd,...,False,False,False,False,False,False,Day,Day,Day,Day
4,A-7,Source2,2,2016-02-08 07:59:35,2016-02-08 08:29:35,39.758274,-84.230507,0.00,Accident on Oakridge Dr at Woodward Ave. Expec...,N Woodward Ave,...,False,False,False,False,False,False,Day,Day,Day,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320980,A-400007,Source2,2,2017-04-25 12:08:49,2017-04-25 12:38:27,37.352116,-122.059219,0.01,Accident on Fremont Ave at Bernardo Ave.,S Bernardo Ave,...,False,False,False,False,True,False,Day,Day,Day,Day
320981,A-400008,Source2,3,2017-04-25 12:15:27,2017-04-25 12:49:00,37.516037,-121.940689,0.01,Right hand shoulder blocked due to accident on...,I-680 S,...,False,False,False,False,False,False,Day,Day,Day,Day
320982,A-400009,Source2,3,2017-04-25 12:13:06,2017-04-25 12:44:00,37.458626,-121.924133,0.01,Accident on I-880 Southbound at Exit 10 Dixon ...,Nimitz Fwy S,...,False,False,False,False,False,False,Day,Day,Day,Day
320983,A-400010,Source2,2,2017-04-25 12:14:47,2017-04-25 12:48:00,37.355961,-121.868294,0.01,Right hand shoulder blocked due to accident on...,Bayshore Fwy S,...,False,False,False,False,False,False,Day,Day,Day,Day


In [12]:
#unnecessary columns
{column: len(data[column].unique()) for column in data.columns if data.dtypes[column] == 'object'}

{'ID': 320985,
 'Source': 2,
 'Start_Time': 316633,
 'End_Time': 314443,
 'Description': 236516,
 'Street': 36216,
 'City': 4024,
 'County': 548,
 'State': 28,
 'Zipcode': 57076,
 'Country': 1,
 'Timezone': 4,
 'Airport_Code': 638,
 'Weather_Timestamp': 78674,
 'Wind_Direction': 23,
 'Weather_Condition': 67,
 'Sunrise_Sunset': 2,
 'Civil_Twilight': 2,
 'Nautical_Twilight': 2,
 'Astronomical_Twilight': 2}

In [13]:
unneeded_columns = ['ID', 'Description', 'Street', 'City', 'Zipcode', 'Country']

data = data.drop(unneeded_columns, axis=1)

In [14]:
data

Unnamed: 0,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),County,State,Timezone,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,0.01,Clermont,OH,US/Eastern,...,False,False,False,False,True,False,Night,Night,Day,Day
1,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,0.01,Montgomery,OH,US/Eastern,...,False,False,False,False,False,False,Night,Day,Day,Day
2,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,0.01,Montgomery,OH,US/Eastern,...,False,False,False,False,True,False,Day,Day,Day,Day
3,Source2,3,2016-02-08 07:44:26,2016-02-08 08:14:26,40.100590,-82.925194,0.01,Franklin,OH,US/Eastern,...,False,False,False,False,False,False,Day,Day,Day,Day
4,Source2,2,2016-02-08 07:59:35,2016-02-08 08:29:35,39.758274,-84.230507,0.00,Montgomery,OH,US/Eastern,...,False,False,False,False,False,False,Day,Day,Day,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320980,Source2,2,2017-04-25 12:08:49,2017-04-25 12:38:27,37.352116,-122.059219,0.01,Santa Clara,CA,US/Pacific,...,False,False,False,False,True,False,Day,Day,Day,Day
320981,Source2,3,2017-04-25 12:15:27,2017-04-25 12:49:00,37.516037,-121.940689,0.01,Alameda,CA,US/Pacific,...,False,False,False,False,False,False,Day,Day,Day,Day
320982,Source2,3,2017-04-25 12:13:06,2017-04-25 12:44:00,37.458626,-121.924133,0.01,Alameda,CA,US/Pacific,...,False,False,False,False,False,False,Day,Day,Day,Day
320983,Source2,2,2017-04-25 12:14:47,2017-04-25 12:48:00,37.355961,-121.868294,0.01,Santa Clara,CA,US/Pacific,...,False,False,False,False,False,False,Day,Day,Day,Day


In [15]:
def get_years(df, column):
    return df[column].apply(lambda date: date[0:4])

def get_months(df, column):
    return df[column].apply(lambda date: date[5:7])

In [16]:
data['Start_Time_Month'] = get_months(data, 'Start_Time')
data['Start_Time_Year'] = get_years(data, 'Start_Time')

data['End_Time_Month'] = get_months(data, 'End_Time')
data['End_Time_Year'] = get_years(data, 'End_Time')

data['Weather_Timestamp_Month'] = get_months(data, 'Weather_Timestamp')
data['Weather_Timestamp_Year'] = get_years(data, 'Weather_Timestamp')


data = data.drop(['Start_Time', 'End_Time', 'Weather_Timestamp'], axis=1)

In [17]:
data

Unnamed: 0,Source,Severity,Start_Lat,Start_Lng,Distance(mi),County,State,Timezone,Airport_Code,Temperature(F),...,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Start_Time_Month,Start_Time_Year,End_Time_Month,End_Time_Year,Weather_Timestamp_Month,Weather_Timestamp_Year
0,Source2,2,39.063148,-84.032608,0.01,Clermont,OH,US/Eastern,KI69,36.0,...,Night,Night,Day,Day,02,2016,02,2016,02,2016
1,Source2,3,39.747753,-84.205582,0.01,Montgomery,OH,US/Eastern,KDAY,35.1,...,Night,Day,Day,Day,02,2016,02,2016,02,2016
2,Source2,2,39.627781,-84.188354,0.01,Montgomery,OH,US/Eastern,KMGY,36.0,...,Day,Day,Day,Day,02,2016,02,2016,02,2016
3,Source2,3,40.100590,-82.925194,0.01,Franklin,OH,US/Eastern,KCMH,37.9,...,Day,Day,Day,Day,02,2016,02,2016,02,2016
4,Source2,2,39.758274,-84.230507,0.00,Montgomery,OH,US/Eastern,KDAY,34.0,...,Day,Day,Day,Day,02,2016,02,2016,02,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320980,Source2,2,37.352116,-122.059219,0.01,Santa Clara,CA,US/Pacific,KNUQ,63.0,...,Day,Day,Day,Day,04,2017,04,2017,04,2017
320981,Source2,3,37.516037,-121.940689,0.01,Alameda,CA,US/Pacific,KNUQ,63.0,...,Day,Day,Day,Day,04,2017,04,2017,04,2017
320982,Source2,3,37.458626,-121.924133,0.01,Alameda,CA,US/Pacific,KPAO,64.4,...,Day,Day,Day,Day,04,2017,04,2017,04,2017
320983,Source2,2,37.355961,-121.868294,0.01,Santa Clara,CA,US/Pacific,KRHV,64.4,...,Day,Day,Day,Day,04,2017,04,2017,04,2017


In [18]:
#encoding
def onehot_encode(df, columns, prefixes):
    df = df.copy()
    for column, prefix in zip(columns, prefixes):
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

In [19]:
{column: len(data[column].unique()) for column in data.columns if data.dtypes[column] == 'object'}

{'Source': 2,
 'County': 548,
 'State': 28,
 'Timezone': 4,
 'Airport_Code': 638,
 'Wind_Direction': 23,
 'Weather_Condition': 67,
 'Sunrise_Sunset': 2,
 'Civil_Twilight': 2,
 'Nautical_Twilight': 2,
 'Astronomical_Twilight': 2,
 'Start_Time_Month': 12,
 'Start_Time_Year': 2,
 'End_Time_Month': 12,
 'End_Time_Year': 2,
 'Weather_Timestamp_Month': 12,
 'Weather_Timestamp_Year': 2}

In [21]:
data = onehot_encode(
    data,
    columns=['County', 'State', 'Timezone', 'Airport_Code', 'Wind_Direction', 'Weather_Condition'],
    prefixes=['CO', 'ST', 'TZ', 'AC', 'WD', 'WC']
)

In [22]:
data

Unnamed: 0,Source,Severity,Start_Lat,Start_Lng,Distance(mi),Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),...,WC_Snow,WC_Snow Grains,WC_Snow Showers,WC_Squalls,WC_T-Storm,WC_Thunder,WC_Thunder in the Vicinity,WC_Thunderstorm,WC_Thunderstorms and Rain,WC_Widespread Dust
0,Source2,2,39.063148,-84.032608,0.01,36.0,100.0,29.67,10.0,3.5,...,False,False,False,False,False,False,False,False,False,False
1,Source2,3,39.747753,-84.205582,0.01,35.1,96.0,29.64,9.0,4.6,...,False,False,False,False,False,False,False,False,False,False
2,Source2,2,39.627781,-84.188354,0.01,36.0,89.0,29.65,6.0,3.5,...,False,False,False,False,False,False,False,False,False,False
3,Source2,3,40.100590,-82.925194,0.01,37.9,97.0,29.63,7.0,3.5,...,False,False,False,False,False,False,False,False,False,False
4,Source2,2,39.758274,-84.230507,0.00,34.0,100.0,29.66,7.0,3.5,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320980,Source2,2,37.352116,-122.059219,0.01,63.0,58.0,30.12,10.0,16.1,...,False,False,False,False,False,False,False,False,False,False
320981,Source2,3,37.516037,-121.940689,0.01,63.0,58.0,30.12,10.0,16.1,...,False,False,False,False,False,False,False,False,False,False
320982,Source2,3,37.458626,-121.924133,0.01,64.4,59.0,30.11,10.0,16.1,...,False,False,False,False,False,False,False,False,False,False
320983,Source2,2,37.355961,-121.868294,0.01,64.4,49.0,30.13,10.0,9.2,...,False,False,False,False,False,False,False,False,False,False


In [23]:
def get_binary_column(df, column):
    if column == 'Source':
        return df[column].apply(lambda x: 1 if x == 'MapQuest' else 0)
    else:
        return df[column].apply(lambda x: 1 if x == 'Day' else 0)

In [24]:
data['Source'] = get_binary_column(data, 'Source')

data['Sunrise_Sunset'] = get_binary_column(data, 'Sunrise_Sunset')
data['Civil_Twilight'] = get_binary_column(data, 'Civil_Twilight')
data['Nautical_Twilight'] = get_binary_column(data, 'Nautical_Twilight')
data['Astronomical_Twilight'] = get_binary_column(data, 'Astronomical_Twilight')

In [25]:
data

Unnamed: 0,Source,Severity,Start_Lat,Start_Lng,Distance(mi),Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),...,WC_Snow,WC_Snow Grains,WC_Snow Showers,WC_Squalls,WC_T-Storm,WC_Thunder,WC_Thunder in the Vicinity,WC_Thunderstorm,WC_Thunderstorms and Rain,WC_Widespread Dust
0,0,2,39.063148,-84.032608,0.01,36.0,100.0,29.67,10.0,3.5,...,False,False,False,False,False,False,False,False,False,False
1,0,3,39.747753,-84.205582,0.01,35.1,96.0,29.64,9.0,4.6,...,False,False,False,False,False,False,False,False,False,False
2,0,2,39.627781,-84.188354,0.01,36.0,89.0,29.65,6.0,3.5,...,False,False,False,False,False,False,False,False,False,False
3,0,3,40.100590,-82.925194,0.01,37.9,97.0,29.63,7.0,3.5,...,False,False,False,False,False,False,False,False,False,False
4,0,2,39.758274,-84.230507,0.00,34.0,100.0,29.66,7.0,3.5,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320980,0,2,37.352116,-122.059219,0.01,63.0,58.0,30.12,10.0,16.1,...,False,False,False,False,False,False,False,False,False,False
320981,0,3,37.516037,-121.940689,0.01,63.0,58.0,30.12,10.0,16.1,...,False,False,False,False,False,False,False,False,False,False
320982,0,3,37.458626,-121.924133,0.01,64.4,59.0,30.11,10.0,16.1,...,False,False,False,False,False,False,False,False,False,False
320983,0,2,37.355961,-121.868294,0.01,64.4,49.0,30.13,10.0,9.2,...,False,False,False,False,False,False,False,False,False,False


In [26]:
#splitting and scaling 
y = data['Severity'].copy()
X = data.drop('Severity', axis=1).copy()

In [27]:
y.unique()

array([2, 3, 1, 4], dtype=int64)

In [28]:
y = y - 1

In [30]:
X = X.astype(float)

In [31]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=100)

In [33]:
#training
X.shape

(320985, 1340)

In [34]:
inputs = tf.keras.Input(shape=(X.shape[1],))
x = tf.keras.layers.Dense(64, activation='relu')(inputs)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(4, activation='softmax')(x)

model = tf.keras.Model(inputs, outputs)

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

batch_size = 32
epochs = 20

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(),
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


In [35]:
print("Test Accuracy:", model.evaluate(X_test, y_test, verbose=0)[1])

Test Accuracy: 0.7897524237632751
