# Part 4: Neural Networks

#### First, we consider the various resampling techniques 

In [478]:
import numpy as np
import pandas as pd
import csv
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE 

df = pd.read_csv('DF.csv')

This section picks the columns of interest for the statistical modeling.

In [502]:
df= df.loc[:,[ 'WEATHER_CONDITION',
       'CRASH_DAY_OF_WEEK', 'CRASH_MONTH', 'CRASH_HOUR', 'LIGHTING_CONDITION',
       'MANEUVER', 'TRAFFICWAY_TYPE', 'PRIM_CONTRIBUTORY_CAUSE',
       'POSTED_SPEED_LIMIT','COMBINED_DANGER_SCORE']]

df.head()
df_test_interest = df

We used four different techniques to address the issue of imbalanced dataset.

In [503]:
# Approach 1: Upsampling or known as oversampling

## do RANDOM SAMPLING TO PICK EQUAL NUMBER OF DATA IN EACH Y GROUP
# down sampling
df4=df.reset_index()
X=df4.drop(columns = 'COMBINED_DANGER_SCORE')
y=df4.COMBINED_DANGER_SCORE

# find the number of levels in y and number of entries associated with each level

unique_levels = np.unique(y)
unique_counts = {level: sum(y == level) for level in unique_levels}
print(unique_counts)

# find the target number of data points
unique_counts.items()
max_level = max(unique_counts.items(), key=operator.itemgetter(1))[0]
min_level = min(unique_counts.items(), key=operator.itemgetter(1))[0]
target_number = unique_counts[max_level]
target_number_min = unique_counts[min_level]

# find which data points are associated with which group

grouped_levels = {}
for ii, level in enumerate(unique_levels):
    obs_idx = [idx for idx, val in enumerate(y) if val == level]
    grouped_levels[level] = obs_idx

grouped_levels

#oversampling
sampled_levels={}

# sample indices
for i in list(unique_levels):
    if i != max_level:
        sampled_levels[i] = choices(grouped_levels[i], k=target_number )
    else:
        sampled_levels[i] = grouped_levels[i]

first = df4.iloc[sampled_levels[1]].reset_index()
second = df4.iloc[sampled_levels[2]].reset_index()
third = df4.iloc[sampled_levels[3]].reset_index()

new_oversampled = pd.concat([first,second,third], axis = 0)
new_oversampled = new_oversampled.drop(columns = ['level_0','index'])



{1.0: 153383, 2.0: 180741, 3.0: 29925}


In [504]:

# Approach 2: Downsampling 

sampled_levels={}
from random import choices
# sample indices
for i in list(unique_levels):
    if i != min_level:
        sampled_levels[i] = choices(grouped_levels[i], k=target_number_min )
    else:
        sampled_levels[i] = grouped_levels[i]

first = df4.iloc[sampled_levels[1]].reset_index()
second = df4.iloc[sampled_levels[2]].reset_index()
third = df4.iloc[sampled_levels[3]].reset_index()

new_downsampled = pd.concat([first,second,third], axis = 0)
new_downsampled = new_downsampled.drop(columns = ['level_0','index'])

### The 'df_train_interest' is either 'new_oversampled' or 'new_downsampled', based on whether we are considering the oversampled dataframe or the undersampled dataframe respectively. 

In [505]:
#df_train_interest = new_oversampled
df_train_interest = new_downsampled

We have to convert categorical variables to categories for one-hot encoding.


In [506]:
# Define the lambda function: categorize_label
# Define the lambda function: categorize_label
categorize_label = lambda x: x.astype('category')
labels = ['WEATHER_CONDITION', 'CRASH_DAY_OF_WEEK','LIGHTING_CONDITION','MANEUVER', 'TRAFFICWAY_TYPE', 'PRIM_CONTRIBUTORY_CAUSE']
# Convert df[LABELS] to a categorical type
df_train_interest[labels] = df_train_interest[labels].astype('category')
print(df_train_interest[labels].dtypes)

WEATHER_CONDITION          category
CRASH_DAY_OF_WEEK          category
LIGHTING_CONDITION         category
MANEUVER                   category
TRAFFICWAY_TYPE            category
PRIM_CONTRIBUTORY_CAUSE    category
dtype: object


In [507]:

df_test_interest[labels] = df_test_interest[labels].astype('category')


In [508]:
df_train_interest = pd.get_dummies(df_train_interest, drop_first = True)
df_test_interest = pd.get_dummies(df_test_interest, drop_first = True)


In [509]:
# machine learning algorithm
from sklearn.model_selection import train_test_split
train_df1, test_df1 = train_test_split(df_train_interest, test_size=0.2, random_state=100)
train_df2, test_df2 = train_test_split(df_test_interest, test_size=0.2, random_state=100)
#train_df.columns

X_train = train_df1.drop(columns = 'COMBINED_DANGER_SCORE')
Y_train = train_df1.COMBINED_DANGER_SCORE
X_test  = test_df2.drop(columns = 'COMBINED_DANGER_SCORE')
Y_test = test_df2.COMBINED_DANGER_SCORE


## Then we can run the Neural Network Model (changing the dataframe to be from the unsampling or downsampling methods)

In [510]:
Y_train.shape

(71820,)

In [511]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation

In [512]:
X_train.shape # no. of columns for X_train data

(71820, 63)

In [513]:
# create neural network model, 3 layers 
NNmodel = Sequential()
NNmodel.add(Dense(10, input_dim=63, activation='relu'))
# creating the first layer with the input_dim argument and setting it to 63 for the 63 input variables
NNmodel.add(Dense(63, activation='relu'))
NNmodel.add(Dense(3, activation='softmax'))

In [514]:
NNmodel.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) 

In [515]:
Y_train_map = {1:0, 2:1, 3:2} #Â mapping values for Y_train
Y_train=Y_train.map(Y_train_map)

In [517]:
Y_test_map = {1:0, 2:1, 3:2} # mapping values for Y_test
Y_test=Y_test.map(Y_test_map)

In [518]:
# Convert labels to categorical one-hot encoding
from keras.utils.np_utils import to_categorical
Y_train_cat = to_categorical(Y_train, num_classes=3)

In [519]:
Y_test_cat = to_categorical(Y_test, num_classes=3)

In [520]:
# Train the model, iterating on the data in batches of 10 samples
NNmodel.fit(X_train, Y_train_cat, epochs=5, batch_size=10)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a4234e518>

In [521]:
# evaluate the model using X_test and Y_test
scores = NNmodel.evaluate(X_test, Y_test_cat)



In [522]:
print("\n%s: %.2f%%" % (NNmodel.metrics_names[1], scores[1]*100))


categorical_accuracy: 33.23%


In [523]:
#np.asarray(pd.get_dummies(Y_pred_nn.argmax(axis = 1)))
np.asarray(Y_pred_nn.argmax(axis = 1))

array([0, 0, 0, ..., 1, 2, 0])

In [524]:
#Y_true_nn = Y_test_cat
Y_true_nn = Y_test

Y_pred_nn = NNmodel.predict(X_test)

ConfusionMatrix = pd.DataFrame(confusion_matrix(Y_true_nn,np.asarray(Y_pred_nn.argmax(axis = 1))), columns = ['Predicted 1', 'Predicted 2','Predicted 3'], index = ['Actual 1', 'Actual 2','Actual 3'])
print ('Confusion matrix of test data is: \n',ConfusionMatrix)

Confusion matrix of test data is: 
           Predicted 1  Predicted 2  Predicted 3
Actual 1        12778         5489        12556
Actual 2        10899         7055        18106
Actual 3          878          684         4365


In [525]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
print(recall_score(Y_test_cat,np.asarray(pd.get_dummies(Y_pred_nn.argmax(axis = 1))),average=None))
print(precision_score(Y_test_cat,np.asarray(pd.get_dummies(Y_pred_nn.argmax(axis = 1))),average=None))

[0.41456056 0.19564615 0.73646027]
[0.52038281 0.53333837 0.12461815]


### Using SMOTE as a resampling technique

In [526]:
from imblearn.over_sampling import SMOTE

In [527]:
#Approach 3: SMOTE
categorize_label = lambda x: x.astype('category')
labels = ['WEATHER_CONDITION', 'CRASH_DAY_OF_WEEK','LIGHTING_CONDITION','MANEUVER', 'TRAFFICWAY_TYPE', 'PRIM_CONTRIBUTORY_CAUSE']
# Convert df[LABELS] to a categorical type
df[labels] = df[labels].astype('category')
df = pd.get_dummies(df, drop_first = True)

In [528]:
train_df1, test_df1 = train_test_split(df, test_size=0.2)
X_train_SMOTE = train_df1.drop(columns = 'COMBINED_DANGER_SCORE')
Y_train_SMOTE = train_df1.COMBINED_DANGER_SCORE
X_test_SMOTE = test_df1.drop(columns = 'COMBINED_DANGER_SCORE')
Y_test_SMOTE = test_df1.COMBINED_DANGER_SCORE

X_resampled_SMOTE, Y_resampled_SMOTE = SMOTE().fit_sample(X_train_SMOTE, Y_train_SMOTE)

In [544]:
Y_resampled_SMOTE_map = {1:0, 2:1, 3:2}
Y_resampled_SMOTE=pd.Series(Y_resampled_SMOTE).map(Y_resampled_SMOTE_map)

In [545]:
Y_resampled_SMOTE_cat = to_categorical(Y_resampled_SMOTE, num_classes=3)

In [547]:
NNmodel.fit(X_resampled_SMOTE, Y_resampled_SMOTE_cat, epochs=5, batch_size=10)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a3141e748>

In [548]:
# evaluate the model using X_test and Y_test
scores = NNmodel.evaluate(X_test, Y_test_cat)
print("\n%s: %.2f%%" % (NNmodel.metrics_names[1], scores[1]*100))


categorical_accuracy: 49.31%


In [549]:
Y_true_nn = Y_test
Y_pred_nn = NNmodel.predict(X_test)

ConfusionMatrix = pd.DataFrame(confusion_matrix(Y_true_nn,np.asarray(Y_pred_nn.argmax(axis = 1))), columns = ['Predicted 1', 'Predicted 2','Predicted 3'], index = ['Actual 1', 'Actual 2','Actual 3'])
print ('Confusion matrix of test data is: \n',ConfusionMatrix)

Confusion matrix of test data is: 
           Predicted 1  Predicted 2  Predicted 3
Actual 1        12456        16363         2004
Actual 2        10568        22679         2813
Actual 3          887         4270          770


In [550]:
from sklearn.metrics import precision_score, recall_score
print(recall_score(Y_test_cat,np.asarray(pd.get_dummies(Y_pred_nn.argmax(axis = 1))),average=None))
print(precision_score(Y_test_cat,np.asarray(pd.get_dummies(Y_pred_nn.argmax(axis = 1))),average=None))

[0.40411381 0.62892402 0.12991395]
[0.52093179 0.52361932 0.13781994]
