In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import time


from sklearn.neural_network import MLPClassifier

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report,confusion_matrix

#df = pd.read_csv("C:\\Users\Eric\Desktop\crash_data_one_hot_encoded_MLP.csv")
df = pd.read_csv("crash_data_one_hot_encoded_without_unknowns.csv")
df = sklearn.utils.shuffle(df)
df.fillna(0,inplace=True)
encode = LabelEncoder()

features = ['CRASH_YEAR', 'NUM_LANES', 'SPD_LIM', 'MULTI_VEH_CODE','HOLIDAY_CODE',
            'LG_REGION_DESC_CODE', 'JUNCTION_TYPE_CODE', 'DIRN_ROLE1_DESC_CODE','INTSN_MIDBLOCK_CODE',
            'FLAT_HILL_CODE','ROAD_CURVATURE_CODE','ROAD_MARKINGS_CODE','ROAD_SURFACE_CODE','ROAD_WET_CODE',
            'URBAN_CODE','LIGHT_CODE','STREET_LIGHT_CODE','WEATHER_A_CODE']

df_non_injury = df[df.CRASH_SEV_CODE==2]  # 480452 samples
df_minor = df[df.CRASH_SEV_CODE==1]       # 150834 samples
df_serious = df[df.CRASH_SEV_CODE==3]    # 37347  samples
df_fatal = df[df.CRASH_SEV_CODE==0]       # 6178   samples

df_minor_upsampled = resample(df_minor, replace=True,n_samples=480452,random_state=40)
df_serious_upsampled = resample(df_serious, replace=True,n_samples=480452,random_state=40)
df_fatal_upsampled = resample(df_fatal, replace=True,n_samples=480452,random_state=40)

df_upsampled = pd.concat([df_non_injury, df_minor_upsampled, df_serious_upsampled, df_fatal_upsampled])

df_upsampled.CRASH_SEV_CODE.value_counts()




3    480452
2    480452
1    480452
0    480452
Name: CRASH_SEV_CODE, dtype: int64

In [30]:
X = df_upsampled.drop(["CRASH_SEV_CODE"], axis=1).values
y = df_upsampled["CRASH_SEV_CODE"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=40)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(X_train.shape); print(X_test.shape)

(1729627, 92)
(192181, 92)


In [31]:
mlp = MLPClassifier(hidden_layer_sizes=(92,45,1), activation='relu', solver='adam', max_iter=500,verbose = True)

start = time.time()
mlp.fit(X_train,y_train)
stop = time.time()

predict_train = mlp.predict(X_train)
predict_test = mlp.predict(X_test)
print(f"\n Training time: {stop - start}s")
print("\n training data\n")
print(confusion_matrix(y_train,predict_train))
print("\n")
print(classification_report(y_train,predict_train,labels=pd.unique(predict_train)))
print("\n---------------------------------------------------------------------------------------\n")
print("\n test data\n")
print(confusion_matrix(y_test,predict_test))
print("\n")
print(classification_report(y_test,predict_test,labels=pd.unique(predict_test)))

Iteration 1, loss = 1.24502974
Iteration 2, loss = 1.09342582
Iteration 3, loss = 1.05699197
Iteration 4, loss = 1.03851845
Iteration 5, loss = 1.02600029
Iteration 6, loss = 1.01707884
Iteration 7, loss = 1.01044924
Iteration 8, loss = 1.00525699
Iteration 9, loss = 1.00099569
Iteration 10, loss = 0.99724328
Iteration 11, loss = 0.99403216
Iteration 12, loss = 0.99127454
Iteration 13, loss = 0.98884170
Iteration 14, loss = 0.98661332
Iteration 15, loss = 0.98440716
Iteration 16, loss = 0.98239979
Iteration 17, loss = 0.98072854
Iteration 18, loss = 0.97905750
Iteration 19, loss = 0.97754919
Iteration 20, loss = 0.97608947
Iteration 21, loss = 0.97484625
Iteration 22, loss = 0.97356071
Iteration 23, loss = 0.97217012
Iteration 24, loss = 0.97140897
Iteration 25, loss = 0.97023478
Iteration 26, loss = 0.96915893
Iteration 27, loss = 0.96851784
Iteration 28, loss = 0.96754928
Iteration 29, loss = 0.96680218
Iteration 30, loss = 0.96595940
Iteration 31, loss = 0.96524767
Iteration 32, los

In [35]:
classification_report(y_test,predict_test,labels=pd.unique(predict_test))
print(mlp.classes_)

[0 1 2 3]


In [17]:
# only run this cell and below for feature importance
from sklearn.linear_model import Perceptron
clf = Perceptron(tol=1e-3, random_state=40, verbose=True,n_jobs = -1)
clf.fit(X,y)

coeffs = clf.coef_

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


-- Epoch 1-- Epoch 1

-- Epoch 1-- Epoch 1

Norm: 195158.69, NNZs: 18, Bias: 23.000000, T: 1921808, Avg. loss: 686148.332722
Total training time: 0.49 seconds.
-- Epoch 2
Norm: 82456.47, NNZs: 18, Bias: -59.000000, T: 1921808, Avg. loss: 745568.259439
Total training time: 0.49 seconds.
-- Epoch 2
Norm: 116749.24, NNZs: 18, Bias: -29.000000, T: 1921808, Avg. loss: 744053.898060
Total training time: 0.52 seconds.
-- Epoch 2
Norm: 78961.63, NNZs: 18, Bias: 11.000000, T: 1921808, Avg. loss: 672685.401437
Total training time: 0.52 seconds.
-- Epoch 2
Norm: 281231.62, NNZs: 18, Bias: 43.000000, T: 3843616, Avg. loss: 668819.152235
Total training time: 0.98 seconds.
-- Epoch 3
Norm: 128339.92, NNZs: 18, Bias: -118.000000, T: 3843616, Avg. loss: 743692.985429
Total training time: 1.01 seconds.
-- Epoch 3
Norm: 164033.92, NNZs: 18, Bias: -53.000000, T: 3843616, Avg. loss: 736939.681098
Total training time: 1.02 seconds.
-- Epoch 3
Norm: 94703.29, NNZs: 18, Bias: 21.000000, T: 3843616, Avg. loss

[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   10.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   10.6s finished


In [40]:
features = df.drop(["CRASH_SEV_CODE"], axis=1).columns
minV = 0
maxV = 0
minPlace = 0
maxPlace = 0
count = 0
for i in coeffs:
    for j in i:
        if count == 0:
            minPlace = 0
            maxPlace = 0
            minV = j
            maxV = j
        else:
            if minV > j:
                minV =j
                minPlace = count
            if maxV < j:
                maxV = j
                maxPlace = count
        count+=1
    
    count = 0
    print("-------------------")
    print(" min: ",minV," feature: ",features[minPlace],"\n max: ",maxV," feature: ",features[maxPlace])
    print("-------------------")

predict_train2 = clf.predict(X_train)
predict_test2 = clf.predict(X_test)

print("\n training data\n")
print(confusion_matrix(y_train,predict_train2))
print("\n")
print(classification_report(y_train,predict_train2,labels=pd.unique(predict_train)))
print("\n---------------------------------------------------------------------------------------\n")
print("\n test data\n")
print(confusion_matrix(y_test,predict_test2))
print("\n")
print(classification_report(y_test,predict_test2,labels=pd.unique(predict_train)))

-------------------
 min:  -657.505707130214  feature:  ROAD_SURFACE_Unknown 
 max:  37.03248869219892  feature:  MULTI_VEH_Other
-------------------
-------------------
 min:  -1315.1133036655588  feature:  DIRN_ROLE1_DESC_0 
 max:  657.6182417089478  feature:  ROAD_SURFACE_Unknown
-------------------
-------------------
 min:  -657.5057071302133  feature:  ROAD_SURFACE_Unknown 
 max:  1315.2258382442915  feature:  DIRN_ROLE1_DESC_0
-------------------
-------------------
 min:  -657.5193937681679  feature:  ROAD_SURFACE_Unknown 
 max:  41.12664954461998  feature:  MULTI_VEH_Cyclist(s)+Pedestrian(s) only
-------------------

 training data

[[ 94361 101434  51191 185414]
 [102238 125102 112274  92853]
 [ 82660 123299 161536  65137]
 [ 98839 110351  85673 137265]]


              precision    recall  f1-score   support

           0       0.25      0.22      0.23    432400
           2       0.39      0.37      0.38    432632
           1       0.27      0.29      0.28    432467
      

numpy.ndarray