In [None]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import RandomOverSampler

from sklearn.model_selection import train_test_split


In [None]:
data = pd.read_csv("data_cleaned.csv")

data.drop(columns=['INCDTTM','year','day','month'],axis=1, inplace=True)

In [3]:
data.head(5)

Unnamed: 0,ADDRTYPE,SEVERITYCODE,COLLISIONTYPE,JUNCTIONTYPE,UNDERINFL,WEATHER,ROADCOND,LIGHTCOND,PEDROWNOTGRNT,SPEEDING,HITPARKEDCAR
0,Intersection,2,Pedestrian,At Intersection (intersection related),0,Clear,Dry,Daylight,Y,0,N
1,Intersection,2,Angles,At Intersection (intersection related),0,Raining,Wet,Dark - Street Lights On,Y,0,N
2,Block,1,Sideswipe,Mid-Block (not related to intersection),0,Clear,Dry,Daylight,Y,0,N
3,Intersection,1,Left Turn,At Intersection (intersection related),0,Raining,Wet,Dark - Street Lights On,Y,0,N
4,Block,1,Other,Mid-Block (not related to intersection),0,Clear,Dry,Dark - Street Lights On,Y,0,N


In [4]:
data['SEVERITYCODE'].value_counts()

1    137485
2     58698
3      3098
4       349
Name: SEVERITYCODE, dtype: int64

In [5]:
data.head(2)

Unnamed: 0,ADDRTYPE,SEVERITYCODE,COLLISIONTYPE,JUNCTIONTYPE,UNDERINFL,WEATHER,ROADCOND,LIGHTCOND,PEDROWNOTGRNT,SPEEDING,HITPARKEDCAR
0,Intersection,2,Pedestrian,At Intersection (intersection related),0,Clear,Dry,Daylight,Y,0,N
1,Intersection,2,Angles,At Intersection (intersection related),0,Raining,Wet,Dark - Street Lights On,Y,0,N


In [6]:
data['SPEEDING'] = data['SPEEDING'].astype(int)

In [7]:
data.dtypes

ADDRTYPE         object
SEVERITYCODE      int64
COLLISIONTYPE    object
JUNCTIONTYPE     object
UNDERINFL         int64
WEATHER          object
ROADCOND         object
LIGHTCOND        object
PEDROWNOTGRNT    object
SPEEDING          int64
HITPARKEDCAR     object
dtype: object

In [8]:
cat_columns = ['ADDRTYPE','COLLISIONTYPE','JUNCTIONTYPE','WEATHER','ROADCOND','LIGHTCOND','HITPARKEDCAR','PEDROWNOTGRNT']

In [9]:
data[cat_columns] = data[cat_columns].apply(lambda x: x.astype('category').cat.codes)

In [10]:
data.tail(6)

Unnamed: 0,ADDRTYPE,SEVERITYCODE,COLLISIONTYPE,JUNCTIONTYPE,UNDERINFL,WEATHER,ROADCOND,LIGHTCOND,PEDROWNOTGRNT,SPEEDING,HITPARKEDCAR
199624,1,1,0,3,0,5,0,5,0,0,0
199625,1,1,7,4,0,11,7,5,0,0,0
199626,1,2,7,4,0,2,0,5,0,0,0
199627,2,1,8,1,0,2,0,5,0,0,0
199628,1,1,0,2,0,2,0,5,0,0,0
199629,1,2,5,4,0,2,0,5,0,0,0


In [11]:
X_train = data.drop(columns=['SEVERITYCODE'],axis=1)
Y_train = data['SEVERITYCODE']

In [12]:
#apply SelectKBest to extract top 5 best features for housing df
from sklearn.feature_selection import SelectKBest, chi2, f_classif

bestfeatures = SelectKBest(score_func=chi2, k=5)
fit = bestfeatures.fit(X_train,Y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_train.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Features','Score']  #naming the dataframe columns
print(featureScores.nlargest(14,'Score'))  #print 5best features

        Features        Score
1  COLLISIONTYPE  5670.606450
2   JUNCTIONTYPE  5312.475739
4        WEATHER  4875.036603
5       ROADCOND  2745.880663
9   HITPARKEDCAR  2092.978875
0       ADDRTYPE  1363.455201
3      UNDERINFL  1132.955601
8       SPEEDING   892.711942
6      LIGHTCOND   609.365801


In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.30, stratify=Y_train,random_state=66)

In [14]:
y_train.value_counts()

1    96239
2    41089
3     2169
4      244
Name: SEVERITYCODE, dtype: int64

In [15]:
y_train.value_counts()


1    96239
2    41089
3     2169
4      244
Name: SEVERITYCODE, dtype: int64

In [16]:
y_test.value_counts()

1    41246
2    17609
3      929
4      105
Name: SEVERITYCODE, dtype: int64

In [17]:
from sklearn.utils import class_weight

weights = class_weight.compute_class_weight('balanced', np.unique(y_train),y_train)

188972    1
145912    2
27076     2
190179    1
         ..
26963     2
128589    1
74827     1
170151    1
151924    2
Name: SEVERITYCODE, Length: 139741, dtype: int64 as keyword args. From version 0.25 passing these as positional arguments will result in an error


In [18]:
weights

array([  0.36300512,   0.85023364,  16.10661595, 143.1772541 ])

In [19]:
class_weights = {}
for i in range(4):
    class_weights[i+1] = weights[i]

In [20]:
class_weights

{1: 0.3630051226633693,
 2: 0.8502336391735015,
 3: 16.106615952051637,
 4: 143.17725409836066}

In [21]:
models_info_df_4_class = pd.DataFrame([])

## Trying Different models having severity code 1,2,3,4

### Logistic Regression

In [22]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score,f1_score


# lr = LogisticRegression(random_state = 0)
# lr.fit(x_train, y_train)
# y_pred = lr.predict(x_test)


# print('Accuracy on training set:',lr.score(x_train,y_train))
# print('Accuracy on test set:',lr.score(x_test,y_test))
# print("F1 score", f1_score(y_test, y_pred,average='weighted'))

# # models_info_df_4_class = models_info_df_4_class.append([("Logistic Regression",lr.score(x_train,y_train),lr.score(x_test,y_test) )])

Accuracy on training set: 0.687500447255995
Accuracy on test set: 0.687354939972282
F1 score 0.577311384819023


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Random Forest Classifier

In [23]:
# from sklearn.ensemble import RandomForestClassifier

# rf_clf = RandomForestClassifier(n_estimators = 10, random_state = 0,class_weight=class_weights)
# rf_clf.fit(x_train, y_train)

# print('Accuracy on training set:',rf_clf.score(x_train,y_train))
# print('Accuracy on test set:',rf_clf.score(x_test,y_test))
# models_info_df_4_class = models_info_df_4_class.append([("RandomForestClassifier",rf_clf.score(x_train,y_train), rf_clf.score(x_test,y_test) )])

### Decision Tree Classifier

In [24]:
# from sklearn.tree import DecisionTreeClassifier
# dec_clf = DecisionTreeClassifier(class_weight=class_weights)
# dec_clf.fit(x_train, y_train)

# print('Accuracy on training set',dec_clf.score(x_train, y_train))
# print('Accuracy on test set:',dec_clf.score(x_test, y_test))

# models_info_df_4_class = models_info_df_4_class.append([("DecisionTreeClassifier",dec_clf.score(x_train, y_train), dec_clf.score(x_test, y_test) )])

### XGBClassifier

In [25]:
# from xgboost import XGBClassifier
# xgb_clf= XGBClassifier()
# xgb_clf.fit(x_train, y_train)

# from sklearn.metrics import accuracy_score,f1_score

# print('Accuracy score on train_data: ', accuracy_score(y_true = y_train, y_pred = xgb_clf.predict(x_train).round()))
# print('Accuracy score on test_data: ', accuracy_score(y_true = y_test, y_pred = xgb_clf.predict(x_test).round()))
# print("F1 score", f1_score(y_true = y_train, y_pred = xgb_clf.predict(x_train).round()))
# models_info_df_4_class = models_info_df_4_class.append([("XGBClassifier",accuracy_score(y_true = y_train, y_pred = xgb_clf.predict(x_train).round()),accuracy_score(y_true = y_test, y_pred = xgb_clf.predict(x_test).round()) )])

### KNeighborsClassifier

In [26]:
# from sklearn.neighbors import KNeighborsClassifier
# knn_clf = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)  
# #You can change these hyperparameters like metric etc.
# knn_clf.fit(x_train, y_train)

# print('Accuracy on training set:',knn_clf.score(x_train,y_train))
# print('Accuracy on test set:',knn_clf.score(x_test,y_test))

# models_info_df_4_class = models_info_df_4_class.append([("KNeighborsClassifier",knn_clf.score(x_train,y_train), knn_clf.score(x_test,y_test))])

In [27]:
# models_info_df_4_class.columns=  ['Model','Train Accuracy','Test Accuracy']
# models_info_df_4_class

ValueError: Length mismatch: Expected axis has 0 elements, new values have 3 elements

## Trying Different models having severity code 1,2 = 1 and 3,4 = 2

In [28]:
# # Ordering 1 & 2 as 1 and 3 & 4 as 2
data['SEVERITYCODE'].replace(2,1,inplace=True)
data['SEVERITYCODE'].replace(3,2,inplace=True)
data['SEVERITYCODE'].replace(4,2,inplace=True)
data['SEVERITYCODE'].value_counts()

1    196183
2      3447
Name: SEVERITYCODE, dtype: int64

In [29]:
X_train = data.drop(columns=['SEVERITYCODE'],axis=1)
Y_train = data['SEVERITYCODE']

In [30]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.20, random_state=66)

In [31]:
y_train.value_counts()

1    156973
2      2731
Name: SEVERITYCODE, dtype: int64

In [32]:
from sklearn.utils import class_weight

weights = class_weight.compute_class_weight('balanced', np.unique(y_train),y_train)

127614    1
11992     1
74266     1
136211    1
         ..
181965    2
62586     1
139315    1
139895    1
123708    1
Name: SEVERITYCODE, Length: 159704, dtype: int64 as keyword args. From version 0.25 passing these as positional arguments will result in an error


In [33]:
weights

array([ 0.50869895, 29.23910655])

In [34]:
class_weights = {}
for i in range(2):
    class_weights[i+1] = weights[i]
    
class_weights    

{1: 0.5086989482267651, 2: 29.239106554375688}

In [35]:
models_info_df_2_class = pd.DataFrame([])

### Logistic Regression

In [40]:
# from sklearn.linear_model import LogisticRegression

# lr = LogisticRegression(class_weight=class_weights,random_state = 0)
# lr.fit(x_train, y_train)

# print('Accuracy on training set:',lr.score(x_train,y_train))
# print('Accuracy on test set:',lr.score(x_test,y_test))

# print(y_pred.shape)
# print(y_test.shape)
# print('Accuracy on training set:',lr.score(x_train,y_train))
# print('Accuracy on test set:',lr.score(x_test,y_test))
# print("F1 score", f1_score(y_test, y_pred,average='binary'))
# models_info_df_2_class = models_info_df_2_class.append([("Logistic Regression",lr.score(x_train,y_train),lr.score(x_test,y_test) )])

Accuracy on training set: 0.6380491409106848
Accuracy on test set: 0.6358012322797175
(59889,)
(39926,)
Accuracy on training set: 0.6380491409106848
Accuracy on test set: 0.6358012322797175


ValueError: Found input variables with inconsistent numbers of samples: [39926, 59889]

### Random Forest Classifier

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# rf_clf = RandomForestClassifier(n_estimators = 10, random_state = 0,class_weight=class_weights)
# rf_clf.fit(x_train, y_train)

# print('Accuracy on training set:',rf_clf.score(x_train,y_train))
# print('Accuracy on test set:',rf_clf.score(x_test,y_test))
# models_info_df_2_class = models_info_df_2_class.append([("RandomForestClassifier",rf_clf.score(x_train,y_train), rf_clf.score(x_test,y_test) )])

### Decision Tree Classifier

In [None]:
# from sklearn.tree import DecisionTreeClassifier
# dec_clf = DecisionTreeClassifier(class_weight=class_weights)
# dec_clf.fit(x_train, y_train)

# print('Accuracy on training set',dec_clf.score(x_train, y_train))
# print('Accuracy on test set:',dec_clf.score(x_test, y_test))

# models_info_df_2_class = models_info_df_2_class.append([("DecisionTreeClassifier",dec_clf.score(x_train, y_train), dec_clf.score(x_test, y_test) )])

### XGBClassifier

In [None]:
# from xgboost import XGBClassifier
# xgb_clf= XGBClassifier()
# xgb_clf.fit(x_train, y_train)

# from sklearn.metrics import accuracy_score

# print('Accuracy score on train_data: ', accuracy_score(y_true = y_train, y_pred = xgb_clf.predict(x_train).round()))
# print('Accuracy score on test_data: ', accuracy_score(y_true = y_test, y_pred = xgb_clf.predict(x_test).round()))

# models_info_df_2_class = models_info_df_2_class.append([("XGBClassifier",accuracy_score(y_true = y_train, y_pred = xgb_clf.predict(x_train).round()),accuracy_score(y_true = y_test, y_pred = xgb_clf.predict(x_test).round()) )])

### KNeighborsClassifier

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# knn_clf = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)  
# #You can change these hyperparameters like metric etc.
# knn_clf.fit(x_train, y_train)

# print('Accuracy on training set:',knn_clf.score(x_train,y_train))
# print('Accuracy on test set:',knn_clf.score(x_test,y_test))

# models_info_df_2_class = models_info_df_2_class.append([("KNeighborsClassifier",knn_clf.score(x_train,y_train), knn_clf.score(x_test,y_test))])

In [None]:
# models_info_df_2_class.columns=  ['Model','Train Accuracy','Test Accuracy']
# models_info_df_2_class