In [17]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer
from sklearn.metrics import recall_score

In [2]:
people = pd.read_csv('../DS-Unit-2-4-Build-Week/people.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
people.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
VEHICLE_ID,768713.0,372184.236701,214710.997314,2.0,187073.0,371645.0,557891.0,744984.0
SEAT_NO,153425.0,4.208121,2.169906,1.0,3.0,3.0,6.0,12.0
AGE,564067.0,38.063264,17.099626,-49.0,26.0,36.0,50.0,110.0
BAC_RESULT VALUE,952.0,0.172027,0.098773,0.0,0.13,0.17,0.22,0.99


In [4]:
people.describe(include = object).T

Unnamed: 0,count,unique,top,freq
PERSON_ID,783816,783816,O605983,1
PERSON_TYPE,783816,6,DRIVER,615209
RD_NO,783816,357877,JB187770,61
CRASH_DATE,783816,229497,11/10/2017 10:30:00 AM,64
CITY,585027,8453,CHICAGO,410601
STATE,589541,52,IL,558262
ZIPCODE,536370,9395,60629,19219
SEX,773346,4,M,410832
DRIVERS_LICENSE_STATE,473191,171,IL,436438
DRIVERS_LICENSE_CLASS,417338,215,D,363711


In [5]:
people['INJURY_CLASSIFICATION'].isnull().sum()

336

In [6]:
people = people.dropna(subset=['INJURY_CLASSIFICATION'])

In [7]:
people['INJURY_CLASSIFICATION'].value_counts(normalize = True)

NO INDICATION OF INJURY     0.925683
NONINCAPACITATING INJURY    0.040472
REPORTED, NOT EVIDENT       0.025410
INCAPACITATING INJURY       0.008032
FATAL                       0.000403
Name: INJURY_CLASSIFICATION, dtype: float64

In [8]:
people['INJURY_CLASSIFICATION'].value_counts()

NO INDICATION OF INJURY     725254
NONINCAPACITATING INJURY     31709
REPORTED, NOT EVIDENT        19908
INCAPACITATING INJURY         6293
FATAL                          316
Name: INJURY_CLASSIFICATION, dtype: int64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(people.drop(['INJURY_CLASSIFICATION'], axis = 1), 
                                                    people['INJURY_CLASSIFICATION'], test_size=0.20)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((626784, 28), (156696, 28), (626784,), (156696,))

In [10]:
pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(),
    RandomForestClassifier()
)

In [11]:
# pipeline.fit(X_train, y_train)
# cross_val_score(pipeline, X_train, y_train, scoring='accuracy', cv=5)

In [12]:
# y_pred = pipeline.predict(X_test)
# print(classification_report(y_test, y_pred))

In [13]:
# pipeline = make_pipeline(
#     ce.OrdinalEncoder(),
#     SimpleImputer(),
#     LogisticRegressionCV(solver='lbfgs', cv=5, n_jobs=-1)    
# )

In [14]:
# pipeline.fit(X_train, y_train)
# pipeline.score(X_test, y_test)

In [15]:
# y_pred = pipeline.predict(X_test)
# print(classification_report(y_test, y_pred))

In [18]:
from scipy.stats import uniform

param_distributions = {
    'randomforestclassifier__n_estimators': range(10, 100), 
    'randomforestclassifier__max_depth': [5, 10, 15, 20], 
    'randomforestclassifier__max_features': uniform(0, 1), 
    'randomforestclassifier__min_samples_leaf': [1, 10], 
}

search = RandomizedSearchCV(
    pipeline, 
    param_distributions = param_distributions, 
    n_iter = 2, 
    cv = 3, 
    scoring = make_scorer(recall_score, average = 'macro', labels = ['FATAL', 'INCAPACITATING INJURY']),
    verbose = 10, 
    return_train_score = True, 
    n_jobs = -2
)

search.fit(X_train, y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done   2 out of   6 | elapsed:  7.2min remaining: 14.3min
[Parallel(n_jobs=-2)]: Done   3 out of   6 | elapsed:  7.2min remaining:  7.2min
[Parallel(n_jobs=-2)]: Done   4 out of   6 | elapsed:  8.9min remaining:  4.5min
[Parallel(n_jobs=-2)]: Done   6 out of   6 | elapsed:  9.1min remaining:    0.0s
[Parallel(n_jobs=-2)]: Done   6 out of   6 | elapsed:  9.1min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ordinalencoder',
                                              OrdinalEncoder(cols=None,
                                                             drop_invariant=False,
                                                             handle_missing='value',
                                                             handle_unknown='value',
                                                             mapping=None,
                                                             return_df=True,
                                                             verbose=0)),
                                             ('simpleimputer',
                                              SimpleImputer(add_indicator=False,
                                                            copy=True,
                                                            f

In [19]:
print(search.best_params_)
print(search.best_score_)

{'randomforestclassifier__max_depth': 15, 'randomforestclassifier__max_features': 0.7172893505680213, 'randomforestclassifier__min_samples_leaf': 10, 'randomforestclassifier__n_estimators': 82}
0.02420429745246174


In [20]:
pd.DataFrame(search.cv_results_).sort_values(by='rank_test_score').T

Unnamed: 0,0,1
mean_fit_time,392.065,96.7675
std_fit_time,6.5434,0.659925
mean_score_time,8.14121,5.99835
std_score_time,0.22891,0.119269
param_randomforestclassifier__max_depth,15,5
param_randomforestclassifier__max_features,0.717289,0.605028
param_randomforestclassifier__min_samples_leaf,10,1
param_randomforestclassifier__n_estimators,82,57
params,"{'randomforestclassifier__max_depth': 15, 'ran...","{'randomforestclassifier__max_depth': 5, 'rand..."
split0_test_score,0.0185185,0.00268817


In [21]:
final = search.best_estimator_
y_pred = best.predict(X_test)

In [22]:
print(classification_report(y_test, y_pred))

  'precision', 'predicted', average, warn_for)


                          precision    recall  f1-score   support

                   FATAL       0.00      0.00      0.00        55
   INCAPACITATING INJURY       0.40      0.03      0.05      1273
 NO INDICATION OF INJURY       0.97      0.99      0.98    145088
NONINCAPACITATING INJURY       0.58      0.65      0.61      6341
   REPORTED, NOT EVIDENT       0.39      0.01      0.01      3939

                accuracy                           0.95    156696
               macro avg       0.47      0.33      0.33    156696
            weighted avg       0.93      0.95      0.93    156696

