In [7]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,recall_score,precision_score,accuracy_score,f1_score
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import ParameterGrid, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from hyperopt import hp, tpe, fmin,STATUS_OK,Trials


import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

### Load Data

In [8]:
data_le = pd.read_csv('data/training_data/crisis_resp_nt_train.csv')

In [9]:
data_le.head()

Unnamed: 0,incident_num,call_type,beat,priority,disp_category_id,region_id,cr_code
0,22110034378,1016,243,2,5,3,0
1,23020035394,1016,611,2,5,3,0
2,22010000119,1016,523,2,0,5,0
3,22010000156,1016,523,2,0,5,0
4,22010000177,1016,523,2,0,5,0


In [10]:
def modeling(X,Y,model,model_name,random_state):
    """
    Step1 - Dividing the data into train, test, validation sets
    """
    X_train,X_val,Y_train,Y_val = train_test_split(X,Y,test_size=0.1,random_state=random_state)
    X_train,X_test,Y_train,Y_test = train_test_split(X_train,Y_train,test_size=0.1,random_state=random_state)
    
    model.fit(X_train,Y_train)
    Y_train_pred = model.predict(X_train)
    Y_test_pred = model.predict(X_test)
    Y_val_pred = model.predict(X_val)
    
    train_acc = accuracy_score(Y_train,Y_train_pred)
    val_acc = accuracy_score(Y_val,Y_val_pred)
    test_acc = accuracy_score(Y_test,Y_test_pred)
    
    train_recall = recall_score(Y_train,Y_train_pred,average="weighted")
    val_recall = recall_score(Y_val,Y_val_pred,average="weighted")
    test_recall = recall_score(Y_test,Y_test_pred,average="weighted")
    
    train_precision = precision_score(Y_train,Y_train_pred,average="weighted")
    val_precision = precision_score(Y_val,Y_val_pred,average="weighted")
    test_precision = precision_score(Y_test,Y_test_pred,average="weighted")
    
    train_f1 = f1_score(Y_train,Y_train_pred,average="weighted")
    val_f1 = f1_score(Y_val,Y_val_pred,average="weighted")
    test_f1 = f1_score(Y_test,Y_test_pred,average="weighted")
    
    df = pd.DataFrame({"TrainAccuracy":train_acc,"ValidationAccuracy":val_acc,"TestAccuracy":test_acc,
                       "TrainPrecision":train_precision,"ValidationPrecision":val_precision,"TestPrecision":test_precision,
                       "TrainRecall":train_recall,"ValidationRecall":val_recall,"TestRecall":test_recall,
                       "TrainF1":train_f1,"ValidationF1":val_f1,"TestF1":test_f1},
                     index = [model_name])
    
    return df

### Modeling For Label Encoding Data

In [11]:
data_le_X = data_le.drop(columns=["cr_code", "call_type"])
data_le_Y = data_le[["cr_code"]]

In [12]:
data_le_X.head()

Unnamed: 0,incident_num,beat,priority,disp_category_id,region_id
0,22110034378,243,2,5,3
1,23020035394,611,2,5,3
2,22010000119,523,2,0,5
3,22010000156,523,2,0,5
4,22010000177,523,2,0,5


In [13]:
dtree_df = modeling(data_le_X,
                    data_le_Y,
                    model=DecisionTreeClassifier(random_state=50),
                    model_name = "DecisionTree",
                    random_state=50)

rf_df = modeling(data_le_X,
                data_le_Y,
                model=RandomForestClassifier(random_state=50),
                model_name = "RandomForest",
                random_state=50)

bagging_df = modeling(data_le_X,
                    data_le_Y,
                    model=BaggingClassifier(random_state=50),
                    model_name = "Bagging",
                    random_state=50)

In [15]:
dtree_df.head()

Unnamed: 0,TrainAccuracy,ValidationAccuracy,TestAccuracy,TrainPrecision,ValidationPrecision,TestPrecision,TrainRecall,ValidationRecall,TestRecall,TrainF1,ValidationF1,TestF1
DecisionTree,0.977887,0.960914,0.962088,0.977544,0.944328,0.94631,0.977887,0.960914,0.962088,0.972151,0.951882,0.953445


In [16]:
rf_df.head()

Unnamed: 0,TrainAccuracy,ValidationAccuracy,TestAccuracy,TrainPrecision,ValidationPrecision,TestPrecision,TrainRecall,ValidationRecall,TestRecall,TrainF1,ValidationF1,TestF1
RandomForest,0.977871,0.962479,0.963543,0.976473,0.944865,0.94626,0.977871,0.962479,0.963543,0.972614,0.952668,0.95392


In [17]:
bagging_df.head()

Unnamed: 0,TrainAccuracy,ValidationAccuracy,TestAccuracy,TrainPrecision,ValidationPrecision,TestPrecision,TrainRecall,ValidationRecall,TestRecall,TrainF1,ValidationF1,TestF1
Bagging,0.976099,0.962606,0.962727,0.972738,0.945387,0.945743,0.976099,0.962606,0.962727,0.970729,0.95294,0.953417


### Models Trained with whole data

In [18]:
models_df_total_data = pd.concat([dtree_df,rf_df,bagging_df])
models_df_total_data

Unnamed: 0,TrainAccuracy,ValidationAccuracy,TestAccuracy,TrainPrecision,ValidationPrecision,TestPrecision,TrainRecall,ValidationRecall,TestRecall,TrainF1,ValidationF1,TestF1
DecisionTree,0.977887,0.960914,0.962088,0.977544,0.944328,0.94631,0.977887,0.960914,0.962088,0.972151,0.951882,0.953445
RandomForest,0.977871,0.962479,0.963543,0.976473,0.944865,0.94626,0.977871,0.962479,0.963543,0.972614,0.952668,0.95392
Bagging,0.976099,0.962606,0.962727,0.972738,0.945387,0.945743,0.976099,0.962606,0.962727,0.970729,0.95294,0.953417


In [19]:
models_df_total_data.sort_values(by=["ValidationAccuracy","ValidationF1"],ascending=False,inplace=True)
models_df_total_data

Unnamed: 0,TrainAccuracy,ValidationAccuracy,TestAccuracy,TrainPrecision,ValidationPrecision,TestPrecision,TrainRecall,ValidationRecall,TestRecall,TrainF1,ValidationF1,TestF1
Bagging,0.976099,0.962606,0.962727,0.972738,0.945387,0.945743,0.976099,0.962606,0.962727,0.970729,0.95294,0.953417
RandomForest,0.977871,0.962479,0.963543,0.976473,0.944865,0.94626,0.977871,0.962479,0.963543,0.972614,0.952668,0.95392
DecisionTree,0.977887,0.960914,0.962088,0.977544,0.944328,0.94631,0.977887,0.960914,0.962088,0.972151,0.951882,0.953445


In [20]:
print("Best Model with default parameters = ".upper(),models_df_total_data.index[0],end="\n")
print("with model validation accuracy = ".upper(),models_df_total_data.ValidationAccuracy[0], end="\n")
print("with model validation precision = ".upper(),models_df_total_data.ValidationPrecision[0], end="\n")
print("with model validation recall = ".upper(),models_df_total_data.ValidationRecall[0], end="\n")
print("with model validation f1score = ".upper(),models_df_total_data.ValidationF1[0], end=" ")

BEST MODEL WITH DEFAULT PARAMETERS =  Bagging
WITH MODEL VALIDATION ACCURACY =  0.9626063770337373
WITH MODEL VALIDATION PRECISION =  0.9453867434422651
WITH MODEL VALIDATION RECALL =  0.9626063770337373
WITH MODEL VALIDATION F1SCORE =  0.9529402133656835 

## Hyperparameter Tuning

### RandomForest

In [23]:
parameters ={'max_depth': [10, 20],
     'criterion' : ['gini', 'entropy'],
     'max_features': [0.3,0.5],
     'min_samples_leaf': [3,5,7],
     'min_samples_split': [2,5],
     'n_estimators': [50,100]}

param_size = ParameterGrid(parameters)

In [24]:
grid_search = RandomForestClassifier()
grid_search = GridSearchCV(
    grid_search, 
    parameters, 
    cv=5,
    scoring='accuracy',n_jobs=-1)

X_train,X_val,Y_train,Y_val = train_test_split(data_le_X.iloc[:50000],data_le_Y.iloc[0:50000],test_size=0.1,random_state=50)
X_train,X_test,Y_train,Y_test = train_test_split(X_train,Y_train,test_size=0.1,random_state=50)

grid_result= grid_search.fit(X_train, Y_train)
print('Best Params: ', grid_result.best_params_)
print('Best Score: ', grid_result.best_score_)

Best Params:  {'criterion': 'gini', 'max_depth': 10, 'max_features': 0.3, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 50}
Best Score:  1.0


# Randomized CV

In [25]:
random_search=RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions=parameters,verbose=1, n_jobs=-1,
                            n_iter=200)
random_result = random_search.fit(X_train, Y_train)
print('Best Score: ', random_result.best_score_*100)
print('Best Params: ', random_result.best_params_)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   49.5s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:  2.0min finished


Best Score:  100.0
Best Params:  {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 0.3, 'max_depth': 10, 'criterion': 'gini'}


# Bayesian model Optimization using HyperOpt

In [26]:
def accuracy_model(params):
   clf = RandomForestClassifier(**params)
   return cross_val_score(clf, X_train, Y_train).mean()

param_space = {'max_depth': hp.choice('max_depth', range(10,100)),
'max_features': hp.uniform('max_features', 0.1,1),
'n_estimators': hp.choice('n_estimators', range(50,500)),
'min_samples_leaf': hp.choice('min_samples_leaf',range(3,5)),
'min_samples_split': hp.choice('min_samples_split',range(2,10)),
'criterion': hp.choice('criterion', ["gini", "entropy"])}

best = 0

def f(params):
    global best
    acc = accuracy_model(params)
    if acc > best:
        best = acc
    return {'loss': -acc, 'status': STATUS_OK}

Trials = Trials()
best_params = fmin(f, param_space , algo=tpe.suggest,max_evals=5, trials= Trials)
print('New best:', best, best_params)
print(best_params)

100%|██████████| 5/5 [00:11<00:00,  2.36s/trial, best loss: -1.0]
New best: 1.0 {'criterion': 0, 'max_depth': 26, 'max_features': 0.7528287130296112, 'min_samples_leaf': 1, 'min_samples_split': 1, 'n_estimators': 34}
{'criterion': 0, 'max_depth': 26, 'max_features': 0.7528287130296112, 'min_samples_leaf': 1, 'min_samples_split': 1, 'n_estimators': 34}
