## import libraries, packages, data

### libraries, packages

In [35]:
#importing libraries & packages
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#display multiple outputs from cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# machine learning
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from imblearn.over_sampling import ADASYN, SMOTE
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import svm
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

In [3]:
# keras
import keras
from keras.models import Sequential 
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical
from sklearn.utils import shuffle
from sklearn import preprocessing, model_selection
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD

### import clean data from part 1

In [101]:
# import Animal Control Incidents df
df = pd.read_csv('a_control.csv')

In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37902 entries, 0 to 37901
Data columns (total 11 columns):
species         37902 non-null object
breed           37902 non-null object
size            37902 non-null object
color           37902 non-null object
condition       37902 non-null object
temperment      37902 non-null object
weekday         37902 non-null float64
month           37902 non-null float64
service_type    37902 non-null object
disposition     37902 non-null object
municipality    37902 non-null object
dtypes: float64(2), object(9)
memory usage: 3.2+ MB


In [103]:
for col in ('month', 'weekday'):
    df[col] = df[col].astype(str)

In [104]:
df = df[['species', 'breed', 'size', 'condition', 'temperment', 'weekday', 'service_type', 'disposition']]

In [105]:
# # assign target
y = df["disposition"]

#remove target from features
df.drop(columns= ['disposition'], axis=1, inplace=True)

# dummy categorical features
#col_list = list(df.select_dtypes(include=['object']).columns)
col_list = list(df.columns)

# create dummies for categorical features, assign to X
df = pd.DataFrame(pd.get_dummies(data=df, drop_first=True
                   , prefix=col_list))
X = df

# X_train_d and y_train_d = using original standard test train spit dad
# that way, I can try out different resampling methods
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state=33)

### functions

In [106]:
# print model report
def model_report(model, cm = True): 
    
    print('\nCross Validation Scoring:\n')
    
    train_score = model.score(X_train, y_train)
    print("train score: {:.4}%".format(train_score * 100))

    test_score = model.score(X_test, y_test)
    print("test score: {:.4}%".format(test_score * 100))
    
    if cm :

        cv_score = np.mean(cross_val_score(model, X, y, cv=3))
        print("cross val score: {:.4}%".format(cv_score * 100))

        print("\nClassification Report:")
        print(classification_report(actuals, predictions))

#         col_list = []
#         ind_list = []                     
#         for x in y_test.unique():
#             col_list.append('pred_'+ x)
#             ind_list.append('real_'+x)


        print('\nConfusion matrix:'.format(accuracy_score))
        display(pd.DataFrame(confusion_matrix(y_test, predictions)
                             #, columns=col_list, index=ind_list))
                ,columns=['pred doa','pred euth', 'pred to owner/wild', 'pred spec_case', 'pred trans']
              , index=['real euth', 'real doa', 'real to owner/wild', 'real spec_case', 'real trans']))

    else:
        pass

## prep data

### resampling

Due to the imbalance of class data, it could be beneficial to resample to compensate for the missing data. Using smote to oversample, then random undersampling is a common practice in these circumstances



In [107]:
# over = SMOTE(sampling_strategy='not majority', random_state=3)
# under = RandomUnderSampler(sampling_strategy='not minority', random_state=3)
# steps = [('o', over), ('u', under)]
# pipeline = Pipeline(steps=steps)
# # # transform the dataset
# X_train, y_train = pipeline.fit_resample(X_train_d, y_train_d)

In [108]:
print (y_test.unique())

['trans_caa' 'return_to_wild/owner' 'special_caseother' 'dead_on_arrival'
 'euthanized']


## __modeling__

In [109]:
lr = LogisticRegression(random_state=3)
lr.fit(X_train, y_train)
 
# Predict on training set
predictions = lr.predict(X_test)
actuals = y_test

model_report(dt_model)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=3, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


Cross Validation Scoring:

train score: 93.73%
test score: 89.59%
cross val score: 89.54%

Classification Report:
                      precision    recall  f1-score   support

     dead_on_arrival       0.00      0.00      0.00        11
          euthanized       0.00      0.00      0.00        24
return_to_wild/owner       0.93      0.84      0.88      1814
   special_caseother       0.69      0.33      0.45       141
           trans_caa       0.93      0.98      0.96      5591

            accuracy                           0.93      7581
           macro avg       0.51      0.43      0.46      7581
        weighted avg       0.92      0.93      0.92      7581


Confusion matrix:


Unnamed: 0,pred doa,pred euth,pred to owner/wild,pred spec_case,pred trans
real euth,0,0,1,0,10
real doa,0,0,1,0,23
real to owner/wild,0,0,1517,3,294
real spec_case,0,0,19,47,75
real trans,0,0,92,18,5481


### decision tree

__default model__

In [111]:
# initialize and fit default decision tree
dt_model = DecisionTreeClassifier(random_state=3)
dt_model.fit(X_train, y_train)
predictions = dt_model.predict(X_test)
#actuals = y_test

model_report(dt_model)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=3, splitter='best')


Cross Validation Scoring:

train score: 95.78%
test score: 89.87%
cross val score: 89.54%

Classification Report:
                      precision    recall  f1-score   support

     dead_on_arrival       0.00      0.00      0.00        11
          euthanized       0.09      0.08      0.09        24
return_to_wild/owner       0.84      0.84      0.84      1814
   special_caseother       0.38      0.28      0.33       141
           trans_caa       0.93      0.94      0.94      5591

            accuracy                           0.90      7581
           macro avg       0.45      0.43      0.44      7581
        weighted avg       0.90      0.90      0.90      7581


Confusion matrix:


Unnamed: 0,pred doa,pred euth,pred to owner/wild,pred spec_case,pred trans
real euth,0,1,2,0,8
real doa,0,2,1,0,21
real to owner/wild,1,3,1531,7,272
real spec_case,0,2,25,40,74
real trans,14,14,265,58,5240


In [112]:
# set random search params
criterion=['gini','entropy']
max_leaf_nodes = [None]
max_features = ['auto', 'sqrt']
max_depth = [5,10,15,17,20]
min_samples_split = [10,20,30,50,100]
min_samples_leaf = [3,5,7,10,20]
bootstrap = [True, False]

random_grid = {'criterion':criterion,
               'max_leaf_nodes': max_leaf_nodes,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf
               }

# random grid search
dt_random = RandomizedSearchCV(estimator = dt_model
                               , param_distributions = random_grid
                               , n_iter = 100, cv = 3, verbose=3, error_score=0
                               , scoring=('accuracy')
                               , random_state=3, n_jobs = -1)
# fit random search model
dt_random.fit(X_train, y_train);

print('best accuracy: {:.4}%'.format(dt_random.best_score_ * 100));
print(dt_random.best_params_);

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   18.2s


best accuracy: 92.58%
{'min_samples_split': 20, 'min_samples_leaf': 5, 'max_leaf_nodes': None, 'max_features': 'auto', 'max_depth': 15, 'criterion': 'entropy'}


[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   19.3s finished


__grid search__

In [119]:
# grid search params
param_grid_dt = {'criterion': ['entropy','gini'],
    'max_depth': [15,17,20],
    'min_samples_split': [2],
    'min_samples_leaf': [6,7,8],
     'max_features':[40,50,60],
     'max_leaf_nodes':[None]
        }

# grid search
gs_dt = GridSearchCV(estimator=dt_model,
                    param_grid=param_grid_dt,
                     error_score=0,
                    scoring=('accuracy'),
                    cv=5, n_jobs=-1, verbose=2)

# fitting grid search
gs_dt.fit(X_train, y_train);

# best parameters
print('Best score: {:.3}%'.format(gs_dt.best_score_ * 100));
print('params:\n', gs_dt.best_params_);

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   17.3s


Best score: 92.7%
params:
 {'criterion': 'gini', 'max_depth': 15, 'max_features': 40, 'max_leaf_nodes': None, 'min_samples_leaf': 7, 'min_samples_split': 2}


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:   40.0s finished


In [120]:
# using best params to fit model
dt_gs_model = DecisionTreeClassifier(criterion='gini',
                              max_depth=15,
                              max_features=40,
                              max_leaf_nodes= None,
                               min_samples_leaf=7,
                               min_samples_split=2
                                
                                    )
dt_gs_model.fit(X_train, y_train)

predictions = dt_gs_model.predict(X_test)
actuals = y_test

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=15, max_features=40, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=7, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [121]:
model_report(dt_gs_model)


Cross Validation Scoring:

train score: 92.77%
test score: 92.44%
cross val score: 92.62%

Classification Report:
                      precision    recall  f1-score   support

     dead_on_arrival       0.00      0.00      0.00        11
          euthanized       0.00      0.00      0.00        24
return_to_wild/owner       0.93      0.82      0.87      1814
   special_caseother       0.63      0.28      0.39       141
           trans_caa       0.93      0.98      0.95      5591

            accuracy                           0.92      7581
           macro avg       0.50      0.42      0.44      7581
        weighted avg       0.92      0.92      0.92      7581


Confusion matrix:


Unnamed: 0,pred doa,pred euth,pred to owner/wild,pred spec_case,pred trans
real euth,0,0,1,0,10
real doa,0,0,1,0,23
real to owner/wild,0,0,1495,6,313
real spec_case,0,0,18,40,83
real trans,0,0,101,17,5473


### Random Forest

---
__default model__

In [122]:
#initialize and fit random forest
rf_model = RandomForestClassifier(random_state=3)
rf_model.fit(X_train, y_train)

predictions = rf_model.predict(X_test)
#actuals = y_test

model_report(rf_model)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=3, verbose=0,
                       warm_start=False)


Cross Validation Scoring:

train score: 95.78%
test score: 91.82%
cross val score: 91.95%

Classification Report:
                      precision    recall  f1-score   support

     dead_on_arrival       0.00      0.00      0.00        11
          euthanized       0.00      0.00      0.00        24
return_to_wild/owner       0.90      0.83      0.87      1814
   special_caseother       0.53      0.27      0.36       141
           trans_caa       0.93      0.97      0.95      5591

            accuracy                           0.92      7581
           macro avg       0.47      0.41      0.43      7581
        weighted avg       0.91      0.92      0.91      7581


Confusion matrix:


Unnamed: 0,pred doa,pred euth,pred to owner/wild,pred spec_case,pred trans
real euth,0,0,3,0,8
real doa,0,0,0,0,24
real to owner/wild,0,2,1507,4,301
real spec_case,0,2,21,38,80
real trans,2,7,136,30,5416


___
__Random Grid Search__

The default model is overfitting. Some ways to avoid overfitting with random forests are: increasing the n_estimators, reducing max features, limiting max depth, and increasing min leaf samples can help construct a more robust fit. Being that it did perform well despite the overfitting, I decided not to go crazy playing with the parameters.

In [123]:
# set random search params

n_estimators = [250,500,750,1000]
# min_weight_fraction_leaf=[0.0, 0.001, 0.0001]
# min_samples_split = [3,4,5,6]
# min_samples_leaf = [2,3]
# min_impurity_split = [0.0]
# min_impurity_decrease = [0.0, 0.0001]
# max_samples = [None, 20,25,30,35,40,50]
# max_leaf_nodes = [None, 3,5,8,10,15,20,25]
# max_features = ['auto', 3,5,6,8]
max_depth = [2,3,5,10,20,30,35,40]
criterion = ['entropy', 'gini']
# bootstrap = [True, False]

# Create the random grid
random_grid = { 'n_estimators': n_estimators,
#                  'min_weight_fraction_leaf': min_weight_fraction_leaf,
#                  'min_samples_split': min_samples_split,
#                  'min_samples_leaf': min_samples_leaf,
#                  'min_impurity_split': min_impurity_split,
#                  'min_impurity_decrease': min_impurity_decrease,
#                  'max_samples': max_samples,
#                  'max_leaf_nodes': max_leaf_nodes,
#                  'max_features': max_features,
                  'max_depth': max_depth,
                 'criterion': criterion,
#                  'bootstrap': bootstrap
              }

# fit and search random param combinations
rf_random = RandomizedSearchCV(estimator = rf_model
                               , param_distributions = random_grid
                               , n_iter = 25
                               , cv = 3
                               , verbose=10
                               , random_state=1
                               , n_jobs = -1);
# Fit the random search model
rf_random.fit(X_train, y_train);

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   34.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed: 12.7min finished


In [124]:
# print best parameters
print('best accuracy: {:.4}%'.format(rf_random.best_score_ * 100));
print(rf_random.best_params_);

best accuracy: 92.77%
{'n_estimators': 750, 'max_depth': 20, 'criterion': 'gini'}


___
__grid search__

In [125]:
param_grid_rf = {
        'n_estimators': [750,1100]
#          , 'min_samples_leaf': [3,4]
#          , 'min_samples_split':[3,4,5,6]
#         , 'max_samples': [20,25,30]
#         , 'max_leaf_nodes':[7,13,20]
#           , 'max_features': [5,6,7]
         , 'max_depth': [3,5,10,20]
         , 'criterion': ['entropy', 'gini']
#          , 'bootstrap': [False, True]
}

# grid search
gs_rf = GridSearchCV(estimator=rf_model
                    ,param_grid=param_grid_rf
                    ,cv=3
                     ,refit=True
                     ,error_score=0
                     , n_jobs=-1
                     , verbose=10
                    )

# fitting grid search
gs_rf.fit(X_train, y_train);

# best parameters
print('best accuracy: {:.4}%'.format(gs_rf.best_score_ * 100));
print('params:\n', gs_rf.best_params_);

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   34.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   56.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  46 out of  48 | elapsed:  7.9min remaining:   20.6s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  8.2min finished


best accuracy: 92.77%
params:
 {'criterion': 'gini', 'max_depth': 20, 'n_estimators': 750}


___
__best model__

In [127]:
# fitting model using best params
rf_gs_model = RandomForestClassifier(
                                   # bootstrap= False
                                    criterion= 'gini'
                                    , max_depth= 20
                                   # , max_features = 'auto'
                                   # , max_leaf_nodes = None
                                   # , max_samples = None
#                                     , min_samples_leaf = 3
                                   # , min_samples_split = 8
                                    , n_estimators = 750
                                    );
rf_gs_model.fit(X_train, y_train);

predictions = rf_gs_model.predict(X_test);

# new model report
model_report(rf_gs_model)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=750,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


Cross Validation Scoring:

train score: 93.87%
test score: 92.63%
cross val score: 92.66%

Classification Report:
                      precision    recall  f1-score   support

     dead_on_arrival       0.00      0.00      0.00        11
          euthanized       0.00      0.00      0.00        24
return_to_wild/owner       0.93      0.83      0.88      1814
   special_caseother       0.65      0.23      0.34       141
           trans_caa       0.93      0.98      0.95      5591

            accuracy                           0.93      7581
           macro avg       0.50      0.41      0.43      7581
        weighted avg       0.92      0.93      0.92      7581


Confusion matrix:


Unnamed: 0,pred doa,pred euth,pred to owner/wild,pred spec_case,pred trans
real euth,0,0,3,0,8
real doa,0,0,0,0,24
real to owner/wild,0,1,1506,2,305
real spec_case,0,1,21,33,86
real trans,0,2,90,16,5483


### XGBoost

In [128]:
# initiate model
xgb_model = XGBClassifier(random_state=3)
xgb_model.fit(X_train, y_train)

predictions = xgb_model.predict(X_test)
# actuals = y_test

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=3,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [129]:
model_report(xgb_model)


Cross Validation Scoring:

train score: 92.96%
test score: 92.94%
cross val score: 92.78%

Classification Report:
                      precision    recall  f1-score   support

     dead_on_arrival       0.00      0.00      0.00        11
          euthanized       0.00      0.00      0.00        24
return_to_wild/owner       0.93      0.84      0.88      1814
   special_caseother       0.73      0.25      0.37       141
           trans_caa       0.93      0.98      0.96      5591

            accuracy                           0.93      7581
           macro avg       0.52      0.41      0.44      7581
        weighted avg       0.92      0.93      0.92      7581


Confusion matrix:


Unnamed: 0,pred doa,pred euth,pred to owner/wild,pred spec_case,pred trans
real euth,0,0,1,0,10
real doa,0,0,1,0,23
real to owner/wild,0,0,1519,1,294
real spec_case,0,0,23,35,83
real trans,0,0,87,12,5492


In [133]:
#set random search params
learning_rate=[.15 ,.1,  .05]
#max_depth = [2,3,4,5,6]
min_child_weight = [.5, 1,1.5]
n_estimators = [90,100, 110]
#min_weight_fraction_leaf = [0,0.0001, 0.01, 1]

random_grid = {
               'learning_rate':learning_rate,
               #'max_depth': max_depth,
               'min_child_weight':min_child_weight
              # 'n_estimators': n_estimators
              }

# searching random params
xgb_random = RandomizedSearchCV(estimator = xgb_model
                               , param_distributions = random_grid
                               , n_iter = 5, cv = 3, verbose=10
                               , random_state=3, n_jobs = -1)
# fit random search model
xgb_random.fit(X_train, y_train)
print('best accuracy: {:.4}%'.format(xgb_random.best_score_ * 100));
print(xgb_random.best_params_);

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:  5.6min remaining:  4.0min
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:  8.4min remaining:  2.8min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  8.4min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='multi:softprob',
                                           random_state=3, reg_alpha=0,
                                           reg_lambda=1, scale_pos_weight=1,
                                           seed=None, silent=None, subsample=1,
                                           verbosity=1),
                   iid='dep

best accuracy: 92.89%
{'min_child_weight': 1.5, 'learning_rate': 0.15}


In [139]:
param_grid_xgb = [
    {
#         'xgb_model__min_samples_split':[2,3],
      #   'xbg_model__gamma':[0.01,0.1,.05],
        'xgb_model__min_child_weight': [1,1.2],
    'xgb_model__learning_rate': [0.009,0.1]
    #'xgb_model__n_estimators': [150,200,250],
    #'xgb_model__max_depth': [2,3]
    }
]

# Construct Grid Search
gs_xgb = GridSearchCV(estimator=xgb_model,
                    param_grid=param_grid_xgb,
                    scoring='accuracy',random_state=3,
                    cv=3, n_jobs=-1, verbose=10)

# Fit using grid search
gs_xgb.fit(X, y)

# Best accuracy and parameters
print('best score: {:.3}%'.format(gs_xgb.best_score_ * 100))
print('params:\n', gs_xgb.best_params_);

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:  8.0min remaining:  5.7min
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed: 11.6min remaining:  3.9min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 11.7min finished


best score: 92.8%
params:
 {'xgb_model__learning_rate': 0.009, 'xgb_model__min_child_weight': 1}


In [140]:
# fit using best params
xgb_gs_model = XGBClassifier(learning_rate=0.009,
                           #  max_depth=3,
                             min_child_weight=1,
                          #   n_estimators=90)
xgb_gs_model.fit(X_train, y_train)

predictions = xgb_gs_model.predict(X_test)

model_report(xgb_gs_model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.009, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=90, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)


Cross Validation Scoring:

train score: 92.62%
test score: 92.67%
cross val score: 92.38%

Classification Report:
                      precision    recall  f1-score   support

     dead_on_arrival       0.00      0.00      0.00        11
          euthanized       0.00      0.00      0.00        24
return_to_wild/owner       0.92      0.84      0.88      1814
   special_caseother       0.72      0.09      0.16       141
           trans_caa       0.93      0.98      0.95      5591

            accuracy                           0.93      7581
           macro avg       0.51      0.38      0.40      7581
        weighted avg       0.92      0.93      0.92      7581


Confusion matrix:


Unnamed: 0,pred doa,pred euth,pred to owner/wild,pred spec_case,pred trans
real euth,0,0,1,0,10
real doa,0,0,1,0,23
real to owner/wild,0,0,1520,0,294
real spec_case,0,0,29,13,99
real trans,0,0,94,5,5492


### AdaBoost

In [141]:
#initialize and fit default model
ada_model = AdaBoostClassifier(random_state=3)
ada_model.fit(X_train, y_train)

predictions = ada_model.predict(X_test)
# actuals = y_test

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=3)

___
__Default Model__

In [143]:
model_report(ada_model)


Cross Validation Scoring:

train score: 92.55%
test score: 92.47%
cross val score: 89.48%

Classification Report:
                      precision    recall  f1-score   support

     dead_on_arrival       0.00      0.00      0.00        11
          euthanized       0.00      0.00      0.00        24
return_to_wild/owner       0.91      0.84      0.87      1814
   special_caseother       0.66      0.16      0.26       141
           trans_caa       0.93      0.98      0.95      5591

            accuracy                           0.92      7581
           macro avg       0.50      0.40      0.42      7581
        weighted avg       0.92      0.92      0.92      7581


Confusion matrix:


Unnamed: 0,pred doa,pred euth,pred to owner/wild,pred spec_case,pred trans
real euth,0,0,1,0,10
real doa,0,0,2,0,22
real to owner/wild,0,0,1517,2,295
real spec_case,0,0,27,23,91
real trans,0,0,111,10,5470


__Random Grid Search__

In [144]:
#set random search params
learning_rate= [.8,.9, 1, 1.1, 1.2]

n_estimators = [5,7,10,,12,15]

random_grid = {'learning_rate':learning_rate,
               'n_estimators': n_estimators
              }

In [150]:
#random grid search
ada_random = RandomizedSearchCV(estimator = ada_model
                               , param_distributions = random_grid
                               , n_iter = 20, cv = 3, verbose=10
                               , random_state=3, n_jobs = -1);
# Fit the random search model
ada_random.fit(X_train, y_train);

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.6min finished


In [151]:
print('best recall: {:.4}%'.format(ada_random.best_score_ * 100))
print('params:\n', ada_random.best_params_)

best recall: 92.3%
params:
 {'n_estimators': 10, 'learning_rate': 1}


___
__grid search__

In [152]:
#grid search params and fitting grid search
param_grid_ada = [
    {'n_estimators': [9,10,11],
    'learning_rate': [.9,1,1.1]}
]

# Construct Grid Search
gs_ada = GridSearchCV(estimator=ada_model,
                    param_grid=param_grid_ada,
                    scoring='accuracy',
                    cv=5, n_jobs=-1, verbose=10)

# Fit using grid search
gs_ada.fit(X_train, y_train);

# Best accuracy and parameters
print('best recall: {:.4}%'.format(gs_ada.best_score_ * 100))
print('params:\n', gs_ada.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done  43 out of  45 | elapsed:   19.8s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   20.7s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=3),
             iid='deprecated', n_jobs=-1,
             param_grid=[{'learning_rate': [0.9, 1, 1.1],
                          'n_estimators': [9, 10, 11]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=10)

best recall: 92.37%
params:
 {'learning_rate': 1, 'n_estimators': 11}


In [153]:
# fit best params
ada_gs_model = AdaBoostClassifier(n_estimators=11,
                                   learning_rate=1)
ada_gs_model.fit(X_train, y_train);

predictions = ada_gs_model.predict(X_test);

__best model__

In [155]:
model_report(ada_gs_model)


Cross Validation Scoring:

train score: 92.43%
test score: 92.44%
cross val score: 91.68%

Classification Report:
                      precision    recall  f1-score   support

     dead_on_arrival       0.00      0.00      0.00        11
          euthanized       0.00      0.00      0.00        24
return_to_wild/owner       0.92      0.84      0.88      1814
   special_caseother       0.34      0.14      0.20       141
           trans_caa       0.93      0.98      0.95      5591

            accuracy                           0.92      7581
           macro avg       0.44      0.39      0.41      7581
        weighted avg       0.91      0.92      0.92      7581


Confusion matrix:


Unnamed: 0,pred doa,pred euth,pred to owner/wild,pred spec_case,pred trans
real euth,0,0,1,3,7
real doa,0,0,1,2,21
real to owner/wild,0,0,1520,5,289
real spec_case,0,0,29,20,92
real trans,0,0,94,29,5468


### gradient boost

In [156]:
#initialize and fit gradient boost
gbt_model = GradientBoostingClassifier(random_state=3)
gbt_model.fit(X_train, y_train)

predictions = gbt_model.predict(X_test)
# actuals = y_test

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=3, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

__default model__

In [157]:
#baseline model report
model_report(gbt_model)


Cross Validation Scoring:

train score: 93.17%
test score: 92.84%
cross val score: 92.68%

Classification Report:
                      precision    recall  f1-score   support

     dead_on_arrival       0.00      0.00      0.00        11
          euthanized       0.00      0.00      0.00        24
return_to_wild/owner       0.93      0.84      0.88      1814
   special_caseother       0.65      0.31      0.42       141
           trans_caa       0.93      0.98      0.95      5591

            accuracy                           0.93      7581
           macro avg       0.50      0.43      0.45      7581
        weighted avg       0.92      0.93      0.92      7581


Confusion matrix:


Unnamed: 0,pred doa,pred euth,pred to owner/wild,pred spec_case,pred trans
real euth,0,0,1,0,10
real doa,0,0,1,0,23
real to owner/wild,0,0,1516,5,293
real spec_case,0,0,17,44,80
real trans,1,4,89,19,5478


In [None]:
GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=3, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

__random grid search__

In [158]:
# setting parameters for random search
learning_rate=[.09,.1, .11]
loss = ['deviance', 'exponential']
max_depth = [None,2,3,4] 
n_estimators = [50,100,200]
min_samples_split = [2,3,4]
min_samples_leaf = [1,2,3]
min_weight_fraction_leaf = [0,0.0001, 0.001]

random_grid = {'learning_rate':learning_rate,
               'loss': loss,
               'max_depth': max_depth,
               'n_estimators': n_estimators,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'min_weight_fraction_leaf': min_weight_fraction_leaf
               }

#searching random params
gbt_random = RandomizedSearchCV(estimator = gbt_model
                               , param_distributions = random_grid
                               , n_iter = 20, cv = 3, verbose=10
                               , random_state=3, n_jobs = -1)
# fit random search model
gbt_random.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   20.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   52.7s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  3.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  3.1min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                        criterion='friedman_mse',
                                                        init=None,
                                                        learning_rate=0.1,
                                                        loss='deviance',
                                                        max_depth=3,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                    

In [160]:
print('best accuracy: {:.4}%'.format(gbt_random.best_score_ * 100));
print(gbt_random.best_params_)

best accuracy: 92.87%
{'n_estimators': 100, 'min_weight_fraction_leaf': 0, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_depth': 2, 'loss': 'deviance', 'learning_rate': 0.1}


__grid search__

In [None]:
# grid search params and run search
param_grid_gbt = [
    {'n_estimators': [90,100,110],
     'min_weight_fraction_leaf':[0.0001, 0],
    'learning_rate': [0.11,0.1, .009],
    'min_samples_split': [2],
    'min_samples_leaf': [2],
    'max_depth': [2,3]
    }
]

# grid search
gs_gbt = GridSearchCV(estimator=gbt_model,
                    param_grid=param_grid_gbt,
                    scoring='accuracy',
                    cv=3, n_jobs=-1, verbose=10)

# fit grid search
gs_gbt.fit(X_train, y_train)

# best params
print('best accuracy: {:.4}%'.format(gs_gbt.best_score_ * 100))
print('params:\n', gs_gbt.best_params_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 13.9min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 16.2min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 18.1min


In [None]:
# fitting best params
gbt_gs_model = GradientBoostingClassifier(learning_rate=0.1,
                                         max_depth=2,
                                         min_samples_leaf=2,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.01,
                                         n_estimators=60)
gbt_gs_model.fit(X_train, y_train)

predictions = gbt_gs_model.predict(X_test)

__best model__

In [None]:
# new model report
model_report(gbt_gs_model)

### SVM

In [None]:
svm_model = svm.SVC(random_state=3)
svm_model.fit(X_train, y_train)

predictions = svm_model.predict(X_test)
# actuals = y_test

___
__default model__

In [None]:
model_report(svm_model, cm=False)

___
__random grid search__

In [None]:
# set random search params
clf_C = [0.07,0.1,1.0, 1.03]
clf_gamma = [0.005,0.001,0.01, 0.1]
clf_kernel = ['rbf', 'linear']

random_grid = {
               'C': clf_C,
               'gamma': clf_gamma,
               'kernel': clf_kernel
               }

# fit and search random param combinations
svm_random = RandomizedSearchCV(estimator = svm_model
                               , param_distributions = random_grid
                               , n_iter = 12 , cv = 3, verbose=10
                               , random_state=3, n_jobs = -1)
# Fit the random search model
svm_random.fit(X_train, y_train)

In [None]:
print('best accuracy: {:.4}%'.format(svm_random.best_score_ * 100));
print(svm_random.best_params_)

___
__grid search__

In [None]:
# Set grid search params
param_grid_svm = [
 
  {'C': [1.0, 1.01, 1.03], 'gamma': [0.008, 0.01, 0.012]
   , 'kernel': ['rbf', 'linear']
  }]
 

# Construct grid search
gs_svm = GridSearchCV(estimator=svm_model,
            param_grid=param_grid_svm,
            cv=5, verbose=10, return_train_score = True)

# Fit using grid search
gs_svm.fit(X_train, y_train)

# Best accuracy
print('best score: {:.4}%'.format(gs_svm.best_score_ * 100));
print('\nBest params:\n', gs_svm.best_params_)

In [None]:
svm_gs_model = svm.SVC(C=1.03
                       ,gamma=0.008
                        ,kernel='rbf'
                             )
svm_gs_model.fit(X_train, y_train)

predictions = svm_gs_model.predict(X_test)

___
__best model__

In [None]:
model_report(svm_gs_model, cm=False)