## import libraries, packages, data

### libraries, packages

In [1]:
#importing libraries & packages
import pandas as pd
import numpy as np
import seaborn as sns
sns.set(style='darkgrid')
import matplotlib.pyplot as plt
%matplotlib inline
import pandas_profiling
import datetime
import re
import pprint
pp = pprint.PrettyPrinter(indent=4)
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

#display multiple outputs from cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# machine learning
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from imblearn.over_sampling import ADASYN, SMOTE
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import svm
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

#from sklearn.base import clone 
#from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler

Using TensorFlow backend.


In [3]:
# keras
import keras
from keras.models import Sequential 
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical
from sklearn.utils import shuffle
from sklearn import preprocessing, model_selection
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD

### import clean data from part 1

In [46]:
# import Animal Control Incidents df
df = pd.read_csv('a_control.csv')

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37902 entries, 0 to 37901
Data columns (total 11 columns):
species         37902 non-null object
breed           37902 non-null object
size            37902 non-null object
color           37902 non-null object
condition       37902 non-null object
temperment      37902 non-null object
weekday         37902 non-null float64
month           37902 non-null float64
service_type    37902 non-null object
disposition     37902 non-null object
municipality    37902 non-null object
dtypes: float64(2), object(9)
memory usage: 3.2+ MB


In [48]:
for col in ('month', 'weekday'):
    df[col] = df[col].astype(str)

In [49]:
# # assign target
y = df["disposition"]

#remove target from features
df.drop(columns= ['disposition'], axis=1, inplace=True)

# dummy categorical features
#col_list = list(df.select_dtypes(include=['object']).columns)
col_list = list(df.columns)

# create dummies for categorical features, assign to X
df = pd.DataFrame(pd.get_dummies(data=df, drop_first=True
                   , prefix=col_list))
X = df

# X_train_d and y_train_d = using original standard test train spit dad
# that way, I can try out different resampling methods
X_train_d, X_test, y_train_d, y_test = train_test_split(X, y, test_size=0.4, shuffle=True, stratify=y, random_state=1)

### functions

In [50]:
def model_report(model): 
    
    print('\nCross Validation Scoring:\n')
    
    train_score = model.score(X_train, y_train)
    print("train score: {:.4}%".format(train_score * 100))

    test_score = model.score(X_test, y_test)
    print("test score: {:.4}%".format(test_score * 100))

    cv_score = np.mean(cross_val_score(model, X, y, cv=3))
    print("cross val score: {:.4}%".format(cv_score * 100))
    
    print("\nClassification Report:")
    print(classification_report(actuals, predictions))
    

    print('\nConfusion matrix:'.format(accuracy_score))
    display(pd.DataFrame(confusion_matrix(y_test, predictions), columns=col_list, index=ind_list))
                             
                        # ,columns=['pred doa','pred euth', 'pred to owner/wild', 'pred spec_case', 'pred trans', pred to wild]
                        # , index=['real euth', 'real doa', 'real to owner', 'real spec_case', 'real trans']


In [51]:
col_list = []
ind_list = []                     
for x in y_train.unique():
    col_list.append('pred_'+ x)
    ind_list.append('real_'+x)

## prep data

### resampling

Due to the imbalance of class data, it could be beneficial to resample to compensate for the missing data. Using smote to oversample, then random undersampling is a common practice in these circumstances

In [52]:
over = SMOTE(sampling_strategy='minority')
under = RandomUnderSampler()
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# # transform the dataset
X_train, y_train = pipeline.fit_resample(X_train_d, y_train_d)

In [53]:
print (y_train.value_counts())

dead_on_arrival         16772
return_to_wild/owner    16772
trans_caa               16772
special_caseother       16772
euthanized              16772
Name: disposition, dtype: int64


## __random forest__

---
__default model__

In [54]:
#initialize and fit random forest
rf_model = RandomForestClassifier(random_state=3)
rf_model.fit(X_train, y_train)

predictions = rf_model.predict(X_test)
actuals = y_test

model_report(rf_model)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=3, verbose=0,
                       warm_start=False)


Cross Validation Scoring:

train score: 99.79%
test score: 92.52%
cross val score: 92.24%

Classification Report:
                      precision    recall  f1-score   support

     dead_on_arrival       0.00      0.00      0.00        22
          euthanized       0.00      0.00      0.00        48
return_to_wild/owner       0.92      0.84      0.88      3627
   special_caseother       0.55      0.30      0.39       283
           trans_caa       0.93      0.98      0.95     11181

            accuracy                           0.93     15161
           macro avg       0.48      0.42      0.44     15161
        weighted avg       0.92      0.93      0.92     15161


Confusion matrix:


Unnamed: 0,pred_dead_on_arrival,pred_euthanized,pred_return_to_wild/owner,pred_special_caseother,pred_trans_caa
real_dead_on_arrival,0,1,6,1,14
real_euthanized,1,0,4,1,42
real_return_to_wild/owner,4,2,3031,17,573
real_special_caseother,1,0,34,84,164
real_trans_caa,0,6,214,49,10912


___
__Random Grid Search__

The default model is overfitting, so increasing the n_estimators, reducing max features, limiting max depth, and increasing min leaf samples can help construct a more robust fit. 

In [72]:
# set random search params

n_estimators = [250,500,750,1000]
min_weight_fraction_leaf=[0.0]
min_samples_split = [2,4,6]
min_samples_leaf = [2,4,6]
min_impurity_split = [None]
min_impurity_decrease = [0.0]
max_samples = [None]
max_leaf_nodes = [None]
max_features = ['auto']
max_depth = [None]
criterion = ['entropy', 'gini']
bootstrap = [True, False]

# Create the random grid
random_grid = { 'n_estimators': n_estimators,
                 'min_weight_fraction_leaf': min_weight_fraction_leaf,
                 'min_samples_split': min_samples_split,
                 'min_samples_leaf': min_samples_leaf,
                 'min_impurity_split': min_impurity_split,
                 'min_impurity_decrease': min_impurity_decrease,
                 'max_samples': max_samples,
                 'max_leaf_nodes': max_leaf_nodes,
                 'max_features': max_features,
                 'max_depth': max_depth,
                 'criterion': criterion,
                 'bootstrap': bootstrap
              }

# fit and search random param combinations
rf_random = RandomizedSearchCV(estimator = rf_model
                               , param_distributions = random_grid
                               , n_iter = 50
                               , cv = 3
                               , verbose=3
                               , random_state=3
                               , n_jobs = -1);
# Fit the random search model
rf_random.fit(X_train, y_train);

# print best parameters
print(rf_random.best_params_);

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:  1.6min remaining:   18.6s


KeyboardInterrupt: 

__grid search__

In [65]:
param_grid_rf = {
    
        'n_estimators': [1000, 1100]
        , 'min_samples_split': [4,5]
        , 'min_samples_leaf': [2,3]
        , 'max_samples': [20,21]
         , 'max_features': [50]
        , 'max_depth': [100]
        , 'criterion': ['entropy']
        , 'bootstrap': [True]
}

# grid search
gs_rf = GridSearchCV(estimator=rf_model
                    ,param_grid=param_grid_rf
                    ,scoring='recall'
                    ,cv=2
                     ,refit=True
                     ,error_score=0
                     , n_jobs=-1
                     , verbose=3
                    )

# fitting grid search
gs_rf.fit(X_train, y_train);

# best parameters
print('best accuracy: {:.4}%'.format(gs_rf.best_score_ * 100));
print('params:\n', gs_rf.best_params_);

Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

__best model__

In [22]:
# fitting model using best params
rf_gs_model = RandomForestClassifier(
                                    bootstrap= True
                                    ,criterion= 'entropy'
                                    , max_depth= None
                                    , max_features = 'auto'
                                    , max_leaf_nodes = None
                                    , max_samples = 50
                                    , min_samples_leaf = 2
                                    , min_samples_split = 4
                                    , n_estimators = 90
                                    );
rf_gs_model.fit(X_train, y_train);

predictions = rf_gs_model.predict(X_test);
actuals = y_test

# new model report
model_report(rf_gs_model)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


Cross Validation Scoring:

train score: 99.08%
test score: 92.3%
cross val score: 92.11%

Classification Report:
                   precision    recall  f1-score   support

         doa/euth       0.20      0.02      0.03        53
  return_to_owner       0.32      0.03      0.05       434
   return_to_wild       0.92      0.97      0.95      2287
special_caseother       0.59      0.31      0.41       212
        trans_caa       0.93      0.98      0.95      8389

         accuracy                           0.92     11375
        macro avg       0.59      0.46      0.48     11375
     weighted avg       0.90      0.92      0.90     11375


Confusion matrix:


Unnamed: 0,pred trans,pred to wild,pred to owner,pred spec_case,pred euth/doa
real trans,1,1,9,1,41
real to wild,0,13,22,0,399
real to owner,0,5,2228,9,45
real spec_case,0,0,17,66,129
real_euth/doa,4,22,137,35,8191


## __SVM__

SVM is often a good choice with categorical data. It also deals well with collinearity. The biggest drawback is the time/computational cost

In [None]:
# pipe_svc = Pipeline([('svm_model', svm.SVC(random_state=3))])

In [None]:
svm_model = svm.SVC(random_state=3)
svm_model.fit(X_train, y_train)

predictions = svm_model.predict(X_test)
actuals = y_test

___
__default model__

In [None]:
model_report(svm_model)

___
__random grid search__

In [None]:
# set random search params

clf_C = [0.001, 0.01,.1,1.]
# min samples required at node
clf_gamma = [.001,.01,1.]
# method of sampling
clf_kernel = ['rbf', 'linear']

random_grid = {
               'C': clf_C,
               'gamma': clf_gamma,
               'kernel': clf_kernel
               }

In [None]:
# fit and search random param combinations
svm_random = RandomizedSearchCV(estimator = svm_model
                               , param_distributions = random_grid
                               , n_iter = 50, cv = 3, verbose=2
                               , random_state=3, n_jobs = -1)
# Fit the random search model
svm_random.fit(X_train, y_train)

In [None]:
svm_random.best_params_

___
__grid search__

In [None]:
# Construct pipeline for support vector grid search

# pipe_svm = Pipeline([('pca', PCA(n_components=num_feats)),
#             ('clf', svm.SVC(random_state=3))])

# Set grid search params
param_grid_svm = [
 
  {'svm_model__C': [1.0], 'svm_model__gamma': [0.001]
   , 'svm_model__kernel': ['rbf']
  }]
 

# Construct grid search
gs_svm = GridSearchCV(estimator=svm_model,
            param_grid=param_grid_svm,
            #scoring='recall',
            cv=3, verbose=1, return_train_score = True)

# Fit using grid search
gs_svm.fit(X_train, y_train)

# Best accuracy
print('Best score: %.3f' % gs_svm.best_score_)

# Best params
print('\nBest params:\n', gs_svm.best_params_)

In [None]:
svm_gs_model = svm.SVC(C=1
                       ,gamma=0.01
                            ,kernel='rbf'
                             )
svm_gs_model.fit(X_train, y_train)

predictions = svm_gs_model.predict(X_test)
actuals = y_test

___
__best model__

In [None]:
model_report(svm_gs_model)

## __XGboost__

In [None]:
# initiate model
xgb_model = XGBClassifier(random_state=3)
xgb_model.fit(X_train, y_train)

predictions = xgb_model.predict(X_test)
actuals = y_test

In [None]:
model_report(xgb_model)

In [None]:
#set random search params
learning_rate=[.01, .1, .2, 1]
# 
max_depth = [2,3,6,10]
#max_depth.append(None)
min_child_weight = [0,1,2]
#
n_estimators = [50,100,200,300]
# Create the random grid
#min_weight_fraction_leaf = [0,0.0001, 0.01, 1]
random_grid = {
               'learning_rate':learning_rate,
               'max_depth': max_depth,
               'min_child_weight':min_child_weight,
               'n_estimators': n_estimators
               
              }
#pprint(random_grid)

# searching random params
xgb_random = RandomizedSearchCV(estimator = xgb_model
                               , param_distributions = random_grid
                               , n_iter = 20, cv = 3, verbose=1
                               , random_state=3, n_jobs = -1)
# fit random search model
xgb_random.fit(X_train, y_train)
xgb_random.best_params_

In [None]:
param_grid_xgb = [
    {
#         'xgb_model__min_samples_split':[2,3],
      #   'xbg_model__gamma':[0.01,0.1,.05],
        'xgb_model__min_child_weight': [1,.1,.01],
    'xgb_model__learning_rate': [0.001,0.01],
    'xgb_model__n_estimators': [150,200,250],
    'xgb_model__max_depth': [2,3]}
]

# Construct Grid Search
gs_xgb = GridSearchCV(estimator=xgb_model,
                    param_grid=param_grid_xgb,
                    scoring='accuracy',
                    cv=3, n_jobs=-1, verbose=1)

# Fit using grid search
gs_xgb.fit(X, y)

# Best accuracy and parameters
print('best score: {:.3}%'.format(gs_xgb.best_score_ * 100))
print('params:\n', gs_xgb.best_params_)

In [None]:
xgb_gs_model = XGBClassifier(learning_rate=0.001,
                             max_depth=3,
                             min_child_weight=1,
                             n_estimators=150)
xgb_gs_model.fit(X_train, y_train)

predictions = xgb_gs_model.predict(X_test)
actuals = y_test

In [None]:
model_report(xgb_gs_model)

## __keras__

In [None]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(X,y,test_size = 0.1, random_state = 0)

In [None]:
input_dim = len(df.columns)
model = Sequential()
model.add(Dense(8, input_dim = input_dim , activation = 'relu'))
model.add(Dense(10, activation = 'relu'))
model.add(Dense(10, activation = 'relu'))
model.add(Dense(10, activation = 'relu'))
model.add(Dense(6, activation = 'softmax'))

model.compile(loss = 'categorical_crossentropy' , optimizer = 'adam' , metrics = ['accuracy'] )

model.fit(train_x, train_y, epochs = 10, batch_size = 2)

scores = model.evaluate(test_x, test_y)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))