## Thought process/logic of selecting models.  Ended selecting Random Forest Classifier
    * Supervised Classification problem
    * Not much difference between all the models we tested (RF, Logistic Regression, SVM, NN models)
    * Chose to do a deeper dive on this model because results were within 1%

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import pickle
from pickle import dump

In [2]:
# Read in cleaned csv
df = pd.read_csv("Resources/LouisvilleCleanFinal.csv").drop(['Unnamed: 0'],axis=1)
df.head()

Unnamed: 0,PrimaryColor,BreedCategory,IntakeStatus,PetAgeCategory,Sex,TopBreed,BinaryOutcome
0,WHITE,PURE,NORMAL,YOUNG,MALE,PIT BULL TERRIER,DENY
1,BLACK,MIX,NORMAL,YOUNG,FEMALE,BORDER COLLIE,TAKE
2,TAN,MIX,NORMAL,YOUNG,MALE,GOLDEN RETRIEVER,TAKE
3,WHITE,PURE,NORMAL,BABY,FEMALE,LABRADOR RETRIEVER,TAKE
4,BLACK,PURE,SICK,YOUNG,MALE,PIT BULL TERRIER,DENY


In [3]:
# Make minor change to Binary Outcome
df["BinaryOutcome2"] = 0
df.loc[df["BinaryOutcome"].str.contains("TAKE"), "BinaryOutcome2"] = 1

In [4]:
df.head()

Unnamed: 0,PrimaryColor,BreedCategory,IntakeStatus,PetAgeCategory,Sex,TopBreed,BinaryOutcome,BinaryOutcome2
0,WHITE,PURE,NORMAL,YOUNG,MALE,PIT BULL TERRIER,DENY,0
1,BLACK,MIX,NORMAL,YOUNG,FEMALE,BORDER COLLIE,TAKE,1
2,TAN,MIX,NORMAL,YOUNG,MALE,GOLDEN RETRIEVER,TAKE,1
3,WHITE,PURE,NORMAL,BABY,FEMALE,LABRADOR RETRIEVER,TAKE,1
4,BLACK,PURE,SICK,YOUNG,MALE,PIT BULL TERRIER,DENY,0


## Define X / y and OneHot encoding categorical features

In [5]:
# Define X and y
X = df.drop(['BinaryOutcome','BinaryOutcome2'], axis="columns")
y = df[['BinaryOutcome2']]


In [6]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
X = ohe.fit_transform(X)

In [7]:
X.shape

(41535, 52)

## Creating a simple RF model

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Import, initialize, fit and predict
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators = 100, oob_score = 'True', n_jobs = -1,random_state =50, max_features = "auto", min_samples_leaf = 50)
rf_model.fit(X_train, y_train)

# Test
predict_y_test = rf_model.predict(X_test)

  import sys


## Some techniques used to evaluate between models, features 
    * Ran accuracy score
    * Ran AUC score
    * Ran confusion matrix
    * Ran tree nodes and height
    * Ran features importances
    * Ran cross val score

In [9]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix

In [10]:
# Validate - run accuracy and AUC score
from sklearn import metrics
print("Accuracy score: ", metrics.accuracy_score(y_test, predict_y_test))
print("AUC score: ", metrics.roc_auc_score(y_test, predict_y_test))

Accuracy score:  0.7036787365177196
AUC score:  0.5801281745190704


In [11]:
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, predict_y_test))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, predict_y_test))
print('\n')

=== Confusion Matrix ===
[[6509  582]
 [2495  798]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.72      0.92      0.81      7091
           1       0.58      0.24      0.34      3293

    accuracy                           0.70     10384
   macro avg       0.65      0.58      0.58     10384
weighted avg       0.68      0.70      0.66     10384





#### Tree nodes and height

In [12]:
from rfpimp import *
print(f"{rfnnodes(rf_model):,d} tree nodes and {np.median(rfmaxdepths(rf_model))} median tree height")

23,442 tree nodes and 20.0 median tree height




#### Feature importance


In [13]:
# # summarize feature importance

importance = rf_model.feature_importances_
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: 0.00945
Feature: 1, Score: 0.00437
Feature: 2, Score: 0.00296
Feature: 3, Score: 0.00358
Feature: 4, Score: 0.00000
Feature: 5, Score: 0.00073
Feature: 6, Score: 0.00000
Feature: 7, Score: 0.00219
Feature: 8, Score: 0.00000
Feature: 9, Score: 0.00474
Feature: 10, Score: 0.00595
Feature: 11, Score: 0.00078
Feature: 12, Score: 0.00569
Feature: 13, Score: 0.03223
Feature: 14, Score: 0.01846
Feature: 15, Score: 0.04504
Feature: 16, Score: 0.26251
Feature: 17, Score: 0.06745
Feature: 18, Score: 0.00305
Feature: 19, Score: 0.06522
Feature: 20, Score: 0.02210
Feature: 21, Score: 0.03512
Feature: 22, Score: 0.15653
Feature: 23, Score: 0.02212
Feature: 24, Score: 0.00545
Feature: 25, Score: 0.00508
Feature: 26, Score: 0.00004
Feature: 27, Score: 0.00024
Feature: 28, Score: 0.00005
Feature: 29, Score: 0.00035
Feature: 30, Score: 0.00816
Feature: 31, Score: 0.00097
Feature: 32, Score: 0.00190
Feature: 33, Score: 0.01022
Feature: 34, Score: 0.01132
Feature: 35, Score: 0.00005
Fe

#### Cross val score

In [14]:
from sklearn.model_selection import cross_val_score

# More efficient use of data (every observation is used for train/test)
# But difficult to inspect results with either confusion matrix/roc curve
rf_cv_score = cross_val_score(rf_model, X, y, cv=10, scoring='accuracy')


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


In [15]:
rf_cv_score.mean()

0.702612180856435

## Hyperparameter tuning the RF model


In [16]:
from sklearn.model_selection import RandomizedSearchCV

# Hyperparameter grid
param_grid = {
    'n_estimators': np.linspace(10, 200).astype(int),
    'max_depth': [None] + list(np.linspace(3, 20).astype(int)),
    'max_features': ['auto', 'sqrt', None] + list(np.arange(0.5, 1, 0.1)),
    'max_leaf_nodes': [None] + list(np.linspace(10, 50, 500).astype(int)),
    'min_samples_split': [2, 5, 10],
    'bootstrap': [True, False]
}

# Estimator for use in random search
rf_model_hyper = RandomForestClassifier(random_state = 1)

# Create the random search model
rs = RandomizedSearchCV(rf_model_hyper, param_grid, n_jobs = -1, 
                        scoring = 'roc_auc', cv = 3, 
                        n_iter = 10, verbose = 1, random_state=1)

# Fit 
rs.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   22.1s finished
  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [17]:
# Print best parameters found
rs.best_params_


{'n_estimators': 106,
 'min_samples_split': 2,
 'max_leaf_nodes': 46,
 'max_features': 0.8999999999999999,
 'max_depth': 15,
 'bootstrap': True}

In [18]:
# With hyper tuning
best_model = rs.best_estimator_
best_model

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15,
                       max_features=0.8999999999999999, max_leaf_nodes=46,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=106, n_jobs=None, oob_score=False,
                       random_state=1, verbose=0, warm_start=False)

In [19]:
# Re-test
hypertuning_rf_predictions_y = best_model.predict(X_test)

In [20]:
# Results after hypertuning
from sklearn import metrics
print("Accuracy score Hypertuned model: ", metrics.accuracy_score(y_test, hypertuning_rf_predictions_y))
print("AUC score Hypertuned model: ", metrics.roc_auc_score(y_test, hypertuning_rf_predictions_y))

Accuracy score Hypertuned model:  0.7103235747303543
AUC score Hypertuned model:  0.6200447070817646


#### Accuracy didn't improve much. AUC improved from .58 to .62

## Saving the model



In [21]:
# dump(best_model, open('rf_modelOneHot2.pkl', 'wb'))
