# Machine Learning

## Note: For now, you must run "merged_csv_cleaning.ipynb" first before running this file

#### ^ This is to reduce merge conflicts on github

In [41]:
# Import dependencies
import pandas as pd
import numpy as np

from datetime import datetime
from path import Path
import hvplot.pandas
import plotly.express as px

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder

In [42]:
# Machine Learning imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [43]:
# Import the clean, incoded CSV that was created on merged_csv_cleaning file into a new Dataframe 
f_path = 'Resources/ml.csv'
ml_df = pd.read_csv(f_path, index_col=False)
ml_df

Unnamed: 0,outcome_type,age_upon_intake_in_days,animal_type_Bird,animal_type_Cat,animal_type_Dog,animal_type_Livestock,animal_type_Other,breed_Abyssinian,breed_Abyssinian Mix,breed_Affenpinscher,...,intake_condition_Nursing,intake_condition_Other,intake_condition_Pregnant,intake_condition_Sick,intake_condition_Space,sex_upon_intake_Intact Female,sex_upon_intake_Intact Male,sex_upon_intake_Neutered Male,sex_upon_intake_Spayed Female,sex_upon_intake_Unknown
0,Transfer,3652,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,Return to Owner,5875,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,Return to Owner,5647,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,Return to Owner,5505,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,Return to Owner,5507,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105250,Transfer,9,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
105251,Transfer,9,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
105252,Transfer,9,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
105253,Transfer,9,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### Attempt 0: Run data as-is

#### Split and scale the data

In [44]:
# Split data by Outcome and Features
X = ml_df.drop(columns = ['outcome_type'])
y = ml_df['outcome_type']

In [45]:
# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=42)

In [46]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(31576, 3097)
(73679, 3097)
(31576,)
(73679,)


In [47]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [48]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [49]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

#### Train and Test the Model

In [50]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=42)

In [51]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [52]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [53]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
                 precision    recall  f1-score   support

       Adoption       0.60      0.67      0.63     31470
           Died       0.09      0.04      0.05       828
       Disposal       0.33      0.08      0.12       418
     Euthanasia       0.71      0.65      0.68      5866
        Missing       0.06      0.03      0.04        36
       Relocate       0.00      0.00      0.00        15
Return to Owner       0.53      0.52      0.53     10097
      Rto-Adopt       0.03      0.01      0.02       335
       Transfer       0.56      0.50      0.53     24614

       accuracy                           0.58     73679
      macro avg       0.32      0.28      0.29     73679
   weighted avg       0.57      0.58      0.57     73679



In [14]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Adoption", "Actual Died", 'Disposal', 'Euthanasia', 'Missing', 'Relocate', 'Return to Owner', 'Rto-Adopt', 'Transfer'], 
    columns=["Predicted Adoption", "Predicted Died", 'Disposal', 'Euthanasia', 'Missing', 'Relocate', 'Return to Owner', 'Rto-Adopt', 'Transfer'])

cm_df

Unnamed: 0,Predicted Adoption,Predicted Died,Disposal,Euthanasia,Missing,Relocate,Return to Owner,Rto-Adopt,Transfer
Actual Adoption,7803,27,5,138,5,0,871,28,2352
Actual Died,100,13,3,55,0,0,13,0,94
Disposal,6,1,11,119,0,0,1,0,23
Euthanasia,298,16,6,1311,1,1,103,4,318
Missing,12,0,0,1,0,0,3,0,2
Relocate,1,0,0,3,0,0,0,0,1
Return to Owner,1177,1,0,53,1,0,1881,13,546
Rto-Adopt,58,0,0,5,0,0,21,4,30
Transfer,3332,45,4,157,0,1,643,13,4580


In [15]:
# check important features
feature_importances_df = pd.DataFrame(
    {"feature": list(X.columns), "importance": rf_model.feature_importances_}
).sort_values("importance", ascending=False)

# Display
feature_importances_df.head(20)

Unnamed: 0,feature,importance
0,age_upon_intake_in_days,0.373837
3096,sex_upon_intake_Unknown,0.025859
3079,intake_type_Wildlife,0.022941
3077,intake_type_Public Assist,0.021355
3076,intake_type_Owner Surrender,0.018216
3078,intake_type_Stray,0.017718
5,animal_type_Other,0.014787
3092,sex_upon_intake_Intact Female,0.014678
3086,intake_condition_Normal,0.014443
3094,sex_upon_intake_Neutered Male,0.012786


### Attempt 1: Combine some outcomes together

#### Reorganize Data

In [8]:
outcome_counts = ml_df['outcome_type'].value_counts()
outcome_counts

Adoption           44866
Transfer           35150
Return to Owner    14524
Euthanasia          8399
Died                1184
Disposal             593
Rto-Adopt            461
Missing               54
Relocate              24
Name: outcome_type, dtype: int64

In [9]:
# Determine which values to replace if counts are less than ...?
replace_outcome = list(outcome_counts[outcome_counts < 1000].index)

# Replace in dataframe
for outcome in replace_outcome:
    ml_df.outcome_type = ml_df.outcome_type.replace(outcome,"Other")

In [10]:
# Combine Died with Euthanasia
ml_df.outcome_type = ml_df.outcome_type.replace('Died',"Euthanasia")

# Check to make sure binning was successful
ml_df.outcome_type.value_counts()

Adoption           44866
Transfer           35150
Return to Owner    14524
Euthanasia          9583
Other               1132
Name: outcome_type, dtype: int64

#### Machine Learning

In [8]:
# Split data by Outcome and Features
X = ml_df.drop(columns = ['outcome_type'])
y = ml_df['outcome_type']

# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=42) 

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [11]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
                 precision    recall  f1-score   support

       Adoption       0.61      0.69      0.65     11229
     Euthanasia       0.71      0.61      0.65      2336
          Other       0.15      0.05      0.07       302
Return to Owner       0.53      0.51      0.52      3672
       Transfer       0.58      0.52      0.55      8775

       accuracy                           0.60     26314
      macro avg       0.52      0.48      0.49     26314
   weighted avg       0.59      0.60      0.59     26314



In [14]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Adoption", "Actual Euthanasia", 'Other', 'Return to Owner', 'Transfer'], 
    columns=["Predicted Adoption", "Predicted Euthanasia", 'Other', 'Return to Owner', 'Transfer'])

cm_df

Unnamed: 0,Predicted Adoption,Predicted Euthanasia,Other,Return to Owner,Transfer
Actual Adoption,7802,182,38,877,2330
Actual Euthanasia,384,1417,15,109,411
Other,74,131,15,23,59
Return to Owner,1185,51,16,1875,545
Transfer,3340,218,16,650,4551


In [12]:
# check important features
feature_importances_df = pd.DataFrame(
    {"feature": list(X.columns), "importance": rf_model.feature_importances_}
).sort_values("importance", ascending=False)

# Display
feature_importances_df.head(20)

Unnamed: 0,feature,importance
0,age_upon_intake_in_days,0.371861
3096,sex_upon_intake_Unknown,0.025652
3077,intake_type_Public Assist,0.021426
3079,intake_type_Wildlife,0.021039
3076,intake_type_Owner Surrender,0.017853
3078,intake_type_Stray,0.017363
5,animal_type_Other,0.016236
3086,intake_condition_Normal,0.015927
3092,sex_upon_intake_Intact Female,0.015758
3,animal_type_Dog,0.013136


### Attempt 2: No Scaling

In [11]:
# Split data by Outcome and Features
X = ml_df.drop(columns = ['outcome_type'])
y = ml_df['outcome_type']

# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [5]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=42)

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test)

In [6]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
                 precision    recall  f1-score   support

       Adoption       0.61      0.69      0.65     11229
           Died       0.12      0.05      0.07       278
       Disposal       0.38      0.07      0.12       161
     Euthanasia       0.71      0.64      0.67      2058
        Missing       0.00      0.00      0.00        18
       Relocate       0.00      0.00      0.00         5
Return to Owner       0.53      0.51      0.52      3672
      Rto-Adopt       0.06      0.03      0.04       118
       Transfer       0.58      0.52      0.55      8775

       accuracy                           0.59     26314
      macro avg       0.33      0.28      0.29     26314
   weighted avg       0.59      0.59      0.59     26314



In [7]:
# check important features
feature_importances_df = pd.DataFrame(
    {"feature": list(X.columns), "importance": rf_model.feature_importances_}
).sort_values("importance", ascending=False)

# Display
feature_importances_df.head(20)

Unnamed: 0,feature,importance
0,age_upon_intake_in_days,0.373837
3096,sex_upon_intake_Unknown,0.025859
3079,intake_type_Wildlife,0.022941
3077,intake_type_Public Assist,0.021355
3076,intake_type_Owner Surrender,0.018216
3078,intake_type_Stray,0.017718
5,animal_type_Other,0.014787
3092,sex_upon_intake_Intact Female,0.014678
3086,intake_condition_Normal,0.014443
3094,sex_upon_intake_Neutered Male,0.012786


### Attempt 3: Use AdaBoost

joblib, pickle

In [13]:
from sklearn.ensemble import AdaBoostClassifier

In [14]:
# Split data by Outcome and Features
X = ml_df.drop(columns = ['outcome_type'])
y = ml_df['outcome_type']

# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [16]:
# Create adaboost random forest classifier.
abd_model = AdaBoostClassifier(random_state=42, n_estimators=200)

# Fitting the model
abd_model = abd_model.fit(X_train, y_train)

# Making predictions using the testing data.
predictions = abd_model.predict(X_test)

In [17]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
                 precision    recall  f1-score   support

       Adoption       0.59      0.80      0.68     11229
     Euthanasia       0.72      0.59      0.65      2336
          Other       0.14      0.01      0.01       302
Return to Owner       0.59      0.52      0.56      3672
       Transfer       0.61      0.42      0.50      8775

       accuracy                           0.61     26314
      macro avg       0.53      0.47      0.48     26314
   weighted avg       0.60      0.61      0.59     26314



In [18]:
# check important features
feature_importances_df = pd.DataFrame(
    {"feature": list(X.columns), "importance": abd_model.feature_importances_}
).sort_values("importance", ascending=False)

# Display
feature_importances_df.head(20)

Unnamed: 0,feature,importance
0,age_upon_intake_in_days,0.12
3096,sex_upon_intake_Unknown,0.02
3077,intake_type_Public Assist,0.015
1877,breed_Pit Bull Mix,0.015
3076,intake_type_Owner Surrender,0.01
3086,intake_condition_Normal,0.01
1,animal_type_Bird,0.01
2213,breed_Siamese Mix,0.01
3079,intake_type_Wildlife,0.01
1876,breed_Pit Bull,0.01


In [10]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# ------------------------------------------

### Attempt 4: Cut down to just Cats and Dogs

In [None]:
# Import the cats and dogs CSV
f_path = 'Resources/ml_cats_dogs.csv'
ml_cats_dogs_df = pd.read_csv(f_path, index_col=False)
ml_cats_dogs_df

In [4]:
# Split data by Outcome and Features
X = ml_cats_dogs_df.drop(columns = ['outcome_type'])
y = ml_cats_dogs_df['outcome_type']

# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [5]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=42)

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test)

In [6]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
                 precision    recall  f1-score   support

       Adoption       0.61      0.69      0.64     10949
           Died       0.07      0.04      0.05       246
       Disposal       0.20      0.07      0.11        28
     Euthanasia       0.37      0.23      0.28       903
        Missing       0.00      0.00      0.00        12
Return to Owner       0.54      0.52      0.53      3657
      Rto-Adopt       0.00      0.00      0.00       118
       Transfer       0.57      0.52      0.55      8522

       accuracy                           0.58     24435
      macro avg       0.29      0.26      0.27     24435
   weighted avg       0.57      0.58      0.57     24435



In [7]:
# check important features
feature_importances_df = pd.DataFrame(
    {"feature": list(X.columns), "importance": rf_model.feature_importances_}
).sort_values("importance", ascending=False)

# Display
feature_importances_df.head(20)

Unnamed: 0,feature,importance
0,age_upon_intake_in_days,0.419131
2835,intake_type_Public Assist,0.023572
2853,sex_upon_intake_Unknown,0.020944
2834,intake_type_Owner Surrender,0.018481
2851,sex_upon_intake_Neutered Male,0.01568
2849,sex_upon_intake_Intact Female,0.014844
2836,intake_type_Stray,0.014022
2843,intake_condition_Normal,0.013064
2850,sex_upon_intake_Intact Male,0.013057
2,animal_type_Dog,0.012073


In [9]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Adoption", "Actual Died", 'Disposal', 'Euthanasia', 'Missing', 'Return to Owner', 'Rto-Adopt', 'Transfer'], 
    columns=["Predicted Adoption", "Predicted Died", 'Disposal', 'Euthanasia', 'Missing', 'Return to Owner', 'Rto-Adopt', 'Transfer'])

cm_df

Unnamed: 0,Predicted Adoption,Predicted Died,Disposal,Euthanasia,Missing,Return to Owner,Rto-Adopt,Transfer
Actual Adoption,7545,39,1,126,2,903,18,2315
Actual Died,104,9,0,20,0,12,0,101
Disposal,4,3,2,5,0,0,1,13
Euthanasia,282,17,3,205,0,93,5,298
Missing,5,0,0,0,0,2,0,5
Return to Owner,1167,3,1,48,1,1904,10,523
Rto-Adopt,59,0,0,4,0,25,0,30
Transfer,3288,52,3,153,1,586,10,4429


### Attempt 5: Cats & Dogs Only, Combine Some Outcomes

In [10]:
outcome_counts = ml_cats_dogs_df['outcome_type'].value_counts()
outcome_counts

Adoption           44102
Transfer           34060
Return to Owner    14433
Euthanasia          3543
Died                 968
Rto-Adopt            460
Disposal             121
Missing               49
Relocate               4
Name: outcome_type, dtype: int64

In [11]:
# Determine which values to replace if counts are less than ...?
replace_outcome = list(outcome_counts[outcome_counts < 900].index)

# Replace in dataframe
for outcome in replace_outcome:
    ml_cats_dogs_df.outcome_type = ml_cats_dogs_df.outcome_type.replace(outcome,"Other")

In [12]:
# Combine Died with Euthanasia
ml_cats_dogs_df.outcome_type = ml_cats_dogs_df.outcome_type.replace('Died',"Euthanasia")

# Check to make sure binning was successful
ml_cats_dogs_df.outcome_type.value_counts()

Adoption           44102
Transfer           34060
Return to Owner    14433
Euthanasia          4511
Other                634
Name: outcome_type, dtype: int64

In [13]:
# Split data by Outcome and Features
X = ml_cats_dogs_df.drop(columns = ['outcome_type'])
y = ml_cats_dogs_df['outcome_type']

# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [14]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=42)

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test)

In [15]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
                 precision    recall  f1-score   support

       Adoption       0.61      0.69      0.65     10949
     Euthanasia       0.37      0.23      0.28      1149
          Other       0.07      0.03      0.04       158
Return to Owner       0.54      0.52      0.53      3657
       Transfer       0.57      0.52      0.54      8522

       accuracy                           0.58     24435
      macro avg       0.43      0.40      0.41     24435
   weighted avg       0.57      0.58      0.57     24435



In [16]:
# check important features
feature_importances_df = pd.DataFrame(
    {"feature": list(X.columns), "importance": rf_model.feature_importances_}
).sort_values("importance", ascending=False)

# Display
feature_importances_df.head(20)

Unnamed: 0,feature,importance
0,age_upon_intake_in_days,0.416122
2835,intake_type_Public Assist,0.022405
2853,sex_upon_intake_Unknown,0.020016
2834,intake_type_Owner Surrender,0.019463
2849,sex_upon_intake_Intact Female,0.015613
2851,sex_upon_intake_Neutered Male,0.015226
2836,intake_type_Stray,0.014385
2843,intake_condition_Normal,0.014054
2,animal_type_Dog,0.013246
2850,sex_upon_intake_Intact Male,0.012987


### Attempt 6: Dogs & Cats Adaboost

In [None]:
# Create adaboost random forest classifier.
abd_model = AdaBoostClassifier(random_state=42, n_estimators=200)

# Fitting the model
abd_model = abd_model.fit(X_train, y_train)

# Making predictions using the testing data.
predictions = abd_model.predict(X_test)

In [None]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, predictions))

In [None]:
# check important features
feature_importances_df = pd.DataFrame(
    {"feature": list(X.columns), "importance": abd_model.feature_importances_}
).sort_values("importance", ascending=False)

# Display
feature_importances_df.head(20)

# Import Cats and Dogs and Bin the Outcomes

In [3]:
# Import the cats and dogs CSV
ml_cats_df = pd.read_csv('Resources/ml_cats.csv', index_col=False)
ml_dogs_df = pd.read_csv('Resources/ml_dogs.csv', index_col=False)

In [4]:
# Combine Died with Euthanasia
ml_cats_df.outcome_type = ml_cats_df.outcome_type.replace('Died',"Euthanasia")
ml_dogs_df.outcome_type = ml_dogs_df.outcome_type.replace('Died',"Euthanasia")

In [5]:
# Bin the outcomes
replace_outcome = ['Disposal', 'Rto-Adopt', 'Missing', 'Relocate']

# Replace in dataframe
for outcome in replace_outcome:
    ml_cats_df.outcome_type = ml_cats_df.outcome_type.replace(outcome,"Other")
    ml_dogs_df.outcome_type = ml_dogs_df.outcome_type.replace(outcome,"Other")

### Cats Only Adaboost

In [6]:
# Split data by Outcome and Features
X = ml_cats_df.drop(columns = ['outcome_type'])
y = ml_cats_df['outcome_type']

# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Create adaboost random forest classifier.
abd_model = AdaBoostClassifier(random_state=42, n_estimators=200)

# Fitting the model
abd_model = abd_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data.
predictions = abd_model.predict(X_test_scaled)

In [11]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
                 precision    recall  f1-score   support

       Adoption       0.67      0.75      0.70      4909
     Euthanasia       0.39      0.24      0.30       698
          Other       0.14      0.01      0.02        78
Return to Owner       0.34      0.26      0.30       439
       Transfer       0.67      0.65      0.66      5038

       accuracy                           0.65     11162
      macro avg       0.44      0.38      0.40     11162
   weighted avg       0.64      0.65      0.64     11162



In [13]:
# check important features
cat_feature_importances_df = pd.DataFrame(
    {"feature": list(X.columns), "importance": abd_model.feature_importances_}
).sort_values("importance", ascending=False)

# Display
cat_feature_importances_df.head(20)

Unnamed: 0,feature,importance
0,age_upon_intake_in_days,0.485
395,sex_upon_intake_Neutered Male,0.02
396,sex_upon_intake_Spayed Female,0.02
397,sex_upon_intake_Unknown,0.02
38,breed_Domestic Shorthair,0.015
379,intake_type_Public Assist,0.015
199,color_Calico,0.01
388,intake_condition_Nursing,0.01
28,breed_Domestic Longhair,0.01
319,color_Tortie,0.01


### Dogs Only Adaboost

In [24]:
# Split data by Outcome and Features
X = ml_dogs_df.drop(columns = ['outcome_type'])
y = ml_dogs_df['outcome_type']

# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [25]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [26]:
# Create adaboost random forest classifier.
abd_model = AdaBoostClassifier(random_state=42, n_estimators=200)

# Fitting the model
abd_model = abd_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data.
predictions = abd_model.predict(X_test_scaled)

In [27]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
                 precision    recall  f1-score   support

       Adoption       0.57      0.85      0.68      6144
     Euthanasia       0.33      0.23      0.27       448
          Other       0.00      0.00      0.00        98
Return to Owner       0.63      0.53      0.58      3114
       Transfer       0.51      0.16      0.25      3469

       accuracy                           0.57     13273
      macro avg       0.41      0.36      0.36     13273
   weighted avg       0.56      0.57      0.52     13273



In [28]:
# check important features
feature_importances_df = pd.DataFrame(
    {"feature": list(X.columns), "importance": abd_model.feature_importances_}
).sort_values("importance", ascending=False)

# Display
feature_importances_df.head(20)

Unnamed: 0,feature,importance
0,age_upon_intake_in_days,0.12
2554,intake_condition_Normal,0.015
1650,breed_Pit Bull Mix,0.015
1310,breed_Labrador Retriever,0.015
702,breed_Chihuahua Shorthair,0.01
2562,sex_upon_intake_Spayed Female,0.01
2561,sex_upon_intake_Neutered Male,0.01
2555,intake_condition_Nursing,0.01
2546,intake_type_Public Assist,0.01
1807,breed_Rat Terrier Mix,0.01


### Refining Cats

In [47]:
drop_cat_features = list(cat_feature_importances_df['feature'][50:])

In [52]:
ml_cats_df2 = ml_cats_df.drop(columns=drop_cat_features)

In [54]:
ml_cats_df2

Unnamed: 0,outcome_type,age_upon_intake_in_days,breed_Domestic Longhair,breed_Domestic Medium Hair,breed_Domestic Shorthair,breed_Maine Coon Mix,breed_Manx,breed_Persian Mix,breed_Russian Blue Mix,breed_Siamese/Angora,...,color_White,color_White/Black,color_White/Blue,intake_type_Owner Surrender,intake_type_Public Assist,intake_condition_Injured,intake_condition_Nursing,sex_upon_intake_Neutered Male,sex_upon_intake_Spayed Female,sex_upon_intake_Unknown
0,Return to Owner,5928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Transfer,5435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,Euthanasia,7228,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,Transfer,6582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,Return to Owner,4659,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44643,Transfer,9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44644,Transfer,9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44645,Transfer,9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44646,Transfer,9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
# Split data by Outcome and Features
X = ml_cats_df.drop(columns = ['outcome_type'])
y = ml_cats_df['outcome_type']

# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [56]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [57]:
# Create adaboost random forest classifier.
abd_model = AdaBoostClassifier(random_state=42, n_estimators=200)

# Fitting the model
abd_model = abd_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data.
predictions = abd_model.predict(X_test_scaled)

In [58]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
                 precision    recall  f1-score   support

       Adoption       0.67      0.75      0.70      4909
     Euthanasia       0.39      0.24      0.30       698
          Other       0.14      0.01      0.02        78
Return to Owner       0.34      0.26      0.30       439
       Transfer       0.67      0.65      0.66      5038

       accuracy                           0.65     11162
      macro avg       0.44      0.38      0.40     11162
   weighted avg       0.64      0.65      0.64     11162



In [59]:
# check important features
feature_importances_df = pd.DataFrame(
    {"feature": list(X.columns), "importance": abd_model.feature_importances_}
).sort_values("importance", ascending=False)

# Display
feature_importances_df.head(20)

Unnamed: 0,feature,importance
0,age_upon_intake_in_days,0.485
395,sex_upon_intake_Neutered Male,0.02
396,sex_upon_intake_Spayed Female,0.02
397,sex_upon_intake_Unknown,0.02
38,breed_Domestic Shorthair,0.015
379,intake_type_Public Assist,0.015
199,color_Calico,0.01
388,intake_condition_Nursing,0.01
28,breed_Domestic Longhair,0.01
319,color_Tortie,0.01


### Cats Only XGBoost

In [17]:
import xgboost as xgb

In [18]:
xgb_model = xgb.XGBClassifier()

# Fitting the model
xgb_cats_model = xgb_model.fit(X_train, y_train)

# Making predictions using the testing data.
xgb_predictions = xgb_cats_model.predict(X_test)





In [19]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, xgb_predictions))

Classification Report
                 precision    recall  f1-score   support

       Adoption       0.67      0.77      0.72      4909
     Euthanasia       0.48      0.31      0.38       698
          Other       0.27      0.04      0.07        78
Return to Owner       0.39      0.21      0.28       439
       Transfer       0.70      0.66      0.68      5038

       accuracy                           0.67     11162
      macro avg       0.50      0.40      0.42     11162
   weighted avg       0.66      0.67      0.66     11162



In [21]:
# check important features
XGCat_feature_importances_df = pd.DataFrame(
    {"feature": list(X.columns), "importance": xgb_cats_model.feature_importances_}
).sort_values("importance", ascending=False)

# Display
XGCat_feature_importances_df.head(20)

Unnamed: 0,feature,importance
397,sex_upon_intake_Unknown,0.116297
388,intake_condition_Nursing,0.072406
387,intake_condition_Normal,0.071474
396,sex_upon_intake_Spayed Female,0.039578
378,intake_type_Owner Surrender,0.039333
379,intake_type_Public Assist,0.032001
395,sex_upon_intake_Neutered Male,0.031394
380,intake_type_Stray,0.027391
0,age_upon_intake_in_days,0.020625
383,intake_condition_Feral,0.012011


### Cats Only Catboost

In [7]:
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.metrics import accuracy_score

In [6]:
# Split data by Outcome and Features
X = ml_cats_df.drop(columns = ['outcome_type'])
y = ml_cats_df['outcome_type']

# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
model = CatBoostClassifier(
    custom_loss=[metrics.Accuracy()],
    random_seed=42,
    logging_level='Silent'
)

In [11]:
model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
#     logging_level='Verbose',  # you can uncomment this for text output
    plot=True
);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [12]:
predictions = model.predict(X_test)

In [13]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
                 precision    recall  f1-score   support

       Adoption       0.66      0.77      0.71      4909
     Euthanasia       0.49      0.31      0.38       698
          Other       0.25      0.01      0.02        78
Return to Owner       0.39      0.19      0.26       439
       Transfer       0.70      0.66      0.68      5038

       accuracy                           0.66     11162
      macro avg       0.50      0.39      0.41     11162
   weighted avg       0.65      0.66      0.65     11162



### Cats Only XGBoost 1: Run model with Top 50 Features

In [22]:
drop_cat_features = list(XGCat_feature_importances_df['feature'][50:])

ml_XGCats_df2 = ml_cats_df.drop(columns=drop_cat_features)

In [23]:
ml_XGCats_df2

Unnamed: 0,outcome_type,age_upon_intake_in_days,breed_Balinese Mix,breed_Domestic Longhair,breed_Domestic Longhair Mix,breed_Domestic Medium Hair Mix,breed_Domestic Shorthair,breed_Domestic Shorthair Mix,breed_Manx Mix,breed_Siamese,...,intake_condition_Injured,intake_condition_Normal,intake_condition_Nursing,intake_condition_Other,intake_condition_Sick,sex_upon_intake_Intact Female,sex_upon_intake_Intact Male,sex_upon_intake_Neutered Male,sex_upon_intake_Spayed Female,sex_upon_intake_Unknown
0,Return to Owner,5928,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,Transfer,5435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,Euthanasia,7228,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,Transfer,6582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,Return to Owner,4659,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44643,Transfer,9,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
44644,Transfer,9,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
44645,Transfer,9,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
44646,Transfer,9,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [24]:
# Split data by Outcome and Features
X = ml_XGCats_df2.drop(columns = ['outcome_type'])
y = ml_XGCats_df2['outcome_type']

# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [25]:
xgb_model = xgb.XGBClassifier()

# Fitting the model
xgb_cats_model = xgb_model.fit(X_train, y_train)

# Making predictions using the testing data.
xgb_predictions = xgb_cats_model.predict(X_test)





In [27]:
# Generate classification report
print("XGBoost Top 50 Classification Report")
print(classification_report(y_test, xgb_predictions))

XGBoost Top 50 Classification Report
                 precision    recall  f1-score   support

       Adoption       0.67      0.77      0.71      4909
     Euthanasia       0.49      0.32      0.38       698
          Other       0.23      0.04      0.07        78
Return to Owner       0.40      0.23      0.29       439
       Transfer       0.70      0.66      0.68      5038

       accuracy                           0.67     11162
      macro avg       0.50      0.40      0.43     11162
   weighted avg       0.66      0.67      0.66     11162



In [28]:
# check important features
XGCat_feature_importances_df = pd.DataFrame(
    {"feature": list(X.columns), "importance": xgb_cats_model.feature_importances_}
).sort_values("importance", ascending=False)

# Display
XGCat_feature_importances_df.head(20)

Unnamed: 0,feature,importance
49,sex_upon_intake_Unknown,0.187219
41,intake_condition_Normal,0.094807
42,intake_condition_Nursing,0.08066
36,intake_type_Owner Surrender,0.053735
48,sex_upon_intake_Spayed Female,0.048486
37,intake_type_Public Assist,0.047142
47,sex_upon_intake_Neutered Male,0.042382
38,intake_type_Stray,0.042288
0,age_upon_intake_in_days,0.028767
39,intake_condition_Feral,0.017639


### Cats Only XGBoost 2: Run model with Top 20 Features

In [29]:
drop_cat_features = list(XGCat_feature_importances_df['feature'][20:])

ml_XGCats_df = ml_cats_df.drop(columns=drop_cat_features)

In [30]:
# Split data by Outcome and Features
X = ml_XGCats_df2.drop(columns = ['outcome_type'])
y = ml_XGCats_df2['outcome_type']

# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [31]:
xgb_model = xgb.XGBClassifier()

# Fitting the model
xgb_cats_model = xgb_model.fit(X_train, y_train)

# Making predictions using the testing data.
xgb_predictions = xgb_cats_model.predict(X_test)





In [32]:
# Generate classification report
print("XGBoost Top 20 Classification Report")
print(classification_report(y_test, xgb_predictions))

XGBoost Top 20 Classification Report
                 precision    recall  f1-score   support

       Adoption       0.67      0.77      0.71      4909
     Euthanasia       0.48      0.31      0.38       698
          Other       0.27      0.04      0.07        78
Return to Owner       0.40      0.23      0.29       439
       Transfer       0.70      0.66      0.68      5038

       accuracy                           0.67     11162
      macro avg       0.50      0.40      0.43     11162
   weighted avg       0.66      0.67      0.66     11162



In [33]:
# check important features
XGCat_feature_importances_df = pd.DataFrame(
    {"feature": list(X.columns), "importance": xgb_cats_model.feature_importances_}
).sort_values("importance", ascending=False)

# Display
XGCat_feature_importances_df.head(20)

Unnamed: 0,feature,importance
367,sex_upon_intake_Unknown,0.132028
357,intake_condition_Normal,0.092424
358,intake_condition_Nursing,0.070275
348,intake_type_Owner Surrender,0.046173
365,sex_upon_intake_Neutered Male,0.040557
366,sex_upon_intake_Spayed Female,0.040419
350,intake_type_Stray,0.037605
349,intake_type_Public Assist,0.037098
0,age_upon_intake_in_days,0.023693
353,intake_condition_Feral,0.016789


### Dogs Only XGBoost 0

In [34]:
# Split data by Outcome and Features
X = ml_dogs_df.drop(columns = ['outcome_type'])
y = ml_dogs_df['outcome_type']

# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [37]:
xgb_model = xgb.XGBClassifier()

# Fitting the model
xgb_dogs_model = xgb_model.fit(X_train, y_train)

# Making predictions using the testing data.
xgb_predictions = xgb_dogs_model.predict(X_test)





In [38]:
# Generate classification report
print("XGBoost Dogs Classification Report")
print(classification_report(y_test, xgb_predictions))

XGBoost Dogs Classification Report
                 precision    recall  f1-score   support

       Adoption       0.59      0.85      0.70      6144
     Euthanasia       0.61      0.18      0.27       448
          Other       0.00      0.00      0.00        98
Return to Owner       0.62      0.62      0.62      3114
       Transfer       0.59      0.20      0.30      3469

       accuracy                           0.60     13273
      macro avg       0.48      0.37      0.38     13273
   weighted avg       0.59      0.60      0.56     13273



In [40]:
# check important features
XGDog_feature_importances_df = pd.DataFrame(
    {"feature": list(X.columns), "importance": xgb_dogs_model.feature_importances_}
).sort_values("importance", ascending=False)

# Display
XGDog_feature_importances_df.head(20)

Unnamed: 0,feature,importance
2546,intake_type_Public Assist,0.065726
2554,intake_condition_Normal,0.024046
2544,intake_type_Euthanasia Request,0.014377
2563,sex_upon_intake_Unknown,0.010188
2545,intake_type_Owner Surrender,0.009997
1923,breed_Shih Tzu Mix,0.008713
2547,intake_type_Stray,0.008378
1922,breed_Shih Tzu,0.008152
1411,breed_Lhasa Apso Mix,0.007187
2555,intake_condition_Nursing,0.006339
