In [2]:
# Import dependencies
import pandas as pd
import numpy as np

from datetime import datetime
from path import Path
import hvplot.pandas
import plotly.express as px

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder

# Machine Learning imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import joblib

In [5]:
# Import the clean, incoded CSV that was created on merged_csv_cleaning file into a new Dataframe 
f_path = 'Resources/ml.csv'
ml_df = pd.read_csv(f_path, index_col=False)
ml_df

Unnamed: 0,outcome_type,animal_type_Bird,animal_type_Cat,animal_type_Dog,animal_type_Livestock,animal_type_Other,breed_Abyssinian,breed_Abyssinian/Mix,breed_Affenpinscher,breed_Affenpinscher/Mix,...,intake_condition_Nursing,intake_condition_Other,intake_condition_Pregnant,intake_condition_Sick,intake_condition_Space,sex_upon_intake_Intact Female,sex_upon_intake_Intact Male,sex_upon_intake_Neutered Male,sex_upon_intake_Spayed Female,sex_upon_intake_Unknown
0,Transfer,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,Return to Owner,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,Return to Owner,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,Return to Owner,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,Return to Owner,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105343,Transfer,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
105344,Transfer,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
105345,Transfer,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
105346,Transfer,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [8]:
# Split data by Outcome and Features
X = ml_df.drop(columns = ['outcome_type'])
y = ml_df['outcome_type']

In [25]:
# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [26]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(79011, 3069)
(26337, 3069)
(79011,)
(26337,)


In [27]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [28]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [51]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [54]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators = 128, random_state = 42) 

### Joblib

In [None]:
# ------------------------------------------------
# Run 'a' and 'b' first after training your model, then comment them out and keep 'c',  
### so you don't have to train the model every time. 

In [55]:
## a) Fitting the model
# rf_model = rf_model.fit(X_train_scaled, y_train)

In [56]:
# # b) Load the dumped and trained joblib file
# rf_model = joblib.dump(rf_model, 'trained_data.joblib')

In [57]:
# c) Load the dumped and trained joblib file
rf_model = joblib.load('trained_data.joblib')

In [None]:
# ---------------------------------------------

In [58]:
predictions = rf_model.predict(X_test_scaled)

In [59]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
                 precision    recall  f1-score   support

       Adoption       0.65      0.79      0.71     11195
           Died       0.73      0.18      0.29       294
       Disposal       0.67      0.14      0.22       148
     Euthanasia       0.81      0.75      0.78      2137
        Missing       1.00      0.06      0.11        17
       Relocate       1.00      0.29      0.44         7
Return to Owner       0.73      0.66      0.69      3586
      Rto-Adopt       0.86      0.28      0.43       113
       Transfer       0.67      0.57      0.62      8840

       accuracy                           0.68     26337
      macro avg       0.79      0.41      0.48     26337
   weighted avg       0.68      0.68      0.67     26337



In [60]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Adoption", "Actual Died", 'Disposal', 'Euthanasia', 'Missing', 'Relocate', 'Return to Owner', 'Rto-Adopt', 'Transfer'], 
    columns=["Predicted Adoption", "Predicted Died", 'Disposal', 'Euthanasia', 'Missing', 'Relocate', 'Return to Owner', 'Rto-Adopt', 'Transfer'])

cm_df

Unnamed: 0,Predicted Adoption,Predicted Died,Disposal,Euthanasia,Missing,Relocate,Return to Owner,Rto-Adopt,Transfer
Actual Adoption,8797,3,0,86,0,0,473,2,1834
Actual Died,96,54,3,42,0,0,5,0,94
Disposal,4,2,20,110,0,0,1,0,11
Euthanasia,234,2,2,1602,0,0,55,0,242
Missing,5,0,0,0,1,0,1,0,10
Relocate,1,0,0,3,0,2,0,0,1
Return to Owner,914,0,1,24,0,0,2363,0,284
Rto-Adopt,45,1,0,4,0,0,13,32,18
Transfer,3354,12,4,106,0,0,327,3,5034


In [67]:
# check important features
feature_importances_df = pd.DataFrame(
    {"feature": list(X.columns), "importance": rf_model.feature_importances_}
).sort_values("importance", ascending=False)

# Display
feature_importances_df.head(20)

Unnamed: 0,feature,importance
3068,sex_upon_intake_Unknown,0.051978
3049,intake_type_Public Assist,0.036297
3051,intake_type_Wildlife,0.029986
3048,intake_type_Owner Surrender,0.028131
3050,intake_type_Stray,0.027759
3064,sex_upon_intake_Intact Female,0.027025
4,animal_type_Other,0.027011
3066,sex_upon_intake_Neutered Male,0.024459
3058,intake_condition_Normal,0.02337
2,animal_type_Dog,0.020908


## Attempt 1: Combine some outcomes together

In [62]:
outcome_counts = ml_df['outcome_type'].value_counts()
outcome_counts

Adoption           44919
Transfer           35175
Return to Owner    14526
Euthanasia          9596
Other               1132
Name: outcome_type, dtype: int64

In [63]:
# Determine which values to replace if counts are less than ...?
replace_outcome = list(outcome_counts[outcome_counts < 1000].index)

# Replace in dataframe
for outcome in replace_outcome:
    ml_df.outcome_type = ml_df.outcome_type.replace(outcome,"Other")

In [64]:
# Combine Died with Euthanasia
ml_df.outcome_type = ml_df.outcome_type.replace('Died',"Euthanasia")

# Check to make sure binning was successful
ml_df.outcome_type.value_counts()

Adoption           44919
Transfer           35175
Return to Owner    14526
Euthanasia          9596
Other               1132
Name: outcome_type, dtype: int64

In [68]:
# check important features
feature_importances_df = pd.DataFrame(
    {"feature": list(X.columns), "importance": rf_model.feature_importances_}
).sort_values("importance", ascending=False)

# Display
feature_importances_df.head(20)

Unnamed: 0,feature,importance
3068,sex_upon_intake_Unknown,0.051978
3049,intake_type_Public Assist,0.036297
3051,intake_type_Wildlife,0.029986
3048,intake_type_Owner Surrender,0.028131
3050,intake_type_Stray,0.027759
3064,sex_upon_intake_Intact Female,0.027025
4,animal_type_Other,0.027011
3066,sex_upon_intake_Neutered Male,0.024459
3058,intake_condition_Normal,0.02337
2,animal_type_Dog,0.020908
