In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import datetime as dt

In [2]:
# Read in csv
dfLouis = pd.read_csv("LouisvilleClean.csv")
dfLouis.head()

Unnamed: 0.1,Unnamed: 0,PrimaryColor,PrimaryBreed,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeType,AgeInMonths,DurationInShelter,BreedCategory,Pet_age_category,Binary_outcome,Sex
0,4,WHITE,PIT BULL TERRIER,NORMAL,HEALTHY,FERTILE,EUTH,12.17,29.0,Pure,Young,Deny,Male
1,51,BLACK,BORDER COLLIE,NORMAL,HEALTHY,ALTERED,ADOPTION,10.17,108.0,Mix,Young,Take,Female
2,69,TAN,GOLDEN RETRIEVER,NORMAL,HEALTHY,ALTERED,ADOPTION,23.43,56.0,Mix,Young,Take,Male
3,99,WHITE,LABRADOR RETRIEVER,NORMAL,HEALTHY,ALTERED,ADOPTION,0.17,183.0,Pure,Baby,Take,Female
4,103,BLACK,PIT BULL TERRIER,SICK,HEALTHY,FERTILE,EUTH,11.9,0.0,Pure,Young,Deny,Male


In [3]:
dfLouis.columns

Index(['Unnamed: 0', 'PrimaryColor', 'PrimaryBreed', 'IntakeInternalStatus',
       'IntakeAsilomarStatus', 'ReproductiveStatusAtIntake', 'OutcomeType',
       'AgeInMonths', 'DurationInShelter', 'BreedCategory', 'Pet_age_category',
       'Binary_outcome', 'Sex'],
      dtype='object')

In [4]:
# Drop unnecessary columns
dfLouis2 = dfLouis.drop(columns=['Unnamed: 0',
                       'IntakeInternalStatus', 
                       'IntakeAsilomarStatus', 
                       'ReproductiveStatusAtIntake',
                      'OutcomeType',
                       'AgeInMonths',
                        'DurationInShelter'])

In [5]:
dfLouis2.head()

Unnamed: 0,PrimaryColor,PrimaryBreed,BreedCategory,Pet_age_category,Binary_outcome,Sex
0,WHITE,PIT BULL TERRIER,Pure,Young,Deny,Male
1,BLACK,BORDER COLLIE,Mix,Young,Take,Female
2,TAN,GOLDEN RETRIEVER,Mix,Young,Take,Male
3,WHITE,LABRADOR RETRIEVER,Pure,Baby,Take,Female
4,BLACK,PIT BULL TERRIER,Pure,Young,Deny,Male


In [6]:
# Read in csv
dfAustin = pd.read_csv("AustinClean.csv")
dfAustin.head()

Unnamed: 0.1,Unnamed: 0,OutcomeType,Sex,AgeMonth,Breed,Color,BreedCategory,PrimaryBreed,SecondaryBreed,PrimaryColor,SecondColor,Pet_age_category,Binary_outcome
0,0,Adoption,Female,1.0,Border Collie Mix,Chocolate/White,Mix,Border Collie Mix,,Chocolate,White,Baby,Take
1,1,Transfer,Male,0.066667,German Shepherd Mix,Brown,Mix,German Shepherd Mix,,Brown,,Baby,Deny
2,2,Transfer,Male,0.066667,Labrador Retriever Mix,Black/White,Mix,Labrador Retriever Mix,,Black,White,Baby,Deny
3,4,Transfer,Female,0.066667,Dachshund Mix,Brown,Mix,Dachshund Mix,,Brown,,Baby,Deny
4,6,Transfer,Female,0.066667,Pit Bull Mix,Chocolate/White,Mix,Pit Bull Mix,,Chocolate,White,Baby,Deny


In [7]:
dfAustin2 = dfAustin.drop(columns=['Unnamed: 0',
                       'OutcomeType', 
                       'AgeMonth', 
                       'Breed',
                       'Color',
                       'SecondaryBreed',
                        'SecondColor'])

In [8]:
dfAustin2.columns

Index(['Sex', 'BreedCategory', 'PrimaryBreed', 'PrimaryColor',
       'Pet_age_category', 'Binary_outcome'],
      dtype='object')

In [9]:
dfLouis2.isna().sum()

PrimaryColor           0
PrimaryBreed           0
BreedCategory          0
Pet_age_category    1287
Binary_outcome         0
Sex                    0
dtype: int64

In [10]:
dfLouis2 = dfLouis2.dropna(axis=0, subset=["Pet_age_category"])

In [11]:
dfLouis2.isna().sum()

PrimaryColor        0
PrimaryBreed        0
BreedCategory       0
Pet_age_category    0
Binary_outcome      0
Sex                 0
dtype: int64

In [12]:
dfAustin2 = dfAustin2[['PrimaryColor', 'PrimaryBreed', 'BreedCategory', 'Pet_age_category', 'Binary_outcome', 'Sex']]

In [13]:
dfAustin2.head()
dfAustin2.shape

(48025, 6)

In [14]:
dfAustin2.isna().sum()

PrimaryColor        0
PrimaryBreed        0
BreedCategory       0
Pet_age_category    2
Binary_outcome      0
Sex                 0
dtype: int64

In [15]:
dfAustin2 = dfAustin2.dropna(axis=0, subset=["Pet_age_category"])

In [16]:
dfAustin2.isna().sum()

PrimaryColor        0
PrimaryBreed        0
BreedCategory       0
Pet_age_category    0
Binary_outcome      0
Sex                 0
dtype: int64

### Merge dataframes

In [17]:
df = pd.concat([dfLouis2, dfAustin2], axis=0)

In [18]:
df.head()

Unnamed: 0,PrimaryColor,PrimaryBreed,BreedCategory,Pet_age_category,Binary_outcome,Sex
0,WHITE,PIT BULL TERRIER,Pure,Young,Deny,Male
1,BLACK,BORDER COLLIE,Mix,Young,Take,Female
2,TAN,GOLDEN RETRIEVER,Mix,Young,Take,Male
3,WHITE,LABRADOR RETRIEVER,Pure,Baby,Take,Female
4,BLACK,PIT BULL TERRIER,Pure,Young,Deny,Male


In [19]:
# v2 - drop PrimaryColor
df = df.drop(columns=['PrimaryColor','PrimaryBreed'])
df.head()

Unnamed: 0,BreedCategory,Pet_age_category,Binary_outcome,Sex
0,Pure,Young,Deny,Male
1,Mix,Young,Take,Female
2,Mix,Young,Take,Male
3,Pure,Baby,Take,Female
4,Pure,Young,Deny,Male


In [20]:
df2 = pd.get_dummies(df[['BreedCategory','Pet_age_category', 'Binary_outcome', "Sex"]], drop_first=True)

In [21]:
df2.columns

Index(['BreedCategory_Pure', 'BreedCategory_Two', 'Pet_age_category_Baby',
       'Pet_age_category_Senior', 'Pet_age_category_Young',
       'Binary_outcome_Take', 'Sex_Male'],
      dtype='object')

In [22]:
# Define X and y
X = df2.drop(['Binary_outcome_Take'], axis="columns")
y = df2[['Binary_outcome_Take']]

In [23]:
X.shape

(89558, 6)

In [24]:
y.shape

(89558, 1)

### RF model

In [25]:
# Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [26]:
# Import, initialize, fit and predict
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators = 100, random_state = 42) 
rf_model.fit(X_train, y_train)
predict_y_test = rf_model.predict(X_test)

  after removing the cwd from sys.path.


In [27]:
# Validate - run accuracy score
from sklearn import metrics
print("Accuracy score: ", metrics.accuracy_score(y_test, predict_y_test))

Accuracy score:  0.6184903974988835
