In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
# Read in csv
df = pd.read_csv("Austin_Animal_Center_Outcomes.csv")

In [3]:
df.head()

Unnamed: 0,Animal_ID,Name,DateTime,MonthYear,Date_of_Birth,Outcome_Type,Outcome_Subtype,Fixed,Sex,Age_numeric,Age_time,Age_month,Breed,Color
0,A805930,*Rachel,11/8/2019 18:37,11/8/2019 18:37,9/11/2019,Adoption,Foster,Spayed,Female,1,month,1.0,Border Collie Mix,Chocolate/White
1,A705114,,06/13/2015 03:47:00 PM,06/13/2015 03:47:00 PM,6/11/2015,Transfer,Partner,Intact,Male,2,days,0.066667,German Shepherd Mix,Brown
2,A680143,,05/31/2014 11:21:00 AM,05/31/2014 11:21:00 AM,05/29/2014,Transfer,Partner,Intact,Male,2,days,0.066667,Labrador Retriever Mix,Black/White
3,A670236,,1/5/2014 15:12,1/5/2014 15:12,1/3/2014,Transfer,Partner,Unknown,,2,days,0.066667,Pit Bull Mix,Blue/White
4,A811594,,1/6/2020 17:09,1/6/2020 17:09,1/4/2020,Transfer,Partner,Intact,Female,2,days,0.066667,Dachshund Mix,Brown


In [4]:
df.shape

(66804, 14)

In [5]:
# Drop unnessary columns
df2 = df.drop(columns=['Animal_ID', 'Name', 'DateTime', 'MonthYear', 'Date_of_Birth', 'Outcome_Subtype', 
                       'Age_numeric', 'Age_time' ])

In [6]:
df2.shape
# df2.head()

(66804, 6)

In [7]:
# Drop rows where Outcome_Type is null

df2 = df2.loc[df2.Outcome_Type.notna(), ["Outcome_Type", "Fixed", "Sex", "Age_month", "Breed" ,"Color"]]

In [8]:
df2.head()

Unnamed: 0,Outcome_Type,Fixed,Sex,Age_month,Breed,Color
0,Adoption,Spayed,Female,1.0,Border Collie Mix,Chocolate/White
1,Transfer,Intact,Male,0.066667,German Shepherd Mix,Brown
2,Transfer,Intact,Male,0.066667,Labrador Retriever Mix,Black/White
3,Transfer,Unknown,,0.066667,Pit Bull Mix,Blue/White
4,Transfer,Intact,Female,0.066667,Dachshund Mix,Brown


In [9]:
df2.groupby('Outcome_Type').count()

Unnamed: 0_level_0,Fixed,Sex,Age_month,Breed,Color
Outcome_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Adoption,31397,31395,31397,31397,31397
Died,235,209,235,235,235
Disposal,25,22,25,25,25
Euthanasia,1731,1694,1731,1731,1731
Missing,29,29,29,29,29
Return to Owner,18469,18400,18470,18470,18470
Rto-Adopt,498,498,498,498,498
Transfer,14419,14178,14419,14419,14419


In [11]:
# Drop all rows 'Return to Owner' ???  

df3 = df2.drop(df2[df2.Outcome_Type == "Return to Owner"].index)

In [12]:
# Verify shape
df3.shape

(48334, 6)

In [13]:
# Verify group by counts
df3.groupby('Outcome_Type').count()

Unnamed: 0_level_0,Fixed,Sex,Age_month,Breed,Color
Outcome_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Adoption,31397,31395,31397,31397,31397
Died,235,209,235,235,235
Disposal,25,22,25,25,25
Euthanasia,1731,1694,1731,1731,1731
Missing,29,29,29,29,29
Rto-Adopt,498,498,498,498,498
Transfer,14419,14178,14419,14419,14419


In [14]:
# Create new BINARY_OUTCOME column and set to 0

df3["Binary_outcome"]=0
df3.head()

Unnamed: 0,Outcome_Type,Fixed,Sex,Age_month,Breed,Color,Binary_outcome
0,Adoption,Spayed,Female,1.0,Border Collie Mix,Chocolate/White,0
1,Transfer,Intact,Male,0.066667,German Shepherd Mix,Brown,0
2,Transfer,Intact,Male,0.066667,Labrador Retriever Mix,Black/White,0
3,Transfer,Unknown,,0.066667,Pit Bull Mix,Blue/White,0
4,Transfer,Intact,Female,0.066667,Dachshund Mix,Brown,0


In [15]:
# Find rows containing word "Adoption" and set to 1 (for Positive Outcome)
# All other rows will be 0 (for Negative Outcome)

df3.loc[df3["Outcome_Type"].str.contains("Adoption"), "Binary_outcome"] = 1

In [18]:
df3.loc[200:300, :]

Unnamed: 0,Outcome_Type,Fixed,Sex,Age_month,Breed,Color,Binary_outcome
200,Transfer,Intact,Female,0.2,Dachshund,Tan,0
201,Transfer,Intact,Male,0.2,Dachshund,Tan,0
202,Transfer,Unknown,,0.2,Labrador Retriever Mix,Brown,0
203,Transfer,Unknown,,0.2,Labrador Retriever Mix,White,0
204,Transfer,Unknown,,0.2,Chihuahua Shorthair/Pit Bull,White,0
...,...,...,...,...,...,...,...
296,Transfer,Intact,Female,1.0,German Shepherd Mix,Brown,0
297,Transfer,Intact,Male,1.0,Australian Cattle Dog Mix,Cream/Brown Merle,0
298,Adoption,Spayed,Female,1.0,Labrador Retriever Mix,Tan,1
299,Transfer,Intact,Male,1.0,Jack Russell Terrier Mix,Sable,0


### Perform get_dummies  ( Changing categorical values to numeric values )

In [19]:
# Passing get_dummies on all columns except Outcome_rating and Pet_ageMonths
df4 = pd.get_dummies(data=df3, columns=['Fixed', 'Sex', 'Breed', 'Color', 'Fixed'])

In [21]:
df4.head()

Unnamed: 0,Outcome_Type,Age_month,Binary_outcome,Fixed_Intact,Fixed_Neutered,Fixed_Spayed,Fixed_Unknown,Sex_Female,Sex_Male,Breed_Affenpinscher Mix,...,Color_Yellow Brindle/Blue,Color_Yellow Brindle/White,Color_Yellow/Black,Color_Yellow/Cream,Color_Yellow/Tan,Color_Yellow/White,Fixed_Intact.1,Fixed_Neutered.1,Fixed_Spayed.1,Fixed_Unknown.1
0,Adoption,1.0,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,Transfer,0.066667,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,Transfer,0.066667,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,Transfer,0.066667,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,Transfer,0.066667,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0


In [22]:
# Drop these columns as they are duplicates
df4 = df4.drop(columns=['Sex_Female'])

In [23]:
df4.shape

(48334, 2357)

In [25]:
# Define X and y
X = df4.drop(['Outcome_Type', 'Binary_outcome'], axis="columns")
y = df4[['Binary_outcome']]

In [26]:
X.shape

(48334, 2355)

In [27]:
y.shape

(48334, 1)

### Random Forest model


In [28]:
# Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [29]:
# Import, initialize, fit and predict
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators = 100, random_state = 10) 
rf_model.fit(X_train, y_train)
predict_y_test = rf_model.predict(X_test)

  after removing the cwd from sys.path.


In [30]:
# Validate - run accuracy score
from sklearn import metrics
print("Accuracy score: ", metrics.accuracy_score(y_test, predict_y_test))

Accuracy score:  0.7803707381661702


In [31]:
# Validate with cross validation

from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf_model, X, y, cv=10, scoring="accuracy")
print(scores)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[0.92635499 0.83202317 0.49731072 0.43442284 0.74881026 0.70722119
 0.73349886 0.67246017 0.43140906 0.40906269]


In [35]:
scores.mean()

0.6392573947504918

### Logistic Regression

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [33]:
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()
logReg.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
print(f"Training Data Score: {logReg.score(X_train, y_train)}")
print(f"Testing Data Score: {logReg.score(X_test, y_test)}")

Training Data Score: 0.8055172413793104
Testing Data Score: 0.8104104601125455
