In [146]:
import pandas as pd
import numpy as np

In [147]:
df=pd.read_csv('all_records.csv')

## Preprocessing

In [148]:
df.head()

Unnamed: 0.1,Unnamed: 0,Animal ID,Name_intake,DateTime_intake,MonthYear_intake,Found_Location,Intake_Type,IntakeCondition,Animal_Type_intake,Sex,...,beagle,terrier,boxer,poodle,rottweiler,dachshund,chihuahua,pit bull,DateTime_length,Days_length
0,0,A730601,,2016-07-07 12:11:00,07/07/2016 12:11:00 PM,1109 Shady Ln in Austin (TX),Stray,Normal,Cat,Intact Male,...,0,0,0,0,0,0,0,0,0 days 20:49:00.000000000,0-7 days
1,1,A683644,*Zoey,2014-07-13 11:02:00,07/13/2014 11:02:00 AM,Austin (TX),Owner Surrender,Nursing,Dog,Intact Female,...,0,0,0,0,0,0,0,0,115 days 23:04:00.000000000,12 weeks - 6 months
2,2,A676515,Rico,2014-04-11 08:45:00,04/11/2014 08:45:00 AM,615 E. Wonsley in Austin (TX),Stray,Normal,Dog,Intact Male,...,0,0,0,0,0,0,0,1,3 days 09:53:00.000000000,0-7 days
3,3,A742953,,2017-01-31 13:30:00,01/31/2017 01:30:00 PM,S Hwy 183 And Thompson Lane in Austin (TX),Stray,Normal,Dog,Intact Male,...,0,0,0,0,0,0,0,0,4 days 00:47:00.000000000,0-7 days
4,4,A679549,*Gilbert,2014-05-22 15:43:00,05/22/2014 03:43:00 PM,124 W Anderson in Austin (TX),Stray,Normal,Cat,Intact Male,...,0,0,0,0,0,0,0,0,24 days 22:11:00.000000000,3-6 weeks


In [149]:
# Drop unnecessary columns
# Drop Name_intake since there is a Intake_type column
# Drop all single breed type column since we need to do our own encoding for breed_intake
df=df.drop(columns=['Unnamed: 0','Name_intake','Name_outcome','MonthYear_intake','MonthYear_outcome',
                    'gender_intake','gender_outcome','fixed_intake','fixed_outcome','Days_length','Outcome_Subtype',
                   'retriever','shepherd','beagle','terrier','boxer','poodle','rottweiler','dachshund','chihuahua',
                   'pit bull','Age','Age_upon_Outcome'])

In [150]:
df=df.dropna()

In [151]:
# Combine Died & Disposal Outcomes
df.loc[df.Outcome_Type == 'Disposal', 'Outcome_Type'] = 'Died'
df.Outcome_Type.value_counts()

Adoption           32408
Transfer           20799
Return to Owner    17394
Euthanasia          5470
Died                 810
Missing               51
Rto-Adopt             23
Relocate              13
Name: Outcome_Type, dtype: int64

In [152]:
# Combine Transfer & Relocate Outcomes
df.loc[df.Outcome_Type == 'Relocate', 'Outcome_Type'] = 'Transfer'
df.Outcome_Type.value_counts()

Adoption           32408
Transfer           20812
Return to Owner    17394
Euthanasia          5470
Died                 810
Missing               51
Rto-Adopt             23
Name: Outcome_Type, dtype: int64

In [153]:
# Combine Rto-Adopt & Return to Owner
df.loc[df.Outcome_Type == 'Rto-Adopt', 'Outcome_Type'] = 'Return to Owner'
df.Outcome_Type.value_counts()

Adoption           32408
Transfer           20812
Return to Owner    17417
Euthanasia          5470
Died                 810
Missing               51
Name: Outcome_Type, dtype: int64

In [154]:
# Update DateTime_intake to datatime datatype
df.DateTime_intake = pd.to_datetime(df.DateTime_intake, format='%Y-%m-%d')

In [155]:
# Update DateTime_outcome to datatime datatype
df.DateTime_outcome = pd.to_datetime(df.DateTime_outcome, format='%Y-%m-%d')

In [156]:
# Create a new column and calculate the length of stay
df['datetime_length']=df['DateTime_outcome']-df['DateTime_intake']

In [157]:
# Create a temporary 'int' column and convert datetime to int
df['int'] = df['datetime_length'].astype(np.int64)

In [158]:
# replace negative values to nan
df.loc[df.int < 0, 'int'] = None

In [159]:
# drop null values
df=df.dropna()

In [160]:
# drop temporary 'int' column and original DateTime_length column
df=df.drop(columns=['int','DateTime_length'])

# Rename calculated length of stay column
df.rename(columns = {'datetime_length':'DateTime_length'}, inplace = True)

In [161]:
# Combine Aged and Feral with Other for IntakeCondition
df.loc[(df.IntakeCondition == 'Aged') | (df.IntakeCondition == 'Feral'), 'IntakeCondition'] = 'Other'

In [162]:
# Combine Injured and Sick for Intake Condition
df.loc[(df.IntakeCondition == 'Injured') | (df.IntakeCondition == 'Sick'), 'IntakeCondition'] = 'Medical'

In [163]:
# Combine Pregnant and Nursing for  IntakeCondition
df.loc[(df.IntakeCondition == 'Nursing') | (df.IntakeCondition == 'Pregnant'), 'IntakeCondition' ] = 'Maternity'

In [164]:
# Combine Bird and Livestock with Other for Animal_Type_intake
df.loc[(df.Animal_Type_intake == 'Bird') | (df.Animal_Type_intake == 'Livestock'), 'Animal_Type_intake'] = 'Other'

In [165]:
df.rename(columns={'Sex': 'Sex_Intake'}, inplace=True)

In [166]:
new_breed=[]
for i in df['Breed_intake']:
    if 'Mix' in i:
        i='Mix'
    elif "/" in i:
        i="Mix"
    elif 'Retriever' in i:
        i='Retriever'
    elif 'Shepherd' in i:
        i='Shepherd'
    elif 'Beagle' in i:
        i='Beagle'
    elif 'Terrier' in i:
        i='Terrier'
    elif 'Boxer' in i:
        i='Boxer'
    elif 'Poodle' in i:
        i='Poodle'
    elif 'Rottweiler' in i:
        i='Rottweiler'
    elif 'Dachshund' in i:
        i='Dachshund'
    elif 'Chihuahua' in i:
        i='Chihuahua'
    elif 'Pit Bull' in i:
        i='Pit Bull'
    elif 'Bulldog' in i:
        i='Bulldog'
    elif 'Pointer' in i:
        i='Pointer'
    else:
        i=i
    new_breed.append(i)
df["new_breed"]=new_breed

In [167]:
breed_counts=df['new_breed'].value_counts()
replace_breed=breed_counts.loc[breed_counts<100].index.tolist()
for i in replace_breed:
    df.new_breed = df.new_breed.replace(i,"Other")

In [168]:
# drop and rename
df=df.drop(columns=['Breed_intake'])
df=df.rename(columns={"new_breed": "Breed_Type"})

In [169]:
colorNew=[]
for i in df.Color_intake:
    if "/" in i:
        i="Bicolor"
    elif "Tabby" in i:
        i='Tabby'
    elif "Brindle" in i:
        i="Tabby"
    elif "Merle" in i:
        i="Merle"
    elif "Tiger" in i:
        i="Tiger"
    elif "Tortie" in i:
        i="Tabby"
    elif "Calico" in i:
        i="Tabby"
    elif "Torbie" in i:
        i="Tabby"
    elif i=="Apricot" or i=="Gold" or i=="Yellow" or i=="Fawn":
        i="Orange"
    elif "Blue" in i:
        i="Blue"
    elif "Black" in i or i=="Sable":
        i="Black"
    elif i=="Liver":
        i="Brown"
    elif "Point" in i:
        i='Point'
    elif "Tick" in i:
        i="Point"
    else:
        i=i
    colorNew.append(i)

In [170]:
df['Color_intake']=colorNew

In [171]:
color_counts=df['Color_intake'].value_counts()
replace_color=color_counts.loc[color_counts<500].index.tolist()
for i in replace_color:
    df.Color_intake = df.Color_intake.replace(i,"Other")

In [172]:
df.head()

Unnamed: 0,Animal ID,DateTime_intake,Found_Location,Intake_Type,IntakeCondition,Animal_Type_intake,Sex_Intake,Color_intake,DateTime_outcome,Outcome_Type,Sex_upon_Outcome,fixed_changed,Age_Bucket,DateTime_length,Breed_Type
0,A730601,2016-07-07 12:11:00,1109 Shady Ln in Austin (TX),Stray,Normal,Cat,Intact Male,Tabby,2016-07-08 09:00:00,Transfer,Neutered Male,1,7-12 months,0 days 20:49:00,Mix
1,A683644,2014-07-13 11:02:00,Austin (TX),Owner Surrender,Maternity,Dog,Intact Female,Bicolor,2014-11-06 10:06:00,Adoption,Spayed Female,1,1-6 weeks,115 days 23:04:00,Mix
2,A676515,2014-04-11 08:45:00,615 E. Wonsley in Austin (TX),Stray,Normal,Dog,Intact Male,Bicolor,2014-04-14 18:38:00,Return to Owner,Neutered Male,1,1-6 months,3 days 09:53:00,Mix
3,A742953,2017-01-31 13:30:00,S Hwy 183 And Thompson Lane in Austin (TX),Stray,Normal,Dog,Intact Male,Bicolor,2017-02-04 14:17:00,Transfer,Intact Male,0,1-3 years,4 days 00:47:00,Other
4,A679549,2014-05-22 15:43:00,124 W Anderson in Austin (TX),Stray,Normal,Cat,Intact Male,Bicolor,2014-06-16 13:54:00,Transfer,Neutered Male,1,1-6 months,24 days 22:11:00,Mix


# Machine Learning Model

In [173]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report

## Encoding Text Columns

In [174]:
# Drop columns with to many unique values
df.drop(['Animal ID', 'Found_Location', 'DateTime_intake', 'DateTime_outcome', 'DateTime_length'], axis=1, inplace=True)

In [175]:
# encode text columns with dummy values
df_encoded = pd.get_dummies(df, columns=['Intake_Type', 'IntakeCondition', 'Animal_Type_intake', 'Sex_Intake', 
                    'Color_intake', 'Sex_upon_Outcome', 'fixed_changed', 'Age_Bucket', 'Breed_Type'])

In [176]:
le = LabelEncoder()
df_encoded['Outcome_Type'] = le.fit_transform(df_encoded['Outcome_Type'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

{'Adoption': 0, 'Died': 1, 'Euthanasia': 2, 'Missing': 3, 'Return to Owner': 4, 'Transfer': 5}


In [177]:
df_encoded.head()

Unnamed: 0,Outcome_Type,Intake_Type_Euthanasia Request,Intake_Type_Owner Surrender,Intake_Type_Public Assist,Intake_Type_Stray,Intake_Type_Wildlife,IntakeCondition_Maternity,IntakeCondition_Medical,IntakeCondition_Normal,IntakeCondition_Other,...,Breed_Type_Domestic Shorthair,Breed_Type_Mix,Breed_Type_Other,Breed_Type_Pit Bull,Breed_Type_Raccoon,Breed_Type_Retriever,Breed_Type_Shepherd,Breed_Type_Shih Tzu,Breed_Type_Siberian Husky,Breed_Type_Terrier
0,5,0,0,0,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,4,0,0,0,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
3,5,0,0,0,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4,5,0,0,0,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0


## Train Test Split Data

In [139]:
y = df_encoded.Outcome_Type
X = df_encoded.drop(columns='Outcome_Type')

In [140]:
# Split data to train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y,  random_state=1)

In [141]:
# create model instance
model = KNeighborsClassifier()

In [142]:
# fit model
model.fit(X_train, y_train)

KNeighborsClassifier()

## Prediction

In [143]:
# Predict and view results
y_pred = model.predict(X_test)
results = pd.DataFrame({
   "Prediction": y_pred,
   "Actual": y_test
}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,5,2
1,0,0
2,4,4
3,0,4
4,5,5


## Evaluation Metrics

In [178]:
# print confusion matrix
cm = confusion_matrix(y_pred, y_test)
cm_df = pd.DataFrame(cm,
                     index = ['Adoption', 'Died', 'Euthanasia', 'Missing', 'Return to Owner', 'Transfer'], 
                     columns = ['Adoption', 'Died', 'Euthanasia', 'Missing', 'Return to Owner', 'Transfer'])
cm_df

Unnamed: 0,Adoption,Died,Euthanasia,Missing,Return to Owner,Transfer
Adoption,5445,14,147,2,922,1306
Died,2,10,2,0,4,4
Euthanasia,26,93,830,1,30,97
Missing,0,0,0,0,3,0
Return to Owner,859,12,159,3,2401,828
Transfer,900,73,238,3,259,2720


In [183]:
# print classification report
print(classification_report(y_pred, y_test, target_names=['Adoption', 'Died', 'Euthanasia', 'Missing', 'Return to Owner', 'Transfer']))

                 precision    recall  f1-score   support

       Adoption       0.75      0.69      0.72      7836
           Died       0.05      0.45      0.09        22
     Euthanasia       0.60      0.77      0.68      1077
        Missing       0.00      0.00      0.00         3
Return to Owner       0.66      0.56      0.61      4262
       Transfer       0.55      0.65      0.59      4193

       accuracy                           0.66     17393
      macro avg       0.44      0.52      0.45     17393
   weighted avg       0.67      0.66      0.66     17393

