In [23]:
import pandas as pd
from pandas import Series, DataFrame
from patsy import dmatrices
import warnings
%pylab inline
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import metrics
from StringIO import StringIO
from sklearn.model_selection import cross_val_score, StratifiedKFold

Populating the interactive namespace from numpy and matplotlib


In [24]:
df = pd.read_csv('aac_shelter_outcomes.csv')
df.head()

Unnamed: 0,age_upon_outcome,animal_id,animal_type,breed,color,date_of_birth,datetime,monthyear,name,outcome_subtype,outcome_type,sex_upon_outcome
0,2 weeks,A684346,Cat,Domestic Shorthair Mix,Orange Tabby,2014-07-07T00:00:00,2014-07-22T16:04:00,2014-07-22T16:04:00,,Partner,Transfer,Intact Male
1,1 year,A666430,Dog,Beagle Mix,White/Brown,2012-11-06T00:00:00,2013-11-07T11:47:00,2013-11-07T11:47:00,Lucy,Partner,Transfer,Spayed Female
2,1 year,A675708,Dog,Pit Bull,Blue/White,2013-03-31T00:00:00,2014-06-03T14:20:00,2014-06-03T14:20:00,*Johnny,,Adoption,Neutered Male
3,9 years,A680386,Dog,Miniature Schnauzer Mix,White,2005-06-02T00:00:00,2014-06-15T15:50:00,2014-06-15T15:50:00,Monday,Partner,Transfer,Neutered Male
4,5 months,A683115,Other,Bat Mix,Brown,2014-01-07T00:00:00,2014-07-07T14:04:00,2014-07-07T14:04:00,,Rabies Risk,Euthanasia,Unknown


In [25]:
df.drop(labels=['animal_id', 'monthyear'], axis=1, inplace=True)
df.head()

Unnamed: 0,age_upon_outcome,animal_type,breed,color,date_of_birth,datetime,name,outcome_subtype,outcome_type,sex_upon_outcome
0,2 weeks,Cat,Domestic Shorthair Mix,Orange Tabby,2014-07-07T00:00:00,2014-07-22T16:04:00,,Partner,Transfer,Intact Male
1,1 year,Dog,Beagle Mix,White/Brown,2012-11-06T00:00:00,2013-11-07T11:47:00,Lucy,Partner,Transfer,Spayed Female
2,1 year,Dog,Pit Bull,Blue/White,2013-03-31T00:00:00,2014-06-03T14:20:00,*Johnny,,Adoption,Neutered Male
3,9 years,Dog,Miniature Schnauzer Mix,White,2005-06-02T00:00:00,2014-06-15T15:50:00,Monday,Partner,Transfer,Neutered Male
4,5 months,Other,Bat Mix,Brown,2014-01-07T00:00:00,2014-07-07T14:04:00,,Rabies Risk,Euthanasia,Unknown


In [26]:
df['animal_type'].value_counts()

Dog          44242
Cat          29422
Other         4249
Bird           334
Livestock        9
Name: animal_type, dtype: int64

In [27]:
df['outcome_type'].value_counts()

Adoption           33112
Transfer           23499
Return to Owner    14354
Euthanasia          6080
Died                 680
Disposal             307
Rto-Adopt            150
Missing               46
Relocate              16
Name: outcome_type, dtype: int64

In [28]:
df[df['animal_type'] == "Dog"]["breed"].value_counts().head()

Pit Bull Mix                 6135
Chihuahua Shorthair Mix      4733
Labrador Retriever Mix       4608
German Shepherd Mix          1892
Australian Cattle Dog Mix    1059
Name: breed, dtype: int64

In [29]:
df[df['animal_type'] == 'Cat']['breed'].value_counts().head()

Domestic Shorthair Mix      23335
Domestic Medium Hair Mix     2323
Domestic Longhair Mix        1228
Siamese Mix                   998
Domestic Shorthair            386
Name: breed, dtype: int64

In [30]:
df[df['animal_type'] == "Other"]['breed'].value_counts().head()

Bat Mix          1286
Bat               799
Raccoon Mix       465
Rabbit Sh Mix     291
Raccoon           230
Name: breed, dtype: int64

In [31]:
def classify_breed(breed):
    ''' classifies mixes where the mix is specified with a slash to fall under the first listed breed as a mix'''
    breed_list = breed.split("/")
    if len(breed_list) > 1:
        return breed_list[0] + " Mix"
    return breed_list[0]

In [97]:
mask = df['name'].isnull()
df['has_name'] = 1
df['has_name'][mask] = 0
mask = df['outcome_type'] == 'Adoption'
df['adopted'] = 0
df['adopted'][mask] = 1
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], format='%Y-%m-%d')
df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%d')
df['days_alive'] = df['datetime'].sub(df['date_of_birth'], axis = 0)
df['days_alive'] = df['days_alive'].dt.days
df['age_binned'] = pd.qcut(df['days_alive'], 5)
df

Unnamed: 0,age_upon_outcome,animal_type,breed,color,date_of_birth,datetime,name,outcome_subtype,outcome_type,sex_upon_outcome,has_name,adopted,days_alive,age_binned
0,2 weeks,Cat,Domestic Shorthair Mix,Orange Tabby,2014-07-07,2014-07-22 16:04:00,,Partner,Transfer,Intact Male,0,0,15,"(-123.001, 78.0]"
1,1 year,Dog,Beagle Mix,White/Brown,2012-11-06,2013-11-07 11:47:00,Lucy,Partner,Transfer,Spayed Female,1,0,366,"(306.0, 669.0]"
2,1 year,Dog,Pit Bull,Blue/White,2013-03-31,2014-06-03 14:20:00,*Johnny,,Adoption,Neutered Male,1,1,429,"(306.0, 669.0]"
3,9 years,Dog,Miniature Schnauzer Mix,White,2005-06-02,2014-06-15 15:50:00,Monday,Partner,Transfer,Neutered Male,1,0,3300,"(1222.0, 9137.0]"
4,5 months,Other,Bat Mix,Brown,2014-01-07,2014-07-07 14:04:00,,Rabies Risk,Euthanasia,Unknown,0,0,181,"(78.0, 306.0]"
5,4 months,Dog,Leonberger Mix,Brown/White,2013-06-03,2013-10-07 13:06:00,*Edgar,Partner,Transfer,Intact Male,1,0,126,"(78.0, 306.0]"
6,1 year,Other,Squirrel Mix,Tan,2013-12-13,2014-12-13 12:20:00,,Suffering,Euthanasia,Unknown,0,0,365,"(306.0, 669.0]"
7,3 years,Dog,Chihuahua Shorthair Mix,Brown,2011-11-23,2014-12-08 15:55:00,*Ella,Partner,Transfer,Spayed Female,1,0,1111,"(669.0, 1222.0]"
8,1 month,Cat,Domestic Shorthair Mix,Blue Tabby/White,2014-06-16,2014-08-14 18:45:00,Lucy,,Adoption,Intact Female,1,1,59,"(-123.001, 78.0]"
9,3 months,Cat,Domestic Shorthair Mix,White/Black,2014-03-26,2014-06-29 17:45:00,*Frida,Offsite,Adoption,Spayed Female,1,1,95,"(78.0, 306.0]"


## Decision Tree - all animals

In [98]:
Y, X = dmatrices('adopted ~ 0 + animal_type + has_name + color + age_binned + sex_upon_outcome', df, return_type='dataframe')
y = Y['adopted'].values
X.head()

Unnamed: 0,animal_type[Bird],animal_type[Cat],animal_type[Dog],animal_type[Livestock],animal_type[Other],color[T.Agouti/Brown Tabby],color[T.Agouti/Gray],color[T.Agouti/White],color[T.Apricot],color[T.Apricot/Brown],...,color[T.Yellow/Yellow],"age_binned[T.Interval(78.0, 306.0, closed='right')]","age_binned[T.Interval(306.0, 669.0, closed='right')]","age_binned[T.Interval(669.0, 1222.0, closed='right')]","age_binned[T.Interval(1222.0, 9137.0, closed='right')]",sex_upon_outcome[T.Intact Male],sex_upon_outcome[T.Neutered Male],sex_upon_outcome[T.Spayed Female],sex_upon_outcome[T.Unknown],has_name
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [100]:
model = tree.DecisionTreeClassifier(criterion='entropy')

In [101]:
result = model.fit(X_train, y_train)
prediction_train = model.predict(X_train)
print metrics.accuracy_score(y_train, prediction_train)

0.7910984537305803


In [102]:
prediction = model.predict(X_test)
print metrics.accuracy_score(y_test, prediction)

0.7620649997870256


## Decision Tree - Only Dogs and Cats

In [103]:
df_copy = df.copy()
mask = df_copy['animal_type'].isin(["Dog", "Cat"])
df_copy = df_copy[mask]
df_copy

Unnamed: 0,age_upon_outcome,animal_type,breed,color,date_of_birth,datetime,name,outcome_subtype,outcome_type,sex_upon_outcome,has_name,adopted,days_alive,age_binned
0,2 weeks,Cat,Domestic Shorthair Mix,Orange Tabby,2014-07-07,2014-07-22 16:04:00,,Partner,Transfer,Intact Male,0,0,15,"(-123.001, 78.0]"
1,1 year,Dog,Beagle Mix,White/Brown,2012-11-06,2013-11-07 11:47:00,Lucy,Partner,Transfer,Spayed Female,1,0,366,"(306.0, 669.0]"
2,1 year,Dog,Pit Bull,Blue/White,2013-03-31,2014-06-03 14:20:00,*Johnny,,Adoption,Neutered Male,1,1,429,"(306.0, 669.0]"
3,9 years,Dog,Miniature Schnauzer Mix,White,2005-06-02,2014-06-15 15:50:00,Monday,Partner,Transfer,Neutered Male,1,0,3300,"(1222.0, 9137.0]"
5,4 months,Dog,Leonberger Mix,Brown/White,2013-06-03,2013-10-07 13:06:00,*Edgar,Partner,Transfer,Intact Male,1,0,126,"(78.0, 306.0]"
7,3 years,Dog,Chihuahua Shorthair Mix,Brown,2011-11-23,2014-12-08 15:55:00,*Ella,Partner,Transfer,Spayed Female,1,0,1111,"(669.0, 1222.0]"
8,1 month,Cat,Domestic Shorthair Mix,Blue Tabby/White,2014-06-16,2014-08-14 18:45:00,Lucy,,Adoption,Intact Female,1,1,59,"(-123.001, 78.0]"
9,3 months,Cat,Domestic Shorthair Mix,White/Black,2014-03-26,2014-06-29 17:45:00,*Frida,Offsite,Adoption,Spayed Female,1,1,95,"(78.0, 306.0]"
10,1 year,Cat,Domestic Medium Hair Mix,Black/White,2013-03-27,2014-03-28 14:55:00,Stella Luna,,Return to Owner,Spayed Female,1,0,366,"(306.0, 669.0]"
11,2 years,Dog,Papillon/Border Collie,Black/White,2012-02-28,2014-03-28 14:39:00,Fancy,Partner,Transfer,Neutered Male,1,0,759,"(669.0, 1222.0]"


In [105]:
Y, X = dmatrices('adopted ~ 0 + animal_type + has_name + color + sex_upon_outcome', df_copy, return_type='dataframe')
y = Y['adopted'].values
X.head()

Unnamed: 0,animal_type[Cat],animal_type[Dog],color[T.Agouti/Brown Tabby],color[T.Agouti/White],color[T.Apricot],color[T.Apricot/Brown],color[T.Apricot/Tricolor],color[T.Apricot/White],color[T.Black],color[T.Black Brindle],...,color[T.Yellow/Orange],color[T.Yellow/Orange Tabby],color[T.Yellow/Tan],color[T.Yellow/White],color[T.Yellow/Yellow],sex_upon_outcome[T.Intact Male],sex_upon_outcome[T.Neutered Male],sex_upon_outcome[T.Spayed Female],sex_upon_outcome[T.Unknown],has_name
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [106]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)
model = tree.DecisionTreeClassifier(criterion='entropy')

In [109]:
result = model.fit(X_train, y_train)
prediction_train = model.predict(X_train)
print metrics.accuracy_score(y_train, prediction_train)

0.7101991738261932


In [110]:
prediction = model.predict(X_test)
print metrics.accuracy_score(y_test, prediction)

0.6932440381917734
