### Initial Setup

In [36]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import *
from sklearn.linear_model import LogisticRegression

In [37]:
# Read in data
raw_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [38]:
# Examine the data a little bit
print raw_data.shape
print raw_data.head()

(26729, 10)
  AnimalID     Name             DateTime      OutcomeType OutcomeSubtype  \
0  A671945  Hambone  2014-02-12 18:22:00  Return_to_owner            NaN   
1  A656520    Emily  2013-10-13 12:44:00       Euthanasia      Suffering   
2  A686464   Pearce  2015-01-31 12:28:00         Adoption         Foster   
3  A683430      NaN  2014-07-11 19:09:00         Transfer        Partner   
4  A667013      NaN  2013-11-15 12:52:00         Transfer        Partner   

  AnimalType SexuponOutcome AgeuponOutcome                        Breed  \
0        Dog  Neutered Male         1 year        Shetland Sheepdog Mix   
1        Cat  Spayed Female         1 year       Domestic Shorthair Mix   
2        Dog  Neutered Male        2 years                 Pit Bull Mix   
3        Cat    Intact Male        3 weeks       Domestic Shorthair Mix   
4        Dog  Neutered Male        2 years  Lhasa Apso/Miniature Poodle   

         Color  
0  Brown/White  
1  Cream Tabby  
2   Blue/White  
3   Blue Cre

### Exploratory Data Analysis

In [39]:
raw_data['OutcomeType'].value_counts()

Adoption           10769
Transfer            9422
Return_to_owner     4786
Euthanasia          1555
Died                 197
Name: OutcomeType, dtype: int64

In [40]:
raw_data['OutcomeSubtype'].value_counts()

Partner                7816
Foster                 1800
SCRP                   1599
Suffering              1002
Aggressive              320
Offsite                 165
In Kennel               114
Behavior                 86
Rabies Risk              74
Medical                  66
In Foster                52
Enroute                   8
Court/Investigation       6
At Vet                    4
In Surgery                3
Barn                      2
Name: OutcomeSubtype, dtype: int64

In [41]:
pd.crosstab(raw_data['OutcomeSubtype'], raw_data['OutcomeType'])

OutcomeType,Adoption,Died,Euthanasia,Transfer
OutcomeSubtype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aggressive,0,0,320,0
At Vet,0,4,0,0
Barn,1,0,0,1
Behavior,0,0,86,0
Court/Investigation,0,0,6,0
Enroute,0,8,0,0
Foster,1800,0,0,0
In Foster,0,52,0,0
In Kennel,0,114,0,0
In Surgery,0,3,0,0


In [42]:
raw_data['AnimalType'].value_counts()

Dog    15595
Cat    11134
Name: AnimalType, dtype: int64

In [43]:
raw_data['SexuponOutcome'].value_counts()

Neutered Male    9779
Spayed Female    8820
Intact Male      3525
Intact Female    3511
Unknown          1093
Name: SexuponOutcome, dtype: int64

In [44]:
raw_data['AgeuponOutcome'].value_counts()

1 year       3969
2 years      3742
2 months     3397
3 years      1823
1 month      1281
3 months     1277
4 years      1071
5 years       992
4 months      888
6 years       670
3 weeks       659
5 months      652
6 months      588
8 years       536
7 years       531
2 weeks       529
10 months     457
10 years      446
8 months      402
4 weeks       334
9 years       288
7 months      288
12 years      234
9 months      224
1 weeks       171
11 months     166
1 week        146
13 years      143
11 years      126
3 days        109
2 days         99
14 years       97
15 years       85
1 day          66
6 days         50
4 days         50
16 years       36
5 days         24
0 years        22
17 years       17
5 weeks        11
18 years       10
19 years        3
20 years        2
Name: AgeuponOutcome, dtype: int64

In [45]:
raw_data_rev = raw_data['AgeuponOutcome'].apply(lambda x: pd.Series(str(x).split(' ')))

raw_data_rev[0]=raw_data_rev[0].apply(pd.to_numeric, args=('coerce',))

print type(raw_data_rev[0][0])

<type 'numpy.float64'>


In [46]:
raw_data_rev2 = pd.concat([raw_data, raw_data_rev], axis=1, join='inner')

raw_data_rev2.rename(columns = {0:"Time",1:"Units"}, inplace = True)

def daycalc(x):
    if x['Units']=='year':
        return 365.0
    elif x['Units']=='years':
        return 365.0*x['Time']
    elif x['Units']=='month':
        return 31.0
    elif x['Units']=='months':
        return 31.0*x['Time']
    elif x['Units']=='week':
        return 7.0
    elif x['Units']=='weeks':
        return 7.0*x['Time']
    
raw_data_rev2['days'] = raw_data_rev2.apply(lambda x:daycalc(x), axis=1 )

print np.mean(raw_data_rev2['days'])

807.460418804


In [47]:
raw_data = raw_data_rev2

In [48]:
# Seperate out label and data
labels = raw_data.OutcomeType
data = raw_data.drop('OutcomeType', 1)

### Data manipulation: Convert string to numeric categories


In [49]:
data['Year'] = pd.to_datetime(data['DateTime']).dt.year
data['Month'] = pd.to_datetime(data['DateTime']).dt.month
data['Day_Num']= pd.to_datetime(data['DateTime']).dt.day
data.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Time,Units,days,Year,Month,Day_Num
0,A671945,Hambone,2014-02-12 18:22:00,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,1.0,year,365.0,2014,2,12
1,A656520,Emily,2013-10-13 12:44:00,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,1.0,year,365.0,2013,10,13
2,A686464,Pearce,2015-01-31 12:28:00,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,2.0,years,730.0,2015,1,31
3,A683430,,2014-07-11 19:09:00,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream,3.0,weeks,21.0,2014,7,11
4,A667013,,2013-11-15 12:52:00,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan,2.0,years,730.0,2013,11,15


In [50]:
data['HasName'] = -pd.isnull(data['Name'])

In [51]:
data['fixed']=data['SexuponOutcome'].str.split(' ', expand=True)[0]
data['gender']=data['SexuponOutcome'].str.split(' ', expand=True)[1]
print data['fixed'].value_counts()
print data['gender'].value_counts()

Neutered    9779
Spayed      8820
Intact      7036
Unknown     1093
Name: fixed, dtype: int64
Male      13304
Female    12331
Name: gender, dtype: int64


In [55]:
# Replace animal type to numeric: Dog = 1, Cat = 0
data.replace(to_replace='Dog', value=1, inplace=True)
data.replace(to_replace='Cat', value=0, inplace=True)

test_data.replace(to_replace='Dog', value=1, inplace=True)
test_data.replace(to_replace='Cat', value=0, inplace=True)


In [56]:
# Gender
data.replace(to_replace='Male', value=0, inplace=True)
data.replace(to_replace='Female', value=1, inplace=True)
data.replace(to_replace='nan', value=99, inplace=True)


data.replace(to_replace='Neutered', value=0, inplace=True)
data.replace(to_replace='Spayed', value=1, inplace=True)
data.replace(to_replace='Intact', value=2, inplace=True)
data.replace(to_replace='Unknown', value=99, inplace=True)
data.replace(to_replace='nan', value=99, inplace=True)


In [57]:
# Replace outcome label: Return to owner = 1, Adoption = 2, Euthanasia = 3, Transfer = 4, Died = 5
labels.replace(to_replace='Return_to_owner', value=1, inplace=True)
labels.replace(to_replace='Adoption', value=2, inplace=True)
labels.replace(to_replace='Euthanasia', value=3, inplace=True)
labels.replace(to_replace='Transfer', value=4, inplace=True)
labels.replace(to_replace='Died', value=5, inplace=True)

In [58]:
# Splitting data into train set and development set
train_data, dev_data, train_labels, dev_labels = train_test_split(data, labels, test_size=0.3, random_state=0)

In [59]:
train_data.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Time,Units,days,Year,Month,Day_Num,HasName,fixed,gender
11109,A673979,Curly,2014-03-09 17:33:00,,1,Neutered Male,7 years,Toy Poodle Mix,White,7.0,years,2555.0,2014,3,9,True,0.0,0.0
18649,A692191,,2014-11-20 14:36:00,Suffering,1,Intact Female,8 years,Beagle Mix,Tricolor,8.0,years,2920.0,2014,11,20,False,2.0,1.0
15515,A705499,,2015-06-18 13:44:00,Partner,0,99,1 week,Domestic Shorthair Mix,White/Blue,1.0,week,7.0,2015,6,18,False,99.0,99.0
12078,A682480,Pebble,2014-08-01 11:28:00,,0,Neutered Male,4 months,Domestic Shorthair Mix,Black/White,4.0,months,124.0,2014,8,1,True,0.0,0.0
8455,A689097,,2014-10-04 19:15:00,,0,Spayed Female,2 months,Domestic Shorthair Mix,Tortie,2.0,months,62.0,2014,10,4,False,1.0,1.0


### Predictions

In [62]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
parameter = {'n_estimators':[100,400,800,1000]}
predictors = ['AnimalType','days','HasName','Year','Month','Day_Num','fixed','gender']
dt = GradientBoostingClassifier()
gs = GridSearchCV(dt,parameter)

gs.fit(train_data[predictors], train_labels)
prediction = gs.predict(dev_data[predictors])

# Prediction accuracy
print 'Prediction accuracy:', np.mean(prediction == dev_labels)


Prediction accuracy: 0.650953984287
