### Initial Setup

In [1]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import *
from sklearn.linear_model import LogisticRegression

In [2]:
# Read in data
raw_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
# Examine the data a little bit
print raw_data.shape
print raw_data.head()

(26729, 10)
  AnimalID     Name             DateTime      OutcomeType OutcomeSubtype  \
0  A671945  Hambone  2014-02-12 18:22:00  Return_to_owner            NaN   
1  A656520    Emily  2013-10-13 12:44:00       Euthanasia      Suffering   
2  A686464   Pearce  2015-01-31 12:28:00         Adoption         Foster   
3  A683430      NaN  2014-07-11 19:09:00         Transfer        Partner   
4  A667013      NaN  2013-11-15 12:52:00         Transfer        Partner   

  AnimalType SexuponOutcome AgeuponOutcome                        Breed  \
0        Dog  Neutered Male         1 year        Shetland Sheepdog Mix   
1        Cat  Spayed Female         1 year       Domestic Shorthair Mix   
2        Dog  Neutered Male        2 years                 Pit Bull Mix   
3        Cat    Intact Male        3 weeks       Domestic Shorthair Mix   
4        Dog  Neutered Male        2 years  Lhasa Apso/Miniature Poodle   

         Color  
0  Brown/White  
1  Cream Tabby  
2   Blue/White  
3   Blue Cre

### Exploratory Data Analysis

In [4]:
raw_data['OutcomeType'].value_counts()

Adoption           10769
Transfer            9422
Return_to_owner     4786
Euthanasia          1555
Died                 197
Name: OutcomeType, dtype: int64

In [5]:
raw_data['OutcomeSubtype'].value_counts()

Partner                7816
Foster                 1800
SCRP                   1599
Suffering              1002
Aggressive              320
Offsite                 165
In Kennel               114
Behavior                 86
Rabies Risk              74
Medical                  66
In Foster                52
Enroute                   8
Court/Investigation       6
At Vet                    4
In Surgery                3
Barn                      2
Name: OutcomeSubtype, dtype: int64

In [6]:
pd.crosstab(raw_data['OutcomeSubtype'], raw_data['OutcomeType'])

OutcomeType,Adoption,Died,Euthanasia,Transfer
OutcomeSubtype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aggressive,0,0,320,0
At Vet,0,4,0,0
Barn,1,0,0,1
Behavior,0,0,86,0
Court/Investigation,0,0,6,0
Enroute,0,8,0,0
Foster,1800,0,0,0
In Foster,0,52,0,0
In Kennel,0,114,0,0
In Surgery,0,3,0,0


In [7]:
raw_data['AnimalType'].value_counts()

Dog    15595
Cat    11134
Name: AnimalType, dtype: int64

In [8]:
raw_data['SexuponOutcome'].value_counts()

Neutered Male    9779
Spayed Female    8820
Intact Male      3525
Intact Female    3511
Unknown          1093
Name: SexuponOutcome, dtype: int64

In [9]:
raw_data['AgeuponOutcome'].value_counts()

1 year       3969
2 years      3742
2 months     3397
3 years      1823
1 month      1281
3 months     1277
4 years      1071
5 years       992
4 months      888
6 years       670
3 weeks       659
5 months      652
6 months      588
8 years       536
7 years       531
2 weeks       529
10 months     457
10 years      446
8 months      402
4 weeks       334
9 years       288
7 months      288
12 years      234
9 months      224
1 weeks       171
11 months     166
1 week        146
13 years      143
11 years      126
3 days        109
2 days         99
14 years       97
15 years       85
1 day          66
6 days         50
4 days         50
16 years       36
5 days         24
0 years        22
17 years       17
5 weeks        11
18 years       10
19 years        3
20 years        2
Name: AgeuponOutcome, dtype: int64

In [10]:
#Separate age into a number and a string
raw_data['Age_num'] = raw_data['AgeuponOutcome'].str[:2].str.strip().astype(int, raise_on_error=False)
raw_data['Age_str'] = raw_data['AgeuponOutcome'].str[2:].str.strip()

test_data['Age_num'] = test_data['AgeuponOutcome'].str[:2].str.strip().astype(int, raise_on_error=False)
test_data['Age_str'] = test_data['AgeuponOutcome'].str[2:].str.strip()

#Create an age factor column based on each possible unit of measure
raw_data['Age_factor'] = 0
raw_data['Age_factor'][raw_data['Age_str'] == 'years'] = 365
raw_data['Age_factor'][raw_data['Age_str'] == 'months'] = 30
raw_data['Age_factor'][raw_data['Age_str'] == 'year'] = 365
raw_data['Age_factor'][raw_data['Age_str'] == 'weeks'] = 7
raw_data['Age_factor'][raw_data['Age_str'] == 'month'] = 30
raw_data['Age_factor'][raw_data['Age_str'] == 'days'] = 1
raw_data['Age_factor'][raw_data['Age_str'] == 'week'] = 7
raw_data['Age_factor'][raw_data['Age_str'] == 'day'] = 1

test_data['Age_factor'] = 0
test_data['Age_factor'][test_data['Age_str'] == 'years'] = 365
test_data['Age_factor'][test_data['Age_str'] == 'months'] = 30
test_data['Age_factor'][test_data['Age_str'] == 'year'] = 365
test_data['Age_factor'][test_data['Age_str'] == 'weeks'] = 7
test_data['Age_factor'][test_data['Age_str'] == 'month'] = 30
test_data['Age_factor'][test_data['Age_str'] == 'days'] = 1
test_data['Age_factor'][test_data['Age_str'] == 'week'] = 7
test_data['Age_factor'][test_data['Age_str'] == 'day'] = 1

#Multiply the number in the age by the factor for comparable numerical column.  Drop and derivitive columns
raw_data['Age_num'] = raw_data['Age_num'].astype(float)
raw_data['Age_days'] = raw_data['Age_num'].mul(raw_data['Age_factor'], axis='index').fillna(0)
raw_data.drop('AgeuponOutcome', 1, inplace = True)
raw_data.drop('Age_num', 1, inplace = True)
raw_data.drop('Age_str', 1, inplace = True)
raw_data.drop('Age_factor', 1, inplace = True)

test_data['Age_num'] = test_data['Age_num'].astype(float)
test_data['Age_days'] = test_data['Age_num'].mul(test_data['Age_factor'], axis='index').fillna(0)
test_data.drop('AgeuponOutcome', 1, inplace = True)
test_data.drop('Age_num', 1, inplace = True)
test_data.drop('Age_str', 1, inplace = True)
test_data.drop('Age_factor', 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

In [11]:
# Seperate out label and data
labels = raw_data.OutcomeType
data = raw_data.drop('OutcomeType', 1)

### Data manipulation: Convert string to numeric categories


In [12]:
data['Year'] = pd.to_datetime(data['DateTime']).dt.year.astype(str)
data['Month'] = pd.to_datetime(data['DateTime']).dt.month.astype(str)
data['Day_Num']= pd.to_datetime(data['DateTime']).dt.day.astype(str)
data.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeSubtype,AnimalType,SexuponOutcome,Breed,Color,Age_days,Year,Month,Day_Num
0,A671945,Hambone,2014-02-12 18:22:00,,Dog,Neutered Male,Shetland Sheepdog Mix,Brown/White,365.0,2014,2,12
1,A656520,Emily,2013-10-13 12:44:00,Suffering,Cat,Spayed Female,Domestic Shorthair Mix,Cream Tabby,365.0,2013,10,13
2,A686464,Pearce,2015-01-31 12:28:00,Foster,Dog,Neutered Male,Pit Bull Mix,Blue/White,730.0,2015,1,31
3,A683430,,2014-07-11 19:09:00,Partner,Cat,Intact Male,Domestic Shorthair Mix,Blue Cream,21.0,2014,7,11
4,A667013,,2013-11-15 12:52:00,Partner,Dog,Neutered Male,Lhasa Apso/Miniature Poodle,Tan,730.0,2013,11,15


In [13]:
data['fixed']=data['SexuponOutcome'].str.split(' ', expand=True)[0]
data['gender']=data['SexuponOutcome'].str.split(' ', expand=True)[1]
print data['fixed'].value_counts()
print data['gender'].value_counts()

Neutered    9779
Spayed      8820
Intact      7036
Unknown     1093
Name: fixed, dtype: int64
Male      13304
Female    12331
Name: gender, dtype: int64


In [14]:
# Replace outcome label: Return to owner = 1, Adoption = 2, Euthanasia = 3, Transfer = 4, Died = 5
labels.replace(to_replace='Return_to_owner', value=1, inplace=True)
labels.replace(to_replace='Adoption', value=2, inplace=True)
labels.replace(to_replace='Euthanasia', value=3, inplace=True)
labels.replace(to_replace='Transfer', value=4, inplace=True)
labels.replace(to_replace='Died', value=5, inplace=True)

In [15]:
data['HasName']=-data['Name'].isnull()

In [16]:
temp = pd.to_datetime(data['DateTime']).dt.hour
bins = [3, 7, 10, 14, 17, 20, 24]
names = ['midnight','morning', 'lunch', 'afternoon','night', 'late night']
data['OutcomeHour'] = pd.cut(temp, bins, labels=names)

In [17]:
temp = pd.to_datetime(data['DateTime']).dt.weekday
data['weekend'] = temp.isin([5,6])

In [43]:
#Does the breed contain a mix?
data['Mix_Breed'] = data['Breed'].map(lambda x: x.find('Mix') != -1)
#Split breed into first and second
data['First_Breed'] = data['Breed'].map(lambda x: x.split('/')[0].replace('Mix', '').strip())
data['Second_Breed'] = data['Breed'].map(lambda x: x.split('/')[1].replace('Mix', '').strip() if len(x.split('/')) > 1 else np.nan)

In [44]:
#Split breed into first and second
data['Mix_Color'] = data['Color'].map(lambda x: x.find('Mix') != -1)

data['First_Color'] = data['Color'].map(lambda x: x.split('/')[0].replace('Mix', '').strip())
data['Second_Color'] = data['Color'].map(lambda x: x.split('/')[1].replace('Mix', '').strip() if len(x.split('/')) > 1 else np.nan)

In [45]:
first_breeds = len(data['First_Breed'].value_counts())
second_breeds = len(data['Second_Breed'].value_counts())

rare_first_breeds=np.array((data['First_Breed'].value_counts()).index[int(first_breeds*.7):])
rare_second_breeds=np.array((data['Second_Breed'].value_counts()).index[int(second_breeds*.1):])

for first_breed in rare_first_breeds:
    data['First_Breed'].replace(to_replace=first_breed, value='Rare', inplace=True)

for sec_breed in rare_second_breeds:
    data['Second_Breed'].replace(to_replace=sec_breed, value='Rare', inplace=True)

In [46]:
first_colors = len(data['First_Color'].value_counts())
second_colors = len(data['Second_Color'].value_counts())

rare_first_colors=np.array((data['First_Color'].value_counts()).index[int(first_colors*.45):])
rare_second_colors=np.array((data['Second_Color'].value_counts()).index[int(second_colors*.2):])

for first_color in rare_first_colors:
    data['First_Color'].replace(to_replace=first_color, value='Rare', inplace=True)

for sec_color in rare_second_colors:
    data['Second_Color'].replace(to_replace=sec_color, value='Rare', inplace=True)

In [47]:
# Splitting data into train set and development set
train_data, dev_data, train_labels, dev_labels = train_test_split(data, labels, test_size=0.3, random_state=0)

In [48]:
train_data.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeSubtype,AnimalType,SexuponOutcome,Breed,Color,Age_days,Year,...,gender,HasName,OutcomeHour,weekend,Mix_Breed,First_Breed,Second_Breed,Mix_Color,First_Color,Second_Color
11109,A673979,Curly,2014-03-09 17:33:00,,Dog,Neutered Male,Toy Poodle Mix,White,2555.0,2014,...,Male,True,afternoon,True,True,Toy Poodle,,False,White,
18649,A692191,,2014-11-20 14:36:00,Suffering,Dog,Intact Female,Beagle Mix,Tricolor,2920.0,2014,...,Female,False,lunch,False,True,Beagle,,False,Tricolor,
15515,A705499,,2015-06-18 13:44:00,Partner,Cat,Unknown,Domestic Shorthair Mix,White/Blue,7.0,2015,...,,False,lunch,False,True,Domestic Shorthair,,False,White,Blue
12078,A682480,Pebble,2014-08-01 11:28:00,,Cat,Neutered Male,Domestic Shorthair Mix,Black/White,120.0,2014,...,Male,True,lunch,False,True,Domestic Shorthair,,False,Black,White
8455,A689097,,2014-10-04 19:15:00,,Cat,Spayed Female,Domestic Shorthair Mix,Tortie,60.0,2014,...,Female,False,night,True,True,Domestic Shorthair,,False,Tortie,


### Hashing

In [49]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
predictors = ['AnimalType','fixed','gender','Year','Month','Day_Num','First_Breed', 'Second_Breed','Mix_Breed','Mix_Color','First_Color','Second_Color','Age_days','HasName','OutcomeHour','weekend']

hashed_train = pd.DataFrame(dv.fit_transform(train_data[predictors].to_dict(orient='records')),columns=dv.get_feature_names()).fillna(0)

hashed_dev = pd.DataFrame(dv.transform(dev_data[predictors].to_dict(orient='records')),columns=dv.get_feature_names()).fillna(0)


In [50]:
hashed_train.shape

(18710, 276)

In [51]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
#parameter = {'n_estimators':[100,400,800,1000]}
dt = GradientBoostingClassifier()
#gs = GridSearchCV(dt,parameter)

dt.fit(hashed_train.to_sparse(), train_labels)
prediction = dt.predict(hashed_dev.to_sparse())

# Prediction accuracy
print 'Prediction accuracy:', np.mean(prediction == dev_labels)


Prediction accuracy: 0.668412520264


### Kaggle Evaluation

In [52]:
pred_prob = dt.predict_proba(hashed_dev.to_sparse())
pred_prob
pred_prob = pred_prob + 0.000000001   # A hack to deal with log transformation of zero
-np.mean([np.log(pred_prob[x][dev_labels.iloc[x]-1]) for x in range(0, len(pred_prob))])

0.79911846057664027

In [53]:
from sklearn.metrics import confusion_matrix

test = pd.DataFrame(prediction, columns = ['prediction'])
print ['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']
confusion_matrix(dev_labels, prediction)

['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']


array([[ 667,  579,    7,  199,    0],
       [ 307, 2716,    8,  202,    0],
       [ 105,   56,   87,  227,    0],
       [ 268,  625,   31, 1887,    1],
       [   3,    4,    4,   33,    3]])

### Tune Percentage of Breed/Color Features to Include (don't put in final writeup)

In [41]:
tuner=1
while tuner>0:
    #Does the breed contain a mix?
    data['Mix_Breed'] = data['Breed'].map(lambda x: x.find('Mix') != -1)
    #Split breed into first and second
    data['First_Breed'] = data['Breed'].map(lambda x: x.split('/')[0].replace('Mix', '').strip())
    data['Second_Breed'] = data['Breed'].map(lambda x: x.split('/')[1].replace('Mix', '').strip() if len(x.split('/')) > 1 else np.nan)

    #Split breed into first and second
    data['Mix_Color'] = data['Color'].map(lambda x: x.find('Mix') != -1)

    data['First_Color'] = data['Color'].map(lambda x: x.split('/')[0].replace('Mix', '').strip())
    data['Second_Color'] = data['Color'].map(lambda x: x.split('/')[1].replace('Mix', '').strip() if len(x.split('/')) > 1 else np.nan)

    rare_first_breeds=np.array((data['First_Breed'].value_counts()).index[int(first_breeds*.7):])
    rare_second_breeds=np.array((data['Second_Breed'].value_counts()).index[int(second_breeds*.1):])

    for first_breed in rare_first_breeds:
        data['First_Breed'].replace(to_replace=first_breed, value='Rare', inplace=True)

    for sec_breed in rare_second_breeds:
        data['Second_Breed'].replace(to_replace=sec_breed, value='Rare', inplace=True)

    rare_first_colors=np.array((data['First_Color'].value_counts()).index[int(first_colors*.45):])
    rare_second_colors=np.array((data['Second_Color'].value_counts()).index[int(second_colors*tuner):])

    for first_color in rare_first_colors:
        data['First_Color'].replace(to_replace=first_color, value='Rare', inplace=True)

    for sec_color in rare_second_colors:
        data['Second_Color'].replace(to_replace=sec_color, value='Rare', inplace=True)

    train_data, dev_data, train_labels, dev_labels = train_test_split(data, labels, test_size=0.3, random_state=0)

    dv = DictVectorizer(sparse=False)
    predictors = ['AnimalType','fixed','gender','Year','Month','Day_Num','First_Breed', 'Second_Breed','Mix_Breed','Mix_Color','First_Color','Second_Color','Age_days','HasName','OutcomeHour','weekend']

    hashed_train = pd.DataFrame(dv.fit_transform(train_data[predictors].to_dict(orient='records')),columns=dv.get_feature_names()).fillna(0)

    hashed_dev = pd.DataFrame(dv.transform(dev_data[predictors].to_dict(orient='records')),columns=dv.get_feature_names()).fillna(0)

    print 'Second Color includes ', tuner 

    print 'Shape: ', hashed_train.shape

    dt = GradientBoostingClassifier()
    #gs = GridSearchCV(dt,parameter)

    dt.fit(hashed_train.to_sparse(), train_labels)
    prediction = dt.predict(hashed_dev.to_sparse())

    # Prediction accuracy
    print 'Prediction accuracy:', np.mean(prediction == dev_labels)

    pred_prob = dt.predict_proba(hashed_dev.to_sparse())
    pred_prob
    pred_prob = pred_prob + 0.000000001   # A hack to deal with log transformation of zero
    print 'Log likelihood', -np.mean([np.log(pred_prob[x][dev_labels.iloc[x]-1]) for x in range(0, len(pred_prob))])
    tuner-=.05

Second Color includes  1
Shape:  (18710, 290)
Prediction accuracy: 0.669160743235
Log likelihood 0.798185951927
Second Color includes  0.95
Shape:  (18710, 289)
Prediction accuracy: 0.668537224093
Log likelihood 0.797885890019
Second Color includes  0.9
Shape:  (18710, 288)
Prediction accuracy: 0.669784262377
Log likelihood 0.799012152288
Second Color includes  0.85
Shape:  (18710, 286)
Prediction accuracy: 0.669285447063
Log likelihood 0.798588529062
Second Color includes  0.8
Shape:  (18710, 284)
Prediction accuracy: 0.668412520264
Log likelihood 0.799683995703
Second Color includes  0.75
Shape:  (18710, 281)
Prediction accuracy: 0.668537224093
Log likelihood 0.799081113682
Second Color includes  0.7
Shape:  (18710, 279)
Prediction accuracy: 0.668038408779
Log likelihood 0.798137462779
Second Color includes  0.65
Shape:  (18710, 277)
Prediction accuracy: 0.667664297294
Log likelihood 0.798905648631
Second Color includes  0.6
Shape:  (18710, 275)
Prediction accuracy: 0.668412520264
Lo