### Initial Setup

In [1]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import *
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction import DictVectorizer

In [2]:
# Read in data
raw_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
# Examine the data a little bit
print raw_data.shape
print raw_data.head()

(26729, 10)
  AnimalID     Name             DateTime      OutcomeType OutcomeSubtype  \
0  A671945  Hambone  2014-02-12 18:22:00  Return_to_owner            NaN   
1  A656520    Emily  2013-10-13 12:44:00       Euthanasia      Suffering   
2  A686464   Pearce  2015-01-31 12:28:00         Adoption         Foster   
3  A683430      NaN  2014-07-11 19:09:00         Transfer        Partner   
4  A667013      NaN  2013-11-15 12:52:00         Transfer        Partner   

  AnimalType SexuponOutcome AgeuponOutcome                        Breed  \
0        Dog  Neutered Male         1 year        Shetland Sheepdog Mix   
1        Cat  Spayed Female         1 year       Domestic Shorthair Mix   
2        Dog  Neutered Male        2 years                 Pit Bull Mix   
3        Cat    Intact Male        3 weeks       Domestic Shorthair Mix   
4        Dog  Neutered Male        2 years  Lhasa Apso/Miniature Poodle   

         Color  
0  Brown/White  
1  Cream Tabby  
2   Blue/White  
3   Blue Cre

### Exploratory Data Analysis

In [None]:
raw_data['OutcomeType'].value_counts()

In [None]:
raw_data['OutcomeSubtype'].value_counts()

In [None]:
pd.crosstab(raw_data['OutcomeSubtype'], raw_data['OutcomeType'])

In [None]:
raw_data['AnimalType'].value_counts()

In [None]:
raw_data['SexuponOutcome'].value_counts()

In [None]:
raw_data['AgeuponOutcome'].value_counts()

In [3]:
#Separate age into a number and a string
raw_data['Age_num'] = raw_data['AgeuponOutcome'].str[:2].str.strip().astype(int, raise_on_error=False)
raw_data['Age_str'] = raw_data['AgeuponOutcome'].str[2:].str.strip()

test_data['Age_num'] = test_data['AgeuponOutcome'].str[:2].str.strip().astype(int, raise_on_error=False)
test_data['Age_str'] = test_data['AgeuponOutcome'].str[2:].str.strip()

#Create an age factor column based on each possible unit of measure
raw_data['Age_factor'] = 0
raw_data['Age_factor'][raw_data['Age_str'] == 'years'] = 365
raw_data['Age_factor'][raw_data['Age_str'] == 'months'] = 30
raw_data['Age_factor'][raw_data['Age_str'] == 'year'] = 365
raw_data['Age_factor'][raw_data['Age_str'] == 'weeks'] = 7
raw_data['Age_factor'][raw_data['Age_str'] == 'month'] = 30
raw_data['Age_factor'][raw_data['Age_str'] == 'days'] = 1
raw_data['Age_factor'][raw_data['Age_str'] == 'week'] = 7
raw_data['Age_factor'][raw_data['Age_str'] == 'day'] = 1

test_data['Age_factor'] = 0
test_data['Age_factor'][test_data['Age_str'] == 'years'] = 365
test_data['Age_factor'][test_data['Age_str'] == 'months'] = 30
test_data['Age_factor'][test_data['Age_str'] == 'year'] = 365
test_data['Age_factor'][test_data['Age_str'] == 'weeks'] = 7
test_data['Age_factor'][test_data['Age_str'] == 'month'] = 30
test_data['Age_factor'][test_data['Age_str'] == 'days'] = 1
test_data['Age_factor'][test_data['Age_str'] == 'week'] = 7
test_data['Age_factor'][test_data['Age_str'] == 'day'] = 1

#Multiply the number in the age by the factor for comparable numerical column.  Drop and derivitive columns
raw_data['Age_num'] = raw_data['Age_num'].astype(float)
raw_data['Age_days'] = raw_data['Age_num'].mul(raw_data['Age_factor'], axis='index').fillna(0)
raw_data.drop('AgeuponOutcome', 1, inplace = True)
raw_data.drop('Age_num', 1, inplace = True)
raw_data.drop('Age_str', 1, inplace = True)
raw_data.drop('Age_factor', 1, inplace = True)

test_data['Age_num'] = test_data['Age_num'].astype(float)
test_data['Age_days'] = test_data['Age_num'].mul(test_data['Age_factor'], axis='index').fillna(0)
test_data.drop('AgeuponOutcome', 1, inplace = True)
test_data.drop('Age_num', 1, inplace = True)
test_data.drop('Age_str', 1, inplace = True)
test_data.drop('Age_factor', 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

In [4]:
# Seperate out label and data
labels = raw_data.OutcomeType
data = raw_data.drop('OutcomeType', 1)

### Data manipulation: Convert string to numeric categories


In [5]:
data['Year'] = pd.to_datetime(data['DateTime']).dt.year.astype(str)
data['Month'] = pd.to_datetime(data['DateTime']).dt.month.astype(str)
data['Day_Num']= pd.to_datetime(data['DateTime']).dt.day.astype(str)
data.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeSubtype,AnimalType,SexuponOutcome,Breed,Color,Age_days,Year,Month,Day_Num
0,A671945,Hambone,2014-02-12 18:22:00,,Dog,Neutered Male,Shetland Sheepdog Mix,Brown/White,365.0,2014,2,12
1,A656520,Emily,2013-10-13 12:44:00,Suffering,Cat,Spayed Female,Domestic Shorthair Mix,Cream Tabby,365.0,2013,10,13
2,A686464,Pearce,2015-01-31 12:28:00,Foster,Dog,Neutered Male,Pit Bull Mix,Blue/White,730.0,2015,1,31
3,A683430,,2014-07-11 19:09:00,Partner,Cat,Intact Male,Domestic Shorthair Mix,Blue Cream,21.0,2014,7,11
4,A667013,,2013-11-15 12:52:00,Partner,Dog,Neutered Male,Lhasa Apso/Miniature Poodle,Tan,730.0,2013,11,15


In [6]:
data['fixed']=data['SexuponOutcome'].str.split(' ', expand=True)[0]
data['gender']=data['SexuponOutcome'].str.split(' ', expand=True)[1]
print data['fixed'].value_counts()
print data['gender'].value_counts()

Neutered    9779
Spayed      8820
Intact      7036
Unknown     1093
Name: fixed, dtype: int64
Male      13304
Female    12331
Name: gender, dtype: int64


In [7]:
# Replace outcome label: Return to owner = 1, Adoption = 2, Euthanasia = 3, Transfer = 4, Died = 5
labels.replace(to_replace='Adoption', value=1, inplace=True)
labels.replace(to_replace='Died', value=2, inplace=True)
labels.replace(to_replace='Euthanasia', value=3, inplace=True)
labels.replace(to_replace='Return_to_owner', value=4, inplace=True)
labels.replace(to_replace='Transfer', value=5, inplace=True)

In [8]:
data['HasName']=-data['Name'].isnull()

In [9]:
temp = pd.to_datetime(data['DateTime']).dt.hour
bins = [3, 7, 10, 14, 17, 20, 24]
names = ['midnight','morning', 'lunch', 'afternoon','night', 'late night']
data['OutcomeHour'] = pd.cut(temp, bins, labels=names)

In [10]:
temp = pd.to_datetime(data['DateTime']).dt.weekday
data['weekend'] = temp.isin([5,6])

In [11]:
#Does the breed contain a mix?
data['Mix_Breed'] = data['Breed'].map(lambda x: x.find('Mix') != -1)
#Split breed into first and second
data['First_Breed'] = data['Breed'].map(lambda x: x.split('/')[0].replace('Mix', '').strip())
data['Second_Breed'] = data['Breed'].map(lambda x: x.split('/')[1].replace('Mix', '').strip() if len(x.split('/')) > 1 else np.nan)

In [12]:
#Split breed into first and second
data['Mix_Color'] = data['Color'].map(lambda x: x.find('Mix') != -1)

data['First_Color'] = data['Color'].map(lambda x: x.split('/')[0].replace('Mix', '').strip())
data['Second_Color'] = data['Color'].map(lambda x: x.split('/')[1].replace('Mix', '').strip() if len(x.split('/')) > 1 else np.nan)

In [13]:
first_breeds = len(data['First_Breed'].value_counts())
second_breeds = len(data['Second_Breed'].value_counts())

rare_first_breeds=np.array((data['First_Breed'].value_counts()).index[int(first_breeds*.7):])
rare_second_breeds=np.array((data['Second_Breed'].value_counts()).index[int(second_breeds*.1):])

for first_breed in rare_first_breeds:
    data['First_Breed'].replace(to_replace=first_breed, value='Rare', inplace=True)

for sec_breed in rare_second_breeds:
    data['Second_Breed'].replace(to_replace=sec_breed, value='Rare', inplace=True)

In [14]:
first_colors = len(data['First_Color'].value_counts())
second_colors = len(data['Second_Color'].value_counts())

rare_first_colors=np.array((data['First_Color'].value_counts()).index[int(first_colors*.45):])
rare_second_colors=np.array((data['Second_Color'].value_counts()).index[int(second_colors*.2):])

for first_color in rare_first_colors:
    data['First_Color'].replace(to_replace=first_color, value='Rare', inplace=True)

for sec_color in rare_second_colors:
    data['Second_Color'].replace(to_replace=sec_color, value='Rare', inplace=True)

In [None]:
int(first_colors*.45)

In [15]:
# Splitting data into train set and development set
train_data, dev_data, train_labels, dev_labels = train_test_split(data, labels, test_size=0.3, random_state=0)

In [17]:
train_data.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeSubtype,AnimalType,SexuponOutcome,Breed,Color,Age_days,Year,...,gender,HasName,OutcomeHour,weekend,Mix_Breed,First_Breed,Second_Breed,Mix_Color,First_Color,Second_Color
11109,A673979,Curly,2014-03-09 17:33:00,,Dog,Neutered Male,Toy Poodle Mix,White,2555.0,2014,...,Male,True,afternoon,True,True,Toy Poodle,,False,White,
18649,A692191,,2014-11-20 14:36:00,Suffering,Dog,Intact Female,Beagle Mix,Tricolor,2920.0,2014,...,Female,False,lunch,False,True,Beagle,,False,Tricolor,
15515,A705499,,2015-06-18 13:44:00,Partner,Cat,Unknown,Domestic Shorthair Mix,White/Blue,7.0,2015,...,,False,lunch,False,True,Domestic Shorthair,,False,White,Blue
12078,A682480,Pebble,2014-08-01 11:28:00,,Cat,Neutered Male,Domestic Shorthair Mix,Black/White,120.0,2014,...,Male,True,lunch,False,True,Domestic Shorthair,,False,Black,White
8455,A689097,,2014-10-04 19:15:00,,Cat,Spayed Female,Domestic Shorthair Mix,Tortie,60.0,2014,...,Female,False,night,True,True,Domestic Shorthair,,False,Tortie,


### Hashing

In [16]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
predictors = ['AnimalType','fixed','gender','Year','Month','Day_Num','First_Breed', 'Second_Breed','Mix_Breed','Mix_Color','First_Color','Second_Color','Age_days','HasName','OutcomeHour','weekend']

hashed_train = pd.DataFrame(dv.fit_transform(train_data[predictors].to_dict(orient='records')),columns=dv.get_feature_names()).fillna(0)

hashed_dev = pd.DataFrame(dv.transform(dev_data[predictors].to_dict(orient='records')),columns=dv.get_feature_names()).fillna(0)


In [None]:
hashed_train.shape

In [17]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
#parameter = {'n_estimators':[100,400,800,1000]}
dt = GradientBoostingClassifier()
#gs = GridSearchCV(dt,parameter)

dt.fit(hashed_train.to_sparse(), train_labels)
prediction = dt.predict(hashed_dev.to_sparse())

# Prediction accuracy
print 'Prediction accuracy:', np.mean(prediction == dev_labels)


Prediction accuracy: 0.668537224093


### Kaggle Evaluation

In [18]:
pred_prob = dt.predict_proba(hashed_dev.to_sparse())
pred_prob
pred_prob = pred_prob + 0.000000001   # A hack to deal with log transformation of zero
-np.mean([np.log(pred_prob[x][dev_labels.iloc[x]-1]) for x in range(0, len(pred_prob))])

0.79960915256039089

In [None]:
from sklearn.metrics import confusion_matrix

test = pd.DataFrame(prediction, columns = ['prediction'])
print ['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']
confusion_matrix(dev_labels, prediction)

### Tune Percentage of Breed/Color Features to Include (don't put in final writeup)

In [19]:
tuner=1
while tuner>0:
    #Does the breed contain a mix?
    data['Mix_Breed'] = data['Breed'].map(lambda x: x.find('Mix') != -1)
    #Split breed into first and second
    data['First_Breed'] = data['Breed'].map(lambda x: x.split('/')[0].replace('Mix', '').strip())
    data['Second_Breed'] = data['Breed'].map(lambda x: x.split('/')[1].replace('Mix', '').strip() if len(x.split('/')) > 1 else np.nan)

    #Split breed into first and second
    data['Mix_Color'] = data['Color'].map(lambda x: x.find('Mix') != -1)

    data['First_Color'] = data['Color'].map(lambda x: x.split('/')[0].replace('Mix', '').strip())
    data['Second_Color'] = data['Color'].map(lambda x: x.split('/')[1].replace('Mix', '').strip() if len(x.split('/')) > 1 else np.nan)

    rare_first_breeds=np.array((data['First_Breed'].value_counts()).index[int(first_breeds*.7):])
    rare_second_breeds=np.array((data['Second_Breed'].value_counts()).index[int(second_breeds*.1):])

    for first_breed in rare_first_breeds:
        data['First_Breed'].replace(to_replace=first_breed, value='Rare', inplace=True)

    for sec_breed in rare_second_breeds:
        data['Second_Breed'].replace(to_replace=sec_breed, value='Rare', inplace=True)

    rare_first_colors=np.array((data['First_Color'].value_counts()).index[int(first_colors*.45):])
    rare_second_colors=np.array((data['Second_Color'].value_counts()).index[int(second_colors*tuner):])

    for first_color in rare_first_colors:
        data['First_Color'].replace(to_replace=first_color, value='Rare', inplace=True)

    for sec_color in rare_second_colors:
        data['Second_Color'].replace(to_replace=sec_color, value='Rare', inplace=True)

    train_data, dev_data, train_labels, dev_labels = train_test_split(data, labels, test_size=0.3, random_state=0)

    dv = DictVectorizer(sparse=False)
    predictors = ['AnimalType','fixed','gender','Year','Month','Day_Num','First_Breed', 'Second_Breed','Mix_Breed','Mix_Color','First_Color','Second_Color','Age_days','HasName','OutcomeHour','weekend']

    hashed_train = pd.DataFrame(dv.fit_transform(train_data[predictors].to_dict(orient='records')),columns=dv.get_feature_names()).fillna(0)

    hashed_dev = pd.DataFrame(dv.transform(dev_data[predictors].to_dict(orient='records')),columns=dv.get_feature_names()).fillna(0)

    print 'Second Color includes ', tuner 

    print 'Shape: ', hashed_train.shape

    dt = GradientBoostingClassifier()
    #gs = GridSearchCV(dt,parameter)

    dt.fit(hashed_train.to_sparse(), train_labels)
    prediction = dt.predict(hashed_dev.to_sparse())

    # Prediction accuracy
    print 'Prediction accuracy:', np.mean(prediction == dev_labels)

    pred_prob = dt.predict_proba(hashed_dev.to_sparse())
    pred_prob
    pred_prob = pred_prob + 0.000000001   # A hack to deal with log transformation of zero
    print 'Log likelihood', -np.mean([np.log(pred_prob[x][dev_labels.iloc[x]-1]) for x in range(0, len(pred_prob))])
    tuner-=.05

Second Color includes  1
Shape:  (18710, 309)


KeyboardInterrupt: 

### Submitting to Kaggle

In [None]:
test_data.head()

In [20]:
test_data['Year'] = pd.to_datetime(test_data['DateTime']).dt.year.astype(str)
test_data['Month'] = pd.to_datetime(test_data['DateTime']).dt.month.astype(str)
test_data['Day_Num']= pd.to_datetime(test_data['DateTime']).dt.day.astype(str)

test_data['fixed']=test_data['SexuponOutcome'].str.split(' ', expand=True)[0]
test_data['gender']=test_data['SexuponOutcome'].str.split(' ', expand=True)[1]

test_data['HasName']=-test_data['Name'].isnull()

temp = pd.to_datetime(test_data['DateTime']).dt.hour
bins = [3, 7, 10, 14, 17, 20, 24]
names = ['midnight','morning', 'lunch', 'afternoon','night', 'late night']
test_data['OutcomeHour'] = pd.cut(temp, bins, labels=names)

temp = pd.to_datetime(test_data['DateTime']).dt.weekday
test_data['weekend'] = temp.isin([5,6])

#Does the breed contain a mix?
test_data['Mix_Breed'] = test_data['Breed'].map(lambda x: x.find('Mix') != -1)
#Split breed into first and second
test_data['First_Breed'] = test_data['Breed'].map(lambda x: x.split('/')[0].replace('Mix', '').strip())
test_data['Second_Breed'] = test_data['Breed'].map(lambda x: x.split('/')[1].replace('Mix', '').strip() if len(x.split('/')) > 1 else np.nan)

#Split breed into first and second
test_data['Mix_Color'] = test_data['Color'].map(lambda x: x.find('Mix') != -1)

test_data['First_Color'] = test_data['Color'].map(lambda x: x.split('/')[0].replace('Mix', '').strip())
test_data['Second_Color'] = test_data['Color'].map(lambda x: x.split('/')[1].replace('Mix', '').strip() if len(x.split('/')) > 1 else np.nan)

first_breeds = len(test_data['First_Breed'].value_counts())
second_breeds = len(test_data['Second_Breed'].value_counts())

rare_first_breeds=np.array((test_data['First_Breed'].value_counts()).index[int(first_breeds*.7):])
rare_second_breeds=np.array((test_data['Second_Breed'].value_counts()).index[int(second_breeds*.1):])

for first_breed in rare_first_breeds:
    test_data['First_Breed'].replace(to_replace=first_breed, value='Rare', inplace=True)

for sec_breed in rare_second_breeds:
    test_data['Second_Breed'].replace(to_replace=sec_breed, value='Rare', inplace=True)
    
first_colors = len(test_data['First_Color'].value_counts())
second_colors = len(test_data['Second_Color'].value_counts())

rare_first_colors=np.array((test_data['First_Color'].value_counts()).index[int(first_colors*.45):])
rare_second_colors=np.array((test_data['Second_Color'].value_counts()).index[int(second_colors*.2):])

for first_color in rare_first_colors:
    test_data['First_Color'].replace(to_replace=first_color, value='Rare', inplace=True)

for sec_color in rare_second_colors:
    test_data['Second_Color'].replace(to_replace=sec_color, value='Rare', inplace=True)
    
dv = DictVectorizer(sparse=False)
predictors = ['AnimalType','fixed','gender','Year','Month','Day_Num','First_Breed','Second_Breed','Mix_Breed','Mix_Color','First_Color','Second_Color','Age_days','HasName','OutcomeHour','weekend']

hashed_train = pd.DataFrame(dv.fit_transform(train_data[predictors].to_dict(orient='records')),columns=dv.get_feature_names()).fillna(0)

hashed_test = pd.DataFrame(dv.transform(test_data[predictors].to_dict(orient='records')),columns=dv.get_feature_names()).fillna(0)

dt = GradientBoostingClassifier()

dt.fit(hashed_train.to_sparse(), train_labels)
prediction = dt.predict(hashed_test.to_sparse())

In [29]:
prediction

array([5, 1, 5, ..., 5, 1, 4], dtype=int64)

In [21]:
# Format an output file
out = np.zeros((prediction.shape[0], 5))

for i in range(out.shape[0]):
    out[i][prediction[i]-1] = 1

# Convert to panda dataframe
out = pd.DataFrame(data=out, index=test_data.ID, columns=['Adoption','Died','Euthanasia','Return_to_owner','Transfer'])

In [None]:
out.head()

In [22]:
out.to_csv('results_4_23_v2.csv')