In [492]:
import xgboost as xgb
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing, pipeline, metrics, grid_search, cross_validation
import time
import random
import numpy as np
import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.externals import joblib

## 1. Load data

In [239]:
# Load data
start = time.time() 
train = pd.read_csv('./data/train.csv', parse_dates=['DateTime'])
print ("Loading train data finished in %0.3fs" % (time.time() - start))        

test = pd.read_csv('./data/test.csv', parse_dates=['DateTime'])
print ("Loading test data finished in %0.3fs" % (time.time() - start))        

Loading train data finished in 0.069s
Loading test data finished in 0.093s


In [240]:
train.columns

Index([u'AnimalID', u'Name', u'DateTime', u'OutcomeType', u'OutcomeSubtype',
       u'AnimalType', u'SexuponOutcome', u'AgeuponOutcome', u'Breed',
       u'Color'],
      dtype='object')

In [241]:
test.columns

Index([u'ID', u'Name', u'DateTime', u'AnimalType', u'SexuponOutcome',
       u'AgeuponOutcome', u'Breed', u'Color'],
      dtype='object')

In [242]:
# OutcomeType, OutcomeSubtype not shown in test data

In [243]:
train.dtypes

AnimalID                  object
Name                      object
DateTime          datetime64[ns]
OutcomeType               object
OutcomeSubtype            object
AnimalType                object
SexuponOutcome            object
AgeuponOutcome            object
Breed                     object
Color                     object
dtype: object

## 2. Classifiy columns into categorical, numerical, label and id

In [442]:
# seperate column names by categorical, numerical, label, and id
data_types = train.dtypes  
categorical_cols = list(data_types[data_types=='object'].index) + list(data_types[data_types=='datetime64[ns]'].index)
numerical_cols = list(data_types[data_types=='int64'].index) + list(data_types[data_types=='float64'].index)

dict_cols = dict()

# categorical columns
dict_cols['categorical_cols'] = categorical_cols
dict_cols['categorical_cols'].remove('AnimalID') # remove ids
dict_cols['categorical_cols'].remove('OutcomeType') # remove labels
dict_cols['categorical_cols'].remove('OutcomeSubtype')

# numeric columns
dict_cols['numerical_cols'] = numerical_cols

# id columns
dict_cols['id_col'] = ['AnimalID']

# label columns
dict_cols['label_col'] = ['OutcomeType']

print dict_cols

{'label_col': ['OutcomeType'], 'id_col': ['AnimalID'], 'numerical_cols': [], 'categorical_cols': ['Name', 'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color', 'DateTime']}


## 3. Transform features

In [443]:
# merge train and test data

In [444]:
# Merge train and test
dict_cols['train_size'] = train.shape[0]
dict_cols['test_size'] = test.shape[0]
print 'train data size: %s' % dict_cols['train_size']
print 'test data size: %s' % dict_cols['test_size']

train data size: 26729
test data size: 11456


In [410]:
full_data = pd.concat([train[dict_cols['categorical_cols'] + dict_cols['numerical_cols'] + dict_cols['label_col']]
                     , test[dict_cols['categorical_cols'] + dict_cols['numerical_cols']]
                    ])

full_data.shape

(38185, 8)

In [411]:
full_data.head()

Unnamed: 0,AgeuponOutcome,AnimalType,Breed,Color,DateTime,Name,OutcomeType,SexuponOutcome
0,1 year,Dog,Shetland Sheepdog Mix,Brown/White,2014-02-12 18:22:00,Hambone,Return_to_owner,Neutered Male
1,1 year,Cat,Domestic Shorthair Mix,Cream Tabby,2013-10-13 12:44:00,Emily,Euthanasia,Spayed Female
2,2 years,Dog,Pit Bull Mix,Blue/White,2015-01-31 12:28:00,Pearce,Adoption,Neutered Male
3,3 weeks,Cat,Domestic Shorthair Mix,Blue Cream,2014-07-11 19:09:00,,Transfer,Intact Male
4,2 years,Dog,Lhasa Apso/Miniature Poodle,Tan,2013-11-15 12:52:00,,Transfer,Neutered Male


In [412]:
full_data.describe()

Unnamed: 0,AgeuponOutcome,AnimalType,Breed,Color,DateTime,Name,OutcomeType,SexuponOutcome
count,38161,38185,38185,38185,38185,27269,26729,38184
unique,45,2,1678,411,31413,7968,5,5
top,1 year,Dog,Domestic Shorthair Mix,Black/White,2015-08-11 00:00:00,Bella,Adoption,Neutered Male
freq,5737,22251,12587,4043,25,195,10769,14014
first,,,,,2013-10-01 09:31:00,,,
last,,,,,2016-02-21 19:17:00,,,


In [413]:
# transofrm datatime into year, month, weekday

In [414]:
full_data['year'] = full_data['DateTime'].dt.year

In [415]:
full_data['month'] = full_data['DateTime'].dt.month

In [416]:
full_data['weekday'] = full_data['DateTime'].dt.weekday

In [417]:
# transform age into years

In [418]:
def calc_age_in_years(x):
    x = str(x)
    if x == 'nan': return 0
    age = int(x.split()[0])
    if x.find('year') > -1: return age 
    if x.find('month')> -1: return age / 12.
    if x.find('week')> -1: return age / 52.
    if x.find('day')> -1: return age / 365.
    else: return 0

In [419]:
full_data['AgeInYears'] = full_data.AgeuponOutcome.apply(calc_age_in_years)

In [420]:
# transform name into have name or not

In [421]:
full_data['Name'] = pd.isnull(full_data['Name'])

In [406]:
# drop DateTime and AgeuponOutcome

In [423]:
full_data.drop(['DateTime', 'AgeuponOutcome'], axis=1, inplace=True)

In [424]:
full_data.head()

Unnamed: 0,AnimalType,Breed,Color,Name,OutcomeType,SexuponOutcome,year,month,weekday,AgeInYears
0,Dog,Shetland Sheepdog Mix,Brown/White,False,Return_to_owner,Neutered Male,2014,2,2,1.0
1,Cat,Domestic Shorthair Mix,Cream Tabby,False,Euthanasia,Spayed Female,2013,10,6,1.0
2,Dog,Pit Bull Mix,Blue/White,False,Adoption,Neutered Male,2015,1,5,2.0
3,Cat,Domestic Shorthair Mix,Blue Cream,True,Transfer,Intact Male,2014,7,4,0.057692
4,Dog,Lhasa Apso/Miniature Poodle,Tan,True,Transfer,Neutered Male,2013,11,4,2.0


In [426]:
full_data.to_pickle('./data/full_data.pkl')

In [None]:
# update dict_cols

In [447]:
dict_cols['categorical_cols'].remove('DateTime')
dict_cols['categorical_cols'].remove('AgeuponOutcome')

In [449]:
dict_cols['categorical_cols'].append('year')
dict_cols['categorical_cols'].append('month')
dict_cols['categorical_cols'].append('weekday')
dict_cols['categorical_cols'].append('AgeInYears')

In [450]:
dict_cols

{'categorical_cols': ['Name',
  'AnimalType',
  'SexuponOutcome',
  'Breed',
  'Color',
  'year',
  'month',
  'weekday',
  'AgeInYears'],
 'id_col': ['AnimalID'],
 'label_col': ['OutcomeType'],
 'numerical_cols': [],
 'test_size': 11456,
 'train_size': 26729}

## 4. Label encoding categorical columns for train and test data

In [451]:
# label encoding categorical columns
for col in dict_cols['categorical_cols']:
    print("Label encoding column: %s" % (col))
    LBL = preprocessing.LabelEncoder()
    LBL.fit(full_data[col])
    full_data[col] = LBL.transform(full_data[col])

Label encoding column: Name
Label encoding column: AnimalType
Label encoding column: SexuponOutcome
Label encoding column: Breed
Label encoding column: Color
Label encoding column: year
Label encoding column: month
Label encoding column: weekday
Label encoding column: AgeInYears


In [452]:
# record the label mapping
LBL = preprocessing.LabelEncoder()
LBL.fit(full_data['OutcomeType'])
label_mapping = dict(zip(full_data['OutcomeType'].unique(), LBL.transform(full_data['OutcomeType'].unique())))
print("Label mapping: %s" % (label_mapping))

Label mapping: {nan: 0, 'Return_to_owner': 4, 'Transfer': 5, 'Adoption': 1, 'Euthanasia': 3, 'Died': 2}


In [453]:
# label encode categorical label
full_data['OutcomeType'] = LBL.transform(full_data['OutcomeType'])
print "Label encoding column: OutcomeType"

Label encoding column: OutcomeType


In [454]:
full_data.shape

(38185, 10)

In [455]:
full_data.head()

Unnamed: 0,AnimalType,Breed,Color,Name,OutcomeType,SexuponOutcome,year,month,weekday,AgeInYears
0,1,1482,146,0,4,3,1,1,2,23
1,0,775,184,0,3,4,0,9,6,23
2,1,1293,97,0,1,3,2,0,5,24
3,0,775,47,1,5,2,1,6,4,9
4,1,1101,311,1,5,3,0,10,4,24


## 5. Search models

In [456]:
## Defind function to grid search the best model
def search_model(train_x, train_y, est, param_grid, n_jobs, cv, refit=False):
    model = grid_search.GridSearchCV(estimator  = est, 
                                     param_grid = param_grid, 
                                     scoring = 'log_loss', 
                                     verbose = 10, 
                                     n_jobs  = n_jobs, 
                                     iid = True, # is identically distributed 
                                     refit = refit,
                                     cv = cv)
    # Fit Grid Search Model
    model.fit(train_x, train_y)
    print("Best score: %0.3f" % model.best_score_)
    print("Best parameters set:", model.best_params_)
    print("Scores:", model.grid_scores_)
    return model

In [502]:
train_X_label = full_data[:dict_cols['train_size']][dict_cols['categorical_cols']]

In [503]:
train_y_label = full_data[:dict_cols['train_size']][dict_cols['label_col']].values.reshape(dict_cols['train_size'])

In [459]:
param_grid = {'criterion':['gini', 'entropy'], 'n_estimators':[500], 'random_state' : [1234]}

In [460]:
model = search_model(train_X
            , train_y
            , RandomForestClassifier()
            , param_grid
            , 1
            , 4)

Fitting 4 folds for each of 2 candidates, totalling 8 fits
[CV] n_estimators=500, random_state=1234, criterion=gini .............
[CV]  n_estimators=500, random_state=1234, criterion=gini, score=-0.945090 -  11.5s
[CV] n_estimators=500, random_state=1234, criterion=gini .............


[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:   11.5s


[CV]  n_estimators=500, random_state=1234, criterion=gini, score=-0.953191 -  11.5s
[CV] n_estimators=500, random_state=1234, criterion=gini .............
[CV]  n_estimators=500, random_state=1234, criterion=gini, score=-0.933401 -  11.7s
[CV] n_estimators=500, random_state=1234, criterion=gini .............
[CV]  n_estimators=500, random_state=1234, criterion=gini, score=-0.974801 -  11.5s
[CV] n_estimators=500, random_state=1234, criterion=entropy ..........


[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:   46.3s


[CV]  n_estimators=500, random_state=1234, criterion=entropy, score=-0.967128 -  15.0s
[CV] n_estimators=500, random_state=1234, criterion=entropy ..........
[CV]  n_estimators=500, random_state=1234, criterion=entropy, score=-0.957786 -  14.4s
[CV] n_estimators=500, random_state=1234, criterion=entropy ..........
[CV]  n_estimators=500, random_state=1234, criterion=entropy, score=-0.923045 -  14.5s
[CV] n_estimators=500, random_state=1234, criterion=entropy ..........


[Parallel(n_jobs=1)]: Done   7 tasks       | elapsed:  1.5min


[CV]  n_estimators=500, random_state=1234, criterion=entropy, score=-0.952309 -  14.4s
Best score: -0.950
('Best parameters set:', {'n_estimators': 500, 'random_state': 1234, 'criterion': 'entropy'})
('Scores:', [mean: -0.95162, std: 0.01512, params: {'n_estimators': 500, 'random_state': 1234, 'criterion': 'gini'}, mean: -0.95007, std: 0.01648, params: {'n_estimators': 500, 'random_state': 1234, 'criterion': 'entropy'}])


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.7min finished


In [515]:
rfc_label = RandomForestClassifier(n_estimators=500, random_state=1234, criterion='gini')

In [516]:
rfc_label_model = rfc_label.fit(train_X_label, train_y_label)

In [517]:
preds_label = rfc_label_model.predict(full_data[dict_cols['train_size']:][dict_cols['categorical_cols']])

In [518]:
preds_label[1:10]

array([1, 5, 5, 1, 4, 5, 1, 1, 1])

In [519]:
label_mapping

{nan: 0,
 'Adoption': 1,
 'Died': 2,
 'Euthanasia': 3,
 'Return_to_owner': 4,
 'Transfer': 5}

## Write out submission

In [468]:
submission = pd.read_csv('./data/sample_submission.csv')

In [469]:
submission['Adoption'] = 0

In [470]:
label_mapping

{nan: 0,
 'Adoption': 1,
 'Died': 2,
 'Euthanasia': 3,
 'Return_to_owner': 4,
 'Transfer': 5}

In [471]:
for idx, val in enumerate(preds):
    #print index, value
    if val == 0:
        submission.iloc[idx]['Died'] = 1
    else:
        submission.iloc[idx][val] = 1

In [472]:
path = './data/submission_' + str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + '.csv'

In [473]:
submission.to_csv(path, index=False)

#### Homeworks:
1. Apply One-Hot-Encoder for all the categorical columns then remove the original categoricals.
2. Check the difference between Label Encoding and One-Hot-Encoding.
    Tips: scikit-learn one-hot-encoder or pandas get_dummies()
    Question: what if there are levels existing in traning data but not in test data?
3. Create submissions using both encoders, as well as a submission that is the average of them. Submit all of the three to see if ensemble helps 

## One Hot encoding

In [474]:
enc = OneHotEncoder()

In [475]:
enc.fit(full_data[dict_cols['categorical_cols']])

OneHotEncoder(categorical_features='all', dtype=<type 'float'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [476]:
enc.feature_indices_

array([   0,    2,    4,   10, 1688, 2099, 2103, 2115, 2122, 2166])

In [477]:
one_hot_predictors = enc.transform(full_data[dict_cols['categorical_cols']])

In [478]:
one_hot_predictors.shape

(38185, 2166)

In [495]:
train_X_onehot = one_hot_predictors[:dict_cols['train_size']]
train_y_onehot = full_data[:dict_cols['train_size']][dict_cols['label_col']].values.reshape(dict_cols['train_size'])

In [496]:
train_X_onehot.shape

(26729, 2166)

In [497]:
train_y_onehot.shape

(26729,)

In [483]:
model = search_model(train_X_onehot
            , train_y_onehot
            , RandomForestClassifier()
            , param_grid
            , 1
            , 4)

Fitting 4 folds for each of 2 candidates, totalling 8 fits
[CV] n_estimators=500, random_state=1234, criterion=gini .............
[CV]  n_estimators=500, random_state=1234, criterion=gini, score=-0.894292 - 2.3min
[CV] n_estimators=500, random_state=1234, criterion=gini .............


[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:  2.3min


[CV]  n_estimators=500, random_state=1234, criterion=gini, score=-0.921493 - 2.3min
[CV] n_estimators=500, random_state=1234, criterion=gini .............
[CV]  n_estimators=500, random_state=1234, criterion=gini, score=-0.927556 - 2.4min
[CV] n_estimators=500, random_state=1234, criterion=gini .............
[CV]  n_estimators=500, random_state=1234, criterion=gini, score=-0.915387 - 2.4min
[CV] n_estimators=500, random_state=1234, criterion=entropy ..........


[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:  9.4min


[CV]  n_estimators=500, random_state=1234, criterion=entropy, score=-0.930241 - 2.4min
[CV] n_estimators=500, random_state=1234, criterion=entropy ..........
[CV]  n_estimators=500, random_state=1234, criterion=entropy, score=-0.938151 - 2.4min
[CV] n_estimators=500, random_state=1234, criterion=entropy ..........
[CV]  n_estimators=500, random_state=1234, criterion=entropy, score=-0.899984 - 2.3min
[CV] n_estimators=500, random_state=1234, criterion=entropy ..........


[Parallel(n_jobs=1)]: Done   7 tasks       | elapsed: 16.5min


[CV]  n_estimators=500, random_state=1234, criterion=entropy, score=-0.909402 - 2.4min
Best score: -0.915
('Best parameters set:', {'n_estimators': 500, 'random_state': 1234, 'criterion': 'gini'})
('Scores:', [mean: -0.91468, std: 0.01253, params: {'n_estimators': 500, 'random_state': 1234, 'criterion': 'gini'}, mean: -0.91945, std: 0.01538, params: {'n_estimators': 500, 'random_state': 1234, 'criterion': 'entropy'}])


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 18.9min finished


In [494]:
rfc_onehot = RandomForestClassifier(n_estimators=500, random_state=1234, criterion='gini')

In [498]:
rfc_onehot_model = rfc_onehot.fit(train_X_onehot, train_y_onehot)

In [539]:
preds_onehot = rfc_onehot_model.predict(one_hot_predictors[dict_cols['train_size']:])

In [487]:
submission = pd.read_csv('./data/sample_submission.csv')

In [488]:
submission['Adoption'] = 0

In [489]:
for idx, val in enumerate(preds):
    #print index, value
    if val == 0:
        submission.iloc[idx]['Died'] = 1
    else:
        submission.iloc[idx][val] = 1

In [490]:
path = './data/submission_' + str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + '.csv'

In [491]:
submission.to_csv(path, index=False)

## Averge both models

In [530]:
preds_label

array([5, 1, 5, ..., 5, 1, 4])

In [531]:
preds_onehot

array([5, 1, 1, ..., 5, 1, 4])

In [524]:
label_proba = rfc_label_model.predict_proba(full_data[dict_cols['train_size']:][dict_cols['categorical_cols']])

In [525]:
onehot_proba = rfc_onehot_model.predict_proba(one_hot_predictors[dict_cols['train_size']:])

In [526]:
both_proba = label_proba + onehot_proba

In [527]:
label_proba

array([[ 0.026     ,  0.        ,  0.058     ,  0.374     ,  0.542     ],
       [ 0.69      ,  0.        ,  0.        ,  0.17      ,  0.14      ],
       [ 0.38526667,  0.        ,  0.        ,  0.134     ,  0.48073333],
       ..., 
       [ 0.        ,  0.        ,  0.026     ,  0.002     ,  0.972     ],
       [ 0.474     ,  0.        ,  0.01      ,  0.466     ,  0.05      ],
       [ 0.088     ,  0.        ,  0.286     ,  0.418     ,  0.208     ]])

In [528]:
onehot_proba

array([[ 0.124,  0.   ,  0.061,  0.274,  0.541],
       [ 0.788,  0.   ,  0.004,  0.106,  0.102],
       [ 0.509,  0.002,  0.002,  0.222,  0.265],
       ..., 
       [ 0.   ,  0.002,  0.13 ,  0.   ,  0.868],
       [ 0.462,  0.   ,  0.022,  0.398,  0.118],
       [ 0.112,  0.   ,  0.164,  0.384,  0.34 ]])

In [529]:
both_proba

array([[ 0.15      ,  0.        ,  0.119     ,  0.648     ,  1.083     ],
       [ 1.478     ,  0.        ,  0.004     ,  0.276     ,  0.242     ],
       [ 0.89426667,  0.002     ,  0.002     ,  0.356     ,  0.74573333],
       ..., 
       [ 0.        ,  0.002     ,  0.156     ,  0.002     ,  1.84      ],
       [ 0.936     ,  0.        ,  0.032     ,  0.864     ,  0.168     ],
       [ 0.2       ,  0.        ,  0.45      ,  0.802     ,  0.548     ]])

In [538]:
preds_both = pd.DataFrame(both_proba).idxmax(axis=1) + 1

In [542]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['Adoption'] = 0
for idx, val in enumerate(preds_both):
    #print index, value
    if val == 0:
        submission.iloc[idx]['Adoption'] = 1
    else:
        submission.iloc[idx][val] = 1
        
path = './data/submission_' + str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + '.csv'
submission.to_csv(path, index=False)