In [121]:
import xgboost as xgb
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing, pipeline, metrics, grid_search, cross_validation
import time
import random
import numpy as np

## 1. Load data

In [122]:
# Load data
start = time.time() 
train = pd.read_csv('train.csv')
print ("Loading train data finished in %0.3fs" % (time.time() - start))        

test = pd.read_csv('test.csv')
print ("Loading test data finished in %0.3fs" % (time.time() - start))        

Loading train data finished in 0.058s
Loading test data finished in 0.074s


In [123]:
train.columns

Index([u'AnimalID', u'Name', u'DateTime', u'OutcomeType', u'OutcomeSubtype',
       u'AnimalType', u'SexuponOutcome', u'AgeuponOutcome', u'Breed',
       u'Color'],
      dtype='object')

In [124]:
test.columns

Index([u'ID', u'Name', u'DateTime', u'AnimalType', u'SexuponOutcome',
       u'AgeuponOutcome', u'Breed', u'Color'],
      dtype='object')

In [139]:
# OutcomeType, OutcomeSubtype not shown in test data

In [125]:
train.dtypes

AnimalID          object
Name              object
DateTime          object
OutcomeType       object
OutcomeSubtype    object
AnimalType        object
SexuponOutcome    object
AgeuponOutcome    object
Breed             object
Color             object
dtype: object

## 2. Classifiy columns into categorical, numerical, label and id

In [126]:
# seperate column names by categorical, numerical, label, and id
data_types = train.dtypes  
categorical_cols = list(data_types[data_types=='object'].index)
numerical_cols = list(data_types[data_types=='int64'].index) + list(data_types[data_types=='float64'].index)

dict_cols = dict()

# categorical columns
dict_cols['categorical_cols'] = categorical_cols
dict_cols['categorical_cols'].remove('AnimalID') # remove ids
dict_cols['categorical_cols'].remove('OutcomeType') # remove labels
dict_cols['categorical_cols'].remove('OutcomeSubtype')

# numeric columns
dict_cols['numerical_cols'] = numerical_cols

# id columns
dict_cols['id_col'] = ['AnimalID']

# label columns
dict_cols['label_col'] = ['OutcomeType']

print dict_cols

{'label_col': ['OutcomeType'], 'id_col': ['AnimalID'], 'numerical_cols': [], 'categorical_cols': ['Name', 'DateTime', 'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color']}


## 3. Label encoding categorical columns for train and test data

In [117]:
# merge train and test data

In [128]:
# Merge train and test
dict_cols['train_size'] = train.shape[0]
dict_cols['test_size'] = test.shape[0]
print 'train data size: %s' % dict_cols['train_size']
print 'test data size: %s' % dict_cols['test_size']

train data size: 26729
test data size: 11456


In [142]:
full_data = pd.concat([train[dict_cols['categorical_cols'] + dict_cols['numerical_cols'] + dict_cols['label_col']]
                     , test[dict_cols['categorical_cols'] + dict_cols['numerical_cols']]
                    ])

full_data.shape

(38185, 8)

In [143]:
# label encoding categorical columns
for col in dict_cols['categorical_cols']:
    print("Label encoding column: %s" % (col))
    LBL = preprocessing.LabelEncoder()
    LBL.fit(full_data[col])
    full_data[col] = LBL.transform(full_data[col])

Label encoding column: Name
Label encoding column: DateTime
Label encoding column: AnimalType
Label encoding column: SexuponOutcome
Label encoding column: AgeuponOutcome
Label encoding column: Breed
Label encoding column: Color


In [144]:
# record the label mapping
LBL = preprocessing.LabelEncoder()
LBL.fit(full_data['OutcomeType'])
label_mapping = dict(zip(full_data['OutcomeType'].unique(), LBL.transform(full_data['OutcomeType'].unique())))
print("Label mapping: %s" % (label_mapping))

Label mapping: {nan: 0, 'Return_to_owner': 4, 'Transfer': 5, 'Adoption': 1, 'Euthanasia': 3, 'Died': 2}


In [145]:
# label encode categorical label
full_data['OutcomeType'] = LBL.transform(full_data['OutcomeType'])
print "Label encoding column: OutcomeType"

Label encoding column: OutcomeType


In [148]:
full_data.shape

(38185, 8)

## 5. Search models

In [149]:
## Defind function to grid search the best model
def search_model(train_x, train_y, est, param_grid, n_jobs, cv, refit=False):
    model = grid_search.GridSearchCV(estimator  = est, 
                                     param_grid = param_grid, 
                                     scoring = 'log_loss', 
                                     verbose = 10, 
                                     n_jobs  = n_jobs, 
                                     iid = True, # is identically distributed 
                                     refit = refit,
                                     cv = cv)
    # Fit Grid Search Model
    model.fit(train_x, train_y)
    print("Best score: %0.3f" % model.best_score_)
    print("Best parameters set:", model.best_params_)
    print("Scores:", model.grid_scores_)
    return model

In [154]:
train_X = full_data[:dict_cols['train_size']][dict_cols['categorical_cols']]

In [160]:
train_y = full_data[:dict_cols['train_size']][dict_cols['label_col']].values.reshape(dict_cols['train_size'])

In [156]:
param_grid = {'criterion':['gini', 'entropy'], 'n_estimators':[500], 'random_state' : [1234]}

In [162]:
search_model(train_X
            , train_y
            , RandomForestClassifier()
            , param_grid
            , -1
            , 4)

Fitting 4 folds for each of 2 candidates, totalling 8 fits
[CV] n_estimators=500, random_state=1234, criterion=gini .............
[CV] n_estimators=500, random_state=1234, criterion=gini .............
[CV] n_estimators=500, random_state=1234, criterion=gini .............
[CV] n_estimators=500, random_state=1234, criterion=gini .............
[CV] n_estimators=500, random_state=1234, criterion=entropy ..........
[CV] n_estimators=500, random_state=1234, criterion=entropy ..........
[CV] n_estimators=500, random_state=1234, criterion=entropy ..........
[CV] n_estimators=500, random_state=1234, criterion=entropy ..........
[CV]  n_estimators=500, random_state=1234, criterion=gini, score=-0.947547 -  22.5s
[CV]  n_estimators=500, random_state=1234, criterion=gini, score=-0.993872 -  22.6s


[Parallel(n_jobs=-1)]: Done   9 out of   8 | elapsed:   22.6s remaining:   -2.5s


[CV]  n_estimators=500, random_state=1234, criterion=gini, score=-0.964288 -  22.9s
[CV]  n_estimators=500, random_state=1234, criterion=gini, score=-0.966900 -  23.0s


[Parallel(n_jobs=-1)]: Done   9 out of   8 | elapsed:   22.9s remaining:   -2.5s
[Parallel(n_jobs=-1)]: Done   9 out of   8 | elapsed:   23.1s remaining:   -2.6s


[CV]  n_estimators=500, random_state=1234, criterion=entropy, score=-0.988436 -  32.2s
[CV]  n_estimators=500, random_state=1234, criterion=entropy, score=-0.959271 -  32.2s
[CV]  n_estimators=500, random_state=1234, criterion=entropy, score=-0.977050 -  32.3s


[Parallel(n_jobs=-1)]: Done   9 out of   8 | elapsed:   32.3s remaining:   -3.6s
[Parallel(n_jobs=-1)]: Done   9 out of   8 | elapsed:   32.3s remaining:   -3.6s
[Parallel(n_jobs=-1)]: Done   9 out of   8 | elapsed:   32.5s remaining:   -3.6s


[CV]  n_estimators=500, random_state=1234, criterion=entropy, score=-0.993002 -  32.6s


[Parallel(n_jobs=-1)]: Done   9 out of   8 | elapsed:   32.7s remaining:   -3.6s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   32.7s finished


Best score: -0.968
('Best parameters set:', {'n_estimators': 500, 'random_state': 1234, 'criterion': 'gini'})
('Scores:', [mean: -0.96816, std: 0.01660, params: {'n_estimators': 500, 'random_state': 1234, 'criterion': 'gini'}, mean: -0.97944, std: 0.01301, params: {'n_estimators': 500, 'random_state': 1234, 'criterion': 'entropy'}])


GridSearchCV(cv=4, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [500], 'random_state': [1234], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=False, scoring='log_loss',
       verbose=10)

#### Homeworks:
1. Apply One-Hot-Encoder for all the categorical columns then remove the original categoricals.
2. Check the difference between Label Encoding and One-Hot-Encoding.
    Tips: scikit-learn one-hot-encoder or pandas get_dummies()
    Question: what if there are levels existing in traning data but not in test data?
3. Create submissions using both encoders, as well as a submission that is the average of them. Submit all of the three to see if ensemble helps 