In [42]:
import xgboost as xgb
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing, pipeline, metrics, grid_search, cross_validation
import datetime
import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

###### Week 3 we will be focused on XGBoost tuning and will be using engineered features from week 2 which by the end of this notebook will lead you to a score between 0.736-0.739. But feel free to use whichever features that  work for you.

In [2]:
# load train and test data and output full data and column dictionary
def load_data():
    # Load data
    start = time.time() 
    train = pd.read_csv('./data/train.csv', parse_dates=['DateTime'])
    print ("Loading train data finished in %0.3fs" % (time.time() - start))        

    test = pd.read_csv('./data/test.csv', parse_dates=['DateTime'])
    print ("Loading test data finished in %0.3fs" % (time.time() - start))  

    # seperate column names by categorical, numerical, label, and id
    data_types = train.dtypes  
    categorical_cols = list(data_types[data_types=='object'].index) + list(data_types[data_types=='datetime64[ns]'].index)
    numerical_cols = list(data_types[data_types=='int64'].index) + list(data_types[data_types=='float64'].index)

    dict_cols = dict()
    # categorical columns
    dict_cols['categorical_cols'] = categorical_cols
    dict_cols['categorical_cols'].remove('AnimalID') # remove ids
    dict_cols['categorical_cols'].remove('OutcomeType') # remove labels
    dict_cols['categorical_cols'].remove('OutcomeSubtype')

    # numeric columns
    dict_cols['numerical_cols'] = numerical_cols

    # id columns
    dict_cols['id_col'] = ['AnimalID']

    # label columns
    dict_cols['label_col'] = ['OutcomeType']
    
    # Merge train and test
    dict_cols['train_size'] = train.shape[0]
    dict_cols['test_size'] = test.shape[0]
    print 'train data size: %s' % dict_cols['train_size']
    print 'test data size: %s' % dict_cols['test_size']

    full_data = pd.concat([train[dict_cols['categorical_cols'] + dict_cols['numerical_cols'] + dict_cols['label_col']]
                         , test[dict_cols['categorical_cols'] + dict_cols['numerical_cols']]
                        ])
    
    return dict_cols, full_data

In [3]:
## function to grid search the best model
def search_model(train_x, train_y, est, param_grid, n_jobs, cv, refit=False):
    model = grid_search.GridSearchCV(estimator  = est, 
                                     param_grid = param_grid, 
                                     scoring = 'log_loss', 
                                     verbose = 10, 
                                     n_jobs  = n_jobs, 
                                     iid = True, # is identically distributed 
                                     refit = refit,
                                     cv = cv)
    # Fit Grid Search Model
    model.fit(train_x, train_y)
    print("Best score: %0.3f" % model.best_score_)
    print("Best parameters set:", model.best_params_)
    print("Scores:", model.grid_scores_)
    return model

In [4]:
# function to plot feature importance
def plot_feature_importance(feature_importances, feature_names):
    ftr_imp_df = pd.DataFrame(sorted(zip(feature_names, feature_importances)
                          , key=lambda x: x[1], reverse = False)
                   )
    y_pos = np.arange(ftr_imp_df.shape[0])

    plt.barh(y_pos, ftr_imp_df[1], align='center', alpha=0.4)
    plt.yticks(y_pos, ftr_imp_df[0])
    plt.xlabel('Feature Importance')

    plt.show()

In [5]:
# function to label encoding, and output label mapping
def label_encoding(cols, full_data):
    label_mapping = {}
    for col in cols:
        print("Label encoding column: %s" % (col))
        LBL = preprocessing.LabelEncoder()
        LBL.fit(full_data[col])
        if col == 'OutcomeType':
            label_mapping = dict(zip(full_data['OutcomeType'].unique(), LBL.transform(full_data['OutcomeType'].unique())))
#             print("Label mapping: %s" % (label_mapping))
        full_data[col] = LBL.transform(full_data[col])
        
    return label_mapping

In [6]:
# function to transform ages into days
def age2days(age):
    age = str(age)
    if 'day' in age:
        days = int(age.split(' ')[0])
    if 'week' in age:
        days= int(age.split(' ')[0])*7
    if 'month' in age:
        days = int(age.split(' ')[0])*30
    if 'year' in age:
        days = int(age.split(' ')[0])*365    
    else:
        days = 0
    return days   

In [7]:
# Load data
dict_cols, full_data = load_data()

Loading train data finished in 0.252s
Loading test data finished in 0.299s
train data size: 26729
test data size: 11456


In [8]:
full_data['AgeuponOutcome'] = full_data['AgeuponOutcome'].apply(age2days)
full_data['Breed'] = full_data['Breed'].apply(lambda x: '-'.join(sorted(list(set(x.split("/"))))))
full_data['NameLength'] = full_data['Name'].apply(lambda x: len(str(x)) if x else 0)

In [9]:
dict_cols['categorical_cols'].remove('AgeuponOutcome')
dict_cols['numerical_cols'].append('AgeuponOutcome')
dict_cols['numerical_cols'].append('NameLength')

In [10]:
full_data['year'] = full_data['DateTime'].dt.year
full_data['month'] = full_data['DateTime'].dt.month
full_data['day'] = full_data['DateTime'].dt.day
full_data['weekday'] = full_data['DateTime'].dt.dayofweek
full_data['weekyear'] = full_data['DateTime'].dt.weekofyear
full_data['hour'] = full_data['DateTime'].dt.hour

In [11]:
dict_cols['numerical_cols'].append('year')
dict_cols['numerical_cols'].append('month')
dict_cols['numerical_cols'].append('day')
dict_cols['numerical_cols'].append('weekday')
dict_cols['numerical_cols'].append('weekyear')
dict_cols['numerical_cols'].append('hour') 

In [12]:
full_data["DateTime"] = full_data["DateTime"].apply(lambda x:str(x))

In [13]:
label_encoding(dict_cols['categorical_cols'] + dict_cols['label_col'], full_data)

Label encoding column: Name


  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
  return aux[:-1][aux[1:] == aux[:-1]]


Label encoding column: AnimalType
Label encoding column: SexuponOutcome
Label encoding column: Breed
Label encoding column: Color
Label encoding column: DateTime
Label encoding column: OutcomeType


{nan: 0,
 'Adoption': 1,
 'Died': 2,
 'Euthanasia': 3,
 'Return_to_owner': 4,
 'Transfer': 5}

In [14]:
full_data.dtypes

AgeuponOutcome    int64
AnimalType        int64
Breed             int64
Color             int64
DateTime          int64
Name              int64
OutcomeType       int64
SexuponOutcome    int64
NameLength        int64
year              int64
month             int64
day               int64
weekday           int64
weekyear          int64
hour              int64
dtype: object

In [15]:
full_data.head()

Unnamed: 0,AgeuponOutcome,AnimalType,Breed,Color,DateTime,Name,OutcomeType,SexuponOutcome,NameLength,year,month,day,weekday,weekyear,hour
0,365,1,1282,146,4641,2911,4,3,7,2014,2,12,2,7,18
1,365,0,866,184,482,2266,3,4,5,2013,10,13,6,41,12
2,730,1,1210,97,17382,5501,1,3,6,2015,1,31,5,5,12
3,0,0,866,47,9918,0,5,2,3,2014,7,11,4,28,19
4,730,1,1090,311,1710,0,5,3,3,2013,11,15,4,46,12


# Model Blending

Train 4 models: RF (gini), RF(entropy), ExtraTree(gini) and ExtraTree(entropy)

Split training data into 4 folds.


In [16]:
full_cols = dict_cols['categorical_cols'] + dict_cols['numerical_cols']
num_class = 5
train_size = dict_cols['train_size']
(train_X, train_y, test_X) = (full_data[:train_size][full_cols].fillna(0).values
                        ,full_data[:train_size]['OutcomeType'].fillna(0).values
                        ,full_data[train_size:][full_cols].fillna(0).values)

skf = list(cross_validation.StratifiedKFold(train_y, 4))


clfs = [
        ExtraTreesClassifier(n_estimators = 500, criterion='entropy', n_jobs=-1), 
        ExtraTreesClassifier(n_estimators = 500,criterion='gini', n_jobs=-1),
        RandomForestClassifier(n_estimators = 500, criterion='entropy', n_jobs=-1), 
        RandomForestClassifier(n_estimators = 500,criterion='gini', n_jobs=-1)
       ]

train_blend_X = np.zeros((train_X.shape[0], len(clfs)*num_class))
test_blend_X = np.zeros((test_X.shape[0], len(clfs)*num_class))
scores = np.zeros ((len(skf),len(clfs)))

for j, clf in enumerate(clfs):
    print ("Blending model",j+1, clf)
    test_blend_X_j = np.zeros((test_X.shape[0], num_class))
    for i, (train, val) in enumerate(skf):
        print ("Model %d fold %d" %(j+1,i+1))
        train_X_fold = train_X[train]
        train_y_fold = train_y[train]
        val_X_fold = train_X[val]
        val_y_fold = train_y[val]
        clf.fit(train_X_fold, train_y_fold)
        val_y_predict_fold = clf.predict_proba(val_X_fold)
        score = metrics.log_loss(val_y_fold,val_y_predict_fold)
        print ("LOGLOSS: ", score)
        scores[i,j]=score
        train_blend_X[val, j*num_class:j*num_class+num_class] = val_y_predict_fold
        test_blend_X_j = test_blend_X_j + clf.predict_proba(test_X)
    test_blend_X[:,j*num_class:j*num_class+num_class] = test_blend_X_j/len(skf)
    print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))

('Blending model', 1, ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))
Model 1 fold 1




('LOGLOSS: ', 0.82406020879480113)
Model 1 fold 2
('LOGLOSS: ', 0.82428199531061053)
Model 1 fold 3
('LOGLOSS: ', 0.83134782113998185)
Model 1 fold 4
('LOGLOSS: ', 0.8174301854291306)
Score for model 1 is 0.824280
('Blending model', 2, ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))
Model 2 fold 1
('LOGLOSS: ', 0.82560147693243957)
Model 2 fold 2
('LOGLOSS: ', 0.82199034063022347)
Model 2 fold 3
('LOGLOSS: ', 0.83733983993745675)
Model 2 fold 4
('LOGLOSS: ', 0.81924663385318142)
Score for model 2 is 0.826045
('Blending model', 3, RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samp

# Cross Validation/ Grid Search with blending

In [18]:
print "Grid Search on Blending."
param_grid = {
              }
model = search_model(train_blend_X
                                         , train_y
                                         , LogisticRegression()
                                         , param_grid
                                         , n_jobs=1
                                         , cv=8
                                         , refit=True)   

print ("best params:", model.best_params_)

Grid Search on Blending.
Fitting 8 folds for each of 1 candidates, totalling 8 fits
[CV]  ................................................................
[CV] ...................................... , score=-0.790337 -   1.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:    1.0s


[CV] ...................................... , score=-0.775729 -   0.7s
[CV]  ................................................................
[CV] ...................................... , score=-0.770499 -   0.7s
[CV]  ................................................................
[CV] ...................................... , score=-0.771240 -   0.8s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:    3.2s


[CV] ...................................... , score=-0.782286 -   0.7s
[CV]  ................................................................
[CV] ...................................... , score=-0.769832 -   0.7s
[CV]  ................................................................
[CV] ...................................... , score=-0.760914 -   0.7s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   7 tasks       | elapsed:    5.3s


[CV] ...................................... , score=-0.770329 -   0.7s


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    5.9s finished


Best score: -0.774
('Best parameters set:', {})
('Scores:', [mean: -0.77390, std: 0.00837, params: {}])
('best params:', {})


### Write out submission

In [27]:
full_data

Unnamed: 0,AgeuponOutcome,AnimalType,Breed,Color,DateTime,Name,OutcomeType,SexuponOutcome,NameLength,year,month,day,weekday,weekyear,hour
0,365,1,1282,146,4641,2911,4,3,7,2014,2,12,2,7,18
1,365,0,866,184,482,2266,3,4,5,2013,10,13,6,41,12
2,730,1,1210,97,17382,5501,1,3,6,2015,1,31,5,5,12
3,0,0,866,47,9918,0,5,2,3,2014,7,11,4,28,19
4,730,1,1090,311,1710,0,5,3,3,2013,11,15,4,46,12
5,0,1,558,40,6899,2250,5,1,4,2014,4,25,4,17,13
6,0,0,866,70,19028,3415,5,2,5,2015,3,28,5,13,13
7,0,0,866,117,20025,0,5,5,3,2015,4,30,3,18,17
8,0,1,78,283,4432,4256,1,4,4,2014,2,4,1,6,17
9,365,1,554,359,7147,0,1,4,3,2014,5,3,5,18,7


In [40]:
np.array(xrange(1, dict_cols['test_size'] + 1))

array([    1,     2,     3, ..., 11454, 11455, 11456])

In [43]:
preds = model.predict_proba(test_blend_X)
submission = pd.DataFrame()
submission['ID'], submission['Adoption'], submission['Died'], submission['Euthanasia'], submission['Return_to_owner'], submission['Transfer'] \
            = np.array(xrange(1, dict_cols['test_size'] + 1)).astype(int),  preds[:,0], preds[:,1], preds[:,2], preds[:,3], preds[:,4]
path = './data/submission_' + str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + '.csv'
submission.to_csv(path, index=False)
print ("Submission created.")

Submission created.


# Homework
1. Implement grid search for weighted averaging
2. Try different combination of classifiers for level 1
3. Try different classifiers for level 2 (Elastic Net, Neural Network, etc)