In [2]:
import pandas as pd
import matplotlib as plt
import numpy as np

%matplotlib inline

In [3]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

In [4]:
empty_percent = train.apply(pd.isnull, axis = 0).sum() / train.shape[0] * 100
empty_percent.sort_values(ascending=False)

OutcomeSubtype    50.925961
Name              28.773991
AgeuponOutcome     0.067343
SexuponOutcome     0.003741
Color              0.000000
Breed              0.000000
AnimalType         0.000000
OutcomeType        0.000000
DateTime           0.000000
AnimalID           0.000000
dtype: float64

In [5]:
def transform_dates(val):
#     print(float(val.split(" ")[0]) * 365)
#     print(val)
    num_val = float(val.split(" ")[0])
    if "year" in val:
        return num_val * 365
    elif "month" in val:
        return num_val * 31
    elif "week" in val:
        return num_val * 7


def transform_dataset(df, columns_to_dropna=["AgeuponOutcome", "SexuponOutcome"]):
    result = df.dropna(subset=columns_to_dropna, axis=0)
    result.loc[:, "AgeuponOutcome"] = result.loc[:, "AgeuponOutcome"].apply(transform_dates)
    # result.drop(\"Name\")
    return result

train["Train"] = True
test["Train"] = False
train.drop('AnimalID', axis=1, inplace=True)
test.drop('ID', axis=1, inplace=True)

dataset = pd.concat([train, test])
dataset = transform_dataset(dataset)
# dataset.drop(['AnimalID', 'ID'], axis=1, inplace=True)
# test_t = transform_dataset(test)
dataset.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,AgeuponOutcome,AnimalType,Breed,Color,DateTime,Name,OutcomeSubtype,OutcomeType,SexuponOutcome,Train
0,365,Dog,Shetland Sheepdog Mix,Brown/White,2014-02-12 18:22:00,Hambone,,Return_to_owner,Neutered Male,True
1,365,Cat,Domestic Shorthair Mix,Cream Tabby,2013-10-13 12:44:00,Emily,Suffering,Euthanasia,Spayed Female,True
2,730,Dog,Pit Bull Mix,Blue/White,2015-01-31 12:28:00,Pearce,Foster,Adoption,Neutered Male,True


Is there any empty values?

In [6]:
dataset.isnull().any()

AgeuponOutcome     True
AnimalType        False
Breed             False
Color             False
DateTime          False
Name               True
OutcomeSubtype     True
OutcomeType        True
SexuponOutcome    False
Train             False
dtype: bool

AgeuponOutcome needs fixing since it is qualitative variable

In [7]:
dataset["AgeuponOutcome"].fillna(0, inplace=True)

We want to split all mixed breeds. Breed column contains a low number of strings like "Black/Tan Hound Mix" so we will remove colors fom those so we can split all values as we wish

In [8]:
import re

# First we remove "Mix" from all breeds and add additional categorical variable to the dataset
dataset["Mix"] = False
dataset.loc[dataset["Breed"].str.contains("Mix"), "Mix"] = True
breeds = dataset["Breed"].apply(lambda x: x.split(" Mix")[0])

# Next we remove all of the colors which cause problems when we try to split mixed breeds 
breeds = breeds.apply(lambda x: re.sub('Black\s?|Tan\s?', '', x))

# After that let's remove dirty substrings left from previous replacements
breeds = breeds.apply(lambda x: re.sub('^/', '', x))
breeds = breeds.str.replace("//", "")

# Finally, lets split the breeds and modify out dataset
breeds = breeds.apply(lambda x: pd.Series(x.split("/")))
breeds.columns = ['Breed', 'SecondaryBreed']

dataset.drop('Breed', inplace=True, axis = 1)
dataset = dataset.join(breeds)

In [9]:
# dataset.reset_index(inplace=True)
dataset.head(3)

Unnamed: 0,AgeuponOutcome,AnimalType,Color,DateTime,Name,OutcomeSubtype,OutcomeType,SexuponOutcome,Train,Mix,Breed,SecondaryBreed
0,365,Dog,Brown/White,2014-02-12 18:22:00,Hambone,,Return_to_owner,Neutered Male,True,True,Shetland Sheepdog,
0,365,Dog,Brown/White,2014-02-12 18:22:00,Hambone,,Return_to_owner,Neutered Male,True,True,Labrador Retriever,
0,310,Dog,Red/White,2015-10-12 12:15:00,Summer,,,Intact Female,False,True,Shetland Sheepdog,


Now we split DateTime to several features

In [10]:
from datetime import *

dates = dataset['DateTime'].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))

def time_of_day(hour):
    if hour > 7 and hour <= 11:
        return "morning"
    elif hour > 11 and hour <= 18:
        return "day"
    elif hour > 18 and hour <= 22:
        return "evening"
    else:
        return "night"

dataset["Year"] = dates.apply(lambda x: x.year)
dataset["Month"] = dates.apply(lambda x: x.month)
dataset["TimeOfDay"] = dates.apply(lambda x: time_of_day(x.hour))
dataset.drop("DateTime", axis=1, inplace=True)

Name can be converted to HasName categorical feature which will be more useful

In [11]:
dataset['HasName'] = dataset['Name'].isnull()
dataset.drop(['Name', 'OutcomeSubtype'], axis=1, inplace=True)

There will probaby be way to much features for colors, so we will transform them too

In [12]:
pd.value_counts(dataset['Color']).shape

(411,)

57 + 47 categorical features instead of 411, nice improvement

In [13]:
split_colors = pd.DataFrame(dataset['Color'].str.split('/').tolist(), columns=["Color1", "Color2"])
dataset.drop('Color', axis = 1, inplace=True)
dataset = dataset.join(split_colors)

len(split_colors['Color1'].unique()), len(split_colors['Color2'].unique())

(57, 47)

In [14]:
sex = pd.DataFrame(dataset['SexuponOutcome'].str.split(" ").tolist(), columns=["Sterialized", "Sex"])
sex.head(2)
dataset = dataset.join(sex)
dataset.drop('SexuponOutcome', axis = 1, inplace=True)

Convert AnimalType to boolean feature

In [15]:
dataset['IsDog'] = dataset['AnimalType'].apply(lambda x: x == "Dog")
dataset['IsMale'] = dataset['Sex'].apply(lambda x: x == "Male")
dataset.drop(['AnimalType', 'Sex'], axis = 1, inplace = True)

In [16]:
cols = dataset.columns.difference(["Name", "AgeuponOutcome", "OutcomeType", "index", "Mix", "Month", "Train", "Year", "level_0", "AnimalType", "HasName"])
dataset_d = pd.get_dummies(dataset, columns = cols)
dataset_d.head(3)

Unnamed: 0,AgeuponOutcome,OutcomeType,Train,Mix,Year,Month,HasName,Breed_Abyssinian,Breed_Affenpinscher,Breed_Afghan Hound,...,SecondaryBreed_Yorkshire,SecondaryBreed_Yorkshire Terrier,Sterialized_Intact,Sterialized_Neutered,Sterialized_Spayed,Sterialized_Unknown,TimeOfDay_day,TimeOfDay_evening,TimeOfDay_morning,TimeOfDay_night
0,365,Return_to_owner,True,True,2014,2,False,0,0,0,...,0,0,0,1,0,0,1,0,0,0
0,365,Return_to_owner,True,True,2014,2,False,0,0,0,...,0,0,0,1,0,0,1,0,0,0
0,310,,False,True,2015,10,False,0,0,0,...,0,0,0,1,0,0,1,0,0,0


In [17]:
# dataset_d.drop(['index'], axis=1, inplace=True)
dataset_d.loc[:, dataset_d.columns.str.contains("OutcomeType")].head(1)

Unnamed: 0,OutcomeType
0,Return_to_owner


Now we fit XGBoost model on full dataset in order to perform feature selection

In [20]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold

train = dataset_d[dataset['Train'] == True]
test = dataset_d[dataset['Train'] == False]

train.drop('Train', axis=1, inplace=True)
test.drop('Train', axis=1, inplace=True)

train_x = train.loc[:, train.columns.difference(["OutcomeType"])]
train_y = train.loc[:, "OutcomeType"]

train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.3)

test_x = test.loc[:, test.columns.difference(["OutcomeType"])]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Error estimates for the full model will help us not to throw off important features during selection 

In [21]:
import xgboost as xgb
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
# xgb_param_dist = {"n_estimators"     : 150,
#                   "max_depth"        : 8,
#                   "learning_rate"    : 0.1,
#                   "colsample_bytree" : 0.8}

xgb_param_dist = {"n_estimators"     : 200,
                  "max_depth"        : 10,
                  "learning_rate"    : 0.08,
                  "colsample_bytree" : 0.7,
                  "objective"        : "multi:softmax",
                  "num_class" : 5}

enc = LabelEncoder()
enc.fit(train_y)
train_yt = enc.transform(train_y)
xgb.cv(xgb_param_dist, xgb.DMatrix(train_x, train_yt), nfold=5)

Unnamed: 0,test-merror-mean,test-merror-std,train-merror-mean,train-merror-std
0,0.435649,0.006023,0.401432,0.002125
1,0.413219,0.010682,0.370455,0.010213
2,0.404044,0.015731,0.363649,0.017732
3,0.402808,0.015923,0.361814,0.017228
4,0.396255,0.013393,0.354437,0.015069
5,0.393447,0.012647,0.350786,0.014011
6,0.390264,0.008513,0.345815,0.009056
7,0.387605,0.006445,0.342754,0.007748
8,0.385546,0.00485,0.339758,0.005976
9,0.384048,0.003194,0.336969,0.00554


In [25]:
xgb_param_dist.pop("num_class", None)
cf = xgb.XGBClassifier(**xgb_param_dist)
%time cf.fit(train_x, train_y)

CPU times: user 21min 43s, sys: 24.5 s, total: 22min 8s
Wall time: 3min 15s


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0, learning_rate=0.08, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='multi:softmax', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

Let's get feature importance map and see top 100 features

In [26]:
fscores = cf.booster().get_fscore()
np.sort(list(fscores.values()))[::-1][0:100]

array([16270, 13009,  6384,  3837,  3292,  3000,  2875,  2581,  2448,
        2275,  1932,  1921,  1831,  1713,  1614,  1510,  1450,  1407,
        1026,  1008,   941,   932,   865,   806,   802,   797,   751,
         743,   718,   716,   673,   600,   498,   497,   495,   491,
         445,   412,   396,   390,   383,   379,   355,   307,   298,
         293,   283,   282,   239,   237,   229,   228,   216,   216,
         200,   199,   198,   193,   187,   183,   181,   172,   168,
         162,   156,   155,   154,   154,   143,   138,   134,   130,
         127,   126,   123,   121,   121,   119,   118,   117,   113,
         111,   109,   109,   109,   109,   107,   104,   104,    99,
          94,    93,    92,    91,    88,    88,    83,    82,    81,    78])

Perform feature selection based on a `fscore` threshold

In [27]:
filtered_fscores = {k: v for k, v in fscores.items() if v > 100}

print("{0:1.2f}% features left".format((len(filtered_fscores) / len(fscores)) * 100))
# model = SelectFromModel(cf ,prefit=True)
# train_xr = model.transform(train_x)

27.05% features left


Estimate the new model

In [28]:
filtered_cols = list(filtered_fscores.keys())
train_xr = train_x[filtered_cols]
val_xr = val_x[filtered_cols]

# cfr = xgb.XGBClassifier(**xgb_param_dist)
# cfr.fit(train_xr, train_y)
xgb.cv(xgb_param_dist, xgb.DMatrix(train_xr, train_yt), 15, nfold=5)

XGBoostError: b'value 0for Parameter num_class should be greater equal to 1'

Now it is time for hyperparameter optimization

In [36]:
from scipy.stats import uniform
from scipy.stats import randint as sp_randint
from sklearn.grid_search import RandomizedSearchCV

xgb_param_dist = {"n_estimators" : np.arange(10, 250, 10),
                    "max_depth": sp_randint(2, 31),
                    "learning_rate" : uniform(loc = 0.01, scale=0.2),
                    "colsample_bytree" : uniform(loc = 0.3, scale = 0.7),
                    "subsample" : uniform(loc = 0.0, scale = 0.7),
                    "objective" : "multi:softmax"}

xgb_clf = xgb.XGBClassifier()
n_iter_search = 30

xgb_random_search = RandomizedSearchCV(xgb_clf, param_distributions = xgb_param_dist,
                                       n_iter = n_iter_search, random_state = 123, n_jobs = 8, verbose = 1)
%time xgb_random_search.fit(train_xr, train_yt)
                                       
xgb_clf = xgb_random_search.best_estimator_

print("Best randomized search score - %s" % xgb_random_search.best_score_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed: 12.5min
[Parallel(n_jobs=8)]: Done  90 out of  90 | elapsed: 30.3min finished


CPU times: user 2min 30s, sys: 1.69 s, total: 2min 32s
Wall time: 30min 36s
Best randomized search score - 0.636734693878


In [38]:
xgb_clf.booster().save_model('random_search_best_est.model')

In [39]:
xgb_clf

XGBClassifier(base_score=0.5, colsample_bylevel=1,
       colsample_bytree=0.767666166020837, gamma=0,
       learning_rate=0.0772553896305663, max_delta_step=0, max_depth=20,
       min_child_weight=1, missing=None, n_estimators=70, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True,
       subsample=0.5315892750132439)