In [1]:
import pandas as pd
import matplotlib as plt
import numpy as np

%matplotlib inline

In [2]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

In [3]:
empty_percent = train.apply(pd.isnull, axis = 0).sum() / train.shape[0] * 100
empty_percent.sort_values(ascending=False)

OutcomeSubtype    50.925961
Name              28.773991
AgeuponOutcome     0.067343
SexuponOutcome     0.003741
Color              0.000000
Breed              0.000000
AnimalType         0.000000
OutcomeType        0.000000
DateTime           0.000000
AnimalID           0.000000
dtype: float64

In [4]:
def transform_dates(val):
#     print(float(val.split(" ")[0]) * 365)
#     print(val)
    num_val = float(val.split(" ")[0])
    if "year" in val:
        return num_val * 365
    elif "month" in val:
        return num_val * 31
    elif "week" in val:
        return num_val * 7


def transform_dataset(df, columns_to_dropna=["AgeuponOutcome", "SexuponOutcome"]):
    result = df.dropna(subset=columns_to_dropna, axis=0)
    result.loc[:, "AgeuponOutcome"] = result.loc[:, "AgeuponOutcome"].apply(transform_dates)
    # result.drop(\"Name\")
    return result

train["Train"] = True
test["Train"] = False
train.drop('AnimalID', axis=1, inplace=True)
test.drop('ID', axis=1, inplace=True)

dataset = pd.concat([train, test])
dataset = transform_dataset(dataset)
# dataset.drop(['AnimalID', 'ID'], axis=1, inplace=True)
# test_t = transform_dataset(test)
dataset.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,AgeuponOutcome,AnimalType,Breed,Color,DateTime,Name,OutcomeSubtype,OutcomeType,SexuponOutcome,Train
0,365.0,Dog,Shetland Sheepdog Mix,Brown/White,2014-02-12 18:22:00,Hambone,,Return_to_owner,Neutered Male,True
1,365.0,Cat,Domestic Shorthair Mix,Cream Tabby,2013-10-13 12:44:00,Emily,Suffering,Euthanasia,Spayed Female,True
2,730.0,Dog,Pit Bull Mix,Blue/White,2015-01-31 12:28:00,Pearce,Foster,Adoption,Neutered Male,True


Is there any empty values?

In [5]:
dataset.isnull().any()

AgeuponOutcome     True
AnimalType        False
Breed             False
Color             False
DateTime          False
Name               True
OutcomeSubtype     True
OutcomeType        True
SexuponOutcome    False
Train             False
dtype: bool

AgeuponOutcome needs fixing since it is qualitative variable

In [6]:
dataset["AgeuponOutcome"].fillna(0, inplace=True)

We want to split all mixed breeds. Breed column contains a low number of strings like "Black/Tan Hound Mix" so we will remove colors fom those so we can split all values as we wish

In [7]:
import re

# First we remove "Mix" from all breeds and add additional categorical variable to the dataset
dataset["Mix"] = False
dataset.loc[dataset["Breed"].str.contains("Mix"), "Mix"] = True
breeds = dataset["Breed"].apply(lambda x: x.split(" Mix")[0])

# Next we remove all of the colors which cause problems when we try to split mixed breeds 
breeds = breeds.apply(lambda x: re.sub('Black\s?|Tan\s?', '', x))

# After that let's remove dirty substrings left from previous replacements
breeds = breeds.apply(lambda x: re.sub('^/', '', x))
breeds = breeds.str.replace("//", "")

# Finally, lets split the breeds and modify out dataset
breeds = breeds.apply(lambda x: pd.Series(x.split("/")))
breeds.columns = ['Breed', 'SecondaryBreed']

dataset.drop('Breed', inplace=True, axis = 1)
dataset = dataset.join(breeds)

In [8]:
# dataset.reset_index(inplace=True)
dataset.head(3)

Unnamed: 0,AgeuponOutcome,AnimalType,Color,DateTime,Name,OutcomeSubtype,OutcomeType,SexuponOutcome,Train,Mix,Breed,SecondaryBreed
0,365.0,Dog,Brown/White,2014-02-12 18:22:00,Hambone,,Return_to_owner,Neutered Male,True,True,Shetland Sheepdog,
0,365.0,Dog,Brown/White,2014-02-12 18:22:00,Hambone,,Return_to_owner,Neutered Male,True,True,Labrador Retriever,
0,310.0,Dog,Red/White,2015-10-12 12:15:00,Summer,,,Intact Female,False,True,Shetland Sheepdog,


Now we split DateTime to several features

In [9]:
from datetime import *

dates = dataset['DateTime'].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))

def time_of_day(hour):
    if hour > 7 and hour <= 11:
        return "morning"
    elif hour > 11 and hour <= 18:
        return "day"
    elif hour > 18 and hour <= 22:
        return "evening"
    else:
        return "night"

dataset["Year"] = dates.apply(lambda x: x.year)
dataset["Month"] = dates.apply(lambda x: x.month)
dataset["TimeOfDay"] = dates.apply(lambda x: time_of_day(x.hour))
dataset.drop("DateTime", axis=1, inplace=True)

Name can be converted to HasName categorical feature which will be more useful

In [10]:
dataset['HasName'] = dataset['Name'].isnull()
dataset.drop(['Name', 'OutcomeSubtype'], axis=1, inplace=True)

There will probaby be way to much features for colors, so we will transform them too

In [11]:
pd.value_counts(dataset['Color']).shape

(411,)

57 + 47 categorical features instead of 411, nice improvement

In [12]:
split_colors = pd.DataFrame(dataset['Color'].str.split('/').tolist(), columns=["Color1", "Color2"])
dataset.drop('Color', axis = 1, inplace=True)
dataset = dataset.join(split_colors)

len(split_colors['Color1'].unique()), len(split_colors['Color2'].unique())

(57, 47)

In [13]:
sex = pd.DataFrame(dataset['SexuponOutcome'].str.split(" ").tolist(), columns=["Sterialized", "Sex"])
sex.head(2)
dataset = dataset.join(sex)
dataset.drop('SexuponOutcome', axis = 1, inplace=True)

Convert AnimalType to boolean feature

In [14]:
dataset['IsDog'] = dataset['AnimalType'].apply(lambda x: x == "Dog")
dataset['IsMale'] = dataset['Sex'].apply(lambda x: x == "Male")
dataset.drop(['AnimalType', 'Sex'], axis = 1, inplace = True)

In [15]:
cols = dataset.columns.difference(["Name", "AgeuponOutcome", "OutcomeType", "index", "Mix", "Month", "Train", "Year", "level_0", "AnimalType", "HasName"])
dataset_d = pd.get_dummies(dataset, columns = cols)
dataset_d.head(3)

Unnamed: 0,AgeuponOutcome,OutcomeType,Train,Mix,Year,Month,HasName,Breed_Abyssinian,Breed_Affenpinscher,Breed_Afghan Hound,...,SecondaryBreed_Yorkshire,SecondaryBreed_Yorkshire Terrier,Sterialized_Intact,Sterialized_Neutered,Sterialized_Spayed,Sterialized_Unknown,TimeOfDay_day,TimeOfDay_evening,TimeOfDay_morning,TimeOfDay_night
0,365.0,Return_to_owner,True,True,2014,2,False,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
0,365.0,Return_to_owner,True,True,2014,2,False,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
0,310.0,,False,True,2015,10,False,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [18]:
# dataset_d.drop(['index'], axis=1, inplace=True)
dataset_d.loc[:, dataset_d.columns.str.contains("OutcomeType")].head(1)

Unnamed: 0,OutcomeType
0,Return_to_owner


Now we fit XGBoost model on full dataset in order to perform feature selection

In [137]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold

train = dataset_d[dataset['Train'] == True]
test = dataset_d[dataset['Train'] == False]

train.drop('Train', axis=1, inplace=True)
test.drop('Train', axis=1, inplace=True)

train_x = train.loc[:, train.columns.difference(["OutcomeType"])]
train_y = train.loc[:, "OutcomeType"]

train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.3)

test_x = test.loc[:, test.columns.difference(["OutcomeType"])]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Error estimates for the full model will help us not to throw off important features during selection 

In [158]:
import xgboost as xgb
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
# xgb_param_dist = {"n_estimators"     : 150,
#                   "max_depth"        : 8,
#                   "learning_rate"    : 0.1,
#                   "colsample_bytree" : 0.8}

xgb_param_dist = {"n_estimators"     : 200,
                  "max_depth"        : 10,
                  "learning_rate"    : 0.08,
                  "colsample_bytree" : 0.7,
                  "objective"        : "multi:softmax",
                  "num_class" : 5}

# cf = xgb.XGBClassifier(**xgb_param_dist)
enc = LabelEncoder()
enc.fit(train_y)
train_yt = enc.transform(train_y)
xgb.cv(xgb_param_dist, xgb.DMatrix(train_x, train_yt), nfold=5)
# %time cf.fit(train_x, train_y)

Unnamed: 0,test-merror-mean,test-merror-std,train-merror-mean,train-merror-std
0,0.434001,0.005089,0.399392,0.002601
1,0.410597,0.013566,0.372926,0.010571
2,0.402771,0.017161,0.36555,0.016779
3,0.402771,0.015115,0.363977,0.017415
4,0.396518,0.01481,0.356759,0.015913
5,0.393372,0.015036,0.35132,0.014245
6,0.390039,0.013453,0.346274,0.009113
7,0.388729,0.012634,0.342988,0.005281
8,0.388354,0.010322,0.340573,0.003146
9,0.38783,0.009501,0.337559,0.002711


Let's get feature importance map and see top 100 features

In [207]:
fscores = cf.booster().get_fscore()
np.sort(list(fscores.values()))[::-1][0:100]

array([16156, 12963,  6516,  3782,  3189,  2868,  2801,  2688,  2243,
        2067,  1884,  1858,  1735,  1581,  1518,  1398,  1354,  1336,
        1031,  1012,   999,   938,   877,   867,   858,   829,   805,
         716,   664,   642,   622,   542,   508,   476,   475,   471,
         469,   462,   439,   407,   371,   368,   363,   360,   330,
         269,   252,   248,   246,   212,   208,   203,   203,   196,
         194,   190,   183,   182,   180,   179,   163,   155,   150,
         148,   148,   145,   137,   132,   131,   126,   125,   125,
         125,   123,   121,   119,   117,   117,   116,   114,   113,
         112,   109,   108,   106,   105,   102,   102,   100,    98,
          97,    97,    95,    94,    93,    93,    93,    92,    90,    86])

Perform feature selection based on a `fscore` threshold

In [208]:
filtered_fscores = {k: v for k, v in fscores.items() if v > 100}

print("{0:1.2f}% features left".format((len(filtered_fscores) / len(fscores)) * 100))
# model = SelectFromModel(cf ,prefit=True)
# train_xr = model.transform(train_x)

27.50% features left


Estimate the new model

In [209]:
filtered_cols = list(filtered_fscores.keys())
train_xr = train_x[filtered_cols]
val_xr = val_x[filtered_cols]

# cfr = xgb.XGBClassifier(**xgb_param_dist)
# cfr.fit(train_xr, train_y)
xgb.cv(xgb_param_dist, xgb.DMatrix(train_xr, train_yt), 15, nfold=5)

Unnamed: 0,test-merror-mean,test-merror-std,train-merror-mean,train-merror-std
0,0.411309,0.005203,0.37229,0.002794
1,0.402921,0.012241,0.362516,0.005214
2,0.397828,0.009148,0.355589,0.006363
3,0.397042,0.012291,0.354606,0.008428
4,0.392136,0.009479,0.347884,0.003765
5,0.391275,0.009725,0.347332,0.004423
6,0.389964,0.008976,0.344439,0.00251
7,0.389365,0.008314,0.343278,0.003083
8,0.387231,0.007982,0.340882,0.002833
9,0.387793,0.009118,0.338551,0.003041


Now it is time for hyperparameter optimization

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


ValueError: X has a different shape than during fitting.