In [262]:
import pandas as pd
import matplotlib as plt
import numpy as np

%matplotlib inline

In [450]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
train.shape, test.shape

((26729, 10), (11456, 8))

In [451]:
empty_percent = train.apply(pd.isnull, axis = 0).sum() / train.shape[0] * 100
empty_percent.sort_values(ascending=False)

OutcomeSubtype    50.925961
Name              28.773991
AgeuponOutcome     0.067343
SexuponOutcome     0.003741
Color              0.000000
Breed              0.000000
AnimalType         0.000000
OutcomeType        0.000000
DateTime           0.000000
AnimalID           0.000000
dtype: float64

In [452]:
def transform_dates(val):
#     print(float(val.split(" ")[0]) * 365)
#     print(val)
    if pd.isnull(val):
        return val
    
    num_val = float(val.split(" ")[0])
    if "year" in val:
        return num_val * 365
    elif "month" in val:
        return num_val * 31
    elif "week" in val:
        return num_val * 7


def transform_dataset(df, columns_to_dropna=["AgeuponOutcome", "SexuponOutcome"]):
    result = df
#     result = df.dropna(subset=columns_to_dropna, axis=0)
    result.loc[:, "AgeuponOutcome"] = result.loc[:, "AgeuponOutcome"].apply(transform_dates)
    result["AgeuponOutcome"].fillna(result['AgeuponOutcome'].dropna().mean(), inplace = True)
    return result

train["Train"] = True
test["Train"] = False
train.drop('AnimalID', axis=1, inplace=True)
test.drop('ID', axis=1, inplace=True)

dataset = pd.concat([train, test])
dataset = transform_dataset(dataset)
# dataset.drop(['AnimalID', 'ID'], axis=1, inplace=True)
# test_t = transform_dataset(test)
dataset.reset_index()
dataset.head(3)

Unnamed: 0,AgeuponOutcome,AnimalType,Breed,Color,DateTime,Name,OutcomeSubtype,OutcomeType,SexuponOutcome,Train
0,365.0,Dog,Shetland Sheepdog Mix,Brown/White,2014-02-12 18:22:00,Hambone,,Return_to_owner,Neutered Male,True
1,365.0,Cat,Domestic Shorthair Mix,Cream Tabby,2013-10-13 12:44:00,Emily,Suffering,Euthanasia,Spayed Female,True
2,730.0,Dog,Pit Bull Mix,Blue/White,2015-01-31 12:28:00,Pearce,Foster,Adoption,Neutered Male,True


In [453]:
dataset.shape, dataset[dataset['Train'] == False].shape, test.shape

((38185, 10), (11456, 10), (11456, 8))

Is there any empty values?

In [454]:
dataset.isnull().any()

AgeuponOutcome    False
AnimalType        False
Breed             False
Color             False
DateTime          False
Name               True
OutcomeSubtype     True
OutcomeType        True
SexuponOutcome     True
Train             False
dtype: bool

We want to split all mixed breeds. Breed column contains a low number of strings like "Black/Tan Hound Mix" so we will remove colors fom those so we can split all values as we wish

In [455]:
import re

# First we remove "Mix" from all breeds and add additional categorical variable to the dataset
dataset["Mix"] = False
dataset.loc[dataset["Breed"].str.contains("Mix"), "Mix"] = True
breeds = dataset["Breed"].apply(lambda x: x.split(" Mix")[0])

# Next we remove all of the colors which cause problems when we try to split mixed breeds 
breeds = breeds.apply(lambda x: re.sub('Black\s?|Tan\s?', '', x))

# After that let's remove dirty substrings left from previous replacements
breeds = breeds.apply(lambda x: re.sub('^/', '', x))
breeds = breeds.str.replace("//", "")

# Finally, lets split the breeds and modify out dataset
breeds = breeds.apply(lambda x: pd.Series(x.split("/")))
breeds.columns = ['Breed', 'SecondaryBreed']

dataset.drop('Breed', inplace=True, axis = 1)
dataset = pd.concat([dataset, breeds], axis = 1)

In [456]:
breeds.head(2)

Unnamed: 0,Breed,SecondaryBreed
0,Shetland Sheepdog,
1,Domestic Shorthair,


In [457]:
# dataset.reset_index(inplace=True)
dataset.head(3)

Unnamed: 0,AgeuponOutcome,AnimalType,Color,DateTime,Name,OutcomeSubtype,OutcomeType,SexuponOutcome,Train,Mix,Breed,SecondaryBreed
0,365.0,Dog,Brown/White,2014-02-12 18:22:00,Hambone,,Return_to_owner,Neutered Male,True,True,Shetland Sheepdog,
1,365.0,Cat,Cream Tabby,2013-10-13 12:44:00,Emily,Suffering,Euthanasia,Spayed Female,True,True,Domestic Shorthair,
2,730.0,Dog,Blue/White,2015-01-31 12:28:00,Pearce,Foster,Adoption,Neutered Male,True,True,Pit Bull,


Now we split DateTime to several features

In [458]:
from datetime import *

dates = dataset['DateTime'].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))

def time_of_day(hour):
    if hour > 7 and hour <= 11:
        return "morning"
    elif hour > 11 and hour <= 18:
        return "day"
    elif hour > 18 and hour <= 22:
        return "evening"
    else:
        return "night"

dataset["Year"] = dates.apply(lambda x: x.year)
dataset["Month"] = dates.apply(lambda x: x.month)
dataset["TimeOfDay"] = dates.apply(lambda x: time_of_day(x.hour))
dataset.drop("DateTime", axis=1, inplace=True)

Name can be converted to HasName categorical feature which will be more useful

In [459]:
dataset['HasName'] = dataset['Name'].isnull()
dataset.drop(['Name', 'OutcomeSubtype'], axis=1, inplace=True)

There will probaby be way to much features for colors, so we will transform them too

In [460]:
pd.value_counts(dataset['Color']).shape

(411,)

57 + 47 categorical features instead of 411, nice improvement

In [461]:
split_colors = pd.DataFrame(dataset['Color'].str.split('/').tolist(), columns=["Color1", "Color2"])
dataset.drop('Color', axis = 1, inplace=True)
dataset = dataset.join(split_colors)

len(split_colors['Color1'].unique()), len(split_colors['Color2'].unique())

(57, 47)

In [462]:
dataset['SexuponOutcome'].value_counts()

Neutered Male    14014
Spayed Female    12633
Intact Female     5004
Intact Male       4985
Unknown           1548
Name: SexuponOutcome, dtype: int64

In [463]:
dataset['SexuponOutcome'].fillna('Neutered Male', inplace=True)

In [464]:
sex = pd.DataFrame(dataset['SexuponOutcome'].str.split(" ").tolist(), columns=["Sterialized", "Sex"])
sex.head(2)
dataset = dataset.join(sex)
dataset.drop('SexuponOutcome', axis = 1, inplace=True)

Convert AnimalType to boolean feature

In [465]:
dataset['IsDog'] = dataset['AnimalType'].apply(lambda x: x == "Dog")
dataset['IsMale'] = dataset['Sex'].apply(lambda x: x == "Male")
dataset.drop(['AnimalType', 'Sex'], axis = 1, inplace = True)

In [466]:
cols = dataset.columns.difference(["Name", "AgeuponOutcome", "OutcomeType", "index", "Mix", "Month", "Train", "Year", "level_0", "AnimalType", "HasName"])
dataset_d = pd.get_dummies(dataset, columns = cols)
dataset_d.head(3)

Unnamed: 0,AgeuponOutcome,OutcomeType,Train,Mix,Year,Month,HasName,Breed_Abyssinian,Breed_Affenpinscher,Breed_Afghan Hound,...,SecondaryBreed_Yorkshire,SecondaryBreed_Yorkshire Terrier,Sterialized_Intact,Sterialized_Neutered,Sterialized_Spayed,Sterialized_Unknown,TimeOfDay_day,TimeOfDay_evening,TimeOfDay_morning,TimeOfDay_night
0,365.0,Return_to_owner,True,True,2014,2,False,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
0,310.0,,False,True,2015,10,False,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,365.0,Euthanasia,True,True,2013,10,False,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [467]:
# dataset_d.drop(['index'], axis=1, inplace=True)
dataset_d.loc[:, dataset_d.columns.str.contains("OutcomeType")].head(1)

Unnamed: 0,OutcomeType
0,Return_to_owner


Now we fit XGBoost model on full dataset in order to perform feature selection

In [468]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold

train = dataset_d[dataset['Train'] == True]
test = dataset_d[dataset['Train'] == False]

train.drop('Train', axis=1, inplace=True)
test.drop('Train', axis=1, inplace=True)

train_x = train.loc[:, train.columns.difference(["OutcomeType"])]
train_y = train.loc[:, "OutcomeType"]

# train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.3)

test_x = test.loc[:, test.columns.difference(["OutcomeType"])]

train_x.shape, test_x.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


((26729, 509), (11456, 509))

Error estimates for the full model will help us not to throw off important features during selection 

In [469]:
import xgboost as xgb
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
# xgb_param_dist = {"n_estimators"     : 150,
#                   "max_depth"        : 8,
#                   "learning_rate"    : 0.1,
#                   "colsample_bytree" : 0.8}

xgb_param_dist = {"n_estimators"     : 200,
                  "max_depth"        : 10,
                  "learning_rate"    : 0.08,
                  "colsample_bytree" : 0.7,
                  "objective"        : "multi:softmax",
                  "num_class" : 5}

enc = LabelEncoder()
enc.fit(train_y)
train_yt = enc.transform(train_y)
xgb.cv(xgb_param_dist, xgb.DMatrix(train_x, train_yt), nfold=5)

Unnamed: 0,test-merror-mean,test-merror-std,train-merror-mean,train-merror-std
0,0.434509,0.008521,0.394861,0.003466
1,0.416888,0.010158,0.376061,0.012159
2,0.404878,0.009945,0.36177,0.009739
3,0.399192,0.008425,0.353511,0.00409
4,0.394702,0.007734,0.350518,0.002687
5,0.396498,0.007327,0.347946,0.000908
6,0.395001,0.007997,0.34688,0.002591
7,0.394889,0.009517,0.346319,0.003439
8,0.394777,0.007713,0.345243,0.003385
9,0.394478,0.007703,0.343494,0.003396


In [470]:
# xgb_param_dist_sk = 
# xgb_param_dist.pop("num_class", None)
# cf = xgb.Booster(**xgb_param_dist)
# %time cf.fit(train_x, train_y)

bst = xgb.train(xgb_param_dist, xgb.DMatrix(train_x, train_yt))

Let's get feature importance map and see top 100 features

In [None]:
cf.booster().save_model('full.model')

In [471]:
fscores = bst.get_fscore()
# fscores = bst.get_fscore()
np.sort(list(fscores.values()))[::-1][0:100]

array([1750, 1245,  735,  428,  417,  341,  325,  287,  268,  228,  186,
        183,  181,  180,  158,  156,  144,  139,  116,  109,  108,  107,
        107,  100,   92,   90,   86,   85,   85,   83,   78,   77,   73,
         65,   53,   50,   50,   48,   43,   40,   39,   39,   38,   38,
         38,   37,   36,   36,   35,   35,   34,   33,   31,   31,   30,
         29,   28,   28,   27,   26,   26,   26,   25,   23,   22,   22,
         21,   21,   21,   20,   20,   18,   18,   17,   17,   16,   16,
         15,   15,   15,   15,   14,   14,   14,   14,   14,   14,   14,
         13,   13,   13,   13,   13,   13,   13,   11,   11,   11,   11,
         10])

Perform feature selection based on a `fscore` threshold

In [472]:
from sklearn.feature_selection import SelectFromModel
filtered_fscores = {k: v for k, v in fscores.items() if v > 10}

print("{0:1.2f}% features left".format((len(filtered_fscores) / len(fscores)) * 100))
# model = SelectFromModel(cf ,prefit=True)
# train_xr = model.transform(train_x)

50.51% features left


Estimate the new model

In [473]:
filtered_cols = list(filtered_fscores.keys())
train_xr = train_x[filtered_cols]
# val_xr = val_x[filtered_cols]
test_xr = test_x[filtered_cols]

# cfr = xgb.XGBClassifier(**xgb_param_dist)
# cfr.fit(train_xr, train_y)
xgb.cv(xgb_param_dist, xgb.DMatrix(train_xr, train_yt), 15, nfold=5)

Unnamed: 0,test-merror-mean,test-merror-std,train-merror-mean,train-merror-std
0,0.411688,0.005885,0.375004,0.005052
1,0.404579,0.006576,0.365642,0.005319
2,0.397396,0.007987,0.360966,0.004618
3,0.394328,0.008577,0.355962,0.006045
4,0.393355,0.009763,0.3531,0.005223
5,0.392832,0.007587,0.349649,0.004565
6,0.391934,0.008113,0.347226,0.004189
7,0.39141,0.009768,0.346019,0.004753
8,0.390961,0.008086,0.345197,0.004084
9,0.390998,0.00822,0.343204,0.004283


Now it is time for hyperparameter optimization

In [157]:
from scipy.stats import uniform
from scipy.stats import randint as sp_randint
from sklearn.grid_search import RandomizedSearchCV

xgb_param_dist = {"n_estimators" : np.arange(10, 250, 10),
                    "max_depth": sp_randint(2, 31),
                    "learning_rate" : uniform(loc = 0.01, scale=0.2),
                    "colsample_bytree" : uniform(loc = 0.3, scale = 0.7),
                    "subsample" : uniform(loc = 0.0, scale = 0.7),
                    "objective" : "multi:softmax"}

xgb_clf = xgb.XGBClassifier()
n_iter_search = 30

xgb_random_search = RandomizedSearchCV(xgb_clf, param_distributions = xgb_param_dist,
                                       n_iter = n_iter_search, random_state = 123, n_jobs = 8, verbose = 1)
%time xgb_random_search.fit(train_xr, train_yt)
                                       
xgb_clf = xgb_random_search.best_estimator_

print("Best randomized search score - %s" % xgb_random_search.best_score_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed: 35.6min
[Parallel(n_jobs=8)]: Done  90 out of  90 | elapsed: 76.8min finished


CPU times: user 1min 28s, sys: 2.35 s, total: 1min 31s
Wall time: 1h 16min 58s
Best randomized search score - 0.605478374836


In [148]:
xgb_clf.booster().save_model('random_search_best_est.model')

In [158]:
import xgboost as xgb
best_m = xgb.Booster({'nthread' : 8}) #init model
best_m.load_model('/home/kdubovikov/MEGA/random_search_best_est.model') # load data

pred = best_m.predict(xgb.DMatrix(test_x))
enc.inverse_transform(pred.astype(int))

array([['Adoption', 'Adoption', 'Adoption', 'Adoption', 'Adoption'],
       ['Adoption', 'Adoption', 'Adoption', 'Adoption', 'Adoption'],
       ['Adoption', 'Adoption', 'Adoption', 'Adoption', 'Adoption'],
       ..., 
       ['Adoption', 'Adoption', 'Adoption', 'Adoption', 'Adoption'],
       ['Adoption', 'Adoption', 'Adoption', 'Adoption', 'Adoption'],
       ['Adoption', 'Adoption', 'Adoption', 'Adoption', 'Adoption']], dtype=object)

In [171]:
xgb_random_search.best_params_

{'colsample_bytree': 0.7082178433723527,
 'learning_rate': 0.062116170866098705,
 'max_depth': 7,
 'n_estimators': 50,
 'num_class': 5,
 'objective': 'multi:softmax',
 'subsample': 0.31904123137425433}

In [474]:
#todo estimate

best_params ={'colsample_bytree': 0.7082178433723527,
             'learning_rate': 0.06,
             'max_depth': 10,
             'n_estimators': 50,
             'num_class': 5,
             'objective': 'multi:softmax',
             'subsample': 0.3}
best_params['num_class']= 5
best_params['objective'] = 'multi:softmax'

filtered_fscores_t = {k: v for k, v in fscores.items() if v > 0}
filtered_cols_t = list(filtered_fscores_t.keys())
train_xrt = train_x[filtered_cols_t]
# val_xr = val_x[filtered_cols]
test_xrt = test_x[filtered_cols_t]

# cfr = xgb.XGBClassifier(**xgb_param_dist)
# cfr.fit(train_xr, train_y)
# xgb.cv(best_params, xgb.DMatrix(train_x, train_yt), 15, nfold=5)
%time xgb.cv(best_params, xgb.DMatrix(train_xr, train_yt), nfold=5)



CPU times: user 50.8 s, sys: 604 ms, total: 51.4 s
Wall time: 6.9 s


Unnamed: 0,test-merror-mean,test-merror-std,train-merror-mean,train-merror-std
0,0.427251,0.00706,0.406244,0.0029
1,0.408171,0.008911,0.386715,0.005646
2,0.401549,0.007607,0.379765,0.007178
3,0.400614,0.006822,0.377034,0.006619
4,0.398743,0.007662,0.375351,0.004983
5,0.397733,0.007002,0.37247,0.004635
6,0.396947,0.007669,0.371656,0.00362
7,0.395563,0.007639,0.370468,0.003319
8,0.396386,0.007279,0.369795,0.003685
9,0.396274,0.008441,0.367672,0.003782


In [475]:
%time booster = xgb.train(best_params, xgb.DMatrix(train_xr, train_yt))
np.unique(booster.predict(xgb.DMatrix(test_xr)))

CPU times: user 12.8 s, sys: 168 ms, total: 12.9 s
Wall time: 1.71 s


array([ 0.,  2.,  3.,  4.], dtype=float32)

In [476]:
preds = booster.predict(xgb.DMatrix(test_xr))
oh_results = enc.inverse_transform(preds.astype(int))
np.unique(oh_results)

array(['Adoption', 'Euthanasia', 'Return_to_owner', 'Transfer'], dtype=object)

In [477]:
oh_enc = preprocessing.OneHotEncoder(sparse=False)
results = oh_enc.fit_transform(preds.reshape(-1, 1))
results = results.astype(int)
results

array([[1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       ..., 
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 0, 1, 0]])

In [482]:
# test_results = pd.read_csv("./test.csv")
# test_results['ID'] = test_results['ID'].astype(int)
# final_results = pd.concat([test_results["ID"], pd.DataFrame(results)], axis = 1)
final_results = pd.DataFrame(results)
final_results.columns = ['Adoption', 'Euthanasia', 'Return_to_owner', 'Transfer']
final_results['Died'] = 0
final_results.index.name = 'ID'
final_results.index = final_results.index + 1

final_results = final_results[['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']]

final_results.to_csv('results.csv')

In [483]:
final_results.shape

(11456, 5)