In [1]:
import pandas as pd
import matplotlib as plt
import numpy as np

%matplotlib inline

In [2]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
train.shape, test.shape

((26729, 10), (11456, 8))

In [3]:
empty_percent = train.apply(pd.isnull, axis = 0).sum() / train.shape[0] * 100
empty_percent.sort_values(ascending=False)

OutcomeSubtype    50.925961
Name              28.773991
AgeuponOutcome     0.067343
SexuponOutcome     0.003741
Color              0.000000
Breed              0.000000
AnimalType         0.000000
OutcomeType        0.000000
DateTime           0.000000
AnimalID           0.000000
dtype: float64

In [4]:
def transform_dates(val):
#     print(float(val.split(" ")[0]) * 365)
#     print(val)
    if pd.isnull(val):
        return val
    
    num_val = float(val.split(" ")[0])
    if "year" in val:
        return num_val * 365
    elif "month" in val:
        return num_val * 30.5
    elif "week" in val:
        return num_val * 7


def transform_dataset(df, columns_to_dropna=["AgeuponOutcome", "SexuponOutcome"]):
    result = df
#     result = df.dropna(subset=columns_to_dropna, axis=0)
    result.loc[:, "AgeuponOutcome"] = result.loc[:, "AgeuponOutcome"].apply(transform_dates)
    result["AgeuponOutcome"].fillna(result['AgeuponOutcome'].dropna().mean(), inplace = True)
    return result

train["Train"] = True
test["Train"] = False
train.drop('AnimalID', axis=1, inplace=True)
test.drop('ID', axis=1, inplace=True)

dataset = pd.concat([train, test])
dataset = transform_dataset(dataset)
# dataset.drop(['AnimalID', 'ID'], axis=1, inplace=True)
# test_t = transform_dataset(test)
dataset.reset_index()
dataset.head(3)

Unnamed: 0,AgeuponOutcome,AnimalType,Breed,Color,DateTime,Name,OutcomeSubtype,OutcomeType,SexuponOutcome,Train
0,365.0,Dog,Shetland Sheepdog Mix,Brown/White,2014-02-12 18:22:00,Hambone,,Return_to_owner,Neutered Male,True
1,365.0,Cat,Domestic Shorthair Mix,Cream Tabby,2013-10-13 12:44:00,Emily,Suffering,Euthanasia,Spayed Female,True
2,730.0,Dog,Pit Bull Mix,Blue/White,2015-01-31 12:28:00,Pearce,Foster,Adoption,Neutered Male,True


In [5]:
dataset.shape, dataset[dataset['Train'] == False].shape, test.shape

((38185, 10), (11456, 10), (11456, 8))

Is there any empty values?

In [6]:
dataset.isnull().any()

AgeuponOutcome    False
AnimalType        False
Breed             False
Color             False
DateTime          False
Name               True
OutcomeSubtype     True
OutcomeType        True
SexuponOutcome     True
Train             False
dtype: bool

We want to split all mixed breeds. Breed column contains a low number of strings like "Black/Tan Hound Mix" so we will remove colors fom those so we can split all values as we wish

In [7]:
import re

# First we remove "Mix" from all breeds and add additional categorical variable to the dataset
dataset["Mix"] = False
dataset.loc[dataset["Breed"].str.contains("Mix"), "Mix"] = True

# DEPRECATRED - those features seem too be better removed
# breeds = dataset["Breed"].apply(lambda x: x.split(" Mix")[0])

# # Next we remove all of the colors which cause problems when we try to split mixed breeds 
# breeds = breeds.apply(lambda x: re.sub('Black\s?|Tan\s?', '', x))

# # After that let's remove dirty substrings left from previous replacements
# breeds = breeds.apply(lambda x: re.sub('^/', '', x))
# breeds = breeds.str.replace("//", "")

# # Finally, lets split the breeds and modify out dataset
# breeds = breeds.apply(lambda x: pd.Series(x.split("/")))
# breeds.columns = ['Breed', 'SecondaryBreed']

dataset.drop('Breed', inplace=True, axis = 1)
# dataset = pd.concat([dataset, breeds], axis = 1)

In [8]:
breeds.head(2)

NameError: name 'breeds' is not defined

In [9]:
# dataset.reset_index(inplace=True)
dataset.head(3)

Unnamed: 0,AgeuponOutcome,AnimalType,Color,DateTime,Name,OutcomeSubtype,OutcomeType,SexuponOutcome,Train,Mix
0,365.0,Dog,Brown/White,2014-02-12 18:22:00,Hambone,,Return_to_owner,Neutered Male,True,True
1,365.0,Cat,Cream Tabby,2013-10-13 12:44:00,Emily,Suffering,Euthanasia,Spayed Female,True,True
2,730.0,Dog,Blue/White,2015-01-31 12:28:00,Pearce,Foster,Adoption,Neutered Male,True,True


Now we split DateTime to several features

In [10]:
from datetime import *

dates = dataset['DateTime'].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))

# def time_of_day(hour):
#     if hour > 7 and hour <= 11:
#         return "morning"
#     elif hour > 11 and hour <= 18:
#         return "day"
#     elif hour > 18 and hour <= 22:
#         return "evening"
#     else:
#         return "night"

dataset["Year"] = dates.apply(lambda x: x.year)
dataset["Month"] = dates.apply(lambda x: x.month)
dataset["Hour"] = dates.apply(lambda x: x.hour)
dataset["Weekday"] = dates.apply(lambda x: x.weekday())
# dataset["TimeOfDay"] = dates.apply(lambda x: time_of_day(x.hour))
dataset.head(5)
dataset.drop("DateTime", axis=1, inplace=True)

Name can be converted to HasName categorical feature which will be more useful

In [11]:
dataset['HasName'] = dataset['Name'].isnull()
dataset.drop(['Name', 'OutcomeSubtype'], axis=1, inplace=True)

There will probaby be way to much features for colors, so we will transform them too

In [12]:
pd.value_counts(dataset['Color']).shape

(411,)

2 quantitative features instead of 411, nice improvement

In [13]:
from sklearn.preprocessing import LabelEncoder

split_colors = pd.DataFrame(dataset['Color'].str.split('/').tolist(), columns=["Color1", "Color2"])
dataset.drop('Color', axis = 1, inplace=True)

l_enc = LabelEncoder()
split_colors['Color2'].fillna("None", inplace=True)
split_colors['Color1'] = l_enc.fit_transform(split_colors['Color1'])
split_colors['Color2'] = l_enc.fit_transform(split_colors['Color2'])
dataset = dataset.join(split_colors)

In [14]:
dataset['SexuponOutcome'].value_counts()

Neutered Male    14014
Spayed Female    12633
Intact Female     5004
Intact Male       4985
Unknown           1548
Name: SexuponOutcome, dtype: int64

In [15]:
dataset['SexuponOutcome'].fillna('Neutered Male', inplace=True)

In [16]:
sex = pd.DataFrame(dataset['SexuponOutcome'].str.split(" ").tolist(), columns=["Sterialized", "Sex"])
sex.head(2)
dataset = dataset.join(sex)
dataset.drop('SexuponOutcome', axis = 1, inplace=True)

Convert AnimalType to boolean feature

In [17]:
dataset['IsDog'] = dataset['AnimalType'].apply(lambda x: x == "Dog")
dataset['IsMale'] = dataset['Sex'].apply(lambda x: x == "Male")
dataset.drop(['AnimalType', 'Sex'], axis = 1, inplace = True)

In [21]:
cols = ['Sterialized']
dataset_d = pd.get_dummies(dataset, columns = cols)
dataset_d.head(3)

Unnamed: 0,AgeuponOutcome,OutcomeType,Train,Mix,Year,Month,Hour,Weekday,HasName,Color1,Color2,IsDog,IsMale,Sterialized_Intact,Sterialized_Neutered,Sterialized_Spayed,Sterialized_Unknown
0,365.0,Return_to_owner,True,True,2014,2,18,2,False,15,44,True,True,0.0,1.0,0.0,0.0
0,305.0,,False,True,2015,10,12,0,False,15,44,True,True,0.0,1.0,0.0,0.0
1,365.0,Euthanasia,True,True,2013,10,12,6,False,26,30,False,False,1.0,0.0,0.0,0.0


In [22]:
# dataset_d.drop(['index'], axis=1, inplace=True)
dataset_d.loc[:, dataset_d.columns.str.contains("OutcomeType")].head(1)

Unnamed: 0,OutcomeType
0,Return_to_owner


Now we fit XGBoost model on full dataset in order to perform feature selection

In [23]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold

train = dataset_d[dataset['Train'] == True]
test = dataset_d[dataset['Train'] == False]

train.drop('Train', axis=1, inplace=True)
test.drop('Train', axis=1, inplace=True)

train_x = train.loc[:, train.columns.difference(["OutcomeType"])]
train_y = train.loc[:, "OutcomeType"]

# train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.3)

test_x = test.loc[:, test.columns.difference(["OutcomeType"])]

train_x.shape, test_x.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


((26729, 15), (11456, 15))

Error estimates for the full model will help us not to throw off important features during selection 

In [24]:
import xgboost as xgb
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
# xgb_param_dist = {"n_estimators"     : 150,
#                   "max_depth"        : 8,
#                   "learning_rate"    : 0.1,
#                   "colsample_bytree" : 0.8}

xgb_param_dist = {"n_estimators"     : 200,
                  "max_depth"        : 10,
                  "learning_rate"    : 0.08,
                  "colsample_bytree" : 0.7,
                  "objective"        : "multi:softmax",
                  "num_class" : 5}

enc = LabelEncoder()
enc.fit(train_y)
train_yt = enc.transform(train_y)
xgb.cv(xgb_param_dist, xgb.DMatrix(train_x, train_yt), nfold=5)

Unnamed: 0,test-merror-mean,test-merror-std,train-merror-mean,train-merror-std
0,0.384564,0.009673,0.337657,0.003144
1,0.371132,0.011231,0.320728,0.002881
2,0.368514,0.010524,0.316202,0.002632
3,0.364174,0.009378,0.311908,0.003598
4,0.360208,0.009688,0.309486,0.00406
5,0.36032,0.010197,0.306998,0.003276
6,0.36032,0.010291,0.305932,0.003393
7,0.360582,0.01034,0.303781,0.003818
8,0.359422,0.011482,0.301237,0.004059
9,0.359572,0.010973,0.299852,0.003193


In [25]:
# xgb_param_dist_sk = 
# xgb_param_dist.pop("num_class", None)
# cf = xgb.Booster(**xgb_param_dist)
# %time cf.fit(train_x, train_y)

bst = xgb.train(xgb_param_dist, xgb.DMatrix(train_x, train_yt))

Let's get feature importance map and see top 100 features

In [None]:
cf.booster().save_model('full.model')

In [26]:
fscores = bst.get_fscore()
# fscores = bst.get_fscore()
np.sort(list(fscores.values()))[::-1][0:100]

array([2838, 2326, 2128, 1690, 1616, 1492,  866,  458,  451,  386,  336,
        322,  315,  179,  116])

Perform feature selection based on a `fscore` threshold

In [27]:
from sklearn.feature_selection import SelectFromModel
filtered_fscores = {k: v for k, v in fscores.items() if v > 0}

print("{0:1.2f}% features left".format((len(filtered_fscores) / len(fscores)) * 100))
# model = SelectFromModel(cf ,prefit=True)
# train_xr = model.transform(train_x)

100.00% features left


Estimate the new model

In [28]:
filtered_cols = list(filtered_fscores.keys())
train_xr = train_x[filtered_cols]
# val_xr = val_x[filtered_cols]
test_xr = test_x[filtered_cols]

# cfr = xgb.XGBClassifier(**xgb_param_dist)
# cfr.fit(train_xr, train_y)
xgb.cv(xgb_param_dist, xgb.DMatrix(train_xr, train_yt), 15, nfold=5)

Unnamed: 0,test-merror-mean,test-merror-std,train-merror-mean,train-merror-std
0,0.407946,0.011191,0.360115,0.002929
1,0.376108,0.009387,0.329043,0.00231
2,0.372816,0.008835,0.321636,0.004978
3,0.368551,0.008986,0.314003,0.005295
4,0.366044,0.011464,0.308906,0.00425
5,0.366456,0.011807,0.307868,0.005097
6,0.365932,0.010996,0.306184,0.006489
7,0.364847,0.010819,0.304173,0.005556
8,0.363463,0.00962,0.301143,0.005665
9,0.363986,0.009493,0.29931,0.006536


Now it is time for hyperparameter optimization

In [None]:
from scipy.stats import uniform
from scipy.stats import randint as sp_randint
from sklearn.grid_search import RandomizedSearchCV

xgb_param_dist = {"n_estimators" : np.arange(10, 250, 10),
                    "max_depth": sp_randint(2, 31),
                    "learning_rate" : uniform(loc = 0.01, scale=0.2),
                    "colsample_bytree" : uniform(loc = 0.3, scale = 0.7),
                    "subsample" : uniform(loc = 0.0, scale = 0.7),
                    "objective" : "multi:softmax"}

xgb_clf = xgb.XGBClassifier()
n_iter_search = 30

xgb_random_search = RandomizedSearchCV(xgb_clf, param_distributions = xgb_param_dist,
                                       n_iter = n_iter_search, random_state = 123, n_jobs = 8, verbose = 1)
%time xgb_random_search.fit(train_xr, train_yt)
                                       
xgb_clf = xgb_random_search.best_estimator_

print("Best randomized search score - %s" % xgb_random_search.best_score_)

In [None]:
xgb_clf.booster().save_model('random_search_best_est.model')

In [None]:
import xgboost as xgb
best_m = xgb.Booster({'nthread' : 8}) #init model
best_m.load_model('/home/kdubovikov/MEGA/random_search_best_est.model') # load data

pred = best_m.predict(xgb.DMatrix(test_x))
enc.inverse_transform(pred.astype(int))

In [None]:
xgb_random_search.best_params_

In [29]:
best_params ={'colsample_bytree': 0.7082178433723527,
             'learning_rate': 0.06,
             'max_depth': 10,
             'n_estimators': 50,
             'num_class': 5,
             'objective': 'multi:softprob',
             'subsample': 0.8}
# best_params['num_class']= 5
# best_params['objective'] = 'multi:softmax'

filtered_fscores_t = {k: v for k, v in fscores.items() if v > 0}
filtered_cols_t = list(filtered_fscores_t.keys())
train_xrt = train_x[filtered_cols_t]
# val_xr = val_x[filtered_cols]
test_xrt = test_x[filtered_cols_t]

# cfr = xgb.XGBClassifier(**xgb_param_dist)
# cfr.fit(train_xr, train_y)
# xgb.cv(best_params, xgb.DMatrix(train_x, train_yt), 15, nfold=5)
%time xgb.cv(best_params, xgb.DMatrix(train_xr, train_yt), nfold=5)



CPU times: user 19.1 s, sys: 284 ms, total: 19.4 s
Wall time: 2.57 s


Unnamed: 0,test-merror-mean,test-merror-std,train-merror-mean,train-merror-std
0,0.439409,0.026508,0.384508,0.021507
1,0.397657,0.016269,0.338378,0.009976
2,0.384152,0.01352,0.324451,0.005137
3,0.372778,0.010664,0.317081,0.00302
4,0.369748,0.009544,0.311843,0.004166
5,0.36769,0.010728,0.308064,0.004302
6,0.365371,0.007909,0.307653,0.004837
7,0.364248,0.008385,0.305071,0.004907
8,0.363238,0.00904,0.304454,0.005292
9,0.363238,0.0085,0.303659,0.005369


In [30]:
%time booster = xgb.train(best_params, xgb.DMatrix(train_xr, train_yt))
np.unique(booster.predict(xgb.DMatrix(test_xr)))

CPU times: user 4.52 s, sys: 124 ms, total: 4.64 s
Wall time: 634 ms


array([ 0.12148891,  0.12329598,  0.12351845, ...,  0.46134725,
        0.46157441,  0.4620828 ], dtype=float32)

In [None]:
xgb.plot_importance(booster)

In [None]:
# This wont work properly with softprob
preds = booster.predict(xgb.DMatrix(test_xr))
oh_results = enc.inverse_transform(preds.astype(int))
np.unique(oh_results)

In [None]:
oh_enc = preprocessing.OneHotEncoder(sparse=False)
results = oh_enc.fit_transform(preds.reshape(-1, 1))
results = results.astype(int)
results

Model stacking

In [31]:
boosters = np.array([])
predictions = []

for i in range(0, 100):
    %time booster = xgb.train(best_params, xgb.DMatrix(train_xr, train_yt))
    boosters = np.append(boosters, booster)
    predictions.append(booster.predict(xgb.DMatrix(test_xr)))
    

# %time booster = xgb.train(best_params, xgb.DMatrix(train_xr, train_yt))
# np.unique(booster.predict(xgb.DMatrix(test_xr)))

CPU times: user 4.68 s, sys: 72 ms, total: 4.76 s
Wall time: 653 ms
CPU times: user 4.9 s, sys: 76 ms, total: 4.98 s
Wall time: 630 ms
CPU times: user 4.82 s, sys: 120 ms, total: 4.94 s
Wall time: 625 ms
CPU times: user 4.97 s, sys: 80 ms, total: 5.05 s
Wall time: 647 ms
CPU times: user 4.87 s, sys: 76 ms, total: 4.95 s
Wall time: 626 ms
CPU times: user 4.8 s, sys: 96 ms, total: 4.89 s
Wall time: 619 ms
CPU times: user 4.85 s, sys: 96 ms, total: 4.94 s
Wall time: 626 ms
CPU times: user 4.93 s, sys: 24 ms, total: 4.96 s
Wall time: 628 ms
CPU times: user 4.9 s, sys: 100 ms, total: 5 s
Wall time: 635 ms
CPU times: user 4.91 s, sys: 92 ms, total: 5 s
Wall time: 637 ms
CPU times: user 4.83 s, sys: 72 ms, total: 4.9 s
Wall time: 620 ms
CPU times: user 4.86 s, sys: 96 ms, total: 4.96 s
Wall time: 627 ms
CPU times: user 5.09 s, sys: 116 ms, total: 5.21 s
Wall time: 666 ms
CPU times: user 5.84 s, sys: 124 ms, total: 5.96 s
Wall time: 780 ms
CPU times: user 5.89 s, sys: 148 ms, total: 6.04 s
Wal

In [32]:
from sklearn.ensemble import RandomForestClassifier
rf_models = np.array([])
# predictions = [] already have those

for i in range(0, 10):
    rf_clf = RandomForestClassifier(n_estimators=300, criterion='gini', n_jobs=8)
    %time rf_clf.fit(train_x, train_yt)
    rf_models = np.append(rf_models, rf_clf)
    predictions.append(rf_clf.predict_proba(test_x))

CPU times: user 15.5 s, sys: 332 ms, total: 15.9 s
Wall time: 2.26 s
CPU times: user 13.5 s, sys: 364 ms, total: 13.9 s
Wall time: 2.22 s
CPU times: user 13.6 s, sys: 260 ms, total: 13.8 s
Wall time: 2.22 s
CPU times: user 13.6 s, sys: 264 ms, total: 13.9 s
Wall time: 2.01 s
CPU times: user 13.4 s, sys: 260 ms, total: 13.7 s
Wall time: 2.01 s
CPU times: user 13.4 s, sys: 296 ms, total: 13.7 s
Wall time: 2.01 s
CPU times: user 13.6 s, sys: 308 ms, total: 13.9 s
Wall time: 2.01 s
CPU times: user 13.5 s, sys: 304 ms, total: 13.8 s
Wall time: 2.02 s
CPU times: user 13.5 s, sys: 268 ms, total: 13.8 s
Wall time: 2.01 s
CPU times: user 13.3 s, sys: 336 ms, total: 13.6 s
Wall time: 2.01 s


In [33]:
preds = np.mean(predictions, axis = 0)
preds = np.argmax(preds, axis = 1)
np.unique(preds)

array([0, 1, 2, 3, 4])

In [34]:
oh_enc = preprocessing.OneHotEncoder(sparse=False)
results = oh_enc.fit_transform(preds.reshape(-1, 1))
results = results.astype(int)
results_encoded = results
results_encoded

array([[0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       ..., 
       [0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0]])

In [35]:
# test_results = pd.read_csv("./test.csv")
# test_results['ID'] = test_results['ID'].astype(int)
# final_results = pd.concat([test_results["ID"], pd.DataFrame(results)], axis = 1)
final_results = pd.DataFrame(results_encoded)
final_results.columns = ['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']
# final_results['Died'] = 0
final_results.index.name = 'ID'
final_results.index = final_results.index + 1

final_results = final_results[['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']]

final_results.to_csv('results_1.csv')

In [36]:
final_results.shape

(11456, 5)