In [19]:
import pandas as pd
pd.set_option('display.max_columns', 100)
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from scipy import stats
import seaborn as sns
import xgboost as xgb

import category_encoders as ce
from sklearn.model_selection import StratifiedKFold

from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

<h1>Load the Data</h1>

In [None]:
train_data = pd.read_csv('/DataScience/cat-in-the-dat-ii/train.csv')
test_data = pd.read_csv('/DataScience/cat-in-the-dat-ii/test.csv')

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
print('Percent of train data kept if we drop all rows with null values: %', 100 * len(train_data.dropna())/len(train_data))
print('Percent of test data kept if we drop all rows with null values: %', 100 * len(test_data.dropna())/len(test_data))

id column, target in the train set, and 23 categorical features. Each feature column has null values in both the test and the train sets. In order to score well, I will need to impute values so I can use as much of my data as possible. Also, according to the competition notes, there are interactions between the features.

<h1>Evaluation</h1>

This competition is evaluated by the area under the ROC curve between the predicted probability and the target. As a baseline, I will drop all null values and ignore feature interaction.

<h1>Feature Analysis and Cleaning</h1>

In [None]:
train_data = pd.read_csv('/DataScience/cat-in-the-dat-ii/train.csv')
train_data.dropna(inplace=True)
train_data.reset_index(inplace=True, drop=True)
# Naively filling in the test data
test_data.fillna(method='ffill', inplace=True)

In [None]:
train_data.head()

In [None]:
train_data.info()
# During cleaning I will be using less memory-intensive data types

<h3>Binned</h3

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 3))
axes[0].hist(train_data.bin_0)
axes[1].hist(train_data.bin_1)
axes[2].hist(train_data.bin_2)
plt.show()

# These are all 1s or 0s so I can use int8. They are all imbalanced towards the '0' class

In [None]:
train_data.iloc[:, 1] = train_data.iloc[:, 1].astype('int8')
train_data.iloc[:, 2] = train_data.iloc[:, 2].astype('int8')
train_data.iloc[:, 3] = train_data.iloc[:, 3].astype('int8')

test_data.iloc[:, 1] = test_data.iloc[:, 1].astype('int8')
test_data.iloc[:, 2] = test_data.iloc[:, 2].astype('int8')
test_data.iloc[:, 3] = test_data.iloc[:, 3].astype('int8')

In [None]:
# What about the non-numeric bins?
print(train_data.bin_3.unique())
print(train_data.bin_4.unique())

In [None]:
# Since they are binary, I will change them to 0s and 1s.
train_data.iloc[:, 4] = train_data.bin_3.map({'F':0, 'T':1}).astype('int8')
train_data.iloc[:, 5] = train_data.bin_4.map({'N':0, 'Y':1}).astype('int8')

test_data.iloc[:, 4] = test_data.bin_3.map({'F':0, 'T':1}).astype('int8')
test_data.iloc[:, 5] = test_data.bin_4.map({'N':0, 'Y':1}).astype('int8')

In [None]:
# The last 2 bins are more evenly distributed
fig, axes = plt.subplots(1, 2, figsize=(10, 3))
axes[0].hist(train_data.bin_3)
axes[1].hist(train_data.bin_4)
plt.show()

<h3>Nominal</h3>

In [None]:
print("Number unique values nom 0:", len(train_data.nom_0.unique()))
print("Number unique values nom 1:", len(train_data.nom_1.unique()))
print("Number unique values nom 2:", len(train_data.nom_2.unique()))
print("Number unique values nom 3:", len(train_data.nom_3.unique()))
print("Number unique values nom 4:", len(train_data.nom_4.unique()))
print("Number unique values nom 5:", len(train_data.nom_5.unique()))
print("Number unique values nom 6:", len(train_data.nom_6.unique()))
print("Number unique values nom 7:", len(train_data.nom_7.unique()))
print("Number unique values nom 8:", len(train_data.nom_8.unique()))
print("Number unique values nom 9:", len(train_data.nom_9.unique()))

In [None]:
# Get dummies for nominal variables with few unique values. They other features will need to be binned.
# Binning will depend on the other features, so for this first run I will just drop them.
train_data = pd.get_dummies(train_data, columns=['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'], drop_first=True)
train_data.drop(['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'], 1, inplace=True)

test_data = pd.get_dummies(test_data, columns=['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'], drop_first=True)
test_data.drop(['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'], 1, inplace=True)

In [None]:
train_data.columns

<h3>Ordinal</h3>

In [None]:
# Ordinal features. It might be correct to one hot encode, or to scale them. For the 

print('Ordinal value 0. Number unique values:', len(train_data.ord_0.unique()))
for value in train_data.ord_0.unique():
    print(value, len(train_data[train_data.ord_0 == value]))
print('\n')
print('Ordinal value 1. Number unique values:', len(train_data.ord_1.unique()))
for value in train_data.ord_1.unique():
    print(value, len(train_data[train_data.ord_1 == value]))
print('\n')
print('Ordinal value 2. Number unique values:', len(train_data.ord_2.unique()))
for value in train_data.ord_2.unique():
    print(value, len(train_data[train_data.ord_2 == value]))
print('\n')
print('Ordinal value 3. Number unique values:', len(train_data.ord_3.unique()))
for value in train_data.ord_3.unique():
    print(value, len(train_data[train_data.ord_3 == value]))
print('\n')
print('Ordinal value 4. Number unique values:', len(train_data.ord_4.unique()))
for value in train_data.ord_4.unique():
    print(value, len(train_data[train_data.ord_4 == value]))
print('\n')
print('Ordinal value 5. Number unique values:', len(train_data.ord_5.unique()))
for value in train_data.ord_5.unique():
    print(value, len(train_data[train_data.ord_5 == value]))
print('\n')

In [None]:
ord_3_map = {}
sorted_ord_3 = sorted(train_data.ord_3.unique())
for value in range(len(sorted_ord_3)):
    ord_3_map[sorted_ord_3[value]] = value
    
ord_4_map = {}
sorted_ord_4 = sorted(train_data.ord_4.unique())
for value in range(len(sorted_ord_4)):
    ord_4_map[sorted_ord_4[value]] = value
    
ord_5_map = {}
sorted_ord_5 = sorted(train_data.ord_5.unique())
for value in range(len(sorted_ord_5)):
    ord_5_map[sorted_ord_5[value]] = value
    
train_data.iloc[:, 6] = train_data.iloc[:, 6].astype('int8')
train_data.iloc[:, 7] = train_data.iloc[:, 7].map({'Novice':0, 'Contributor':1, 'Expert':2, 'Master':3, 'Grandmaster':4}).astype('int8')
train_data.iloc[:, 8] = train_data.iloc[:, 8].map({'Freezing':0, 'Cold':1, 'Warm':2, 'Hot':3, 'Boiling Hot':4, 'Lava Hot':5}).astype('int8')
train_data.iloc[:, 9] = train_data.iloc[:, 9].map(ord_3_map).astype('int8')
train_data.iloc[:, 10] = train_data.iloc[:, 10].map(ord_4_map).astype('int8')
train_data.iloc[:, 11] = train_data.iloc[:, 11].map(ord_5_map).astype('int32')
    
test_data.iloc[:, 6] = test_data.iloc[:, 6].astype('int8')
test_data.iloc[:, 7] = test_data.iloc[:, 7].map({'Novice':0, 'Contributor':1, 'Expert':2, 'Master':3, 'Grandmaster':4}).astype('int8')
test_data.iloc[:, 8] = test_data.iloc[:, 8].map({'Freezing':0, 'Cold':1, 'Warm':2, 'Hot':3, 'Boiling Hot':4, 'Lava Hot':5}).astype('int8')
test_data.iloc[:, 9] = test_data.iloc[:, 9].map(ord_3_map).astype('int8')
test_data.iloc[:, 10] = test_data.iloc[:, 10].map(ord_4_map).astype('int8')
test_data.iloc[:, 11] = test_data.iloc[:, 11].map(ord_5_map).astype('int32')

<h3>Cyclical</h3>

Thank you https://www.kaggle.com/avanwyk/encoding-cyclical-features-for-deep-learning for teaching me this method for handling cyclical data. In the future I could try with a different weight on the month since they are different lengths.

In [None]:
train_data['month_sin'] = np.sin(2 * np.pi * train_data['month']/12)
train_data['month_cos'] = np.cos(2 * np.pi * train_data['month']/12)
train_data['day_sin'] = np.sin(2 * np.pi * train_data['day']/7)
train_data['day_cos'] = np.cos(2 * np.pi * train_data['day']/7)
train_data.drop(['day', 'month'], 1, inplace=True)

test_data['month_sin'] = np.sin(2 * np.pi * test_data['month']/12)
test_data['month_cos'] = np.cos(2 * np.pi * test_data['month']/12)
test_data['day_sin'] = np.sin(2 * np.pi * test_data['day']/7)
test_data['day_cos'] = np.cos(2 * np.pi * test_data['day']/7)
test_data.drop(['day', 'month'], 1, inplace=True)

In [None]:
rfc = RandomForestClassifier(n_estimators=100)
X_train = train_data.drop('target', 1)
y_train = train_data.target
X_test = test_data
rfc.fit(X_train, y_train)
preds = rfc.predict_proba(X_test)

probs = []
for i in range(len(preds)):
    probs.append(preds[i][1])

In [None]:
submission_df = pd.DataFrame()
submission_df['id'] = X_test.id
submission_df['target'] = probs
submission_df.to_csv('/DataScience/cat-in-the-dat-ii/submission.csv', index=False)

# .72089 score, just below the 'RF Benchmark'

<h1>Second Attempt</h1>

This time I will impute missing values in the train data and utilize feature dependencies.

In [23]:
train_data = pd.read_csv('/DataScience/cat-in-the-dat-ii/train.csv')
test_data = pd.read_csv('/DataScience/cat-in-the-dat-ii/test.csv')

In [5]:
train_data.columns.drop('id')

Index(['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1', 'nom_2',
       'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9', 'ord_0',
       'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5', 'day', 'month', 'target'],
      dtype='object')

In [25]:
for feature in train_data.columns.drop(['id', 'target']):
    feature_list = list(train_data[feature].dropna())
    mode = stats.mode(feature_list)[0][0]
    train_data[feature] = train_data[feature].fillna(mode)
    test_data[feature] = test_data[feature].fillna(mode)



In [26]:
train_data.iloc[:, 1] = train_data.iloc[:, 1].astype('int8', errors='ignore')
train_data.iloc[:, 2] = train_data.iloc[:, 2].astype('int8', errors='ignore')
train_data.iloc[:, 3] = train_data.iloc[:, 3].astype('int8', errors='ignore')

test_data.iloc[:, 1] = test_data.iloc[:, 1].astype('int8', errors='ignore')
test_data.iloc[:, 2] = test_data.iloc[:, 2].astype('int8', errors='ignore')
test_data.iloc[:, 3] = test_data.iloc[:, 3].astype('int8', errors='ignore')

In [27]:
# Since they are binary, I will change them to 0s and 1s.
train_data.iloc[:, 4] = train_data.bin_3.map({'F':0, 'T':1}).astype('int8', errors='ignore')
train_data.iloc[:, 5] = train_data.bin_4.map({'N':0, 'Y':1}).astype('int8', errors='ignore')

test_data.iloc[:, 4] = test_data.bin_3.map({'F':0, 'T':1}).astype('int8', errors='ignore')
test_data.iloc[:, 5] = test_data.bin_4.map({'N':0, 'Y':1}).astype('int8', errors='ignore')

In [28]:
# Get dummies for nominal variables with few unique values. They other features will need to be binned.
# Binning will depend on the other features, so for this first run I will just drop them.
train_data = pd.get_dummies(train_data, columns=['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'], drop_first=False)
test_data = pd.get_dummies(test_data, columns=['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'], drop_first=False)

In [29]:
train_data.drop(['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'], 1, inplace=True)
test_data.drop(['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'], 1, inplace=True)

In [30]:
ord_3_map = {}
uniques = []
for value in train_data.ord_3.unique():
    if value == value:
        uniques.append(value)
sorted_ord_3 = sorted(uniques)
for value in range(len(sorted_ord_3)):
    ord_3_map[sorted_ord_3[value]] = value
    
ord_4_map = {}
uniques = []
for value in train_data.ord_4.unique():
    if value == value:
        uniques.append(value)
sorted_ord_4 = sorted(uniques)
for value in range(len(sorted_ord_4)):
    ord_4_map[sorted_ord_4[value]] = value
    
ord_5_map = {}
uniques = []
for value in train_data.ord_5.unique():
    if value == value:
        uniques.append(value)
sorted_ord_5 = sorted(uniques)
for value in range(len(sorted_ord_5)):
    ord_5_map[sorted_ord_5[value]] = value
    
train_data.iloc[:, 6] = train_data.iloc[:, 6].astype('int8', errors='ignore')
train_data.iloc[:, 7] = train_data.iloc[:, 7].map({'Novice':0, 'Contributor':1, 'Expert':2, 'Master':3, 'Grandmaster':4}).astype('int8', errors='ignore')
train_data.iloc[:, 8] = train_data.iloc[:, 8].map({'Freezing':0, 'Cold':1, 'Warm':2, 'Hot':3, 'Boiling Hot':4, 'Lava Hot':5}).astype('int8', errors='ignore')
train_data.iloc[:, 9] = train_data.iloc[:, 9].map(ord_3_map).astype('int8', errors='ignore')
train_data.iloc[:, 10] = train_data.iloc[:, 10].map(ord_4_map).astype('int8', errors='ignore')
train_data.iloc[:, 11] = train_data.iloc[:, 11].map(ord_5_map).astype('int32', errors='ignore')
    
test_data.iloc[:, 6] = test_data.iloc[:, 6].astype('int8', errors='ignore')
test_data.iloc[:, 7] = test_data.iloc[:, 7].map({'Novice':0, 'Contributor':1, 'Expert':2, 'Master':3, 'Grandmaster':4}).astype('int8', errors='ignore')
test_data.iloc[:, 8] = test_data.iloc[:, 8].map({'Freezing':0, 'Cold':1, 'Warm':2, 'Hot':3, 'Boiling Hot':4, 'Lava Hot':5}).astype('int8', errors='ignore')
test_data.iloc[:, 9] = test_data.iloc[:, 9].map(ord_3_map).astype('int8', errors='ignore')
test_data.iloc[:, 10] = test_data.iloc[:, 10].map(ord_4_map).astype('int8', errors='ignore')
test_data.iloc[:, 11] = test_data.iloc[:, 11].map(ord_5_map).astype('int32', errors='ignore')

In [31]:
train_data['month_sin'] = np.sin(2 * np.pi * train_data['month']/12)
train_data['month_cos'] = np.cos(2 * np.pi * train_data['month']/12)
train_data['day_sin'] = np.sin(2 * np.pi * train_data['day']/7)
train_data['day_cos'] = np.cos(2 * np.pi * train_data['day']/7)
train_data.drop(['day', 'month'], 1, inplace=True)

test_data['month_sin'] = np.sin(2 * np.pi * test_data['month']/12)
test_data['month_cos'] = np.cos(2 * np.pi * test_data['month']/12)
test_data['day_sin'] = np.sin(2 * np.pi * test_data['day']/7)
test_data['day_cos'] = np.cos(2 * np.pi * test_data['day']/7)
test_data.drop(['day', 'month'], 1, inplace=True)

In [32]:
rfc = RandomForestClassifier(n_estimators=200)
X_train = train_data.drop('target', 1)
y_train = train_data.target
X_test = test_data
rfc.fit(X_train, y_train)
preds = rfc.predict_proba(X_test)

probs = []
for i in range(len(preds)):
    probs.append(preds[i][1])

In [33]:
submission_df = pd.DataFrame()
submission_df['id'] = X_test.id
submission_df['target'] = probs
submission_df.to_csv('/DataScience/cat-in-the-dat-ii/submission.csv', index=False)

# .72089 score, just below the 'RF Benchmark'

In [47]:
X = train_data.drop('target', 1)
Y = train_data.target
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)
param = {'max_depth':4, 'eta':.75, 'objective':'binary:logistic' }
num_round = 500
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)

print('AUC:', roc_auc_score(y_test, preds))

  if getattr(data, 'base', None) is not None and \


KeyboardInterrupt: 

In [50]:
submission_df = pd.DataFrame()
submission_df['id'] = test_data.id
submission_df['target'] = preds
submission_df.to_csv('/DataScience/cat-in-the-dat-ii/submission.csv', index=False)

In [71]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_res, y_res)
preds = rfc.predict_proba(X_test)
probs = []
for i in range(len(preds)):
    probs.append(preds[i][1])

In [74]:
train_data

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,target,nom_0_Blue,nom_0_Green,nom_0_Red,nom_1_Circle,nom_1_Polygon,nom_1_Square,nom_1_Star,nom_1_Trapezoid,nom_1_Triangle,nom_2_Axolotl,nom_2_Cat,nom_2_Dog,nom_2_Hamster,nom_2_Lion,nom_2_Snake,nom_3_Canada,nom_3_China,nom_3_Costa Rica,nom_3_Finland,nom_3_India,nom_3_Russia,nom_4_Bassoon,nom_4_Oboe,nom_4_Piano,nom_4_Theremin,month_sin,month_cos,day_sin,day_cos
0,0,0,0,0,0,0,3,1,3,2,20,56,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,1.000000e+00,6.123234e-17,-7.818315e-01,0.623490
1,1,1,1,0,0,1,3,4,2,4,23,150,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,-5.000000e-01,-8.660254e-01,-2.449294e-16,1.000000
2,2,0,1,0,0,0,3,0,0,13,15,105,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,-1.000000e+00,-1.836970e-16,-9.749279e-01,-0.222521
3,3,0,0,0,0,0,1,0,5,0,2,21,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1.000000e+00,6.123234e-17,4.338837e-01,-0.900969
4,4,0,0,0,1,0,3,4,1,7,2,50,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,-2.449294e-16,1.000000e+00,-9.749279e-01,-0.222521
5,5,0,0,1,1,0,2,2,3,1,16,180,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,8.660254e-01,-5.000000e-01,4.338837e-01,-0.900969
6,6,0,0,0,0,0,1,4,1,2,17,158,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1.224647e-16,-1.000000e+00,-9.749279e-01,-0.222521
7,7,0,0,1,1,0,3,2,1,1,24,54,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,5.000000e-01,8.660254e-01,7.818315e-01,0.623490
8,8,0,0,0,0,0,1,0,4,2,13,136,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1.000000e+00,6.123234e-17,-7.818315e-01,0.623490
9,9,0,0,0,0,1,3,1,5,13,8,50,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,-8.660254e-01,-5.000000e-01,7.818315e-01,0.623490


<h1>Third Attempt</h1>



Getting stuck around the same submission score. My imputing method is fairly naive and I was not able to utilize feature interactions after looking at the correlation heatmap. The last ordinal variable I sorted alphabetically and it's not clear if that's the correct method. The last 4 nominal variables all have too many categories to make dummies, and so I just filled null values at the mode but then ended up dropping them without a good way of binning them.

Now I will try using some of the methods of other high scoring notebooks on Kaggle.

In [6]:
# https://www.kaggle.com/itsbitan/encode-categorical-features

train_data = pd.read_csv('/DataScience/cat-in-the-dat-ii/train.csv')
test_data = pd.read_csv('/DataScience/cat-in-the-dat-ii/test.csv')

In [8]:
y_train = train_data.target
test_id = test_data.id

train_data.drop(['target', 'id'], 1, inplace=True)
test_data.drop('id', 1, inplace=True)

In [12]:
cat_feat_to_encode = train_data.columns.tolist()
smoothing=0.20
oof = pd.DataFrame([])
for tr_idx, oof_idx in StratifiedKFold(n_splits=5, random_state= 1032, shuffle=True).split(train_data, y_train):
    ce_target_encoder = ce.TargetEncoder(cols = cat_feat_to_encode, smoothing=smoothing)
    ce_target_encoder.fit(train_data.iloc[tr_idx, :], y_train.iloc[tr_idx])
    oof = oof.append(ce_target_encoder.transform(train_data.iloc[oof_idx, :]), ignore_index=False)
ce_target_encoder = ce.TargetEncoder(cols = cat_feat_to_encode, smoothing=smoothing)
ce_target_encoder.fit(train_data, y_train)
train_data = oof.sort_index()
test_data = ce_target_encoder.transform(test_data)

In [14]:
x_train = train_data.iloc[:,:].values
x_test = test_data.iloc[:,:].values

In [15]:
from xgboost import XGBClassifier
classifier = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, 
              learning_rate=0.1, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
classifier.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [16]:
y_pred = classifier.predict_proba(x_test)[:,1]

submission_df = pd.DataFrame()
submission_df['id'] = test_id
submission_df['target'] = y_pred
submission_df.to_csv('/DataScience/cat-in-the-dat-ii/submission.csv', index=False)

<h1>Fourth Attempt</h1>

After copying itsbitan's solution using the category encoders library, I decided to explore the library more.

In [72]:
# https://www.kaggle.com/itsbitan/encode-categorical-features

train_data = pd.read_csv('/DataScience/cat-in-the-dat-ii/train.csv')

X_train, X_test, y_train, y_test = train_test_split(train_data.drop('target', 1), train_data.target, test_size=0.2)
test_data = X_test
train_data = X_train
test_id = test_data.id

train_data.drop('id', 1, inplace=True)
test_data.drop('id', 1, inplace=True)

cat_feat_to_encode = train_data.columns.tolist()
smoothing=0.20
oof = pd.DataFrame([])
for tr_idx, oof_idx in StratifiedKFold(n_splits=5, random_state= 1032, shuffle=True).split(train_data, y_train):
    ce_target_encoder = ce.TargetEncoder(cols = cat_feat_to_encode, smoothing=smoothing)
    ce_target_encoder.fit(train_data.iloc[tr_idx, :], y_train.iloc[tr_idx])
    oof = oof.append(ce_target_encoder.transform(train_data.iloc[oof_idx, :]), ignore_index=False)
ce_target_encoder = ce.TargetEncoder(cols = cat_feat_to_encode, smoothing=smoothing)
ce_target_encoder.fit(train_data, y_train)
train_data = oof.sort_index()
test_data = ce_target_encoder.transform(test_data)

x_train = train_data.iloc[:,:].values
x_test = test_data.iloc[:,:].values

from xgboost import XGBClassifier
classifier = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, 
              learning_rate=0.1, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
classifier.fit(x_train, y_train)

y_pred = classifier.predict_proba(x_test)[:,1]

results = roc_auc_score(y_test, y_pred)
results

0.5154855710033948

In [80]:
x_test

array([[0.19416644, 0.19019071, 0.17105909, ..., 0.12202381, 0.21246559,
        0.14649171],
       [0.1145159 , 0.19019071, 0.22818497, ..., 0.25952045, 0.1646316 ,
        0.12296517],
       [0.19416644, 0.19019071, 0.17105909, ..., 0.16638121, 0.17969351,
        0.24870834],
       ...,
       [0.19416644, 0.19019071, 0.17105909, ..., 0.18467195, 0.19848522,
        0.23107372],
       [0.1145159 , 0.19019071, 0.17105909, ..., 0.24026167, 0.2001032 ,
        0.12296517],
       [0.19416644, 0.1721898 , 0.17105909, ..., 0.18942993, 0.21181575,
        0.14649171]])

In [66]:
train_data = pd.read_csv('/DataScience/cat-in-the-dat-ii/train.csv')
test_data = pd.read_csv('/DataScience/cat-in-the-dat-ii/test.csv')

X_train, X_test, y_train, y_test = train_test_split(train_data.drop('target', 1), train_data.target, test_size=0.2)

test_id = X_test.id

X_train.drop('id', 1, inplace=True)
X_test.drop('id', 1, inplace=True)

oof = pd.DataFrame([])
smoothing = 0.2
for tr_idx, oof_idx in StratifiedKFold(n_splits=5, random_state=1032, shuffle=True).split(X_train, y_train):
    ce_target_encoder = ce.TargetEncoder(cols=cat_feat_to_encode, smoothing=smoothing)
    ce_target_encoder.fit(X_train.iloc[tr_idx, :], y_train.iloc[tr_idx])
    oof = oof.append(ce_target_encoder.transform(X_train.iloc[oof_idx, :]), ignore_index=False)
ce_target_encoder = ce.TargetEncoder(cols=cat_feat_to_encode, smoothing=smoothing)
ce_target_encoder.fit(X_train, y_train)
X_train = oof.sort_index()
X_test = ce_target_encoder.transform(X_test)

X_train = X_train.iloc[:,:].values
X_test = X_test.iloc[:,:].values

classifier = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, 
              learning_rate=0.1, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
classifier.fit(X_train, y_train)

y_pred = classifier.predict_proba(X_test)[:,1]

results = roc_auc_score(y_test, y_pred)
results

0.4788292607189502

In [70]:
X_test

array([[0.19470917, 0.19064322, 0.17123004, ..., 0.25261814, 0.21429413,
        0.20834894],
       [0.19470917, 0.19064322, 0.17123004, ..., 0.17333717, 0.21429413,
        0.19020164],
       [0.19470917, 0.19064322, 0.17123004, ..., 0.25529511, 0.21214974,
        0.25079334],
       ...,
       [0.19470917, 0.19064322, 0.17123004, ..., 0.13361712, 0.21429413,
        0.22516229],
       [0.19470917, 0.19064322, 0.22866206, ..., 0.16901408, 0.19913741,
        0.20834894],
       [0.19470917, 0.19064322, 0.17123004, ..., 0.21650055, 0.16388634,
        0.23030815]])

In [51]:
def evaluate_encoding():

    oof = pd.DataFrame([])
    smoothing = 0.2
    for tr_idx, oof_idx in StratifiedKFold(n_splits=5, random_state=1032, shuffle=True).split(X_train, y_train):
        ce_target_encoder = ce.TargetEncoder(cols=cat_feat_to_encode, smoothing=smoothing)
        ce_target_encoder.fit(X_train.iloc[tr_idx, :], y_train.iloc[tr_idx])
        oof = oof.append(ce_target_encoder.transform(X_train.iloc[oof_idx, :]), ignore_index=False)
    ce_target_encoder = ce.TargetEncoder(cols=cat_feat_to_encode, smoothing=smoothing)
    ce_target_encoder.fit(X_train, y_train)
    X_train = oof.sort_index()
    X_test = ce_target_encoder.transform(X_test)
    
    X_train = X_train.iloc[:,:].values
    X_test = X_test.iloc[:,:].values
    
    classifier = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                  colsample_bynode=1, colsample_bytree=1, 
                  learning_rate=0.1, max_delta_step=0, max_depth=7,
                  min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
                  nthread=None, objective='binary:logistic', random_state=0,
                  reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
                  silent=None, subsample=1, verbosity=1)
    classifier.fit(X_train, y_train)
    
    y_pred = classifier.predict_proba(X_test)[:,1]
    
    return roc_auc_score(y_test, y_pred)

In [52]:
evaluate_encoding()

0.5308828683914547