In [67]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA, FastICA, TruncatedSVD
from sklearn.random_projection import SparseRandomProjection, GaussianRandomProjection
from sklearn.metrics import r2_score

In [15]:
# Read data in
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
test['y'] = -1
test_id = test['ID']
test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X376,X377,X378,X379,X380,X382,X383,X384,X385,y
0,1,az,v,n,f,d,t,a,w,0,...,0,0,1,0,0,0,0,0,0,-1
1,2,t,b,ai,a,d,b,g,y,0,...,0,1,0,0,0,0,0,0,0,-1
2,3,az,v,as,f,d,a,j,j,0,...,0,0,1,0,0,0,0,0,0,-1
3,4,az,l,n,f,d,z,l,n,0,...,0,0,1,0,0,0,0,0,0,-1
4,5,w,s,as,c,d,y,i,m,0,...,0,0,0,0,0,0,0,0,0,-1


In [16]:
# 1. Label Encoding for Categorical features
for col in train.columns:
    if train[col].dtype == 'object':
        alist = list(train[col].values) + list(test[col].values)
        lst_one = sorted([x for x in alist if len(x) == 1])
        lst_two = sorted([x for x in alist if len(x) > 1])
        lst_all = lst_one + lst_two
        encoded_dt, mapping_idx = pd.Series(lst_all).factorize()
        train[col] = train[col].apply(lambda x: mapping_idx.get_loc(x))
        test[col] = test[col].apply(lambda x: mapping_idx.get_loc(x))

test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X376,X377,X378,X379,X380,X382,X383,X384,X385,y
0,1,49,21,13,5,3,18,0,22,0,...,0,0,1,0,0,0,0,0,0,-1
1,2,19,1,33,0,3,1,6,24,0,...,0,1,0,0,0,0,0,0,0,-1
2,3,49,21,43,5,3,0,9,9,0,...,0,0,1,0,0,0,0,0,0,-1
3,4,49,11,13,5,3,24,11,13,0,...,0,0,1,0,0,0,0,0,0,-1
4,5,22,18,43,2,3,23,8,12,0,...,0,0,0,0,0,0,0,0,0,-1


In [17]:
# Combine train & test
all_dt = pd.concat([train, test])
# Standized
cat_col = [col for col in all_dt.columns if col not in ['y'] and all_dt[col].max()>1]
all_dt[cat_col] = all_dt[cat_col].apply(lambda x: x/(x.max()-x.min()))
all_dt.tail()

Unnamed: 0,ID,X0,X1,X10,X100,X101,X102,X103,X104,X105,...,X91,X92,X93,X94,X95,X96,X97,X98,X99,y
4204,0.999168,0.653846,0.269231,0,1,1,0,1,0,0,...,0,0,0,0,0,1,0,1,0,-1.0
4205,0.999287,0.365385,0.961538,0,1,1,0,1,0,0,...,0,0,0,0,0,1,0,1,0,-1.0
4206,0.999525,0.461538,0.807692,0,1,1,0,1,0,0,...,0,0,0,0,0,1,0,1,0,-1.0
4207,0.999644,0.673077,0.807692,0,1,1,0,1,0,0,...,0,0,0,0,0,1,0,1,0,-1.0
4208,0.999881,0.365385,0.961538,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,-1.0


In [18]:
# 2. PCA, ICA & SVD
n_comp = 100
# 2.1 PCA
pca = PCA(n_components=n_comp, random_state=890624)
pca_feat = pca.fit_transform(all_dt.drop(['y'], axis = 1))
# 2.2 ICA
ica = FastICA(n_components=n_comp, random_state=890624)
ica_feat = ica.fit_transform(all_dt.drop(['y'], axis = 1))
# 2.3 SVD
svd = TruncatedSVD(n_components=n_comp, random_state=890624)
svd_feat = svd.fit_transform(all_dt.drop(['y'], axis = 1))

In [19]:
# 3. Random Projection
# 3.1 SRP
srp = SparseRandomProjection(n_components=n_comp, eps=0.1, dense_output = True, random_state=890624)
srp_feat = srp.fit_transform(all_dt.drop(['y'], axis = 1))
# 3.2 GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=890624)
grp_feat = grp.fit_transform(all_dt.drop(['y'], axis = 1))

In [20]:
#pca.explained_variance_ratio_.cumsum()

In [44]:
# 4. Rename and merge features
# 4.1 PCA
pca_feat_dat = pd.DataFrame(pca_feat)
pca_feat_dat = pca_feat_dat.rename(columns=lambda x: 'pca_'+str(x))
# 4.2 ICA
ica_feat_dat = pd.DataFrame(ica_feat)
ica_feat_dat = ica_feat_dat.rename(columns=lambda x: 'ica_'+str(x))
# 4.3 SVD
svd_feat_dat = pd.DataFrame(svd_feat)
svd_feat_dat = svd_feat_dat.rename(columns=lambda x: 'svd_'+str(x))
# 4.4 SRP
srp_feat_dat = pd.DataFrame(srp_feat)
srp_feat_dat = srp_feat_dat.rename(columns=lambda x: 'srp_'+str(x))
# 4.5 GRP
grp_feat_dat = pd.DataFrame(grp_feat)
grp_feat_dat = grp_feat_dat.rename(columns=lambda x: 'grp_'+str(x))

# Merge
all_dt = pd.concat([all_dt.reset_index(drop=True), pca_feat_dat], axis=1)
all_dt = pd.concat([all_dt.reset_index(drop=True), ica_feat_dat], axis=1)
all_dt = pd.concat([all_dt.reset_index(drop=True), svd_feat_dat], axis=1)
all_dt = pd.concat([all_dt.reset_index(drop=True), srp_feat_dat], axis=1)
all_dt = pd.concat([all_dt.reset_index(drop=True), grp_feat_dat], axis=1)
all_dt.head()

Unnamed: 0,ID,X0,X1,X10,X100,X101,X102,X103,X104,X105,...,grp_90,grp_91,grp_92,grp_93,grp_94,grp_95,grp_96,grp_97,grp_98,grp_99
0,0.000000,0.192308,0.807692,0,0,0,0,0,0,0,...,-0.970262,-0.496967,-0.681124,-0.010562,0.116251,0.044945,-1.061215,0.341749,0.706311,-0.138409
1,0.000713,0.192308,0.730769,0,1,1,0,0,0,0,...,-0.550047,0.444355,-0.001771,0.622154,0.107568,0.096269,-1.284971,-0.818661,-0.865108,-0.136055
2,0.000832,0.942308,0.846154,0,0,1,0,0,0,0,...,-0.316343,0.214623,0.678189,-1.316431,0.405287,0.167584,-1.257158,0.531086,-0.515671,-1.289080
3,0.001069,0.942308,0.730769,0,0,1,0,0,0,0,...,-0.520725,0.218004,0.835926,-0.003224,0.494411,0.151871,-0.740399,-0.421942,-0.000998,-0.522564
4,0.001544,0.942308,0.807692,0,0,1,0,0,0,0,...,-0.528435,-0.070876,0.885832,-0.053953,0.313850,-0.035057,-0.827737,-0.243252,0.206263,-0.689661
5,0.002139,0.365385,0.038462,0,1,1,0,1,0,0,...,0.036610,-0.472503,0.666914,-0.046977,-0.822952,0.971467,-0.683638,-0.474585,-0.235888,-0.763061
6,0.002851,0.692308,0.653846,0,0,1,0,1,0,0,...,-0.352378,0.012041,0.852781,0.119606,-0.584012,-0.197263,-1.442873,-0.098372,-0.238465,-1.493471
7,0.002970,0.269231,0.423077,0,1,1,0,1,0,0,...,-0.537371,-0.609466,1.351939,0.177609,-0.361689,-0.243622,-1.001191,-1.005769,-0.958307,-0.987921
8,0.003208,0.423077,0.692308,0,1,1,0,1,0,0,...,-0.145001,-0.041567,1.168404,-0.131603,-0.214916,-0.343335,-0.951528,-0.963883,-0.801249,-1.467459
9,0.003564,0.173077,0.038462,0,1,1,0,1,0,0,...,-0.520324,-0.457254,1.171206,-0.717190,-0.317644,0.366918,-0.283777,-1.074531,0.020798,-1.607910


In [50]:
# 5. Modeling
train = all_dt[all_dt['y'] > 0]
test = all_dt[all_dt['y'] == -1]
y_train = train['y'].values
y_mean = np.mean(y_train)
features = [c for c in train.columns if c != 'y']

In [74]:
# 5.1 Xgboost
def the_metric(y_pred, y):
    y_true = y.get_label()
    return 'r2_score', r2_score(y_true, y_pred)

xgb_params = {
    'max_depth': 2 # 4
    ,'eta': 0.005 #0.0045,
    ,'objective': 'reg:linear'
    ,'eval_metric': 'rmse'
    ,'booster': 'gbtree'
    ,'gamma': 1
    ,'min_child_weight': 0
    ,'subsample': 0.93
    ,'colsample_bytree': 0.7
    ,'lambda': 2
    ,'alpha': 1
    ,'base_score': y_mean
}
dtrain = xgb.DMatrix(train[features], y_train)
dtest = xgb.DMatrix(test[features])

In [75]:
model = xgb.cv(dict(xgb_params), 
               maximize=True,
               feval=the_metric,
               verbose_eval=100, 
               stratified=True, 
               dtrain = dtrain, 
               num_boost_round=15000, 
               early_stopping_rounds=100, 
               nfold=5)



[0]	train-r2_score:0.0048782+0.00019724	test-r2_score:0.004504+0.00118837
[100]	train-r2_score:0.32416+0.0118267	test-r2_score:0.326192+0.0404038
[200]	train-r2_score:0.457378+0.01541	test-r2_score:0.460719+0.0564792
[300]	train-r2_score:0.515608+0.0172671	test-r2_score:0.518952+0.0632329
[400]	train-r2_score:0.542682+0.0178795	test-r2_score:0.544568+0.0660652
[500]	train-r2_score:0.557583+0.0177619	test-r2_score:0.555299+0.0658848
[600]	train-r2_score:0.56947+0.0167017	test-r2_score:0.561099+0.0656774
[700]	train-r2_score:0.579732+0.015428	test-r2_score:0.564238+0.0655163
[800]	train-r2_score:0.588785+0.0142287	test-r2_score:0.565627+0.0651459
[900]	train-r2_score:0.596712+0.0131996	test-r2_score:0.566209+0.0647412
[1000]	train-r2_score:0.603958+0.0123007	test-r2_score:0.566517+0.0643781
[1100]	train-r2_score:0.610502+0.0115973	test-r2_score:0.566558+0.0641374


In [76]:
model

Unnamed: 0,test-r2_score-mean,test-r2_score-std,train-r2_score-mean,train-r2_score-std
0,0.004504,0.001188,0.004878,0.000197
1,0.009066,0.001905,0.009407,0.000790
2,0.013964,0.002540,0.014299,0.000878
3,0.018751,0.003214,0.019072,0.000956
4,0.022920,0.003579,0.023339,0.001353
5,0.027668,0.004225,0.028083,0.001544
6,0.032348,0.004863,0.032804,0.001655
7,0.036861,0.005404,0.037372,0.001804
8,0.041421,0.006021,0.041966,0.001954
9,0.045969,0.006638,0.046524,0.002107
