In [1]:
import sys
oldsysstdout = sys.stdout
class flushfile():
    def __init__(self, f):
        self.f = f
    def __getattr__(self,name): 
        return object.__getattribute__(self.f, name)
    def write(self, x):
        self.f.write(x)
        self.f.flush()
    def flush(self):
        self.f.flush()
sys.stdout = flushfile(sys.stdout)

In [2]:
import pandas as pd

import numpy as np
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble
from itertools import product

def find_delimiter(df, col):
    """
    Function that trying to find an approximate delimiter used for scaling.
    So we can undo the feature scaling.
    """
    vals = df[col].dropna().sort_values().round(8)
    vals = pd.rolling_apply(vals, 2, lambda x: x[1] - x[0])
    vals = vals[vals > 0.000001]
    return vals.value_counts().idxmax() 

In [3]:
import xgboost as xgb
from hyperopt import hp, fmin, tpe
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split, StratifiedKFold

In [4]:
train = pd.read_csv("data/train.csv")
target = train['target'].values

In [5]:
newtrain, X_test, newtarget, y_test = train_test_split(train, target, test_size=.75, stratify=target,
                                                    random_state=42)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(newtrain, newtarget, test_size=.2, stratify=newtarget,
                                                    random_state=42)

In [7]:
def baseline(X_train, X_test, y_train, y_test):
    X_train = X_train.drop(['ID','target','v8','v23','v25','v31','v36','v37','v46',
                        'v51','v53','v54','v63','v73','v75','v79','v81','v82',
                        'v89','v92','v95','v105','v107','v108','v109','v110',
                        'v116','v117','v118','v119','v123','v124','v128'],axis=1)
    X_test = X_test.drop(['ID','target','v8','v23','v25','v31','v36','v37','v46','v51',
                          'v53','v54','v63','v73','v75','v79','v81','v82','v89',
                          'v92','v95','v105','v107','v108','v109','v110','v116',
                          'v117','v118','v119','v123','v124','v128'],axis=1)

    for (train_name, train_series), (test_name, test_series) in zip(X_train.iteritems(),X_test.iteritems()):
        if train_series.dtype == 'O':
            #for objects: factorize
            X_train[train_name], tmp_indexer = pd.factorize(X_train[train_name])
            X_test[test_name] = tmp_indexer.get_indexer(X_test[test_name])
            #but now we have -1 values (NaN)
        else:
            #for int or float: fill NaN
            tmp_len = len(X_train[train_series.isnull()])
            if tmp_len>0:
                #print "mean", train_series.mean()
                X_train.loc[train_series.isnull(), train_name] = -999
            #and Test
            tmp_len = len(X_test[test_series.isnull()])
            if tmp_len>0:
                X_test.loc[test_series.isnull(), test_name] = -999

    extc = ExtraTreesClassifier(n_estimators=850,max_features= 60,criterion= 'entropy',min_samples_split= 4,
                                max_depth= 40, min_samples_leaf= 2, n_jobs = -1)      

    extc.fit(X_train, y_train)
    y_pred = extc.predict_proba(X_test)
    return log_loss(y_test, y_pred[:, 1])

# Baseline ET

In [8]:
skf = StratifiedKFold(newtarget, n_folds=5, random_state=42)
res = []
for train_index, test_index in skf:
    X_train, X_test = newtrain.iloc[train_index].copy(), newtrain.iloc[test_index].copy()
    y_train, y_test = newtarget[train_index], newtarget[test_index]
    r = baseline(X_train, X_test, y_train, y_test)
    print(r)
    res.append(r)
print('mean', np.mean(res))

0.469913562479
0.473848904516
0.47572746921
0.467934125021
0.478691461967
mean 0.473223104639


In [18]:
X_train[['v1']].values - X_train[['v1', 'v2', 'v4', 'v5']].values

array([[ 0.        ,  0.15322361,  1.01088794, -6.2033558 ],
       [        nan,         nan,         nan,         nan],
       [        nan,         nan,         nan,         nan],
       ..., 
       [        nan,         nan,         nan,         nan],
       [ 0.        , -6.02684484, -3.09924293, -6.48420617],
       [ 0.        , -4.79081156, -1.19054852, -5.29813207]])

In [10]:
def xgbtest(X_train, X_test, y_train, y_test):
    X_train = X_train.drop(['ID','target','v8','v23','v25','v31','v36','v37',
                        'v46','v51','v53','v54','v63','v73','v75','v79',
                        'v81','v82','v89','v92','v95','v105','v107','v108',
                        'v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
    X_test = X_test.drop(['ID','target', 'v8','v23','v25','v31','v36','v37','v46','v51',
                      'v53','v54','v63','v73','v75','v79','v81','v82','v89',
                      'v92','v95','v105','v107','v108','v109','v110','v116',
                      'v117','v118','v119','v123','v124','v128'],axis=1)
    num_vars = ['v1', 'v2', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10', 'v11',
                'v12', 'v13', 'v14', 'v15', 'v16', 'v17', 'v18', 'v19', 'v20',
                'v21', 'v26', 'v27', 'v28', 'v29', 'v32', 'v33', 'v34', 'v35', 'v38',
                'v39', 'v40', 'v41', 'v42', 'v43', 'v44', 'v45', 'v48', 'v49', 'v50',
                'v55', 'v57', 'v58', 'v59', 'v60', 'v61', 'v62', 'v64', 'v65', 'v67',
                'v68', 'v69', 'v70', 'v72', 'v76', 'v77', 'v78', 'v80', 'v83', 'v84', 
                'v85', 'v86', 'v87', 'v88', 'v90', 'v93', 'v94', 'v96', 'v97', 'v98', 
                'v99', 'v100', 'v101', 'v102', 'v103', 'v104', 'v106', 'v111', 'v114',
                'v115', 'v120', 'v121', 'v122', 'v126', 'v127', 'v129', 'v130', 'v131']

    dropcols = []
    floatcols = []
    traincat = np.zeros((X_train.shape[0], 1))
    testcat = np.zeros((X_test.shape[0], 1))

    skfi = StratifiedKFold(y_train, n_folds=5, random_state=42)

    vs = pd.concat([X_train, X_test])
    for c in num_vars:
        if c not in X_train.columns:
            continue

        X_train.loc[X_train[c].round(5) == 0, c] = 0
        X_test.loc[X_test[c].round(5) == 0, c] = 0

        delimiter = find_delimiter(vs, c)
        X_train[c] *= 1/delimiter
        X_test[c] *= 1/delimiter

    for (train_name, train_series), (test_name, test_series) in zip(X_train.iteritems(), X_test.iteritems()):
        if train_series.dtype == 'O' or train_series.dtype == 'int64':
            dropcols.append(train_name)
            if train_name != 'v22':
                X_train.loc[:, train_name+'c'] = np.zeros(X_train.shape[0])
                for train_index, test_index in skfi:
                    X_X_train = X_train.loc[train.index[train_index], [train_name, 'target']]
                    d = X_X_train.groupby(by=train_name).mean()
                    predvec = X_train.loc[X_train.index[test_index], train_name]
                    predvec = predvec.map(dict([(i, d.loc[i, 'target']) for i in d.index]))
                    X_train.loc[X_train.index[test_index], train_name+'c'] = predvec.values
                predvec = X_test.loc[:, train_name]
                predvec = predvec.map(dict([(i, d.loc[i, 'target']) for i in d.index]))
                X_test.loc[:, train_name+'c'] = predvec.values

                X_train[train_name], tmp_indexer = pd.factorize(X_train[train_name])
                X_test[test_name] = tmp_indexer.get_indexer(X_test[test_name])
                ohe = OneHotEncoder()
                tr = ohe.fit_transform(X_train[train_name].values.reshape(-1, 1) + 1).toarray()
                ts = ohe.transform(X_test[test_name].values.reshape(-1, 1) + 1).toarray()
            else:
                X_train[train_name], tmp_indexer = pd.factorize(X_train[train_name])
                X_test[test_name] = tmp_indexer.get_indexer(X_test[test_name])
                a, b = pd.factorize(X_train[train_name].value_counts()[X_train[train_name].value_counts() > 100].index)
                trv = b.get_indexer(X_train[train_name])
                tsv = b.get_indexer(X_test[test_name])
                ohe = OneHotEncoder()
                tr = ohe.fit_transform(trv.reshape(-1, 1) + 1).toarray()
                ts = ohe.transform(tsv.reshape(-1, 1) + 1).toarray()
                tr = np.hstack((X_train[train_name].values.reshape(-1, 1), tr))
                ts = np.hstack((X_test[test_name].values.reshape(-1, 1), ts))

            traincat = np.hstack((traincat, tr))
            testcat = np.hstack((testcat, ts))

        else:
            floatcols.append(train_name)
            #for int or float: fill NaN
            tmp_len = len(X_train[train_series.isnull()])
            if tmp_len>0:
                #print "mean", train_series.mean()
                X_train.loc[train_series.isnull(), train_name] = -1
            #and Test
            tmp_len = len(X_test[test_series.isnull()])
            if tmp_len>0:
                X_test.loc[test_series.isnull(), test_name] = -1

    pca = PCA(n_components=10)
    trainpca = pca.fit_transform(X_train[floatcols].values)
    testpca = pca.transform(X_test[floatcols].values)

    for i in range(len(floatcols)):
        for j in range(i, len(floatcols)):
            X_train.loc[:, floatcols[i]+'-'+floatcols[j]] = X_train.loc[:, floatcols[i]] - X_train.loc[:, floatcols[j]]
            X_train.loc[:, floatcols[i]+'*'+floatcols[j]] = X_train.loc[:, floatcols[i]] * X_train.loc[:, floatcols[j]]
            X_train.loc[:, floatcols[i]+'/'+floatcols[j]] = X_train.loc[:, floatcols[i]] / X_train.loc[:, floatcols[j]]
            X_test.loc[:, floatcols[i]+'-'+floatcols[j]] = X_test.loc[:, floatcols[i]] - X_test.loc[:, floatcols[j]]
            X_test.loc[:, floatcols[i]+'*'+floatcols[j]] = X_test.loc[:, floatcols[i]] * X_test.loc[:, floatcols[j]]
            X_test.loc[:, floatcols[i]+'/'+floatcols[j]] = X_test.loc[:, floatcols[i]] / X_test.loc[:, floatcols[j]]
        print(floatcols[i])
        
    X_train_f = np.hstack((X_train[X_train.columns.difference(dropcols)].values, traincat, trainpca))
    X_test_f = np.hstack((X_test[X_test.columns.difference(dropcols)].values, testcat, testpca))

    dtrain = xgb.DMatrix(X_train_f, label=y_train)
    dval = xgb.DMatrix(X_test_f, label=y_test)

    watchlist = ((dtrain, 'train'), (dval, 'val'))

    params = {
        'nthread': 12,
        'eta': .01,
        'max_depth': 12,
        'colsample_bytree': .4,
        'subsample': 1.,
        'min_child_weight': 4.,
        'lambda': 7.,
        'alpha': .75,
        'seed': 0,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'silent': 1
    }

    gbm = xgb.train(params, dtrain, num_boost_round=3000, early_stopping_rounds=300, evals=watchlist,
              verbose_eval=True)
    pred = gbm.predict(dval, ntree_limit=gbm.best_iteration)
    return log_loss(y_test, pred)

In [11]:
xgbtest(X_train, X_test, y_train, y_test)

	Series.rolling(center=False,window=2).apply(func=<function>,args=<tuple>,kwargs=<dict>)


v1
v2
v4
v5
v6
v7
v9
v10
v11
v12
v13
v14
v15
v16
v17
v18
v19
v20
v21
v26


KeyboardInterrupt: 

In [14]:
X_train.columns

Index(['v1', 'v2', 'v3', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10', 'v11',
       ...
       'v106/v80', 'v9-v9', 'v9*v9', 'v9/v9', 'v9-v80', 'v9*v80', 'v9/v80',
       'v80-v80', 'v80*v80', 'v80/v80'],
      dtype='object', length=248)

In [29]:
X_train_f[np.isnan(X_train_f)] = -999

In [30]:
X_test_f[np.isnan(X_test_f)] = -999

In [15]:
from sklearn.metrics import log_loss

In [32]:
extc = ExtraTreesClassifier(n_estimators=2850,max_features= 100,criterion= 'entropy',min_samples_split= 4,
                            max_depth= 40, min_samples_leaf= 2, n_jobs = -1)      
 
extc.fit(X_train_f, y_train)
preds = extc.predict_proba(X_test_f)
print(log_loss(y_test, preds[:, 1]))

0.475724395496


In [41]:
z = X_train.columns.difference(dropcols)
zz = extc.feature_importances_

In [43]:
lst = [(z[i], zz[i]) for i in range(98)]

In [47]:
lst.sort(key=lambda x: x[1])

In [48]:
lst

[('v112c', 0.0),
 ('v113c', 0.0),
 ('v125c', 0.0),
 ('v24c', 0.0),
 ('v30c', 0.0),
 ('v3c', 0.0),
 ('v47c', 0.0),
 ('v52c', 0.0),
 ('v56c', 0.0),
 ('v66c', 0.0),
 ('v71c', 0.0),
 ('v74c', 0.0),
 ('v91c', 0.0),
 ('v38', 0.001900790083705275),
 ('v65', 0.0033821642288781126),
 ('v20', 0.003418688414782641),
 ('v67', 0.0034806137619644018),
 ('v29', 0.0034892562170432753),
 ('v41', 0.0034961417074067156),
 ('v48', 0.0034972356023415213),
 ('v96', 0.0035643468489207041),
 ('v49', 0.003573162943010951),
 ('v61', 0.0036088682663659648),
 ('v42', 0.0036223029859698331),
 ('v106', 0.0036223612460628799),
 ('v11', 0.0036262430938946381),
 ('v77', 0.0036761157102065587),
 ('v93', 0.0036868323844363384),
 ('v68', 0.0037163850259029447),
 ('v64', 0.0037400838643919742),
 ('v17', 0.0037459769886174958),
 ('v76', 0.0037626594425400912),
 ('v19', 0.0038269384288362762),
 ('v13', 0.0038358818575447567),
 ('v104', 0.0038526945640091894),
 ('v94', 0.0038735666128595922),
 ('v59', 0.0039376260799359377),

In [33]:
extc.feature_importances_

array([  5.24130812e-03,   2.02666641e-02,   5.34023601e-03,
         4.12361825e-03,   4.45859023e-03,   4.10393982e-03,
         3.85269456e-03,   3.62236125e-03,   3.62624309e-03,
         4.29954653e-03,   0.00000000e+00,   0.00000000e+00,
         1.59068164e-02,   4.34694299e-03,   2.08071187e-02,
         5.16787572e-03,   4.13412603e-03,   4.41913466e-03,
         0.00000000e+00,   4.01698592e-03,   4.78446836e-03,
         8.24894825e-03,   3.83588186e-03,   4.11335778e-03,
         5.12774902e-03,   1.61857890e-02,   4.24131332e-03,
         4.80448983e-03,   3.74597699e-03,   4.42892959e-03,
         3.82693843e-03,   4.45459140e-03,   3.41868841e-03,
         1.29313029e-02,   0.00000000e+00,   4.21841242e-03,
         4.28309787e-03,   4.88002454e-03,   3.48925622e-03,
         0.00000000e+00,   4.13061988e-03,   3.98067618e-03,
         1.46015962e-02,   4.08013545e-03,   1.90079008e-03,
         4.44101328e-03,   0.00000000e+00,   4.03120169e-03,
         1.64319820e-02,

In [9]:
X_train = X_train.drop(['ID','target','v8','v23','v25','v31','v36','v37',
                    'v46','v51','v53','v54','v63','v73','v75','v79',
                    'v81','v82','v89','v92','v95','v105','v107','v108',
                    'v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
X_test = X_test.drop(['ID','target', 'v8','v23','v25','v31','v36','v37','v46','v51',
                  'v53','v54','v63','v73','v75','v79','v81','v82','v89',
                  'v92','v95','v105','v107','v108','v109','v110','v116',
                  'v117','v118','v119','v123','v124','v128'],axis=1)
num_vars = ['v1', 'v2', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10', 'v11',
            'v12', 'v13', 'v14', 'v15', 'v16', 'v17', 'v18', 'v19', 'v20',
            'v21', 'v26', 'v27', 'v28', 'v29', 'v32', 'v33', 'v34', 'v35', 'v38',
            'v39', 'v40', 'v41', 'v42', 'v43', 'v44', 'v45', 'v48', 'v49', 'v50',
            'v55', 'v57', 'v58', 'v59', 'v60', 'v61', 'v62', 'v64', 'v65', 'v67',
            'v68', 'v69', 'v70', 'v72', 'v76', 'v77', 'v78', 'v80', 'v83', 'v84', 
            'v85', 'v86', 'v87', 'v88', 'v90', 'v93', 'v94', 'v96', 'v97', 'v98', 
            'v99', 'v100', 'v101', 'v102', 'v103', 'v104', 'v106', 'v111', 'v114',
            'v115', 'v120', 'v121', 'v122', 'v126', 'v127', 'v129', 'v130', 'v131']

dropcols = []
traincat = np.zeros((X_train.shape[0], 1))
testcat = np.zeros((X_test.shape[0], 1))

vs = pd.concat([X_train, X_test])
for c in num_vars:
    if c not in X_train.columns:
        continue

    X_train.loc[X_train[c].round(5) == 0, c] = 0
    X_test.loc[X_test[c].round(5) == 0, c] = 0

    delimiter = find_delimiter(vs, c)
    X_train[c] *= 1/delimiter
    X_test[c] *= 1/delimiter

for (train_name, train_series), (test_name, test_series) in zip(X_train.iteritems(), X_test.iteritems()):
    if train_series.dtype == 'O' or train_series.dtype == 'int64':
        dropcols.append(train_name)
        if train_name != 'v22':
            X_train[train_name], tmp_indexer = pd.factorize(X_train[train_name])
            X_test[test_name] = tmp_indexer.get_indexer(X_test[test_name])
            ohe = OneHotEncoder()
            tr = ohe.fit_transform(X_train[train_name].values.reshape(-1, 1) + 1).toarray()
            ts = ohe.transform(X_test[test_name].values.reshape(-1, 1) + 1).toarray()
        else:
            X_train[train_name], tmp_indexer = pd.factorize(X_train[train_name])
            X_test[test_name] = tmp_indexer.get_indexer(X_test[test_name])
            a, b = pd.factorize(X_train[train_name].value_counts()[X_train[train_name].value_counts() > 100].index)
            trv = b.get_indexer(X_train[train_name])
            tsv = b.get_indexer(X_test[test_name])
            ohe = OneHotEncoder()
            tr = ohe.fit_transform(trv.reshape(-1, 1) + 1).toarray()
            ts = ohe.transform(tsv.reshape(-1, 1) + 1).toarray()
            tr = np.hstack((X_train[train_name].values.reshape(-1, 1), tr))
            ts = np.hstack((X_test[test_name].values.reshape(-1, 1), ts))

        traincat = np.hstack((traincat, tr))
        testcat = np.hstack((testcat, ts))

X_train_f = np.hstack((X_train.values, traincat))
X_test_f = np.hstack((X_test.values, testcat))

	Series.rolling(center=False,window=2).apply(args=<tuple>,kwargs=<dict>,func=<function>)


In [17]:
dtrain = xgb.DMatrix(X_train_f, label=y_train)
dval = xgb.DMatrix(X_test_f, label=y_test)

In [18]:
watchlist = ((dtrain, 'train'), (dval, 'val'))

In [19]:
from sklearn.metrics import log_loss

In [12]:
space = {'colsample_bytree': hp.uniform('colsample_bytree', .3, 1.),
         'subsample': hp.uniform('subsample', .3, 1.),
         'max_depth': hp.quniform('max_depth', 3, 15, 1),
         'min_child_weight': hp.quniform('min_samples_leaf', 1, 5, 1),
         'lambda': hp.quniform('lambda', 1., 10., 1.),
         'alpha': hp.quniform('alpha', 0., 1., .05)}

In [16]:
def objective(space):
    params = {
        'nthread': 12,
        'eta': .1,
        'max_depth': int(space['max_depth']),
        'colsample_bytree': space['colsample_bytree'],
        'subsample': space['subsample'],
        'min_child_weight': space['min_child_weight'],
        'lambda': space['lambda'],
        'alpha': space['alpha'],
        'seed': 0,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'silent': 1
    }
    
    gbm = xgb.train(params, dtrain, num_boost_round=3000, early_stopping_rounds=300, evals=watchlist,
              verbose_eval=False)
    
    pred = gbm.predict(dval, ntree_limit=gbm.best_iteration)
    
    print(space)
    print(log_loss(y_test, pred))
    return(log_loss(y_test, pred))
#print y_pred

#pd.DataFrame({"ID": id_test, "PredictedProb": y_pred[:,1]}).to_csv('extra_trees.csv',index=False)

In [None]:
best = fmin(objective, space, algo=tpe.suggest, max_evals=100)

{'alpha': 1.0, 'subsample': 0.46816746101562534, 'lambda': 2.0, 'min_child_weight': 4.0, 'max_depth': 15.0, 'colsample_bytree': 0.9838631668484263}
0.475060803236
{'alpha': 0.45, 'subsample': 0.39663558243906166, 'lambda': 8.0, 'min_child_weight': 4.0, 'max_depth': 9.0, 'colsample_bytree': 0.4660742289623673}
0.472894268857
{'alpha': 0.35000000000000003, 'subsample': 0.5544694555408947, 'lambda': 7.0, 'min_child_weight': 3.0, 'max_depth': 5.0, 'colsample_bytree': 0.618015664279335}
0.471138683313
{'alpha': 0.30000000000000004, 'subsample': 0.46058224453877133, 'lambda': 5.0, 'min_child_weight': 3.0, 'max_depth': 11.0, 'colsample_bytree': 0.6452748626167566}
0.472712248757
{'alpha': 0.6000000000000001, 'subsample': 0.6643561918573215, 'lambda': 9.0, 'min_child_weight': 5.0, 'max_depth': 15.0, 'colsample_bytree': 0.5555735144203602}
0.47211594184
{'alpha': 0.25, 'subsample': 0.37630382795705897, 'lambda': 6.0, 'min_child_weight': 5.0, 'max_depth': 12.0, 'colsample_bytree': 0.906086030506

In [18]:
best

{'alpha': 0.75,
 'colsample_bytree': 0.38560597281374503,
 'lambda': 7.0,
 'max_depth': 12.0,
 'min_samples_leaf': 4.0,
 'subsample': 0.9966200526403689}

In [21]:
best2 = best
best2['min_child_weight'] = best2['min_samples_leaf']

In [22]:
objective(best2)

{'min_samples_leaf': 4.0, 'alpha': 0.75, 'subsample': 0.9966200526403689, 'lambda': 7.0, 'min_child_weight': 4.0, 'max_depth': 12.0, 'colsample_bytree': 0.38560597281374503}
0.467628613154


0.46762861315378496

In [None]:
params = {
    'nthread': 12,
    'eta': .01,
    'max_depth': 12,
    'colsample_bytree': .4,
    'subsample': 1.,
    'min_child_weight': 4.,
    'lambda': 7.,
    'alpha': .75,
    'seed': 0,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'silent': 1
}

gbm = xgb.train(params, dtrain, num_boost_round=3000, early_stopping_rounds=300, evals=watchlist,
          verbose_eval=True)

pred = gbm.predict(dval, ntree_limit=gbm.best_iteration)

#print(space)
print(log_loss(y_test, pred))
#return(log_loss(y_test, pred))

In [13]:
def create_feature_map(fmap_filename, features):
"""
features: enumerable of feature names
"""
    outfile = open(fmap_filename, 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

create_feature_map('xgb.fmap', features) 

AttributeError: 'Booster' object has no attribute 'feature_importances_'

In [27]:
train = pd.read_csv("data/train.csv")
target = train['target'].values
test = pd.read_csv("data/test.csv")

X_train = train.copy()
X_test = test.copy()

X_train = X_train.drop(['ID','target','v8','v23','v25','v31','v36','v37',
                    'v46','v51','v53','v54','v63','v73','v75','v79',
                    'v81','v82','v89','v92','v95','v105','v107','v108',
                    'v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
X_test = X_test.drop(['ID', 'v8','v23','v25','v31','v36','v37','v46','v51',
                  'v53','v54','v63','v73','v75','v79','v81','v82','v89',
                  'v92','v95','v105','v107','v108','v109','v110','v116',
                  'v117','v118','v119','v123','v124','v128'],axis=1)
num_vars = ['v1', 'v2', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10', 'v11',
            'v12', 'v13', 'v14', 'v15', 'v16', 'v17', 'v18', 'v19', 'v20',
            'v21', 'v26', 'v27', 'v28', 'v29', 'v32', 'v33', 'v34', 'v35', 'v38',
            'v39', 'v40', 'v41', 'v42', 'v43', 'v44', 'v45', 'v48', 'v49', 'v50',
            'v55', 'v57', 'v58', 'v59', 'v60', 'v61', 'v62', 'v64', 'v65', 'v67',
            'v68', 'v69', 'v70', 'v72', 'v76', 'v77', 'v78', 'v80', 'v83', 'v84', 
            'v85', 'v86', 'v87', 'v88', 'v90', 'v93', 'v94', 'v96', 'v97', 'v98', 
            'v99', 'v100', 'v101', 'v102', 'v103', 'v104', 'v106', 'v111', 'v114',
            'v115', 'v120', 'v121', 'v122', 'v126', 'v127', 'v129', 'v130', 'v131']

dropcols = []
traincat = np.zeros((X_train.shape[0], 1))
testcat = np.zeros((X_test.shape[0], 1))

vs = pd.concat([X_train, X_test])
for c in num_vars:
    if c not in X_train.columns:
        continue

    X_train.loc[X_train[c].round(5) == 0, c] = 0
    X_test.loc[X_test[c].round(5) == 0, c] = 0

    delimiter = find_delimiter(vs, c)
    X_train[c] *= 1/delimiter
    X_test[c] *= 1/delimiter

for (train_name, train_series), (test_name, test_series) in zip(X_train.iteritems(), X_test.iteritems()):
    if train_series.dtype == 'O' or train_series.dtype == 'int64':
        dropcols.append(train_name)
        if train_name != 'v22':
            X_train[train_name], tmp_indexer = pd.factorize(X_train[train_name])
            X_test[test_name] = tmp_indexer.get_indexer(X_test[test_name])
            ohe = OneHotEncoder()
            tr = ohe.fit_transform(X_train[train_name].values.reshape(-1, 1) + 1).toarray()
            ts = ohe.transform(X_test[test_name].values.reshape(-1, 1) + 1).toarray()
        else:
            X_train[train_name], tmp_indexer = pd.factorize(X_train[train_name])
            X_test[test_name] = tmp_indexer.get_indexer(X_test[test_name])
            a, b = pd.factorize(X_train[train_name].value_counts()[X_train[train_name].value_counts() > 100].index)
            trv = b.get_indexer(X_train[train_name])
            tsv = b.get_indexer(X_test[test_name])
            ohe = OneHotEncoder()
            tr = ohe.fit_transform(trv.reshape(-1, 1) + 1).toarray()
            ts = ohe.transform(tsv.reshape(-1, 1) + 1).toarray()
            tr = np.hstack((X_train[train_name].values.reshape(-1, 1), tr))
            ts = np.hstack((X_test[test_name].values.reshape(-1, 1), ts))

        traincat = np.hstack((traincat, tr))
        testcat = np.hstack((testcat, ts))

X_train_f = np.hstack((X_train.values, traincat))
X_test_f = np.hstack((X_test.values, testcat))

X_train_n = np.hstack((X_train[X_train.columns.difference(dropcols)].values, traincat))
X_test_n = np.hstack((X_test[X_test.columns.difference(dropcols)].values, testcat))

	Series.rolling(center=False,window=2).apply(kwargs=<dict>,args=<tuple>,func=<function>)


In [28]:
dtrain = xgb.DMatrix(X_train_f, label=target)
dtest = xgb.DMatrix(X_test_f)

In [29]:
params = {
    'nthread': 12,
    'eta': .01,
    'max_depth': 12,
    'colsample_bytree': .4,
    'subsample': 1.,
    'min_child_weight': 4.,
    'lambda': 7.,
    'alpha': .75,
    'seed': 0,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'silent': 1
}

gbm = xgb.train(params, dtrain, num_boost_round=1500, verbose_eval=False)

pred = gbm.predict(dtest)

sub = pd.read_csv('data/sample_submission.csv', index_col='ID')
sub.PredictedProb = pred
sub.to_csv('submission/xgb_ohe_ndc.csv', index_label='ID')

In [30]:
dtrain_n = xgb.DMatrix(X_train_n, label=target)
dtest_n = xgb.DMatrix(X_test_n)

In [31]:
params = {
    'nthread': 12,
    'eta': .01,
    'max_depth': 12,
    'colsample_bytree': .4,
    'subsample': 1.,
    'min_child_weight': 4.,
    'lambda': 7.,
    'alpha': .75,
    'seed': 0,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'silent': 1
}

gbm = xgb.train(params, dtrain_n, num_boost_round=1500, verbose_eval=False)

pred = gbm.predict(dtest_n)

sub = pd.read_csv('data/sample_submission.csv', index_col='ID')
sub.PredictedProb = pred
sub.to_csv('submission/xgb_ohe_dc.csv', index_label='ID')

In [28]:
zx = np.array(res)

In [32]:
z = zx[:, :, 1]

In [34]:
z[0, :].shape

(114393,)

In [37]:
z.mean(axis=0).shape

(114393,)

In [35]:
sub = pd.read_csv('data/sample_submission.csv', index_col='ID')
sub.PredictedProb = z[0, :]
sub.to_csv('submission/et_ch_one.csv', index_label='ID')

In [38]:
sub = pd.read_csv('data/sample_submission.csv', index_col='ID')
sub.PredictedProb = z.mean(axis=0)
sub.to_csv('submission/et_ch_bag.csv', index_label='ID')