In [1]:
import pickle


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline

import xgboost as xgb
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score

from ml_toolbox.kaggle import KaggleResult

In [2]:
dir_out = 'model_xgboost_v1'

### Load data

In [None]:
train_feat_set = './data/train_features_V1.pickle'
test_feat_set = './data/test_features_V1.pickle'

cv_set_train = './data_ori/cv_train_V2.csv'
cv_set_val = './data_ori/cv_val_V2.csv'
cv_set_test = './data_ori/cv_test_V2.csv'

cv_set_kfold = './data_ori/cv_StratifiedKFold_V3.pickle'

In [4]:
# Load train data
with open(train_feat_set,'rb') as f:
    train = pickle.load(f)
# Load test data
with open(test_feat_set,'rb') as f:
    X_test = pickle.load(f)

In [57]:
# Load cv set
train_samples = pd.read_csv(cv_set_train)
val_samples = pd.read_csv(cv_set_val)

In [66]:
# Load kfold set
with open(cv_set_kfold, 'rb') as f:
    fold_data = pickle.load(f)
kfolds = fold_data['folds']
y = fold_data['y']

In [7]:
train_samples.head()

Unnamed: 0,sample_nr,activity_id,outcome
0,6,act2_1233489,1
1,7,act2_1623405,1
2,8,act2_1111598,1
3,9,act2_1177453,1
4,10,act2_133509,1


In [8]:
X_val = train.tocsr()[val_samples.sample_nr.values, :]
X_train = train.tocsr()[train_samples.sample_nr.values, :]

In [9]:
y_val = val_samples['outcome']
y_train = train_samples['outcome']

In [10]:
X_val.shape

(458982, 31143)

In [11]:
X_train.shape

(1738309, 31143)

### Train XGBoost model

In [18]:
y.value_counts()

0    1221794
1     975497
Name: outcome, dtype: int64

In [67]:
print("###########")
print("One Hot enconded Test Dataset Script")


param = {'max_depth':10, 'eta':0.05, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.7
param['colsample_bytree']= 0.7
param['min_child_weight'] = 0
param['lambda'] = 0
param['alpha'] = 0
param['booster'] = "gblinear"

X_train = train.tocsr()

y_pred_val = np.zeros(y.shape)
scores = []
y_pred_test = np.zeros((test.shape[0],len(kfolds)))

for i,(itrain, ival) in enumerate(kfolds):
    print i, len(kfolds), len(itrain)
    d_train = xgb.DMatrix(X_train[itrain,:],label=y[itrain])
    d_valid = xgb.DMatrix(X_train[ival,:],label=y[ival])

    watchlist  = [(d_train,'train'), (d_valid, 'eval')]
    num_round = 400
    early_stopping_rounds=10
    bst = xgb.train(param, d_train, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)
    
    y_pred_val[ival] = bst.predict(xgb.DMatrix(X_train[ival,:]))
    y_pred_test[:,i] = bst.predict(xgb.DMatrix(X_test))
    
    scores.append(roc_auc_score(y[ival], y_pred_val[ival]))



###########
One Hot enconded Test Dataset Script
1940811


Will train until eval error hasn't decreased in 10 rounds.
[0]	train-auc:0.902173	eval-auc:0.908254
[1]	train-auc:0.922680	eval-auc:0.921745
[2]	train-auc:0.940317	eval-auc:0.934504
[3]	train-auc:0.955210	eval-auc:0.944822
[4]	train-auc:0.967237	eval-auc:0.952788
[5]	train-auc:0.976009	eval-auc:0.958354
[6]	train-auc:0.981927	eval-auc:0.962348
[7]	train-auc:0.985732	eval-auc:0.965346
[8]	train-auc:0.988179	eval-auc:0.967534
[9]	train-auc:0.989839	eval-auc:0.969338
[10]	train-auc:0.991054	eval-auc:0.970752
[11]	train-auc:0.991998	eval-auc:0.971798
[12]	train-auc:0.992751	eval-auc:0.972684
[13]	train-auc:0.993363	eval-auc:0.973439
[14]	train-auc:0.993866	eval-auc:0.974032
[15]	train-auc:0.994286	eval-auc:0.974482
[16]	train-auc:0.994640	eval-auc:0.974896
[17]	train-auc:0.994943	eval-auc:0.975227
[18]	train-auc:0.995204	eval-auc:0.975478
[19]	train-auc:0.995430	eval-auc:0.975694
[20]	train-auc:0.995626	eval-auc:0.975816
[21]	train-auc:0.995797	eval-auc:0.975907
[22]	train-auc:0.995946	eva

2000627


Will train until eval error hasn't decreased in 10 rounds.
[0]	train-auc:0.904921	eval-auc:0.877323
[1]	train-auc:0.923286	eval-auc:0.892009
[2]	train-auc:0.940219	eval-auc:0.907307
[3]	train-auc:0.954594	eval-auc:0.921941
[4]	train-auc:0.966478	eval-auc:0.934391
[5]	train-auc:0.975387	eval-auc:0.944234
[6]	train-auc:0.981492	eval-auc:0.951720
[7]	train-auc:0.985479	eval-auc:0.957319
[8]	train-auc:0.988062	eval-auc:0.961461
[9]	train-auc:0.989799	eval-auc:0.964533
[10]	train-auc:0.991051	eval-auc:0.966873
[11]	train-auc:0.992009	eval-auc:0.968726
[12]	train-auc:0.992765	eval-auc:0.970196
[13]	train-auc:0.993378	eval-auc:0.971405
[14]	train-auc:0.993878	eval-auc:0.972416
[15]	train-auc:0.994294	eval-auc:0.973275
[16]	train-auc:0.994646	eval-auc:0.974001
[17]	train-auc:0.994946	eval-auc:0.974599
[18]	train-auc:0.995203	eval-auc:0.975089
[19]	train-auc:0.995427	eval-auc:0.975509
[20]	train-auc:0.995622	eval-auc:0.975866
[21]	train-auc:0.995793	eval-auc:0.976174
[22]	train-auc:0.995945	eva

1940838


Will train until eval error hasn't decreased in 10 rounds.
[0]	train-auc:0.901490	eval-auc:0.915680
[1]	train-auc:0.921745	eval-auc:0.925645
[2]	train-auc:0.939682	eval-auc:0.935904
[3]	train-auc:0.954560	eval-auc:0.945806
[4]	train-auc:0.966626	eval-auc:0.954152
[5]	train-auc:0.975485	eval-auc:0.960509
[6]	train-auc:0.981476	eval-auc:0.965126
[7]	train-auc:0.985351	eval-auc:0.968569
[8]	train-auc:0.987859	eval-auc:0.971190
[9]	train-auc:0.989561	eval-auc:0.973277
[10]	train-auc:0.990804	eval-auc:0.974954
[11]	train-auc:0.991762	eval-auc:0.976322
[12]	train-auc:0.992523	eval-auc:0.977439
[13]	train-auc:0.993140	eval-auc:0.978372
[14]	train-auc:0.993647	eval-auc:0.979156
[15]	train-auc:0.994070	eval-auc:0.979811
[16]	train-auc:0.994429	eval-auc:0.980354
[17]	train-auc:0.994736	eval-auc:0.980812
[18]	train-auc:0.995002	eval-auc:0.981203
[19]	train-auc:0.995234	eval-auc:0.981529
[20]	train-auc:0.995436	eval-auc:0.981806
[21]	train-auc:0.995613	eval-auc:0.982035
[22]	train-auc:0.995770	eva

1999195


Will train until eval error hasn't decreased in 10 rounds.
[0]	train-auc:0.905349	eval-auc:0.876818
[1]	train-auc:0.923564	eval-auc:0.890428
[2]	train-auc:0.940431	eval-auc:0.904840
[3]	train-auc:0.954777	eval-auc:0.919089
[4]	train-auc:0.966668	eval-auc:0.931347
[5]	train-auc:0.975589	eval-auc:0.940714
[6]	train-auc:0.981709	eval-auc:0.947615
[7]	train-auc:0.985702	eval-auc:0.952742
[8]	train-auc:0.988276	eval-auc:0.956696
[9]	train-auc:0.989995	eval-auc:0.959847
[10]	train-auc:0.991225	eval-auc:0.962415
[11]	train-auc:0.992157	eval-auc:0.964529
[12]	train-auc:0.992891	eval-auc:0.966337
[13]	train-auc:0.993483	eval-auc:0.967882
[14]	train-auc:0.993970	eval-auc:0.969200
[15]	train-auc:0.994377	eval-auc:0.970315
[16]	train-auc:0.994721	eval-auc:0.971231
[17]	train-auc:0.995018	eval-auc:0.972003
[18]	train-auc:0.995274	eval-auc:0.972665
[19]	train-auc:0.995497	eval-auc:0.973219
[20]	train-auc:0.995693	eval-auc:0.973683
[21]	train-auc:0.995865	eval-auc:0.974077
[22]	train-auc:0.996016	eva

1951258


Will train until eval error hasn't decreased in 10 rounds.
[0]	train-auc:0.902478	eval-auc:0.918097
[1]	train-auc:0.922352	eval-auc:0.928793
[2]	train-auc:0.939941	eval-auc:0.939524
[3]	train-auc:0.954736	eval-auc:0.949446
[4]	train-auc:0.966762	eval-auc:0.957617
[5]	train-auc:0.975584	eval-auc:0.963762
[6]	train-auc:0.981523	eval-auc:0.968223
[7]	train-auc:0.985375	eval-auc:0.971433
[8]	train-auc:0.987883	eval-auc:0.973827
[9]	train-auc:0.989593	eval-auc:0.975686
[10]	train-auc:0.990835	eval-auc:0.977153
[11]	train-auc:0.991787	eval-auc:0.978368
[12]	train-auc:0.992541	eval-auc:0.979378
[13]	train-auc:0.993152	eval-auc:0.980208
[14]	train-auc:0.993657	eval-auc:0.980891
[15]	train-auc:0.994079	eval-auc:0.981463
[16]	train-auc:0.994438	eval-auc:0.981943
[17]	train-auc:0.994746	eval-auc:0.982344
[18]	train-auc:0.995012	eval-auc:0.982679
[19]	train-auc:0.995244	eval-auc:0.982965
[20]	train-auc:0.995446	eval-auc:0.983205
[21]	train-auc:0.995624	eval-auc:0.983408
[22]	train-auc:0.995781	eva

1993306


Will train until eval error hasn't decreased in 10 rounds.
[0]	train-auc:0.904708	eval-auc:0.883030
[1]	train-auc:0.923207	eval-auc:0.896542
[2]	train-auc:0.940129	eval-auc:0.910486
[3]	train-auc:0.954557	eval-auc:0.923908
[4]	train-auc:0.966476	eval-auc:0.935389
[5]	train-auc:0.975408	eval-auc:0.944307
[6]	train-auc:0.981535	eval-auc:0.950948
[7]	train-auc:0.985547	eval-auc:0.955846
[8]	train-auc:0.988145	eval-auc:0.959534
[9]	train-auc:0.989891	eval-auc:0.962423
[10]	train-auc:0.991143	eval-auc:0.964678
[11]	train-auc:0.992096	eval-auc:0.966503
[12]	train-auc:0.992846	eval-auc:0.967995
[13]	train-auc:0.993452	eval-auc:0.969241
[14]	train-auc:0.993947	eval-auc:0.970301
[15]	train-auc:0.994358	eval-auc:0.971197
[16]	train-auc:0.994704	eval-auc:0.971955
[17]	train-auc:0.995000	eval-auc:0.972591
[18]	train-auc:0.995255	eval-auc:0.973130
[19]	train-auc:0.995476	eval-auc:0.973597
[20]	train-auc:0.995668	eval-auc:0.974011
[21]	train-auc:0.995837	eval-auc:0.974372
[22]	train-auc:0.995986	eva

1968726


Will train until eval error hasn't decreased in 10 rounds.
[0]	train-auc:0.903208	eval-auc:0.801901
[1]	train-auc:0.917354	eval-auc:0.830249
[2]	train-auc:0.931840	eval-auc:0.870419
[3]	train-auc:0.946790	eval-auc:0.909443
[4]	train-auc:0.960800	eval-auc:0.933454
[5]	train-auc:0.971991	eval-auc:0.947484
[6]	train-auc:0.979693	eval-auc:0.956773
[7]	train-auc:0.984542	eval-auc:0.962812
[8]	train-auc:0.987543	eval-auc:0.966606
[9]	train-auc:0.989485	eval-auc:0.969296
[10]	train-auc:0.990842	eval-auc:0.971412
[11]	train-auc:0.991857	eval-auc:0.973135
[12]	train-auc:0.992650	eval-auc:0.974623
[13]	train-auc:0.993285	eval-auc:0.975853
[14]	train-auc:0.993802	eval-auc:0.976933
[15]	train-auc:0.994230	eval-auc:0.977820
[16]	train-auc:0.994590	eval-auc:0.978568
[17]	train-auc:0.994897	eval-auc:0.979211
[18]	train-auc:0.995161	eval-auc:0.979731
[19]	train-auc:0.995389	eval-auc:0.980161
[20]	train-auc:0.995588	eval-auc:0.980512
[21]	train-auc:0.995762	eval-auc:0.980799
[22]	train-auc:0.995914	eva

1994168


Will train until eval error hasn't decreased in 10 rounds.
[0]	train-auc:0.905272	eval-auc:0.865211
[1]	train-auc:0.923470	eval-auc:0.880418
[2]	train-auc:0.940275	eval-auc:0.896090
[3]	train-auc:0.954618	eval-auc:0.911673
[4]	train-auc:0.966466	eval-auc:0.925693
[5]	train-auc:0.975334	eval-auc:0.937173
[6]	train-auc:0.981414	eval-auc:0.945837
[7]	train-auc:0.985397	eval-auc:0.952190
[8]	train-auc:0.987994	eval-auc:0.956793
[9]	train-auc:0.989747	eval-auc:0.960189
[10]	train-auc:0.991008	eval-auc:0.962794
[11]	train-auc:0.991968	eval-auc:0.964948
[12]	train-auc:0.992727	eval-auc:0.966722
[13]	train-auc:0.993341	eval-auc:0.968208
[14]	train-auc:0.993845	eval-auc:0.969475
[15]	train-auc:0.994266	eval-auc:0.970536
[16]	train-auc:0.994621	eval-auc:0.971411
[17]	train-auc:0.994925	eval-auc:0.972151
[18]	train-auc:0.995188	eval-auc:0.972794
[19]	train-auc:0.995415	eval-auc:0.973350
[20]	train-auc:0.995614	eval-auc:0.973816
[21]	train-auc:0.995788	eval-auc:0.974208
[22]	train-auc:0.995942	eva

1991749


Will train until eval error hasn't decreased in 10 rounds.
[0]	train-auc:0.905000	eval-auc:0.866822
[1]	train-auc:0.923303	eval-auc:0.882222
[2]	train-auc:0.940148	eval-auc:0.897678
[3]	train-auc:0.954532	eval-auc:0.912323
[4]	train-auc:0.966426	eval-auc:0.924906
[5]	train-auc:0.975325	eval-auc:0.934824
[6]	train-auc:0.981412	eval-auc:0.942489
[7]	train-auc:0.985396	eval-auc:0.948288
[8]	train-auc:0.987988	eval-auc:0.952742
[9]	train-auc:0.989738	eval-auc:0.956292
[10]	train-auc:0.990995	eval-auc:0.959228
[11]	train-auc:0.991949	eval-auc:0.961660
[12]	train-auc:0.992703	eval-auc:0.963739
[13]	train-auc:0.993313	eval-auc:0.965509
[14]	train-auc:0.993815	eval-auc:0.967014
[15]	train-auc:0.994235	eval-auc:0.968324
[16]	train-auc:0.994590	eval-auc:0.969435
[17]	train-auc:0.994895	eval-auc:0.970379
[18]	train-auc:0.995158	eval-auc:0.971195
[19]	train-auc:0.995387	eval-auc:0.971897
[20]	train-auc:0.995588	eval-auc:0.972488
[21]	train-auc:0.995763	eval-auc:0.972991
[22]	train-auc:0.995917	eva

1994941


Will train until eval error hasn't decreased in 10 rounds.
[0]	train-auc:0.905489	eval-auc:0.882100
[1]	train-auc:0.923888	eval-auc:0.894649
[2]	train-auc:0.940800	eval-auc:0.907153
[3]	train-auc:0.955279	eval-auc:0.919073
[4]	train-auc:0.967150	eval-auc:0.929489
[5]	train-auc:0.975953	eval-auc:0.937862
[6]	train-auc:0.981935	eval-auc:0.944365
[7]	train-auc:0.985824	eval-auc:0.949302
[8]	train-auc:0.988345	eval-auc:0.953122
[9]	train-auc:0.990039	eval-auc:0.956136
[10]	train-auc:0.991258	eval-auc:0.958582
[11]	train-auc:0.992186	eval-auc:0.960624
[12]	train-auc:0.992918	eval-auc:0.962335
[13]	train-auc:0.993510	eval-auc:0.963797
[14]	train-auc:0.993995	eval-auc:0.965032
[15]	train-auc:0.994400	eval-auc:0.966087
[16]	train-auc:0.994743	eval-auc:0.966999
[17]	train-auc:0.995036	eval-auc:0.967783
[18]	train-auc:0.995290	eval-auc:0.968441
[19]	train-auc:0.995510	eval-auc:0.968989
[20]	train-auc:0.995703	eval-auc:0.969463
[21]	train-auc:0.995872	eval-auc:0.969877
[22]	train-auc:0.996021	eva

In [68]:
cv_score = roc_auc_score(y, y_pred_val)
print scores
print cv_score

[0.97575685437609661, 0.97841132487980453, 0.98365345378765112, 0.97643218788840458, 0.98469188978061739, 0.97728595035590771, 0.98291012330022109, 0.97698526502478367, 0.97815203372628667, 0.97242927495200027]
0.978080452044


In [69]:
def ensemble_preds(preds, scores, w=None):
    # preds: numpy array (n, m, k), n: samples, m: classes, k: models
    # scores: numpy array
    # w: 0, None -> mean
    # w==1: weighted by score
    # w==2: weighted by rank
    if not w or w==0:
        return preds.sum(axis=1)/preds.shape[1]
    
    if w==1:
        tmp = np.zeros(preds.shape)
        
        for i in range(preds.shape[1]):
            tmp[:,i] = preds[:,i] * (1/scores[i])
            
        return tmp.sum(axis=1) / np.divide(1,scores).sum()
    
    if w==2:
        w = pd.Series(scores).rank(ascending=False)
        
        tmp = np.zeros(preds.shape)
        
        for i in range(preds.shape[1]):
            tmp[:,i] = preds[:,i] * w[i]
            
        return tmp.sum(axis=1) / w.sum()

In [84]:
print scores
keep = [2,4,6,1,8]
scores = [scores[i] for i in keep]
y_pred_test_bag = []

[0.97575685437609661, 0.97841132487980453, 0.98365345378765112, 0.97643218788840458, 0.98469188978061739, 0.97728595035590771, 0.98291012330022109, 0.97698526502478367, 0.97815203372628667, 0.97242927495200027]


[0.98365345378765112,
 0.98469188978061739,
 0.98291012330022109,
 0.97841132487980453,
 0.97815203372628667]

In [88]:
y_pred_test.shape

(498687, 10)

In [89]:
tmp = np.zeros((y_pred_test_bag.shape[0],5))
for i,j in enumerate(keep):
    tmp[:,i] = y_pred_test[:,j]

In [90]:
y_pred_test_bag = ensemble_preds(tmp, scores, 2)

In [91]:
y_pred_test_bag.shape

(498687,)

### Combine with predictions based on leak

In [92]:
# Load predictions based on data leak
not_leak_value = 0.123456
y_pred_leak = pd.read_csv('leak_abuse_V1_lb_0.987471.csv')
y_pred_leak.head()

Unnamed: 0,activity_id,outcome
0,act2_4043386,0.95
1,act2_220464,0.95
2,act2_2569785,0.95
3,act2_1930380,0.95
4,act2_2186609,0.95


In [93]:
# Add test estimates to test cv set
test_samples = pd.read_csv(cv_set_test)
test_samples['outcome_model']=y_pred_test_bag
test_samples.head()

Unnamed: 0,sample_nr,activity_id,outcome_model
0,0,act1_249281,0.095398
1,1,act2_230855,0.099914
2,2,act1_240724,0.435272
3,3,act1_83552,0.435375
4,4,act2_1043301,0.435198


In [94]:
y_pred_leak=y_pred_leak.merge(test_samples, how='left', on='activity_id')
y_pred_leak.head()

Unnamed: 0,activity_id,outcome,sample_nr,outcome_model
0,act2_4043386,0.95,262581,0.435225
1,act2_220464,0.95,262571,0.435144
2,act2_2569785,0.95,262575,0.435144
3,act2_1930380,0.95,262565,0.435132
4,act2_2186609,0.95,262569,0.435179


In [95]:
pred_no_leak_value = y_pred_leak.outcome==not_leak_value
y_pred_leak.loc[pred_no_leak_value,'outcome'] = y_pred_leak[pred_no_leak_value].outcome_model

In [96]:
y_pred_leak.sort_values('activity_id').head()

Unnamed: 0,activity_id,outcome,sample_nr,outcome_model
363626,act1_1,1.0,240682,0.432874
268409,act1_100006,0.0,79698,6.2e-05
378,act1_100050,0.203541,358220,0.203541
354234,act1_100065,0.0,59778,2.8e-05
301036,act1_100068,0.0,117803,3.6e-05


In [97]:
kag = KaggleResult(y_pred_leak[['activity_id','outcome']], None, cv_score, 'XGBoost model V0 - StratifiedKfold-n=10', dir_out)

In [98]:
print kag.validate()

(True, 'all_ok')


In [99]:
kag.upload()

0.989684