In [1]:
import pickle
import pandas as pd
import numpy as np
import gzip
import datetime
from itertools import product
from scipy import interpolate ## For other interpolation functions.
import time

import xgboost as xgb

import sklearn.metrics
import sklearn.utils

from sklearn.cross_validation import LabelKFold

import copy

import sklearn.linear_model
from sklearn.metrics import roc_auc_score

%matplotlib inline
import matplotlib.pyplot as plt



In [2]:
with gzip.open('merged7.pkl.gz', 'rb') as fd:
    data = pickle.load(fd)

with gzip.open('cvleak7-10fold.pkl.gz', 'rb') as fd:
    cvleak = pickle.load(fd)
    
data = pd.merge(data, cvleak, on='activity_id', how='left')

with gzip.open('dproc7.pkl.gz', 'rb') as fd:
    extra = pickle.load(fd)

data = pd.merge(data, extra, on='activity_id', how='left')

if True: # Wasteful to recompute a constant every time
    mindate = pd.Timestamp('2022-07-17 00:00:00')
    maxdate = pd.Timestamp('2023-08-31 00:00:00')
    minpdate = pd.Timestamp('2020-05-18 00:00:00')
else:
    mindate = min(data['date'])
    maxdate = max(data['date'])
    minpdate = min(data['pdate'])
    

In [3]:
df_gpreds = pickle.load(open('group3d-xgb-preds.pkl', 'rb'))
data = pd.merge(data, df_gpreds, on='group_1', how='left')

In [4]:
# data preproc patches

# convert adate_gap nan's to -1 for xgb
data.adate_gap.fillna(-1, inplace=True)

In [5]:
# get adate range maybefeature

ad = []
for g in data.groupby('achar_10'):
    ad.append([g[0], g[1].adate_daynum.min(), g[1].adate_daynum.max(), g[1].adate_daynum.max() - g[1].adate_daynum.min(), len(g[1]), g[1].outcome.sum(), g[1].outcome.mean()])

df_ad = pd.DataFrame(ad)

df_ad.columns = ['achar_10', 'achar_10_adate_min', 'achar_10_adate_max', 'achar_10_adate_range', 'achar_10_sum', 'achar_10_sum_outcome', 'achar_10_outcome_mean']

# Set achar_10 == -1 to -1 (fixme?)
df_ad.loc[0, ['achar_10_adate_min']] = -1
df_ad.loc[0, ['achar_10_adate_range']] += 1

df_adr = df_ad[['achar_10', 'achar_10_adate_range', 'achar_10_adate_min', 'achar_10_adate_max']]

data = pd.merge(data, df_adr, on='achar_10', how='left')

In [6]:
data_orig = data.copy()

In [7]:
procs = {}
procs['gavg_pc_4_eq24'] = []
procs['gavg_pc_2_eq0'] = []
for g in data.groupby('group_1'):
    procs['gavg_pc_2_eq0'].append([g[0], np.mean(g[1]['pchar_2'] == 0)])
    procs['gavg_pc_4_eq24'].append([g[0], np.mean(g[1]['pchar_4'] == 24)])

In [8]:
procs

{'gavg_pc_2_eq0': [[1, 1.0],
  [2, 0.24210526315789474],
  [3, 0.36170212765957449],
  [4, 0.20000000000000001],
  [5, 0.58396946564885499],
  [6, 0.041095890410958902],
  [7, 0.5],
  [8, 0.12796208530805686],
  [9, 0.125],
  [10, 1.0],
  [11, 0.22222222222222221],
  [12, 1.0],
  [13, 0.081081081081081086],
  [14, 1.0],
  [15, 1.0],
  [17, 0.11392405063291139],
  [18, 0.35950413223140498],
  [20, 0.5],
  [21, 0.44444444444444442],
  [24, 0.23999999999999999],
  [25, 0.20689655172413793],
  [26, 0.43783783783783786],
  [27, 0.72357723577235777],
  [28, 0.14035087719298245],
  [29, 0.52083333333333337],
  [30, 0.092105263157894732],
  [31, 0.18248175182481752],
  [32, 0.98936170212765961],
  [33, 0.066666666666666666],
  [34, 0.28770949720670391],
  [35, 0.4943820224719101],
  [36, 0.10897435897435898],
  [37, 0.32643678160919543],
  [38, 0.49230769230769234],
  [39, 0.51360174102285094],
  [40, 0.054545454545454543],
  [41, 0.012195121951219513],
  [42, 0.012500000000000001],
  [44, 0.5

In [9]:
for k in procs:
    print(k)
    df_tmp = pd.DataFrame(procs[k])
    df_tmp.columns = ['group_1', k]
    
    data = pd.merge(data, df_tmp, on='group_1', how='left')

gavg_pc_2_eq0
gavg_pc_4_eq24


In [10]:
data['achar_10_adate_pos'] = (data.adate_daynum - data.achar_10_adate_min) / data.achar_10_adate_range

In [11]:
data['leak_fillmask'] = data['outcome_filled'].isnull()
data.leak_fillmask.sum()

409834

In [12]:
write_dreduct = False

try:
    dreduct = pickle.load(open('dreduct7h.pkl', 'rb'))
except:
    dreduct = {}
    write_dreduct = True

In [13]:
# fixme?  onehot-values is still used as target fields
do_oneheat = ['adate_dayofweek', 'activity_category', 'group_1_bin']
dont_oneheat = ['pchar_38', 'achar_10', 'achar_10_reduced'] # *char* is heated unless listed here

onehot_values = {}
tot = 0

for k in data.keys():
    if k in dont_oneheat:
        continue
        
    if 'achar_10' in k: # block all achar_10 derivatives
        continue
        
    if 'char' in k or k in do_oneheat:
        onehot_values[k] = sorted(data[k].unique())
        tot += len(onehot_values[k])
        
print(tot, 'possible columns post-heating')

368 possible columns post-heating


In [14]:
# ALL test targets with 0.0/1.0 average are fully/accurately inferred in the test set... 

if 'grouplist' not in dreduct:

    dreduct['grouplist_all0'] = []
    dreduct['grouplist_all1'] = []
    dreduct['grouplist'] = []
    count = 0

    for g in data.groupby(['group_1'], sort=False):
        if len(g[1]) > 100:
            m = g[1].outcome.mean()
            if m != 0 and m != 1:
                #print(g[0], len(g[1]), g[1].outcome.mean())
                dreduct['grouplist'].append(g[0])

                count += len(g[1])
            elif m == 0:
                dreduct['grouplist_all0'].append(g[0])
            elif m == 1:
                dreduct['grouplist_all1'].append(g[0])
        else:
            dreduct['grouplist'].append(g[0])

            count += len(g[1])

    print(len(dreduct['grouplist']), len(dreduct['grouplist_all0']), len(dreduct['grouplist_all1']), count)

In [15]:
def split_traintest():
    testset = np.where(data['outcome'].isnull())
    trainset = np.where(~data['outcome'].isnull())

    return trainset, testset, data.iloc[trainset], data.iloc[testset]

trainset, testset, train, test = split_traintest()

print(len(train), len(test), len(data))

if 'trainmask' not in dreduct:
    mask = np.full(len(train), False, dtype=np.bool)
    for g in dreduct['grouplist_all0']:
        mask = np.logical_or(mask, train.group_1 == g)
    for g in dreduct['grouplist_all1']:
        mask = np.logical_or(mask, train.group_1 == g)
        
    dreduct['trainmask'] = mask
    
mask = dreduct['trainmask'].copy()
    
print(np.sum(mask))

# Need to copy since we're going to add another column
train_cut = train.iloc[np.where(np.logical_not(mask))].copy()

cols = train.columns.copy()
cols = cols.drop('activity_id')
train_dups = train_cut.duplicated(subset=cols)

train_cut_dedup = train_cut[~train_dups]
train_cut_dups = train_cut[train_dups]

train_cut_dedup_leaks = train_cut_dedup.iloc[np.where(train_cut_dedup.leak_fillmask.values)]
len(train_cut_dedup_leaks)

2197291 498687 2695978
1454634


158242

In [16]:
from operator import itemgetter

stratkey = 'people_id'

if 'pplbuckets' not in dreduct:
    balls = []
    for p in train_cut_dedup.groupby([stratkey]):
        balls.append([p[0], len(p[1]), p[1].outcome.mean()])

    dreduct['pplbuckets'] = sorted(balls, key=itemgetter(2), reverse=True)
else:
    balls = dreduct['pplbuckets'].copy()

In [17]:
if write_dreduct:
    pickle.dump(dreduct, open('dreduct7h.pkl', 'wb'))

In [18]:
# This assumes vc is sorted by whatever you want stratified
def dosplit_rr(df, vc, folds, fuzz = (43254, .5, 4)):
    if fuzz is not None:
        np.random.seed(fuzz[0])
    
    bcount = np.zeros(folds)
    
    buckets = []
    for f in range(folds):
        buckets.append([])
    
    ballpit = copy.deepcopy(balls)
    
    runs = 0
    
    tot = 0
    
    while len(ballpit):
        runs += 1
        sel = 0
        r = np.random.rand()
        if r < fuzz[1]:
            sel = int((fuzz[2] / fuzz[1]) * r)
            if sel >= len(ballpit):
                sel = len(ballpit) - 1

        tot += sel
                
        v = ballpit[sel]
        del ballpit[sel]
                
        selbucket = np.argsort(bcount)[0]
        
        buckets[selbucket].append(v[0])
        bcount[selbucket] += v[1]
    
    print(len(balls), runs, tot)
    
    return buckets
    

In [19]:
folds = 4

source = train_cut_dedup
pidsets_grouped = dosplit_rr(source, balls, folds, fuzz = (123456, 0.25, 4))

cv_train = []
cv_train_leak = []

cv_val = []
cv_val_leak = []
cv_val_dups = []

pu = []

for p in pidsets_grouped:
    cv_train.append(source[~source[stratkey].isin(p)])
    cv_val.append(source[source[stratkey].isin(p)].copy()) # copy val since we need to add a field for unique groups
    
    pu.append(list(cv_val[-1][stratkey].unique()))

    cv_train_leak.append(cv_train[-1].iloc[np.where(cv_train[-1].leak_fillmask.values)])
    cv_val_leak.append(cv_val[-1].iloc[np.where(cv_val[-1].leak_fillmask.values)].copy())
    
    # This is directly from train_cut, bypassing duplicate and leak (but not skewed-group) detection
    cv_val_dups.append(train_cut[train_cut[stratkey].isin(p)])


53036 53036 19603


In [20]:
# Optional test to compute which cv_val items have unique group_1's
if False:
    for fold in range(folds):

        groups_train = sorted(cv_train[fold].group_1.unique())
        groups_val = sorted(cv_val_leak[fold].group_1.unique())

        ugroup_mask = np.full(len(cv_val_leak[fold]), False, dtype=np.bool)

        cc = 0
        cl = []
        for g in cv_val[fold].groupby(['group_1']):
            if g[0] not in groups_train:
                cl.append(g[0])
                cc += len(g[1])

                ugroup_mask = np.logical_or(ugroup_mask, cv_val_leak[fold].group_1 == g[0])

        cv_val_leak[fold]['ugroup_mask'] = ugroup_mask
        
        print(fold, cv_val_leak[fold].ugroup_mask.sum(), len(cv_val_leak[fold]))

# 0 38542 39699
# 1 38630 40117
# 2 37681 39041
# 3 37402 38408
# ^ cv_val_leak results (xgb6 LB.992 run) - group_1 is almost ENTIRELY unique in xgb's eval set.  LEAKAGE!?

In [21]:
# ppl_370270/'group 27940' is 55k of *mostly* 0's, any bad prediction in it kills CV!

for f in range(0, folds):
    if (370270 in pu[f]):
        cv_val_dups[f] = cv_val_dups[f][cv_val_dups[f]['people_id'] != 370270]

    if 'group 27940' in cv_val_dups[f]['group_1'].unique():
        cv_val_dups[f] = cv_val_dups[f][cv_val_dups[f]['group_1'] != 'group 27940']
        
    print(len(cv_val[f]), cv_val[f].outcome.mean())
    print(f, len(cv_val_leak[f]), cv_val_leak[f].outcome.mean(), len(cv_val_dups[f]), cv_val_dups[f].outcome.mean())

108850 0.6027101515847496
0 40120 0.46061814556331004 170989 0.5993017094666908
108850 0.6026917776757005
1 37933 0.4507948224501094 171020 0.5941410361361245
108849 0.6026789405506711
2 39747 0.44247364580974663 176491 0.6191193885240607
108879 0.6025220657794432
3 40442 0.45398348251817416 170489 0.5993641818533747




In [24]:
from scipy.sparse import csr_matrix, hstack

noise = .095
q = 20

def buildmatrix(df, oneheat=False, linear=False):
    matrices = []    
    rows = []
    
    rows.append(df.pchar_38.values)
#    rows.append(np.busday_count(df.pdate.values.astype('datetime64[D]'), 
#                         df.adate.values.astype('datetime64[D]')))
 
    rows.append(df.business_days_delta.values)
    
    rows.append(df.people_per_group.values)
    
    if linear:
        #rows.append(df.people_per_group_adate.values / df.people_per_group.values)
        rows.append(df.events_per_group_adate.values / df.people_per_group_adate.values)
    
    if not linear:
        rows.append(df.group_1.values)
        rows.append(df.achar_10.values)
        
    rows.append(df.achar_10_adate_range.values)
    
    rows.append(df.gp_all0.values)
    rows[-1] += ((np.random.rand(len(rows[-1])) - .5) * noise)
    rows[-1] = ((rows[-1] * (100 * q)) + (q / 2)) // 100 / q
    
    rows.append(df.gp_all1.values)
    rows[-1] += ((np.random.rand(len(rows[-1])) - .5) * noise)
    rows[-1] = ((rows[-1] * (100 * q)) + (q / 2)) // 100 / q
    
    rows.append(df.gp_mixed.values)
    rows[-1] += ((np.random.rand(len(rows[-1])) - .5) * noise)
    rows[-1] = ((rows[-1] * (100 * q)) + (q / 2)) // 100 / q

    # Early linear analysis (xgb7h) said this has a .015 rcorr^2 when binned like this
    tmp = df.achar_10_adate_pos.values
    tmp = np.floor(tmp * 1000) / 1000
    tmp[np.isnan(tmp)] = -1
    rows.append(tmp)
        
    rows.append(df.adate_daynum.values)
    
    if not linear:
        tmp = df.adate_gap.values
        tmp[np.isnan(tmp)] = df.adate_daynum.values[np.isnan(tmp)]
        rows.append(tmp) # TODO: replace nan's for linear model
        
    rows.append(df.pdate_daynum.values)
    
    '''
    mask = df.people_per_group == 1
    mask = np.logical_and(mask, df.activity_category <= 4)
    mask = np.logical_and(mask, df.pchar_25 == 0)
    #flist.append('Xlowerprob_mask')
    rows.append(np.int8(mask))
    
    #mask = np.logical_and(mask, df.group_1 < 17871.5)
    mask2 = np.logical_and(mask, df.pchar_4 == 0)
    #flist.append('Xlowprob_mask')
    rows.append(np.int8(mask2))

    mask3 = np.logical_and(mask, df.group_1 < 17871.5)
    #flist.append('Xlowprob_mask2')
    rows.append(np.int8(mask3))
    '''
    
    rows.append(df.gavg_pc_2_eq0)
    rows.append(df.gavg_pc_4_eq24)


#    rows.append(df.outcome_filled_prevday.values)
#    rows.append(df.outcome_filled_nextday.values)

    matrices.append(csr_matrix(np.array(rows).T))
    rows = []

    if oneheat or linear:
        curitem = None
        for i in range(len(onehot_keys)):
            k = onehot_keys[i]

            if k[0] != curitem:
                usedmask = np.full(len(df), False, dtype=np.bool)
                curitem = k[0]

            if k[1] != None:
                rows.append(np.array(df[k[0]] == k[1]))
                usedmask[np.where(df[k[0]] == k[1])] = True
            else:
                rows.append(~usedmask)

            if len(rows) >= 128:
                matrices.append((np.array(rows).T))
                #print(len(matrices))
                rows = []

        if len(rows) > 0:
            matrices.append((np.array(rows).T))
            rows = []

        rv = hstack(matrices, format='csr')
        print(rv.shape)
        return rv
    else:
        for k in onehot_values.keys():
            tmp = df[k].values.astype(np.float64)
            #if np.sum(tmp[tmp == -1]):
                #print(k, np.min(tmp))
            #tmp[tmp == -1] = np.nan
            matrices.append(csr_matrix(tmp.reshape(len(df), 1)))
            
        #matrices = matrices.todense()
        #matrices.append(np.array(rows).T)
        return hstack(matrices)
        
        #output = np.hstack(matrices)
        #output[np.isnan(output)] = -1

        #return output

In [25]:
mat_train = {}
mat_val = {}
mat_val_dups = {}
dtrain = {}
dval = {}
dval_dups = {}

val_source = cv_val_leak

dooneheat = False

np.random.seed(0)

for f in range(folds):
    print(f)
    
    mat_train[f] = buildmatrix(cv_train_leak[f], dooneheat)
    dtrain[f] = xgb.DMatrix(mat_train[f], label=cv_train_leak[f].outcome.values, missing=-1)

    mat_val[f] = buildmatrix(val_source[f], dooneheat)
    dval[f] = xgb.DMatrix(mat_val[f], label=val_source[f].outcome.values, missing=-1)

    mat_val_dups[f] = buildmatrix(cv_val_dups[f], dooneheat)
    dval_dups[f] = xgb.DMatrix(mat_val_dups[f], label=cv_val_dups[f].outcome.values, missing=-1)
    
mat_test = buildmatrix(test, False)
dtest = xgb.DMatrix(mat_test, missing=np.nan)


0
1
2
3


In [26]:
# XXX: need way to pass this through?
curfold = 0

def feval_procleak(yhat, y):
    if (len(yhat) != len(cv_val_dups[curfold])):
        return "auc", sklearn.metrics.roc_auc_score(y.get_label(), yhat)
    
    yhat_f = yhat.copy()
    
    locs = np.where(~cv_val_dups[curfold].leak_fillmask)
    yhat_f[locs] = cv_val_dups[curfold].outcome_filled.values[locs]
    
    return "auc", sklearn.metrics.roc_auc_score(y.get_label(), yhat_f)
    #return "auc", get_leakpreds(curfold, yhat)


In [27]:
def build_dfpreds(preds, cv_df):
    cv_val_preds = []
    for f in range(folds):
        cv_val_preds.append(cv_df[f][['activity_id', 'outcome', 'outcome_filled', 'outcome_filled_nona', 'leak_fillmask']].copy())

        cv_val_preds[f]['pred_outcome'] = preds[f]
        cv_val_preds[f]['pred_outcomel'] = preds[f]
        
        mask = np.where(~cv_val_preds[f].leak_fillmask)
        cv_val_preds[f]['pred_outcomel'].values[mask] = cv_df[f]['outcome_filled'].values[mask]
        
        print(f,
              sklearn.metrics.roc_auc_score(cv_val_preds[f]['outcome'].values, cv_val_preds[f]['pred_outcome'].values),
              sklearn.metrics.roc_auc_score(cv_val_preds[f]['outcome'].values, cv_val_preds[f]['pred_outcomel'].values))
        
    output = pd.concat(cv_val_preds)
    
    print(sklearn.metrics.roc_auc_score(output['outcome'].values, output['pred_outcome'].values),
          sklearn.metrics.roc_auc_score(output['outcome'].values, output['pred_outcomel'].values))
    
    return output
    

def build_dfpreds_xgb(bst, cv_mat = dval_dups, cv_df = cv_val_dups):
    preds = []
    for f in range(folds):
        print(bst[f].attributes())
        try:
            preds.append(bst[f].predict(cv_mat[f], ntree_limit=bst[f].best_ntree_limit))
        except:
            preds.append(bst[f].predict(cv_mat[f]))
            preds[-1] = preds[-1].clip(0.001, .999)

    df_preds = build_dfpreds(preds, cv_df)
    return preds, df_preds

In [28]:

param = {'max_depth':10, 'eta':0.01, 'silent':1, 'objective':'binary:logistic' }
#param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.5
param['colsample_bytree']= 0.3
param['min_child_weight'] = 1
param['max_depth'] = 5
param['booster'] = "gbtree"
param['seed'] = 12345

bst_d5 = {}
#bst_linear = {} # optional

for curfold in range(folds):
#for curfold in [2]:
    #watchlist  = [(dtrain[curfold],'train'), (dval_dups[curfold], 'eval_dups'), (dval[curfold], 'eval')]
    watchlist  = [(dtrain[curfold],'train'), (dval_dups[curfold], 'eval_dups'), (dval[curfold], 'eval')]
    num_round = 2000
    early_stopping_rounds=500
    bst_d5[curfold] = xgb.train(param, dtrain[curfold], num_round, watchlist,
                       feval = feval_procleak,
                       early_stopping_rounds=early_stopping_rounds, 
                       verbose_eval=50)

[0]	train-auc:0.733013	eval_dups-auc:0.939142	eval-auc:0.737786
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 500 rounds.
[50]	train-auc:0.767088	eval_dups-auc:0.941211	eval-auc:0.748131
[100]	train-auc:0.7723	eval_dups-auc:0.941378	eval-auc:0.749035
[150]	train-auc:0.777294	eval_dups-auc:0.941512	eval-auc:0.749614
[200]	train-auc:0.782055	eval_dups-auc:0.941617	eval-auc:0.750082
[250]	train-auc:0.785827	eval_dups-auc:0.942	eval-auc:0.7507
[300]	train-auc:0.79047	eval_dups-auc:0.942184	eval-auc:0.750578
[350]	train-auc:0.794643	eval_dups-auc:0.942261	eval-auc:0.750454
[400]	train-auc:0.799153	eval_dups-auc:0.942285	eval-auc:0.750547
[450]	train-auc:0.8039	eval_dups-auc:0.942189	eval-auc:0.74995
[500]	train-auc:0.807723	eval_dups-auc:0.942218	eval-auc:0.750023
[550]	train-auc:0.811819	eval_dups-auc:0.942228	eval-auc:0.750094
[600]	train-auc:0.815989	eval_dups-auc:0.942248	eval-auc:0.750313
[650]	train-au

In [39]:

param = {'max_depth':10, 'eta':0.01, 'silent':1, 'objective':'binary:logistic' }
#param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.5
param['colsample_bytree']= 0.3
param['min_child_weight'] = 2
param['max_depth'] = 6
param['booster'] = "gbtree"
param['seed'] = 12343

bst_d6 = {}
#bst_linear = {} # optional

for curfold in range(folds):
#for curfold in [2]:
    #watchlist  = [(dtrain[curfold],'train'), (dval_dups[curfold], 'eval_dups'), (dval[curfold], 'eval')]
    watchlist  = [(dtrain[curfold],'train'), (dval_dups[curfold], 'eval_dups'), (dval[curfold], 'eval')]
    num_round = 2000
    early_stopping_rounds=200
    bst_d6[curfold] = xgb.train(param, dtrain[curfold], num_round, watchlist,
                       feval = feval_procleak,
                       early_stopping_rounds=early_stopping_rounds, 
                       verbose_eval=50)

[0]	train-auc:0.737526	eval_dups-auc:0.938145	eval-auc:0.731617
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[50]	train-auc:0.788406	eval_dups-auc:0.940761	eval-auc:0.74626
[100]	train-auc:0.795707	eval_dups-auc:0.9412	eval-auc:0.74806
[150]	train-auc:0.799123	eval_dups-auc:0.94154	eval-auc:0.749939
[200]	train-auc:0.804308	eval_dups-auc:0.941707	eval-auc:0.750647
[250]	train-auc:0.810384	eval_dups-auc:0.942025	eval-auc:0.751326
[300]	train-auc:0.81622	eval_dups-auc:0.942224	eval-auc:0.751077
[350]	train-auc:0.82219	eval_dups-auc:0.94229	eval-auc:0.75118
[400]	train-auc:0.828129	eval_dups-auc:0.9423	eval-auc:0.751063
[450]	train-auc:0.833679	eval_dups-auc:0.942325	eval-auc:0.750974
Stopping. Best iteration:
[253]	train-auc:0.810616	eval_dups-auc:0.942062	eval-auc:0.751395

[0]	train-auc:0.728005	eval_dups-auc:0.942793	eval-auc:0.726518
Multiple eval metrics have been passed: 'eval-auc' will

In [40]:
#m4 run r
preds_d6, df_preds_d6 = build_dfpreds_xgb(bst_d6)

{'best_msg': '[253]\ttrain-auc:0.810616\teval_dups-auc:0.942062\teval-auc:0.751395', 'best_score': '0.751395', 'best_iteration': '253'}
{'best_msg': '[713]\ttrain-auc:0.852334\teval_dups-auc:0.948172\teval-auc:0.759635', 'best_score': '0.759635', 'best_iteration': '713'}
{'best_msg': '[343]\ttrain-auc:0.835905\teval_dups-auc:0.950797\teval-auc:0.747266', 'best_score': '0.747266', 'best_iteration': '343'}
{'best_msg': '[288]\ttrain-auc:0.828604\teval_dups-auc:0.94481\teval-auc:0.737097', 'best_score': '0.737097', 'best_iteration': '288'}
0 0.752733843583 0.942062045934
1 0.752814005632 0.948171670651
2 0.759309729488 0.950797467125
3 0.757924213556 0.94480979979
0.753434939162 0.946454723934


In [41]:
#m4 run two
preds_d5, df_preds_d5 = build_dfpreds_xgb(bst_d5)

{'best_msg': '[276]\ttrain-auc:0.788269\teval_dups-auc:0.942166\teval-auc:0.750867', 'best_score': '0.750867', 'best_iteration': '276'}
{'best_msg': '[993]\ttrain-auc:0.837044\teval_dups-auc:0.948053\teval-auc:0.757864', 'best_score': '0.757864', 'best_iteration': '993'}
{'best_msg': '[226]\ttrain-auc:0.794891\teval_dups-auc:0.950793\teval-auc:0.747106', 'best_score': '0.747106', 'best_iteration': '226'}
{'best_msg': '[312]\ttrain-auc:0.803766\teval_dups-auc:0.944858\teval-auc:0.73771', 'best_score': '0.73771', 'best_iteration': '312'}
0 0.754007781301 0.942165788045
1 0.751606304823 0.948052563567
2 0.762290584249 0.950792528444
3 0.758382898324 0.944857671069
0.754353655316 0.946393075867


In [28]:
#m3
preds_d5, df_preds_d5 = build_dfpreds_xgb(bst_d5)

{'best_score': '0.753615', 'best_iteration': '752', 'best_msg': '[752]\ttrain-auc:0.845992\teval_dups-auc:0.94264\teval-auc:0.753615'}
{'best_score': '0.757821', 'best_iteration': '954', 'best_msg': '[954]\ttrain-auc:0.853251\teval_dups-auc:0.948177\teval-auc:0.757821'}
{'best_score': '0.743711', 'best_iteration': '141', 'best_msg': '[141]\ttrain-auc:0.797326\teval_dups-auc:0.950185\teval-auc:0.743711'}
{'best_score': '0.736786', 'best_iteration': '213', 'best_msg': '[213]\ttrain-auc:0.804843\teval_dups-auc:0.944807\teval-auc:0.736786'}
0 0.752279284087 0.942639702175
1 0.753547690921 0.948177084922
2 0.76067852258 0.950185196789
3 0.759599575665 0.944806969918
0.747293194592 0.946151591414


In [41]:
#m2
preds_d5, df_preds_d5 = build_dfpreds_xgb(bst_d5)

{'best_msg': '[574]\ttrain-auc:0.83139\teval_dups-auc:0.941867\teval-auc:0.748414', 'best_iteration': '574', 'best_score': '0.748414'}
{'best_msg': '[1474]\ttrain-auc:0.883595\teval_dups-auc:0.947402\teval-auc:0.753172', 'best_iteration': '1474', 'best_score': '0.753172'}
{'best_msg': '[457]\ttrain-auc:0.823708\teval_dups-auc:0.949297\teval-auc:0.740588', 'best_iteration': '457', 'best_score': '0.740588'}
{'best_msg': '[238]\ttrain-auc:0.805222\teval_dups-auc:0.944347\teval-auc:0.735669', 'best_iteration': '238', 'best_score': '0.735669'}
0 0.748735285604 0.941867143597
1 0.751936029323 0.947402056114
2 0.765906384995 0.949296834144
3 0.760626521775 0.944346827339
0.753020170928 0.945669475544


In [48]:
#m1
preds_d5, df_preds_d5 = build_dfpreds_xgb(bst_d5)

{'best_iteration': '1256', 'best_score': '0.745132', 'best_msg': '[1256]\ttrain-auc:0.894881\teval_dups-auc:0.941498\teval-auc:0.745132'}
{'best_iteration': '1999', 'best_score': '0.753396', 'best_msg': '[1999]\ttrain-auc:0.921602\teval_dups-auc:0.947338\teval-auc:0.753396'}
{'best_iteration': '811', 'best_score': '0.73237', 'best_msg': '[811]\ttrain-auc:0.865569\teval_dups-auc:0.948136\teval-auc:0.73237'}
{'best_iteration': '1043', 'best_score': '0.719873', 'best_msg': '[1043]\ttrain-auc:0.887005\teval_dups-auc:0.942731\teval-auc:0.719873'}
0 0.739538654098 0.941498369384
1 0.737301126407 0.947338234202
2 0.742346539954 0.948136152364
3 0.739452499588 0.942730770974
0.738588420596 0.944967708879


In [27]:
#m2
preds_d5, df_preds_d5 = build_dfpreds_xgb(bst_d5)

{'best_msg': '[746]\ttrain-auc:0.858042\teval_dups-auc:0.941484\teval-auc:0.746186', 'best_iteration': '746', 'best_score': '0.746186'}
{'best_msg': '[239]\ttrain-auc:0.805788\teval_dups-auc:0.947217\teval-auc:0.752327', 'best_iteration': '239', 'best_score': '0.752327'}
{'best_msg': '[3]\ttrain-auc:0.775678\teval_dups-auc:0.948996\teval-auc:0.740552', 'best_iteration': '3', 'best_score': '0.740552'}
{'best_msg': '[479]\ttrain-auc:0.831583\teval_dups-auc:0.944428\teval-auc:0.736211', 'best_iteration': '479', 'best_score': '0.736211'}
0 0.753149331827 0.941484422522
1 0.752985442728 0.947216783493
2 0.758381677143 0.948996381774
3 0.75686377369 0.944428114792
0.728084245327 0.942640812557


In [39]:

param = {'max_depth':10, 'eta':0.01, 'silent':1, 'objective':'binary:logistic' }
#param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.7
param['colsample_bytree']= 0.6
param['min_child_weight'] = 1
param['max_depth'] = 6
param['booster'] = "gbtree"
param['seed'] = 12345

bst_d6 = {}
#bst_linear = {} # optional

for curfold in range(folds):
#for curfold in [2]:
    #watchlist  = [(dtrain[curfold],'train'), (dval_dups[curfold], 'eval_dups'), (dval[curfold], 'eval')]
    watchlist  = [(dtrain[curfold],'train'), (dval_dups[curfold], 'eval_dups'), (dval[curfold], 'eval')]
    num_round = 2000
    early_stopping_rounds=500
    bst_d6[curfold] = xgb.train(param, dtrain[curfold], num_round, watchlist,
                       feval = feval_procleak,
                       early_stopping_rounds=early_stopping_rounds, 
                       verbose_eval=50)

[0]	train-auc:0.731485	eval_dups-auc:0.934826	eval-auc:0.704509
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 500 rounds.
[50]	train-auc:0.802087	eval_dups-auc:0.939731	eval-auc:0.736787
[100]	train-auc:0.812199	eval_dups-auc:0.939783	eval-auc:0.736208
[150]	train-auc:0.822546	eval_dups-auc:0.939908	eval-auc:0.736826
[200]	train-auc:0.832695	eval_dups-auc:0.940276	eval-auc:0.73744
[250]	train-auc:0.842626	eval_dups-auc:0.94048	eval-auc:0.738799
[300]	train-auc:0.852093	eval_dups-auc:0.940775	eval-auc:0.73987
[350]	train-auc:0.860716	eval_dups-auc:0.940863	eval-auc:0.740573
[400]	train-auc:0.869673	eval_dups-auc:0.941086	eval-auc:0.742302
[450]	train-auc:0.876158	eval_dups-auc:0.941224	eval-auc:0.743061
[500]	train-auc:0.882002	eval_dups-auc:0.94132	eval-auc:0.743588
[550]	train-auc:0.887751	eval_dups-auc:0.941451	eval-auc:0.74454
[600]	train-auc:0.893511	eval_dups-auc:0.941513	eval-auc:0.745156
[650]	tr

In [53]:

param = {'max_depth':10, 'eta':0.01, 'silent':1, 'objective':'binary:logistic' }
#param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.7
param['colsample_bytree']= 0.6
param['min_child_weight'] = 1
param['max_depth'] = 5
param['booster'] = "gbtree"
param['seed'] = 12345

bst_d5 = {}
#bst_linear = {} # optional

for curfold in range(folds):
#for curfold in [2]:
    #watchlist  = [(dtrain[curfold],'train'), (dval_dups[curfold], 'eval_dups'), (dval[curfold], 'eval')]
    watchlist  = [(dtrain[curfold],'train'), (dval_dups[curfold], 'eval_dups'), (dval[curfold], 'eval')]
    num_round = 2000
    early_stopping_rounds=500
    bst_d5[curfold] = xgb.train(param, dtrain[curfold], num_round, watchlist,
                       feval = feval_procleak,
                       early_stopping_rounds=early_stopping_rounds, 
                       verbose_eval=50)

[0]	train-auc:0.624567	eval_dups-auc:0.916111	eval-auc:0.574633
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 500 rounds.
[50]	train-auc:0.763341	eval_dups-auc:0.936094	eval-auc:0.715502
[100]	train-auc:0.771242	eval_dups-auc:0.9368	eval-auc:0.719805
[150]	train-auc:0.778353	eval_dups-auc:0.937369	eval-auc:0.723115
[200]	train-auc:0.785659	eval_dups-auc:0.937785	eval-auc:0.725018
[250]	train-auc:0.793319	eval_dups-auc:0.9382	eval-auc:0.72675
[300]	train-auc:0.799847	eval_dups-auc:0.938669	eval-auc:0.728383
[350]	train-auc:0.805494	eval_dups-auc:0.938929	eval-auc:0.729526


KeyboardInterrupt: 

In [48]:
#m1
preds_d5, df_preds_d5 = build_dfpreds_xgb(bst_d5)

{'best_iteration': '1256', 'best_score': '0.745132', 'best_msg': '[1256]\ttrain-auc:0.894881\teval_dups-auc:0.941498\teval-auc:0.745132'}
{'best_iteration': '1999', 'best_score': '0.753396', 'best_msg': '[1999]\ttrain-auc:0.921602\teval_dups-auc:0.947338\teval-auc:0.753396'}
{'best_iteration': '811', 'best_score': '0.73237', 'best_msg': '[811]\ttrain-auc:0.865569\teval_dups-auc:0.948136\teval-auc:0.73237'}
{'best_iteration': '1043', 'best_score': '0.719873', 'best_msg': '[1043]\ttrain-auc:0.887005\teval_dups-auc:0.942731\teval-auc:0.719873'}
0 0.739538654098 0.941498369384
1 0.737301126407 0.947338234202
2 0.742346539954 0.948136152364
3 0.739452499588 0.942730770974
0.738588420596 0.944967708879


In [27]:
#m2
preds_d5, df_preds_d5 = build_dfpreds_xgb(bst_d5)

{'best_msg': '[746]\ttrain-auc:0.858042\teval_dups-auc:0.941484\teval-auc:0.746186', 'best_iteration': '746', 'best_score': '0.746186'}
{'best_msg': '[239]\ttrain-auc:0.805788\teval_dups-auc:0.947217\teval-auc:0.752327', 'best_iteration': '239', 'best_score': '0.752327'}
{'best_msg': '[3]\ttrain-auc:0.775678\teval_dups-auc:0.948996\teval-auc:0.740552', 'best_iteration': '3', 'best_score': '0.740552'}
{'best_msg': '[479]\ttrain-auc:0.831583\teval_dups-auc:0.944428\teval-auc:0.736211', 'best_iteration': '479', 'best_score': '0.736211'}
0 0.753149331827 0.941484422522
1 0.752985442728 0.947216783493
2 0.758381677143 0.948996381774
3 0.75686377369 0.944428114792
0.728084245327 0.942640812557


In [40]:
preds_d6, df_preds_d6 = build_dfpreds_xgb(bst_d6)

{'best_iteration': '1651', 'best_score': '0.746809', 'best_msg': '[1651]\ttrain-auc:0.948672\teval_dups-auc:0.941312\teval-auc:0.746809'}
{'best_iteration': '1679', 'best_score': '0.757333', 'best_msg': '[1679]\ttrain-auc:0.948609\teval_dups-auc:0.947664\teval-auc:0.757333'}
{'best_iteration': '981', 'best_score': '0.731136', 'best_msg': '[981]\ttrain-auc:0.91844\teval_dups-auc:0.947471\teval-auc:0.731136'}
{'best_iteration': '1999', 'best_score': '0.720694', 'best_msg': '[1999]\ttrain-auc:0.958877\teval_dups-auc:0.94241\teval-auc:0.720694'}
0 0.738310041721 0.941312147355
1 0.738740267928 0.947664493458
2 0.738601366685 0.947470504149
3 0.733135302538 0.942410227846
0.736510286003 0.944720903886


In [30]:
df_preds_d5.to_pickle('predscv-7m4r2-d5.pkl')

In [123]:
# xgb7
#preds_d5, df_preds_d5 = build_dfpreds_xgb(bst_d5)

{'best_iteration': '1384', 'best_msg': '[1384]\ttrain-auc:0.878529\teval_dups-auc:0.938891\teval-auc:0.73028', 'best_score': '0.73028'}
{'best_iteration': '1718', 'best_msg': '[1718]\ttrain-auc:0.890728\teval_dups-auc:0.946822\teval-auc:0.747487', 'best_score': '0.747487'}
{'best_iteration': '1654', 'best_msg': '[1654]\ttrain-auc:0.887692\teval_dups-auc:0.947503\teval-auc:0.730663', 'best_score': '0.730663'}
{'best_iteration': '1091', 'best_msg': '[1091]\ttrain-auc:0.864189\teval_dups-auc:0.942432\teval-auc:0.721049', 'best_score': '0.721049'}
0 0.622079375672 0.938890647326
1 0.628981694176 0.946821876618
2 0.599805157372 0.947502742127
3 0.60200131103 0.942431994643
0.612522320556 0.943952068017


In [126]:
#preds_lin, df_preds_lin = build_dfpreds_xgb(bstlin, ldval_dups, cv_val_dups)

{'best_iteration': '338', 'best_msg': '[338]\ttrain-auc:0.723899\teval_dups-auc:0.706475\teval-auc:0.679699', 'best_score': '0.679699'}
{'best_iteration': '999', 'best_msg': '[999]\ttrain-auc:0.730359\teval_dups-auc:0.712608\teval-auc:0.68424', 'best_score': '0.68424'}
{'best_iteration': '503', 'best_msg': '[503]\ttrain-auc:0.723356\teval_dups-auc:0.729673\teval-auc:0.670418', 'best_score': '0.670418'}
{'best_iteration': '999', 'best_msg': '[999]\ttrain-auc:0.72717\teval_dups-auc:0.724861\teval-auc:0.671322', 'best_score': '0.671322'}
0 0.709370912512 0.93198077234
1 0.71259843731 0.937867897659
2 0.730689823242 0.940478300383
3 0.72486271743 0.934388239106
0.719427019502 0.936269345713


In [23]:
with gzip.open('xgbl7f-output.pkl.gz', 'rb') as fd:
    d_linpreds = pickle.load(fd)

preds_lin = d_linpreds['train']
df_preds_lin = d_linpreds['train_df']
preds_lin_test = d_linpreds['test']

In [42]:
def merge_preds(dfs, clf):
    
    tgt = dfs[0][dfs[0].leak_fillmask].outcome.values.copy()
    
    preds = []
    for df in dfs:
        preds.append(df[df.leak_fillmask].pred_outcomel.values.copy())
        
    print(roc_auc_score(tgt, preds[0]))
        
    #preds.append(np.ones_like(preds[-1]))
        
    X = np.vstack(preds).T

#    clf = sklearn.linear_model.Ridge()
    clf.fit(X, tgt)
    
    merged = clf.predict(X)
    merged = np.clip(merged, .001, .999)
    
    print(roc_auc_score(tgt, merged))

    df_merged = dfs[0][['activity_id', 'outcome', 'pred_outcome', 'pred_outcomel', 'leak_fillmask']]
    df_merged.pred_outcomel.values[np.where(dfs[0].leak_fillmask.values)] = merged
    df_merged.pred_outcome.values[np.where(dfs[0].leak_fillmask.values)] = merged
    
    print(roc_auc_score(dfs[0].outcome.values, df_merged.pred_outcomel))

    return df_merged, clf

In [43]:
import sklearn.ensemble

In [44]:
from sklearn import cross_validation

In [47]:
df_mergedout, clf = merge_preds([df_preds_d5, df_preds_d6], sklearn.linear_model.Ridge())


0.74955264133
0.75002324412
0.94643684052


In [46]:
df_mergedout = df_preds_d5

In [49]:
tm = pd.merge(train, df_mergedout, on='activity_id', how='left').copy()

#print(tm.pred_outcome.isnull().sum(), tm.pred_outcomel.isnull().sum())

tm.loc[tm.pred_outcome.isnull(), ['pred_outcome']] = tm[tm.pred_outcome.isnull()]['outcome_filled_nona']
tm.loc[tm.pred_outcomel.isnull(), ['pred_outcomel']] = tm[tm.pred_outcomel.isnull()]['outcome_filled_nona']

print(sklearn.metrics.roc_auc_score(tm.outcome_x.values, tm.pred_outcome.values), \
sklearn.metrics.roc_auc_score(tm.outcome_x.values, tm.pred_outcomel.values))

mask0 = np.full(len(tm), False, dtype=np.bool)
for g in dreduct['grouplist_all0']:
    mask0 = np.logical_or(mask0, tm.group_1 == g)

mask1 = np.full(len(tm), False, dtype=np.bool)
for g in dreduct['grouplist_all1']:
    mask1 = np.logical_or(mask1, tm.group_1 == g)

vals = (0, 1) # These groups are always 0/1 in test
tm.pred_outcome.values[np.where(mask0)] = vals[0]
tm.pred_outcomel.values[np.where(mask0)] = vals[0]

tm.pred_outcome.values[np.where(mask1)] = vals[1]
tm.pred_outcomel.values[np.where(mask1)] = vals[1]

print(sklearn.metrics.roc_auc_score(tm.outcome_x.values, tm.pred_outcome.values), \
sklearn.metrics.roc_auc_score(tm.outcome_x.values, tm.pred_outcomel.values))

#0.946852648526 0.989813329397
#0.954428644246 0.992619147622 - w/ adate10_r .001 quant .992885

#0.949455564305 0.989835569724
#0.956555615812 0.992636323141 - first FE attempt .992881

#0.955060286488 0.989917327586
#0.96143899082 0.99269895942 - second FE attempt

#0.967763129886 0.990062140532
#0.972240541802 0.992828869816 - 7m1 d5
#0.967298424983 0.989985499522
#0.971925205949 0.992778873978 - d6 (so d5 might be better)

#0.97000016041 0.990225258698
#0.974089176638 0.992948181038 - 7m2 d5

#0.969949811863 0.990401393144
#0.973875698791 0.993074997241 - 7m3 d5

#0.970564734627 0.990424342822
#0.974527494216 0.993108191472 - 7m4 d5

#0.970475042392 0.990426490824
#0.974420841262 0.993112858563 - 7m4r d5+d6

0.970475042392 0.990426490824
0.974420841262 0.993112858563


In [30]:
# Output section

In [31]:
#mat_test = buildmatrix(test, False)
#dtest = xgb.DMatrix(mat_test, missing=np.nan)

#lmat_test = buildmatrix(test, linear=True)
#ldtest = xgb.DMatrix(lmat_test, missing=np.nan)

In [50]:
def xgbmdl_to_test(mdl, dmat = dtest):
    output = np.zeros(len(test), dtype=np.float64)
    for f in range(folds):
        try:
            output += mdl[f].predict(dmat, ntree_limit=mdl[f].best_ntree_limit)
        except: # Linear model
            o = mdl[f].predict(dmat)
            o = o.clip(.001, .999)
            output += o
            
    return output / folds
            

In [51]:
outputs = []
#[df_preds_d4, df_preds_d5, df_preds_d6, df_preds_d7, df_preds_d8, df_preds_d9, df_preds_lin]
#outputs.append(xgbmdl_to_test(bst_d4))
outputs.append(xgbmdl_to_test(bst_d5))
outputs.append(xgbmdl_to_test(bst_d6))
#outputs.append(xgbmdl_to_test(bst_d7))
#outputs.append(xgbmdl_to_test(bst_d8))
#outputs.append(xgbmdl_to_test(bst_d9))
#outputs.append(preds_lin_test)


In [52]:
if len(outputs) > 1:
    moutputs = np.vstack(outputs).T

    outputa = clf.predict(moutputs)

    print(np.min(outputa), np.max(outputa))
    outputa = np.clip(outputa, .001, .999)
else:
    outputa = outputs[0]

-0.0097536049618 0.922685520993


In [53]:
output = test.outcome_filled.values.copy()

mask = np.where(output != output)
output[np.where(output != output)] = outputa[mask]

test_out =  test[['activity_id']].copy()
test_out['outcome'] = output

In [54]:
test_out.to_csv('Submission-7m4r-d5-d6-usethis.csv', index=False)

In [55]:
output = test.outcome_filled.values.copy()

imask = output == 1.0
imask = np.logical_or(imask, output == 0.0)

test_tgt = output[np.where(imask)]
test_preds = outputa[np.where(imask)]

#test_tgt[test_tgt < .5] = 0
#test_tgt[test_tgt >= .5] = 1

roc_auc_score(test_tgt, test_preds) #0.84179900687486708
#0.74025122141974342 .992881
#0.96740052157355261 - m1
#0.96176368357003805 - m2
#0.9693393882776864 - m3


0.96762736900893409