In [1]:
# loading code is copied from the xgb model code, new post-deadline code begins later...

import pickle
import pandas as pd
import numpy as np
import gzip
import datetime
from itertools import product
from scipy import interpolate ## For other interpolation functions.
import time

import xgboost as xgb

import sklearn.metrics
import sklearn.utils

from sklearn.cross_validation import LabelKFold

import copy

import sklearn.linear_model
from sklearn.metrics import roc_auc_score

%matplotlib inline
import matplotlib.pyplot as plt



In [2]:
with gzip.open('merged7.pkl.gz', 'rb') as fd:
    data = pickle.load(fd)

with gzip.open('cvleak7-10fold.pkl.gz', 'rb') as fd:
    cvleak = pickle.load(fd)
    
data = pd.merge(data, cvleak, on='activity_id', how='left')

with gzip.open('dproc7.pkl.gz', 'rb') as fd:
    extra = pickle.load(fd)

data = pd.merge(data, extra, on='activity_id', how='left')

if True: # Wasteful to recompute a constant every time
    mindate = pd.Timestamp('2022-07-17 00:00:00')
    maxdate = pd.Timestamp('2023-08-31 00:00:00')
    minpdate = pd.Timestamp('2020-05-18 00:00:00')
else:
    mindate = min(data['date'])
    maxdate = max(data['date'])
    minpdate = min(data['pdate'])
    

In [3]:
df_gpreds = pickle.load(open('group3d-xgb-preds.pkl', 'rb'))
data = pd.merge(data, df_gpreds, on='group_1', how='left')

In [4]:
# data preproc patches

# convert adate_gap nan's to -1 for xgb
data.adate_gap.fillna(-1, inplace=True)

In [5]:
# get adate range maybefeature

ad = []
for g in data.groupby('achar_10'):
    ad.append([g[0], g[1].adate_daynum.min(), g[1].adate_daynum.max(), g[1].adate_daynum.max() - g[1].adate_daynum.min(), len(g[1]), g[1].outcome.sum(), g[1].outcome.mean()])

df_ad = pd.DataFrame(ad)

df_ad.columns = ['achar_10', 'achar_10_adate_min', 'achar_10_adate_max', 'achar_10_adate_range', 'achar_10_sum', 'achar_10_sum_outcome', 'achar_10_outcome_mean']

# Set achar_10 == -1 to -1 (fixme?)
df_ad.loc[0, ['achar_10_adate_min']] = -1
df_ad.loc[0, ['achar_10_adate_range']] += 1

df_adr = df_ad[['achar_10', 'achar_10_adate_range', 'achar_10_adate_min', 'achar_10_adate_max']]

data = pd.merge(data, df_adr, on='achar_10', how='left')

In [6]:
data_orig = data.copy()

In [7]:
procs = {}
procs['gavg_pc_4_eq24'] = []
procs['gavg_pc_2_eq0'] = []
for g in data.groupby('group_1'):
    procs['gavg_pc_2_eq0'].append([g[0], np.mean(g[1]['pchar_2'] == 0)])
    procs['gavg_pc_4_eq24'].append([g[0], np.mean(g[1]['pchar_4'] == 24)])

In [8]:
procs

{'gavg_pc_2_eq0': [[1, 1.0],
  [2, 0.24210526315789474],
  [3, 0.36170212765957449],
  [4, 0.20000000000000001],
  [5, 0.58396946564885499],
  [6, 0.041095890410958902],
  [7, 0.5],
  [8, 0.12796208530805686],
  [9, 0.125],
  [10, 1.0],
  [11, 0.22222222222222221],
  [12, 1.0],
  [13, 0.081081081081081086],
  [14, 1.0],
  [15, 1.0],
  [17, 0.11392405063291139],
  [18, 0.35950413223140498],
  [20, 0.5],
  [21, 0.44444444444444442],
  [24, 0.23999999999999999],
  [25, 0.20689655172413793],
  [26, 0.43783783783783786],
  [27, 0.72357723577235777],
  [28, 0.14035087719298245],
  [29, 0.52083333333333337],
  [30, 0.092105263157894732],
  [31, 0.18248175182481752],
  [32, 0.98936170212765961],
  [33, 0.066666666666666666],
  [34, 0.28770949720670391],
  [35, 0.4943820224719101],
  [36, 0.10897435897435898],
  [37, 0.32643678160919543],
  [38, 0.49230769230769234],
  [39, 0.51360174102285094],
  [40, 0.054545454545454543],
  [41, 0.012195121951219513],
  [42, 0.012500000000000001],
  [44, 0.5

In [9]:
for k in procs:
    print(k)
    df_tmp = pd.DataFrame(procs[k])
    df_tmp.columns = ['group_1', k]
    
    data = pd.merge(data, df_tmp, on='group_1', how='left')

gavg_pc_2_eq0
gavg_pc_4_eq24


In [10]:
data['achar_10_adate_pos'] = (data.adate_daynum - data.achar_10_adate_min) / data.achar_10_adate_range

In [11]:
data['leak_fillmask'] = data['outcome_filled'].isnull()
data.leak_fillmask.sum()

409834

In [12]:
write_dreduct = False

try:
    dreduct = pickle.load(open('dreduct7h.pkl', 'rb'))
except:
    dreduct = {}
    write_dreduct = True

In [13]:
# fixme?  onehot-values is still used as target fields
do_oneheat = ['adate_dayofweek', 'activity_category', 'group_1_bin']
dont_oneheat = ['pchar_38', 'achar_10', 'achar_10_reduced'] # *char* is heated unless listed here

onehot_values = {}
tot = 0

for k in data.keys():
    if k in dont_oneheat:
        continue
        
    if 'achar_10' in k: # block all achar_10 derivatives
        continue
        
    if 'char' in k or k in do_oneheat:
        onehot_values[k] = sorted(data[k].unique())
        tot += len(onehot_values[k])
        
print(tot, 'possible columns post-heating')

368 possible columns post-heating


In [14]:
# ALL test targets with 0.0/1.0 average are fully/accurately inferred in the test set... 

if 'grouplist' not in dreduct:

    dreduct['grouplist_all0'] = []
    dreduct['grouplist_all1'] = []
    dreduct['grouplist'] = []
    count = 0

    for g in data.groupby(['group_1'], sort=False):
        if len(g[1]) > 100:
            m = g[1].outcome.mean()
            if m != 0 and m != 1:
                #print(g[0], len(g[1]), g[1].outcome.mean())
                dreduct['grouplist'].append(g[0])

                count += len(g[1])
            elif m == 0:
                dreduct['grouplist_all0'].append(g[0])
            elif m == 1:
                dreduct['grouplist_all1'].append(g[0])
        else:
            dreduct['grouplist'].append(g[0])

            count += len(g[1])

    print(len(dreduct['grouplist']), len(dreduct['grouplist_all0']), len(dreduct['grouplist_all1']), count)

In [15]:
def split_traintest():
    testset = np.where(data['outcome'].isnull())
    trainset = np.where(~data['outcome'].isnull())

    return trainset, testset, data.iloc[trainset], data.iloc[testset]

trainset, testset, train, test = split_traintest()

print(len(train), len(test), len(data))

if 'trainmask' not in dreduct:
    mask = np.full(len(train), False, dtype=np.bool)
    for g in dreduct['grouplist_all0']:
        mask = np.logical_or(mask, train.group_1 == g)
    for g in dreduct['grouplist_all1']:
        mask = np.logical_or(mask, train.group_1 == g)
        
    dreduct['trainmask'] = mask
    
mask = dreduct['trainmask'].copy()
    
print(np.sum(mask))

# Need to copy since we're going to add another column
train_cut = train.iloc[np.where(np.logical_not(mask))].copy()

cols = train.columns.copy()
cols = cols.drop('activity_id')
train_dups = train_cut.duplicated(subset=cols)

train_cut_dedup = train_cut[~train_dups]
train_cut_dups = train_cut[train_dups]

train_cut_dedup_leaks = train_cut_dedup.iloc[np.where(train_cut_dedup.leak_fillmask.values)]
len(train_cut_dedup_leaks)

2197291 498687 2695978
1454634


158242

In [16]:
from operator import itemgetter

stratkey = 'people_id'

if 'pplbuckets' not in dreduct:
    balls = []
    for p in train_cut_dedup.groupby([stratkey]):
        balls.append([p[0], len(p[1]), p[1].outcome.mean()])

    dreduct['pplbuckets'] = sorted(balls, key=itemgetter(2), reverse=True)
else:
    balls = dreduct['pplbuckets'].copy()

In [17]:
if write_dreduct:
    pickle.dump(dreduct, open('dreduct7h.pkl', 'wb'))

In [18]:
# This assumes vc is sorted by whatever you want stratified
def dosplit_rr(df, vc, folds, fuzz = (43254, .5, 4)):
    if fuzz is not None:
        np.random.seed(fuzz[0])
    
    bcount = np.zeros(folds)
    
    buckets = []
    for f in range(folds):
        buckets.append([])
    
    ballpit = copy.deepcopy(balls)
    
    runs = 0
    
    tot = 0
    
    while len(ballpit):
        runs += 1
        sel = 0
        r = np.random.rand()
        if r < fuzz[1]:
            sel = int((fuzz[2] / fuzz[1]) * r)
            if sel >= len(ballpit):
                sel = len(ballpit) - 1

        tot += sel
                
        v = ballpit[sel]
        del ballpit[sel]
                
        selbucket = np.argsort(bcount)[0]
        
        buckets[selbucket].append(v[0])
        bcount[selbucket] += v[1]
    
    print(len(balls), runs, tot)
    
    return buckets
    

### POST DEADLINE CODE BEGINS HERE

In [85]:
tc_forstacking = train_cut.copy()

In [72]:
# import 5 and 6 depth XGB models from ps-xgbmodel

In [88]:
m_d5 = pickle.load(open('predscv-7m4r2-d5.pkl', 'rb'))

In [87]:
m_d6 = pickle.load(open('predscv-7m4r2-d6.pkl', 'rb'))

In [89]:
m_xgb = m_d5[['activity_id']].copy()

In [90]:
m_xgb['preds_d5'] = m_d5.pred_outcome.values.copy()
m_xgb['preds_d6'] = m_d6.pred_outcome.values.copy()

In [91]:
tc_forstacking = pd.merge(tc_forstacking, m_xgb, on='activity_id', how='left')


In [121]:
tc_forstacking.preds_d6.isnull().sum()

53668

In [95]:
cols = train.columns.copy()
cols = cols.drop('activity_id')
#cols = cols.drop('preds_d5')
#cols = cols.drop('preds_d6')
tc_dups = tc_forstacking.duplicated(subset=cols)

tcd_forstacking = tc_forstacking[~tc_dups]


In [122]:
tcd_forstacking.preds_d6.isnull().sum()

22

In [96]:
# 

folds = 6

source = tcd_forstacking
pidsets_grouped = dosplit_rr(source, balls, folds, fuzz = (12345678, 0.4, 4))

cv_train = []
cv_train_leak = []

cv_val = []
cv_val_leak = []
cv_val_dups = []

pu = []

for p in pidsets_grouped:
    cv_train.append(source[~source[stratkey].isin(p)])
    cv_val.append(source[source[stratkey].isin(p)].copy()) # copy val since we need to add a field for unique groups
    
    pu.append(list(cv_val[-1][stratkey].unique()))

    cv_train_leak.append(cv_train[-1].iloc[np.where(cv_train[-1].leak_fillmask.values)])
    cv_val_leak.append(cv_val[-1].iloc[np.where(cv_val[-1].leak_fillmask.values)].copy())
    
    # This is directly from train_cut, bypassing duplicate and leak (but not skewed-group) detection
    cv_val_dups.append(tc_forstacking[tc_forstacking[stratkey].isin(p)])


53036 53036 31916


In [97]:
mat_train = {}
mat_val = {}
mat_val_dups = {}
dtrain = {}
dval = {}
dval_dups = {}

np.random.seed(0)

for f in range(folds):
    print(f)
    
    mat_train[f] = np.array(cv_train_leak[f][['preds_d5', 'preds_d6', 'gp_all0', 'gp_all1', 'gp_mixed']])
    dtrain[f] = xgb.DMatrix(mat_train[f], label=cv_train_leak[f].outcome.values, missing=np.nan)

    mat_val[f] = np.array(cv_val[f][['preds_d5', 'preds_d6', 'gp_all0', 'gp_all1', 'gp_mixed']])
    dval[f] = xgb.DMatrix(mat_val[f], label=cv_val[f].outcome.values, missing=np.nan)
    
    mat_val_dups[f] = np.array(cv_val_dups[f][['preds_d5', 'preds_d6', 'gp_all0', 'gp_all1', 'gp_mixed']])
    dval_dups[f] = xgb.DMatrix(mat_val_dups[f], label=cv_val_dups[f].outcome.values, missing=np.nan)



0
1
2
3
4
5


In [138]:

param = {'max_depth':10, 'eta':0.01, 'silent':1, 'objective':'binary:logistic' }
#param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.8
param['colsample_bytree']= 1
param['min_child_weight'] = 1
param['max_depth'] = 3
param['booster'] = "gblinear"
param['seed'] = 12343

bst_stack = {}

for curfold in range(folds):
    watchlist  = [(dtrain[curfold],'train'), (dval[curfold], 'eval')]
    num_round = 2000
    early_stopping_rounds=20
    bst_stack[curfold] = xgb.train(param, dtrain[curfold], num_round, watchlist,
                       early_stopping_rounds=early_stopping_rounds, 
                       verbose_eval=20)

[0]	train-auc:0.750937	eval-auc:0.753409
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 20 rounds.
[20]	train-auc:0.751123	eval-auc:0.753546
[40]	train-auc:0.751289	eval-auc:0.753671
[60]	train-auc:0.751441	eval-auc:0.753793
[80]	train-auc:0.751583	eval-auc:0.75391
[100]	train-auc:0.751716	eval-auc:0.754022
[120]	train-auc:0.75184	eval-auc:0.754127
[140]	train-auc:0.751956	eval-auc:0.754227
[160]	train-auc:0.752064	eval-auc:0.754319
[180]	train-auc:0.752167	eval-auc:0.754405
[200]	train-auc:0.752263	eval-auc:0.754485
[220]	train-auc:0.752353	eval-auc:0.754562
[240]	train-auc:0.752437	eval-auc:0.754633
[260]	train-auc:0.752515	eval-auc:0.754702
[280]	train-auc:0.752588	eval-auc:0.754767
[300]	train-auc:0.752658	eval-auc:0.754827
[320]	train-auc:0.752723	eval-auc:0.754882
[340]	train-auc:0.752785	eval-auc:0.754934
[360]	train-auc:0.752842	eval-auc:0.754983
[380]	train-auc:0.752897	eval-auc:0.755029
[400]	t

In [106]:
def build_dfpreds(preds, cv_df):
    cv_val_preds = []
    for f in range(folds):
        cv_val_preds.append(cv_df[f][['activity_id', 'outcome', 'outcome_filled', 'outcome_filled_nona', 'leak_fillmask']].copy())

        cv_val_preds[f]['pred_outcome'] = preds[f]
        cv_val_preds[f]['pred_outcomel'] = preds[f]
        
        mask = np.where(~cv_val_preds[f].leak_fillmask)
        cv_val_preds[f]['pred_outcomel'].values[mask] = cv_df[f]['outcome_filled'].values[mask]
        
        print(f,
              sklearn.metrics.roc_auc_score(cv_val_preds[f]['outcome'].values, cv_val_preds[f]['pred_outcome'].values),
              sklearn.metrics.roc_auc_score(cv_val_preds[f]['outcome'].values, cv_val_preds[f]['pred_outcomel'].values))
        
    output = pd.concat(cv_val_preds)
    
    print(sklearn.metrics.roc_auc_score(output['outcome'].values, output['pred_outcome'].values),
          sklearn.metrics.roc_auc_score(output['outcome'].values, output['pred_outcomel'].values))
    
    return output
    

def build_dfpreds_xgb(bst, cv_mat = dval_dups, cv_df = cv_val_dups):
    preds = []
    for f in range(folds):
        print(bst[f].attributes())
        try:
            preds.append(bst[f].predict(cv_mat[f], ntree_limit=bst[f].best_ntree_limit))
        except:
            preds.append(bst[f].predict(cv_mat[f]))
            preds[-1] = preds[-1].clip(0.001, .999)

    df_preds = build_dfpreds(preds, cv_df)
    return preds, df_preds

In [139]:
preds, df_preds = build_dfpreds_xgb(bst_stack)

{'best_msg': '[1498]\ttrain-auc:0.753626\teval-auc:0.75568', 'best_iteration': '1498', 'best_score': '0.75568'}
{'best_msg': '[1998]\ttrain-auc:0.751689\teval-auc:0.75923', 'best_iteration': '1998', 'best_score': '0.75923'}
{'best_msg': '[1993]\ttrain-auc:0.751286\teval-auc:0.764933', 'best_iteration': '1993', 'best_score': '0.764933'}
{'best_msg': '[1670]\ttrain-auc:0.748852\teval-auc:0.769673', 'best_iteration': '1670', 'best_score': '0.769673'}
{'best_msg': '[1994]\ttrain-auc:0.748826\teval-auc:0.755613', 'best_iteration': '1994', 'best_score': '0.755613'}
{'best_msg': '[696]\ttrain-auc:0.753947\teval-auc:0.77076', 'best_iteration': '696', 'best_score': '0.77076'}
0 0.757646262351 0.945390615853
1 0.750122693424 0.95443773879
2 0.867202207202 0.952642271919
3 0.768661186381 0.945077037899
4 0.758149982403 0.944588739175
5 0.772948030836 0.941756530723
0.793897972045 0.948299230784


In [133]:
preds, df_preds = build_dfpreds_xgb(bst_stack)

{'best_msg': '[72]\ttrain-auc:0.75968\teval-auc:0.748555', 'best_iteration': '72', 'best_score': '0.748555'}
{'best_msg': '[197]\ttrain-auc:0.761602\teval-auc:0.753406', 'best_iteration': '197', 'best_score': '0.753406'}
{'best_msg': '[14]\ttrain-auc:0.753342\teval-auc:0.758851', 'best_iteration': '14', 'best_score': '0.758851'}
{'best_msg': '[130]\ttrain-auc:0.75626\teval-auc:0.769246', 'best_iteration': '130', 'best_score': '0.769246'}
{'best_msg': '[110]\ttrain-auc:0.757067\teval-auc:0.746127', 'best_iteration': '110', 'best_score': '0.746127'}
{'best_msg': '[68]\ttrain-auc:0.761096\teval-auc:0.759172', 'best_iteration': '68', 'best_score': '0.759172'}
0 0.75143846764 0.944361792665
1 0.74729860741 0.954070794705
2 0.795236276402 0.900506607292
3 0.768631113009 0.943758434164
4 0.749206506454 0.943358439691
5 0.762573726315 0.940958478484
0.760298106059 0.935292044005


In [140]:
roc_auc_score(df_preds.outcome.values, df_preds.pred_outcome.values)

0.79389797204512269

In [124]:
xgb_test = pickle.load(open('ps-xgb-test.pkl', 'rb'))

In [126]:
test_m = pd.merge(test, xgb_test, on='activity_id', how='left')

In [128]:
mat_test = np.array(test_m[['preds_d5', 'preds_d6', 'gp_all0', 'gp_all1', 'gp_mixed']])
dtest = xgb.DMatrix(mat_test, missing=np.nan)


In [141]:
def xgbmdl_to_test(mdl, dmat = dtest):
    output = np.zeros(len(test), dtype=np.float64)
    for f in range(folds):
        try:
            output += mdl[f].predict(dmat, ntree_limit=mdl[f].best_ntree_limit)
        except: # Linear model
            o = mdl[f].predict(dmat)
            o = o.clip(.001, .999)
            output += o
            
    return output / folds

In [143]:
outputs = []
outputs.append(xgbmdl_to_test(bst_stack))

In [144]:
if len(outputs) > 1:
    moutputs = np.vstack(outputs).T

    outputa = clf.predict(moutputs)

    print(np.min(outputa), np.max(outputa))
    outputa = np.clip(outputa, .001, .999)
else:
    outputa = outputs[0]

output = test.outcome_filled.values.copy()

mask = np.where(output != output)
output[np.where(output != output)] = outputa[mask]

test_out =  test[['activity_id']].copy()
test_out['outcome'] = output

test_out.to_csv('Submission-stack.csv', index=False)

output = test.outcome_filled.values.copy()

imask = output == 1.0
imask = np.logical_or(imask, output == 0.0)

test_tgt = output[np.where(imask)]
test_preds = outputa[np.where(imask)]

roc_auc_score(test_tgt, test_preds) #0.84179900687486708
#0.74025122141974342 .992881
#0.96740052157355261 - m1
#0.96176368357003805 - m2
#0.9693393882776864 - m3


0.95434210122959562