In [1]:
import pickle
import pandas as pd
import numpy as np
import gzip
import datetime
from itertools import product
from scipy import interpolate ## For other interpolation functions.
import time

import xgboost as xgb

import sklearn.metrics
import sklearn.utils

from sklearn.cross_validation import LabelKFold

import copy

import sklearn.linear_model
from sklearn.metrics import roc_auc_score

%matplotlib inline
import matplotlib.pyplot as plt



In [2]:
with gzip.open('merged7.pkl.gz', 'rb') as fd:
    data = pickle.load(fd)

with gzip.open('cvleak7-10fold.pkl.gz', 'rb') as fd:
    cvleak = pickle.load(fd)
    
data = pd.merge(data, cvleak, on='activity_id', how='left')

with gzip.open('dproc7.pkl.gz', 'rb') as fd:
    extra = pickle.load(fd)

data = pd.merge(data, extra, on='activity_id', how='left')

if True: # Wasteful to recompute a constant every time
    mindate = pd.Timestamp('2022-07-17 00:00:00')
    maxdate = pd.Timestamp('2023-08-31 00:00:00')
    minpdate = pd.Timestamp('2020-05-18 00:00:00')
else:
    mindate = min(data['date'])
    maxdate = max(data['date'])
    minpdate = min(data['pdate'])
    

In [3]:
df_gpreds = pickle.load(open('group3d-xgb-preds.pkl', 'rb'))
data = pd.merge(data, df_gpreds, on='group_1', how='left')

In [4]:
# data preproc patches

# convert adate_gap nan's to -1 for xgb
data.adate_gap.fillna(-1, inplace=True)

In [5]:
# get adate range maybefeature

ad = []
for g in data.groupby('achar_10'):
    ad.append([g[0], g[1].adate_daynum.min(), g[1].adate_daynum.max(), g[1].adate_daynum.max() - g[1].adate_daynum.min(), len(g[1]), g[1].outcome.sum(), g[1].outcome.mean()])

df_ad = pd.DataFrame(ad)

df_ad.columns = ['achar_10', 'achar_10_adate_min', 'achar_10_adate_max', 'achar_10_adate_range', 'achar_10_sum', 'achar_10_sum_outcome', 'achar_10_outcome_mean']

# Set achar_10 == -1 to -1 (fixme?)
df_ad.loc[0, ['achar_10_adate_min']] = -1
df_ad.loc[0, ['achar_10_adate_range']] += 1

df_adr = df_ad[['achar_10', 'achar_10_adate_range', 'achar_10_adate_min', 'achar_10_adate_max']]

data = pd.merge(data, df_adr, on='achar_10', how='left')

In [6]:
data_orig = data.copy()

In [7]:
procs = {}
procs['gavg_pc_4_eq24'] = []
procs['gavg_pc_2_eq0'] = []
for g in data.groupby('group_1'):
    procs['gavg_pc_2_eq0'].append([g[0], np.mean(g[1]['pchar_2'] == 0)])
    procs['gavg_pc_4_eq24'].append([g[0], np.mean(g[1]['pchar_4'] == 24)])

In [8]:
procs

{'gavg_pc_2_eq0': [[1, 1.0],
  [2, 0.24210526315789474],
  [3, 0.36170212765957449],
  [4, 0.20000000000000001],
  [5, 0.58396946564885499],
  [6, 0.041095890410958902],
  [7, 0.5],
  [8, 0.12796208530805686],
  [9, 0.125],
  [10, 1.0],
  [11, 0.22222222222222221],
  [12, 1.0],
  [13, 0.081081081081081086],
  [14, 1.0],
  [15, 1.0],
  [17, 0.11392405063291139],
  [18, 0.35950413223140498],
  [20, 0.5],
  [21, 0.44444444444444442],
  [24, 0.23999999999999999],
  [25, 0.20689655172413793],
  [26, 0.43783783783783786],
  [27, 0.72357723577235777],
  [28, 0.14035087719298245],
  [29, 0.52083333333333337],
  [30, 0.092105263157894732],
  [31, 0.18248175182481752],
  [32, 0.98936170212765961],
  [33, 0.066666666666666666],
  [34, 0.28770949720670391],
  [35, 0.4943820224719101],
  [36, 0.10897435897435898],
  [37, 0.32643678160919543],
  [38, 0.49230769230769234],
  [39, 0.51360174102285094],
  [40, 0.054545454545454543],
  [41, 0.012195121951219513],
  [42, 0.012500000000000001],
  [44, 0.5

In [9]:
for k in procs:
    print(k)
    df_tmp = pd.DataFrame(procs[k])
    df_tmp.columns = ['group_1', k]
    
    data = pd.merge(data, df_tmp, on='group_1', how='left')

gavg_pc_4_eq24
gavg_pc_2_eq0


In [10]:
data['achar_10_adate_pos'] = (data.adate_daynum - data.achar_10_adate_min) / data.achar_10_adate_range

In [11]:
data['leak_fillmask'] = data['outcome_filled'].isnull()
data.leak_fillmask.sum()

409834

In [12]:
write_dreduct = False

try:
    dreduct = pickle.load(open('dreduct7h.pkl', 'rb'))
except:
    dreduct = {}
    write_dreduct = True

In [13]:
# fixme?  onehot-values is still used as target fields
do_oneheat = ['adate_dayofweek', 'activity_category', 'group_1_bin']
dont_oneheat = ['pchar_38', 'achar_10', 'achar_10_reduced'] # *char* is heated unless listed here

onehot_values = {}
tot = 0

for k in data.keys():
    if k in dont_oneheat:
        continue
        
    if 'achar_10' in k: # block all achar_10 derivatives
        continue
        
    if 'char' in k or k in do_oneheat:
        onehot_values[k] = sorted(data[k].unique())
        tot += len(onehot_values[k])
        
print(tot, 'possible columns post-heating')

368 possible columns post-heating


In [None]:
# ALL test targets with 0.0/1.0 average are fully/accurately inferred in the test set... 

if 'grouplist' not in dreduct:

    dreduct['grouplist_all0'] = []
    dreduct['grouplist_all1'] = []
    dreduct['grouplist'] = []
    count = 0

    for g in data.groupby(['group_1'], sort=False):
        if len(g[1]) > 100:
            m = g[1].outcome.mean()
            if m != 0 and m != 1:
                #print(g[0], len(g[1]), g[1].outcome.mean())
                dreduct['grouplist'].append(g[0])

                count += len(g[1])
            elif m == 0:
                dreduct['grouplist_all0'].append(g[0])
            elif m == 1:
                dreduct['grouplist_all1'].append(g[0])
        else:
            dreduct['grouplist'].append(g[0])

            count += len(g[1])

    print(len(dreduct['grouplist']), len(dreduct['grouplist_all0']), len(dreduct['grouplist_all1']), count)

In [None]:
def split_traintest():
    testset = np.where(data['outcome'].isnull())
    trainset = np.where(~data['outcome'].isnull())

    return trainset, testset, data.iloc[trainset], data.iloc[testset]

trainset, testset, train, test = split_traintest()

print(len(train), len(test), len(data))

if 'trainmask' not in dreduct:
    mask = np.full(len(train), False, dtype=np.bool)
    for g in dreduct['grouplist_all0']:
        mask = np.logical_or(mask, train.group_1 == g)
    for g in dreduct['grouplist_all1']:
        mask = np.logical_or(mask, train.group_1 == g)
        
    dreduct['trainmask'] = mask
    
mask = dreduct['trainmask'].copy()
    
print(np.sum(mask))

# Need to copy since we're going to add another column
train_cut = train.iloc[np.where(np.logical_not(mask))].copy()

cols = train.columns.copy()
cols = cols.drop('activity_id')
train_dups = train_cut.duplicated(subset=cols)

train_cut_dedup = train_cut[~train_dups]
train_cut_dups = train_cut[train_dups]

train_cut_dedup_leaks = train_cut_dedup.iloc[np.where(train_cut_dedup.leak_fillmask.values)]
len(train_cut_dedup_leaks)

2197291 498687 2695978
1454634


In [None]:
from operator import itemgetter

stratkey = 'people_id'

if 'pplbuckets' not in dreduct:
    balls = []
    for p in train_cut_dedup.groupby([stratkey]):
        balls.append([p[0], len(p[1]), p[1].outcome.mean()])

    dreduct['pplbuckets'] = sorted(balls, key=itemgetter(2), reverse=True)
else:
    balls = dreduct['pplbuckets'].copy()

In [None]:
if write_dreduct:
    pickle.dump(dreduct, open('dreduct7h.pkl', 'wb'))

In [None]:
# This assumes vc is sorted by whatever you want stratified
def dosplit_rr(df, vc, folds, fuzz = (43254, .5, 4)):
    if fuzz is not None:
        np.random.seed(fuzz[0])
    
    bcount = np.zeros(folds)
    
    buckets = []
    for f in range(folds):
        buckets.append([])
    
    ballpit = copy.deepcopy(balls)
    
    runs = 0
    
    tot = 0
    
    while len(ballpit):
        runs += 1
        sel = 0
        r = np.random.rand()
        if r < fuzz[1]:
            sel = int((fuzz[2] / fuzz[1]) * r)
            if sel >= len(ballpit):
                sel = len(ballpit) - 1

        tot += sel
                
        v = ballpit[sel]
        del ballpit[sel]
                
        selbucket = np.argsort(bcount)[0]
        
        buckets[selbucket].append(v[0])
        bcount[selbucket] += v[1]
    
    print(len(balls), runs, tot)
    
    return buckets
    

In [None]:
folds = 4

source = train_cut_dedup
pidsets_grouped = dosplit_rr(source, balls, folds, fuzz = (123456, 0.25, 4))

cv_train = []
cv_train_leak = []

cv_val = []
cv_val_leak = []
cv_val_dups = []

pu = []

for p in pidsets_grouped:
    cv_train.append(source[~source[stratkey].isin(p)])
    cv_val.append(source[source[stratkey].isin(p)].copy()) # copy val since we need to add a field for unique groups
    
    pu.append(list(cv_val[-1][stratkey].unique()))

    cv_train_leak.append(cv_train[-1].iloc[np.where(cv_train[-1].leak_fillmask.values)])
    cv_val_leak.append(cv_val[-1].iloc[np.where(cv_val[-1].leak_fillmask.values)].copy())
    
    # This is directly from train_cut, bypassing duplicate and leak (but not skewed-group) detection
    cv_val_dups.append(train_cut[train_cut[stratkey].isin(p)])


In [None]:
# Optional test to compute which cv_val items have unique group_1's
if False:
    for fold in range(folds):

        groups_train = sorted(cv_train[fold].group_1.unique())
        groups_val = sorted(cv_val_leak[fold].group_1.unique())

        ugroup_mask = np.full(len(cv_val_leak[fold]), False, dtype=np.bool)

        cc = 0
        cl = []
        for g in cv_val[fold].groupby(['group_1']):
            if g[0] not in groups_train:
                cl.append(g[0])
                cc += len(g[1])

                ugroup_mask = np.logical_or(ugroup_mask, cv_val_leak[fold].group_1 == g[0])

        cv_val_leak[fold]['ugroup_mask'] = ugroup_mask
        
        print(fold, cv_val_leak[fold].ugroup_mask.sum(), len(cv_val_leak[fold]))

# 0 38542 39699
# 1 38630 40117
# 2 37681 39041
# 3 37402 38408
# ^ cv_val_leak results (xgb6 LB.992 run) - group_1 is almost ENTIRELY unique in xgb's eval set.  LEAKAGE!?

In [None]:
# ppl_370270/'group 27940' is 55k of *mostly* 0's, any bad prediction in it kills CV!

for f in range(0, folds):
    if (370270 in pu[f]):
        cv_val_dups[f] = cv_val_dups[f][cv_val_dups[f]['people_id'] != 370270]

    if 'group 27940' in cv_val_dups[f]['group_1'].unique():
        cv_val_dups[f] = cv_val_dups[f][cv_val_dups[f]['group_1'] != 'group 27940']
        
    print(len(cv_val[f]), cv_val[f].outcome.mean())
    print(f, len(cv_val_leak[f]), cv_val_leak[f].outcome.mean(), len(cv_val_dups[f]), cv_val_dups[f].outcome.mean())

In [28]:
from scipy.sparse import csr_matrix, hstack

noise = .095
q = 20

def buildmatrix(df, oneheat=False, linear=False):
    matrices = []    
    rows = []
    
    rows.append(df.pchar_38.values)
#    rows.append(np.busday_count(df.pdate.values.astype('datetime64[D]'), 
#                         df.adate.values.astype('datetime64[D]')))
 
    rows.append(df.business_days_delta.values)
    
    rows.append(df.people_per_group.values)
    
    if linear:
        #rows.append(df.people_per_group_adate.values / df.people_per_group.values)
        rows.append(df.events_per_group_adate.values / df.people_per_group_adate.values)
    
    if not linear:
        rows.append(df.group_1.values)
        rows.append(df.achar_10.values)
        
    rows.append(df.achar_10_adate_range.values)
    
    # POST SUBMISSION: remove the three group-based features for the stack later
    '''
    
    rows.append(df.gp_all0.values)
    rows[-1] += ((np.random.rand(len(rows[-1])) - .5) * noise)
    rows[-1] = ((rows[-1] * (100 * q)) + (q / 2)) // 100 / q
    
    rows.append(df.gp_all1.values)
    rows[-1] += ((np.random.rand(len(rows[-1])) - .5) * noise)
    rows[-1] = ((rows[-1] * (100 * q)) + (q / 2)) // 100 / q
    
    rows.append(df.gp_mixed.values)
    rows[-1] += ((np.random.rand(len(rows[-1])) - .5) * noise)
    rows[-1] = ((rows[-1] * (100 * q)) + (q / 2)) // 100 / q
    '''

    # Early linear analysis (xgb7h) said this has a .015 rcorr^2 when binned like this
    tmp = df.achar_10_adate_pos.values
    tmp = np.floor(tmp * 1000) / 1000
    tmp[np.isnan(tmp)] = -1
    rows.append(tmp)
        
    rows.append(df.adate_daynum.values)
    
    if not linear:
        tmp = df.adate_gap.values
        tmp[np.isnan(tmp)] = df.adate_daynum.values[np.isnan(tmp)]
        rows.append(tmp) # TODO: replace nan's for linear model
        
    rows.append(df.pdate_daynum.values)
    
    '''
    mask = df.people_per_group == 1
    mask = np.logical_and(mask, df.activity_category <= 4)
    mask = np.logical_and(mask, df.pchar_25 == 0)
    #flist.append('Xlowerprob_mask')
    rows.append(np.int8(mask))
    
    #mask = np.logical_and(mask, df.group_1 < 17871.5)
    mask2 = np.logical_and(mask, df.pchar_4 == 0)
    #flist.append('Xlowprob_mask')
    rows.append(np.int8(mask2))

    mask3 = np.logical_and(mask, df.group_1 < 17871.5)
    #flist.append('Xlowprob_mask2')
    rows.append(np.int8(mask3))
    '''
    
    rows.append(df.gavg_pc_2_eq0)
    rows.append(df.gavg_pc_4_eq24)


#    rows.append(df.outcome_filled_prevday.values)
#    rows.append(df.outcome_filled_nextday.values)

    matrices.append(csr_matrix(np.array(rows).T))
    rows = []

    if oneheat or linear:
        curitem = None
        for i in range(len(onehot_keys)):
            k = onehot_keys[i]

            if k[0] != curitem:
                usedmask = np.full(len(df), False, dtype=np.bool)
                curitem = k[0]

            if k[1] != None:
                rows.append(np.array(df[k[0]] == k[1]))
                usedmask[np.where(df[k[0]] == k[1])] = True
            else:
                rows.append(~usedmask)

            if len(rows) >= 128:
                matrices.append((np.array(rows).T))
                #print(len(matrices))
                rows = []

        if len(rows) > 0:
            matrices.append((np.array(rows).T))
            rows = []

        rv = hstack(matrices, format='csr')
        print(rv.shape)
        return rv
    else:
        for k in onehot_values.keys():
            tmp = df[k].values.astype(np.float64)
            #if np.sum(tmp[tmp == -1]):
                #print(k, np.min(tmp))
            #tmp[tmp == -1] = np.nan
            matrices.append(csr_matrix(tmp.reshape(len(df), 1)))
            
        #matrices = matrices.todense()
        #matrices.append(np.array(rows).T)
        return hstack(matrices)
        
        #output = np.hstack(matrices)
        #output[np.isnan(output)] = -1

        #return output

In [29]:
mat_train = {}
mat_val = {}
mat_val_dups = {}
dtrain = {}
dval = {}
dval_dups = {}

val_source = cv_val_leak

dooneheat = False

np.random.seed(0)

for f in range(folds):
    print(f)
    
    mat_train[f] = buildmatrix(cv_train_leak[f], dooneheat)
    dtrain[f] = xgb.DMatrix(mat_train[f], label=cv_train_leak[f].outcome.values, missing=-1)

    mat_val[f] = buildmatrix(val_source[f], dooneheat)
    dval[f] = xgb.DMatrix(mat_val[f], label=val_source[f].outcome.values, missing=-1)

    mat_val_dups[f] = buildmatrix(cv_val_dups[f], dooneheat)
    dval_dups[f] = xgb.DMatrix(mat_val_dups[f], label=cv_val_dups[f].outcome.values, missing=-1)
    
mat_test = buildmatrix(test, False)
dtest = xgb.DMatrix(mat_test, missing=np.nan)


0
1
2
3


In [30]:
# XXX: need way to pass this through?
curfold = 0

def feval_procleak(yhat, y):
    if (len(yhat) != len(cv_val_dups[curfold])):
        return "auc", sklearn.metrics.roc_auc_score(y.get_label(), yhat)
    
    yhat_f = yhat.copy()
    
    locs = np.where(~cv_val_dups[curfold].leak_fillmask)
    yhat_f[locs] = cv_val_dups[curfold].outcome_filled.values[locs]
    
    return "auc", sklearn.metrics.roc_auc_score(y.get_label(), yhat_f)
    #return "auc", get_leakpreds(curfold, yhat)


In [31]:
def build_dfpreds(preds, cv_df):
    cv_val_preds = []
    for f in range(folds):
        cv_val_preds.append(cv_df[f][['activity_id', 'outcome', 'outcome_filled', 'outcome_filled_nona', 'leak_fillmask']].copy())

        cv_val_preds[f]['pred_outcome'] = preds[f]
        cv_val_preds[f]['pred_outcomel'] = preds[f]
        
        mask = np.where(~cv_val_preds[f].leak_fillmask)
        cv_val_preds[f]['pred_outcomel'].values[mask] = cv_df[f]['outcome_filled'].values[mask]
        
        print(f,
              sklearn.metrics.roc_auc_score(cv_val_preds[f]['outcome'].values, cv_val_preds[f]['pred_outcome'].values),
              sklearn.metrics.roc_auc_score(cv_val_preds[f]['outcome'].values, cv_val_preds[f]['pred_outcomel'].values))
        
    output = pd.concat(cv_val_preds)
    
    print(sklearn.metrics.roc_auc_score(output['outcome'].values, output['pred_outcome'].values),
          sklearn.metrics.roc_auc_score(output['outcome'].values, output['pred_outcomel'].values))
    
    return output
    

def build_dfpreds_xgb(bst, cv_mat = dval_dups, cv_df = cv_val_dups):
    preds = []
    for f in range(folds):
        print(bst[f].attributes())
        try:
            preds.append(bst[f].predict(cv_mat[f], ntree_limit=bst[f].best_ntree_limit))
        except:
            preds.append(bst[f].predict(cv_mat[f]))
            preds[-1] = preds[-1].clip(0.001, .999)

    df_preds = build_dfpreds(preds, cv_df)
    return preds, df_preds

In [32]:

param = {'max_depth':10, 'eta':0.01, 'silent':1, 'objective':'binary:logistic' }
#param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.5
param['colsample_bytree']= 0.3
param['min_child_weight'] = 1
param['max_depth'] = 5
param['booster'] = "gbtree"
param['seed'] = 12345

bst_d5 = {}
#bst_linear = {} # optional

for curfold in range(folds):
#for curfold in [2]:
    #watchlist  = [(dtrain[curfold],'train'), (dval_dups[curfold], 'eval_dups'), (dval[curfold], 'eval')]
    watchlist  = [(dtrain[curfold],'train'), (dval_dups[curfold], 'eval_dups'), (dval[curfold], 'eval')]
    num_round = 2000
    early_stopping_rounds=500
    bst_d5[curfold] = xgb.train(param, dtrain[curfold], num_round, watchlist,
                       feval = feval_procleak,
                       early_stopping_rounds=early_stopping_rounds, 
                       verbose_eval=50)

[0]	train-auc:0.596674	eval_dups-auc:0.917206	eval-auc:0.582719
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 500 rounds.
[50]	train-auc:0.763735	eval_dups-auc:0.935339	eval-auc:0.710838
[100]	train-auc:0.775314	eval_dups-auc:0.935847	eval-auc:0.713161
[150]	train-auc:0.779131	eval_dups-auc:0.93665	eval-auc:0.717117
[200]	train-auc:0.784935	eval_dups-auc:0.93714	eval-auc:0.720311
[250]	train-auc:0.789721	eval_dups-auc:0.937539	eval-auc:0.722035
[300]	train-auc:0.79457	eval_dups-auc:0.937963	eval-auc:0.724332
[350]	train-auc:0.799857	eval_dups-auc:0.938316	eval-auc:0.726233
[400]	train-auc:0.804982	eval_dups-auc:0.93863	eval-auc:0.72802
[450]	train-auc:0.809438	eval_dups-auc:0.938807	eval-auc:0.728854
[500]	train-auc:0.814342	eval_dups-auc:0.939067	eval-auc:0.730192
[550]	train-auc:0.817935	eval_dups-auc:0.939314	eval-auc:0.731423
[600]	train-auc:0.822055	eval_dups-auc:0.939533	eval-auc:0.732352
[650]	tr

In [33]:

param = {'max_depth':10, 'eta':0.01, 'silent':1, 'objective':'binary:logistic' }
#param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.5
param['colsample_bytree']= 0.3
param['min_child_weight'] = 2
param['max_depth'] = 6
param['booster'] = "gbtree"
param['seed'] = 12343

bst_d6 = {}
#bst_linear = {} # optional

for curfold in range(folds):
#for curfold in [2]:
    #watchlist  = [(dtrain[curfold],'train'), (dval_dups[curfold], 'eval_dups'), (dval[curfold], 'eval')]
    watchlist  = [(dtrain[curfold],'train'), (dval_dups[curfold], 'eval_dups'), (dval[curfold], 'eval')]
    num_round = 2000
    early_stopping_rounds=200
    bst_d6[curfold] = xgb.train(param, dtrain[curfold], num_round, watchlist,
                       feval = feval_procleak,
                       early_stopping_rounds=early_stopping_rounds, 
                       verbose_eval=50)

[0]	train-auc:0.633528	eval_dups-auc:0.919227	eval-auc:0.588893
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[50]	train-auc:0.784608	eval_dups-auc:0.936239	eval-auc:0.71359
[100]	train-auc:0.798157	eval_dups-auc:0.93766	eval-auc:0.722765
[150]	train-auc:0.805471	eval_dups-auc:0.937817	eval-auc:0.723779
[200]	train-auc:0.813556	eval_dups-auc:0.938009	eval-auc:0.724828
[250]	train-auc:0.82181	eval_dups-auc:0.938271	eval-auc:0.726585
[300]	train-auc:0.829831	eval_dups-auc:0.938593	eval-auc:0.728206
[350]	train-auc:0.836553	eval_dups-auc:0.938842	eval-auc:0.729605
[400]	train-auc:0.841148	eval_dups-auc:0.939105	eval-auc:0.731165
[450]	train-auc:0.846364	eval_dups-auc:0.939318	eval-auc:0.732036
[500]	train-auc:0.851435	eval_dups-auc:0.939562	eval-auc:0.733161
[550]	train-auc:0.856241	eval_dups-auc:0.939738	eval-auc:0.733898
[600]	train-auc:0.861099	eval_dups-auc:0.939909	eval-auc:0.734866
[650]	

In [36]:
#m4 run r
preds_d6, df_preds_d6 = build_dfpreds_xgb(bst_d6)

{'best_score': '0.737673', 'best_iteration': '1488', 'best_msg': '[1488]\ttrain-auc:0.912306\teval_dups-auc:0.940194\teval-auc:0.737673'}
{'best_score': '0.748243', 'best_iteration': '1966', 'best_msg': '[1966]\ttrain-auc:0.925686\teval_dups-auc:0.946754\teval-auc:0.748243'}
{'best_score': '0.731846', 'best_iteration': '1037', 'best_msg': '[1037]\ttrain-auc:0.890731\teval_dups-auc:0.948036\teval-auc:0.731846'}
{'best_score': '0.721424', 'best_iteration': '750', 'best_msg': '[750]\ttrain-auc:0.879465\teval_dups-auc:0.942663\teval-auc:0.721424'}
0 0.727189177735 0.940193883409
1 0.722664658003 0.946754137723
2 0.720562162727 0.948035938563
3 0.726875266138 0.942663436597
0.722959368019 0.94444711823


In [37]:
#m4 run two
preds_d5, df_preds_d5 = build_dfpreds_xgb(bst_d5)

{'best_score': '0.736923', 'best_iteration': '1796', 'best_msg': '[1796]\ttrain-auc:0.885599\teval_dups-auc:0.940177\teval-auc:0.736923'}
{'best_score': '0.745968', 'best_iteration': '1946', 'best_msg': '[1946]\ttrain-auc:0.887536\teval_dups-auc:0.946524\teval-auc:0.745968'}
{'best_score': '0.732762', 'best_iteration': '1019', 'best_msg': '[1019]\ttrain-auc:0.850004\teval_dups-auc:0.948224\teval-auc:0.732762'}
{'best_score': '0.723141', 'best_iteration': '1162', 'best_msg': '[1162]\ttrain-auc:0.862773\teval_dups-auc:0.94304\teval-auc:0.723141'}
0 0.726745879872 0.940176819615
1 0.721113274381 0.946523515841
2 0.714248287427 0.94822415574
3 0.725705705453 0.943040489625
0.721098812219 0.944525799537


In [38]:
df_preds_d5.to_pickle('predscv-7m4r2-d5.pkl')

In [39]:
df_preds_d6.to_pickle('predscv-7m4r2-d6.pkl')

In [23]:
with gzip.open('xgbl7f-output.pkl.gz', 'rb') as fd:
    d_linpreds = pickle.load(fd)

preds_lin = d_linpreds['train']
df_preds_lin = d_linpreds['train_df']
preds_lin_test = d_linpreds['test']

In [42]:
def merge_preds(dfs, clf):
    
    tgt = dfs[0][dfs[0].leak_fillmask].outcome.values.copy()
    
    preds = []
    for df in dfs:
        preds.append(df[df.leak_fillmask].pred_outcomel.values.copy())
        
    print(roc_auc_score(tgt, preds[0]))
        
    #preds.append(np.ones_like(preds[-1]))
        
    X = np.vstack(preds).T

#    clf = sklearn.linear_model.Ridge()
    clf.fit(X, tgt)
    
    merged = clf.predict(X)
    merged = np.clip(merged, .001, .999)
    
    print(roc_auc_score(tgt, merged))

    df_merged = dfs[0][['activity_id', 'outcome', 'pred_outcome', 'pred_outcomel', 'leak_fillmask']]
    df_merged.pred_outcomel.values[np.where(dfs[0].leak_fillmask.values)] = merged
    df_merged.pred_outcome.values[np.where(dfs[0].leak_fillmask.values)] = merged
    
    print(roc_auc_score(dfs[0].outcome.values, df_merged.pred_outcomel))

    return df_merged, clf

In [43]:
import sklearn.ensemble

In [44]:
from sklearn import cross_validation

In [47]:
df_mergedout, clf = merge_preds([df_preds_d5, df_preds_d6], sklearn.linear_model.Ridge())


0.74955264133
0.75002324412
0.94643684052


In [46]:
df_mergedout = df_preds_d5

In [49]:
tm = pd.merge(train, df_mergedout, on='activity_id', how='left').copy()

#print(tm.pred_outcome.isnull().sum(), tm.pred_outcomel.isnull().sum())

tm.loc[tm.pred_outcome.isnull(), ['pred_outcome']] = tm[tm.pred_outcome.isnull()]['outcome_filled_nona']
tm.loc[tm.pred_outcomel.isnull(), ['pred_outcomel']] = tm[tm.pred_outcomel.isnull()]['outcome_filled_nona']

print(sklearn.metrics.roc_auc_score(tm.outcome_x.values, tm.pred_outcome.values), \
sklearn.metrics.roc_auc_score(tm.outcome_x.values, tm.pred_outcomel.values))

mask0 = np.full(len(tm), False, dtype=np.bool)
for g in dreduct['grouplist_all0']:
    mask0 = np.logical_or(mask0, tm.group_1 == g)

mask1 = np.full(len(tm), False, dtype=np.bool)
for g in dreduct['grouplist_all1']:
    mask1 = np.logical_or(mask1, tm.group_1 == g)

vals = (0, 1) # These groups are always 0/1 in test
tm.pred_outcome.values[np.where(mask0)] = vals[0]
tm.pred_outcomel.values[np.where(mask0)] = vals[0]

tm.pred_outcome.values[np.where(mask1)] = vals[1]
tm.pred_outcomel.values[np.where(mask1)] = vals[1]

print(sklearn.metrics.roc_auc_score(tm.outcome_x.values, tm.pred_outcome.values), \
sklearn.metrics.roc_auc_score(tm.outcome_x.values, tm.pred_outcomel.values))

#0.946852648526 0.989813329397
#0.954428644246 0.992619147622 - w/ adate10_r .001 quant .992885

#0.949455564305 0.989835569724
#0.956555615812 0.992636323141 - first FE attempt .992881

#0.955060286488 0.989917327586
#0.96143899082 0.99269895942 - second FE attempt

#0.967763129886 0.990062140532
#0.972240541802 0.992828869816 - 7m1 d5
#0.967298424983 0.989985499522
#0.971925205949 0.992778873978 - d6 (so d5 might be better)

#0.97000016041 0.990225258698
#0.974089176638 0.992948181038 - 7m2 d5

#0.969949811863 0.990401393144
#0.973875698791 0.993074997241 - 7m3 d5

#0.970564734627 0.990424342822
#0.974527494216 0.993108191472 - 7m4 d5

#0.970475042392 0.990426490824
#0.974420841262 0.993112858563 - 7m4r d5+d6

0.970475042392 0.990426490824
0.974420841262 0.993112858563


In [30]:
# Output section

In [31]:
#mat_test = buildmatrix(test, False)
#dtest = xgb.DMatrix(mat_test, missing=np.nan)

#lmat_test = buildmatrix(test, linear=True)
#ldtest = xgb.DMatrix(lmat_test, missing=np.nan)

In [40]:
def xgbmdl_to_test(mdl, dmat = dtest):
    output = np.zeros(len(test), dtype=np.float64)
    for f in range(folds):
        try:
            output += mdl[f].predict(dmat, ntree_limit=mdl[f].best_ntree_limit)
        except: # Linear model
            o = mdl[f].predict(dmat)
            o = o.clip(.001, .999)
            output += o
            
    return output / folds
            

In [41]:
outputs = {}
outputs['preds_d5'] = xgbmdl_to_test(bst_d5)
outputs['preds_d6'] = xgbmdl_to_test(bst_d6)
outputs['activity_id'] = test.activity_id.values

In [42]:
df_outputs = pd.DataFrame(outputs)

In [44]:
df_outputs.to_pickle('ps-xgb-test.pkl')

In [51]:
outputs = []
#[df_preds_d4, df_preds_d5, df_preds_d6, df_preds_d7, df_preds_d8, df_preds_d9, df_preds_lin]
#outputs.append(xgbmdl_to_test(bst_d4))
outputs.append(xgbmdl_to_test(bst_d5))
outputs.append(xgbmdl_to_test(bst_d6))
#outputs.append(xgbmdl_to_test(bst_d7))
#outputs.append(xgbmdl_to_test(bst_d8))
#outputs.append(xgbmdl_to_test(bst_d9))
#outputs.append(preds_lin_test)


In [52]:
if len(outputs) > 1:
    moutputs = np.vstack(outputs).T

    outputa = clf.predict(moutputs)

    print(np.min(outputa), np.max(outputa))
    outputa = np.clip(outputa, .001, .999)
else:
    outputa = outputs[0]

output = test.outcome_filled.values.copy()

mask = np.where(output != output)
output[np.where(output != output)] = outputa[mask]

test_out =  test[['activity_id']].copy()
test_out['outcome'] = output

test_out.to_csv('Submission-7m4r-d5-d6-usethis.csv', index=False)

output = test.outcome_filled.values.copy()

imask = output == 1.0
imask = np.logical_or(imask, output == 0.0)

test_tgt = output[np.where(imask)]
test_preds = outputa[np.where(imask)]

roc_auc_score(test_tgt, test_preds) #0.84179900687486708
#0.74025122141974342 .992881
#0.96740052157355261 - m1
#0.96176368357003805 - m2
#0.9693393882776864 - m3


-0.0097536049618 0.922685520993
