In [1]:
import copy
import datetime
import gzip
import pickle
from itertools import product
import time

import pandas as pd
import numpy as np

from scipy import interpolate ## For other interpolation functions.

import sklearn.metrics
import sklearn.utils
import sklearn.linear_model

from sklearn.cross_validation import LabelKFold
from sklearn.metrics import roc_auc_score

import xgboost as xgb

%matplotlib inline
import matplotlib.pyplot as plt



In [2]:
with gzip.open('merged7.pkl.gz', 'rb') as fd:
    data = pickle.load(fd)

with gzip.open('cvleak7-10fold.pkl.gz', 'rb') as fd:
    cvleak = pickle.load(fd)
    
data = pd.merge(data, cvleak, on='activity_id', how='left')

with gzip.open('dproc7.pkl.gz', 'rb') as fd:
    extra = pickle.load(fd)

data = pd.merge(data, extra, on='activity_id', how='left')

if True: # Wasteful to recompute a constant every time
    mindate = pd.Timestamp('2022-07-17 00:00:00')
    maxdate = pd.Timestamp('2023-08-31 00:00:00')
    minpdate = pd.Timestamp('2020-05-18 00:00:00')
else:
    mindate = min(data['date'])
    maxdate = max(data['date'])
    minpdate = min(data['pdate'])
    

In [3]:
data.outcome.mean()

0.4439543965728709

In [4]:
# delete 17304, it's big and all zeros and may mess up models!

data = data[data.group_1 != 17304]

In [5]:
data.outcome.mean()  # yes, removing that one group skews the outcome THAT much.

0.6976975552259174

In [6]:
cols = data.columns.copy()
cols = cols.drop('activity_id')
data_dups = data.duplicated(subset=cols)

data_dedup = data[~data_dups]

In [7]:
data_dup = data[data_dups]

In [8]:
len(data_dedup), len(data)

(1023194, 1731249)

In [9]:
data_dedup.outcome.mean(), data_dup.outcome.mean()

(0.7204231620668224, 0.6655633998573868)

In [10]:
def split_traintest():
    testset = np.where(data['outcome'].isnull())
    trainset = np.where(~data['outcome'].isnull())

    return trainset, testset, data.iloc[trainset], data.iloc[testset]

trainset, testset, train, test = split_traintest()


In [None]:
# skip this on a rerun (or group3+)

In [11]:
def build_group_df(df, classes = True):
    procs = {'group_1': [],
             'num_people': [],
             'num_events': [],
             'pdate_mean': [],
             'pdate_first': [],
             'pdate_latest': [],
             'adate_first': [],
             'adate_latest': [],
             'num_adates': [],
             'otype': [],
             }

    group_class = []

    for g in df.groupby('group_1'):
        procs['group_1'].append(g[0])

        procs['pdate_mean'].append(g[1].pdate_daynum.mean())
        procs['pdate_first'].append(g[1].pdate_daynum.min())
        procs['pdate_latest'].append(g[1].pdate_daynum.max())

        procs['adate_first'].append(g[1].adate_daynum.min())
        procs['adate_latest'].append(g[1].adate_daynum.max())

        procs['num_people'].append(len(g[1].people_id.unique()))
        procs['num_events'].append(len(g[1]))
        procs['num_adates'].append(len(g[1].adate_daynum.unique()))

        if False:
            m = g[1].outcome.mean()
            if m == 0:
                procs['otype'].append(0)
            elif m == 1:
                procs['otype'].append(1)
            else:
                procs['otype'].append(2) # mixed
        else:
            procs['otype'].append(-1)

    df_procs = pd.DataFrame(procs)
    return df_procs

In [12]:
df_groups = build_group_df(data_dedup)

In [14]:
def build_weight_oh(df, key):
    weighted = []

    numv = df[key].max() + 1
    
    for g in df.groupby('group_1'):
        oh = np.zeros(numv, dtype=np.float64)

        vc = g[1][key].value_counts()

        wtot = 0
        for z in zip(vc.index.values, vc.values):
            oh[z[0]] += z[1]

        weighted.append(np.hstack([[g[0]], oh / len(g[1])]))

    df_wo = pd.DataFrame(weighted)

    wo_cols = ['group_1']
    for i in range(numv):
        wo_cols.append('{0}_group_onehotavg_{1}'.format(key, i))
        #print(wo_cols[-1])

    df_wo.columns = wo_cols

    #data = pd.merge(data, df_w2o, on='group_1', how='left')

    droplist = []
    for c in df_wo.keys():
        #print(c, df_w2o[c].mean())
        if df_wo[c].mean() < .001:
            droplist.append(c)

    for d in droplist:
        df_wo.drop(d, axis=1, inplace=True)
            
    return df_wo

In [22]:
oh_keys = []
alsouse = ['activity_category']
for k in data.keys():
    if 'achar_10' in k:
        continue
        
    if 'pchar_38' in k:
        continue
        
    if k in alsouse or 'char' in k:
        oh_keys.append(k)

In [24]:
for k in oh_keys:
    print(k)
    df_oh = build_weight_oh(data_dedup, k)
    df_groups = pd.merge(df_groups, df_oh, on='group_1', how='left')

activity_category
achar_1
achar_2
achar_3
achar_4
achar_5
achar_6
achar_7
achar_8
achar_9
pchar_10
pchar_11
pchar_12
pchar_13
pchar_14
pchar_15
pchar_16
pchar_17
pchar_18
pchar_19
pchar_20
pchar_21
pchar_22
pchar_23
pchar_24
pchar_25
pchar_26
pchar_27
pchar_28
pchar_29
pchar_30
pchar_31
pchar_32
pchar_33
pchar_34
pchar_35
pchar_36
pchar_37
pchar_1
pchar_2
pchar_3
pchar_4
pchar_5
pchar_6
pchar_7
pchar_8
pchar_9


In [26]:
df_groups.to_pickle('group2-dfprep1.pkl')

In [11]:
# pick up here
df_groups = pickle.load(open('group2-dfprep1.pkl', 'rb'))

In [12]:
# add pchar_38 mean and median

procs = {'group_1': [], 'p38_median': [], 'p38_mean': []}

for g in data_dedup.groupby('group_1'):
    procs['group_1'].append(g[0])

    procs['p38_median'].append(g[1].pchar_38.median())
    procs['p38_mean'].append(g[1].pchar_38.mean())


In [13]:
df_procs = pd.DataFrame(procs)

df_groups = pd.merge(df_groups, df_procs, on='group_1', how='left')

In [14]:
#train/test split

In [15]:
train_groups = train.group_1.unique()
test_groups = test.group_1.unique()

In [16]:
test_only_groups = []

for t in test_groups:
    if t not in train_groups:
        test_only_groups.append(t)

In [17]:
# Not pythonic, and slow, but eh
mask = np.full(len(df_groups), False, dtype=np.bool)
for i in range(len(df_groups)):
    if df_groups.iloc[i].group_1 in test_only_groups:
        mask[i] = True
        
np.sum(mask)

4325

In [18]:
df_groups.drop('otype', axis=1, inplace=True)

In [19]:
df_train = df_groups[~mask].copy()
df_test = df_groups[mask].copy()

In [20]:
procs = {'group_1': [],
         'otype': [],
         }

group_class = []

for g in data_dedup.groupby('group_1'):
    procs['group_1'].append(g[0])

    m = g[1].outcome.mean()
    if m == 0:
        procs['otype'].append(0)
    elif m == 1:
        procs['otype'].append(1)
    else:
        procs['otype'].append(2) # mixed


In [21]:
df_procs = pd.DataFrame(procs)

df_train = pd.merge(df_train, df_procs, on='group_1', how='left')

In [22]:
df_train.keys()

Index(['adate_first', 'adate_latest', 'group_1', 'num_adates', 'num_events',
       'num_people', 'pdate_first', 'pdate_latest', 'pdate_mean',
       'activity_category_group_onehotavg_1',
       ...
       'pchar_9_group_onehotavg_2', 'pchar_9_group_onehotavg_3',
       'pchar_9_group_onehotavg_4', 'pchar_9_group_onehotavg_5',
       'pchar_9_group_onehotavg_6', 'pchar_9_group_onehotavg_7',
       'pchar_9_group_onehotavg_8', 'p38_mean', 'p38_median', 'otype'],
      dtype='object', length=291)

In [23]:
for k in df_train.keys():
    print(k)

adate_first
adate_latest
group_1
num_adates
num_events
num_people
pdate_first
pdate_latest
pdate_mean
activity_category_group_onehotavg_1
activity_category_group_onehotavg_2
activity_category_group_onehotavg_3
activity_category_group_onehotavg_4
activity_category_group_onehotavg_5
activity_category_group_onehotavg_6
activity_category_group_onehotavg_7
achar_1_group_onehotavg_0
achar_1_group_onehotavg_1
achar_1_group_onehotavg_2
achar_1_group_onehotavg_3
achar_1_group_onehotavg_4
achar_1_group_onehotavg_5
achar_1_group_onehotavg_6
achar_1_group_onehotavg_7
achar_1_group_onehotavg_9
achar_1_group_onehotavg_10
achar_1_group_onehotavg_11
achar_1_group_onehotavg_12
achar_1_group_onehotavg_14
achar_1_group_onehotavg_16
achar_1_group_onehotavg_22
achar_1_group_onehotavg_24
achar_1_group_onehotavg_25
achar_1_group_onehotavg_28
achar_1_group_onehotavg_51
achar_2_group_onehotavg_0
achar_2_group_onehotavg_1
achar_2_group_onehotavg_2
achar_2_group_onehotavg_4
achar_2_group_onehotavg_5
achar_2_grou

In [23]:
# CV prep

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import KFold

gtrain = df_train.copy()

keys = list(gtrain.keys())
keys.remove('otype')
#keys.remove('p38_median')
#keys.remove('p38_mean')
X = gtrain[keys]
y = gtrain.otype.values.copy()

X_train, X_test = {}, {}
y_train, y_test = {}, {}

test_indexes = []

kf = KFold(len(gtrain), 5, shuffle=True, random_state=0)
i = 0
for train_index, test_index in kf:
    print("TRAIN:", train_index, "TEST:", test_index)
    np.random.seed(0)
    train_i = train_index
    test_i = test_index
    
    test_indexes.append(test_index)
    
    X_train[i], X_test[i] = X.iloc[train_i], X.iloc[test_i]
    y_train[i], y_test[i] = y[train_i], y[test_i]
    i += 1

TRAIN: [    0     1     2 ..., 29892 29894 29895] TEST: [    4     6     7 ..., 29893 29896 29897]
TRAIN: [    0     1     2 ..., 29893 29896 29897] TEST: [    8     9    12 ..., 29892 29894 29895]
TRAIN: [    4     6     7 ..., 29895 29896 29897] TEST: [    0     1     2 ..., 29880 29884 29886]
TRAIN: [    0     1     2 ..., 29895 29896 29897] TEST: [   11    21    23 ..., 29865 29870 29872]
TRAIN: [    0     1     2 ..., 29895 29896 29897] TEST: [   10    13    19 ..., 29889 29890 29891]


In [24]:
from sklearn.metrics import log_loss
from sklearn.ensemble import ExtraTreesClassifier

In [30]:
et = {}
for f in range(5):
    et[f] = ExtraTreesClassifier()
    et[f].fit(X_train[f], y_train[f])

    for i in range(20, 300, 10):
        et[f].set_params(warm_start = True, n_estimators = i)
        et[f].fit(X_train[f], y_train[f])
        p0 = et[f].predict_proba(X_test[f])
        print(f, i, log_loss(y_test[f], p0))

df_preds = []
for f in range(5):
    preds = et[f].predict_proba(X_test[f])
    df_preds.append(pd.DataFrame(preds))
    df_preds[-1].columns = ['gp_all0', 'gp_all1', 'gp_mixed']
    df_preds[-1]['group_1'] = gtrain.iloc[test_indexes[f]].group_1.values

df_preds_all = pd.concat(df_preds)

train_preds = np.array(df_preds_all.sort_values('group_1').drop('group_1', axis=1))

log_loss(gtrain.otype, train_preds)

0 20 1.02009535567
0 30 0.893147499619
0 40 0.865105387291
0 50 0.857230701889
0 60 0.851067679948
0 70 0.843657530544
0 80 0.841317778139
0 90 0.840018177112
0 100 0.840300685146
0 110 0.839538550077
0 120 0.838530078907
0 130 0.837719799988
0 140 0.832398852174
0 150 0.832716607947
0 160 0.822760199362
0 170 0.823025436424
0 180 0.822853190798
0 190 0.823031123443
0 200 0.822855743471
0 210 0.82215488242
0 220 0.822117304576
0 230 0.821616216866
0 240 0.821224979642
0 250 0.821107909003
0 260 0.821304140235
0 270 0.821403873332
0 280 0.821567923352
0 290 0.821425667321
1 20 1.02608482993
1 30 0.915669784322
1 40 0.866075417748
1 50 0.861710090329
1 60 0.847826657402
1 70 0.841788820994
1 80 0.834999083375
1 90 0.828881276187
1 100 0.827415126791
1 110 0.827087256368
1 120 0.826102111356
1 130 0.825524279755
1 140 0.825148429664
1 150 0.82482520955
1 160 0.819929730884
1 170 0.819514245168
1 180 0.814894234615
1 190 0.814226254969
1 200 0.81424253252
1 210 0.813879135564
1 220 0.81354

In [43]:
et = {}
for f in range(5):
    et[f] = ExtraTreesClassifier(min_samples_split=8,min_samples_leaf=2,n_jobs=8)
    et[f].fit(X_train[f], y_train[f])

    for i in range(20, 300, 10):
        et[f].set_params(warm_start = True, n_estimators = i)
        et[f].fit(X_train[f], y_train[f])
        p0 = et[f].predict_proba(X_test[f])
        print(f, i, log_loss(y_test[f], p0))

df_preds = []
for f in range(5):
    preds = et[f].predict_proba(X_test[f])
    df_preds.append(pd.DataFrame(preds))
    df_preds[-1].columns = ['gp_all0', 'gp_all1', 'gp_mixed']
    df_preds[-1]['group_1'] = gtrain.iloc[test_indexes[f]].group_1.values

df_preds_all = pd.concat(df_preds)

train_preds = np.array(df_preds_all.sort_values('group_1').drop('group_1', axis=1))

log_loss(gtrain.otype, train_preds)

0 20 0.847376410362
0 30 0.819595314966
0 40 0.816012243128
0 50 0.814010303687
0 60 0.812475050201
0 70 0.810474674305
0 80 0.808859015436
0 90 0.807923758624
0 100 0.807406899485
0 110 0.807247172995
0 120 0.807087980957
0 130 0.806910211261
0 140 0.806463733116
0 150 0.8063124586
0 160 0.806214808484
0 170 0.806180594055
0 180 0.8061106499
0 190 0.806025653759
0 200 0.805617413311
0 210 0.805675454695
0 220 0.805765676756
0 230 0.80592751344
0 240 0.80565474264
0 250 0.805559791837
0 260 0.805750638486
0 270 0.8056779792
0 280 0.805510981426
0 290 0.805495972054
1 20 0.837397460556
1 30 0.824801076006
1 40 0.820356492568
1 50 0.817919956256
1 60 0.815952676013
1 70 0.815898028802
1 80 0.815113600025
1 90 0.815217959331
1 100 0.814277242859
1 110 0.80901299416
1 120 0.808791649101
1 130 0.808900795123
1 140 0.809031627533
1 150 0.808991530429
1 160 0.808605375518
1 170 0.808234392885
1 180 0.808461897322
1 190 0.808142424181
1 200 0.808240326067
1 210 0.807892331445
1 220 0.808078045

0.80476117974112715

In [44]:
gtesta = df_test.copy()

test_preds = []
for f in range(5):
    test_preds.append(et[f].predict_proba(gtesta))
    
preds_comb = np.mean([test_preds[j] for j in range(1,5)], axis = 0)

df_test_preds = pd.DataFrame(preds_comb)
df_test_preds.columns = ['gp_all0', 'gp_all1', 'gp_mixed']
df_test_preds['group_1'] = gtesta.group_1.values

preds_out = pd.concat([df_test_preds, df_preds_all])

preds_out.to_pickle('group3a-preds.pkl')

In [77]:
et_df_test_preds = df_test_preds.copy()

In [39]:
len(preds_out), len(preds_out.group_1.unique()), len(df_preds_all.group_1.unique())

(34223, 34223, 29898)

In [40]:
preds_comb

array([[ 0.23275862,  0.70258621,  0.06465517],
       [ 0.17155172,  0.51982759,  0.30862069],
       [ 0.61034483,  0.33017241,  0.05948276],
       ..., 
       [ 0.58706897,  0.4112069 ,  0.00172414],
       [ 0.35      ,  0.62155172,  0.02844828],
       [ 0.12068966,  0.74137931,  0.13793103]])

In [90]:
preds_comb

array([[ 0.21710526,  0.72894737,  0.05394737],
       [ 0.18421053,  0.49736842,  0.31842105],
       [ 0.64078947,  0.31315789,  0.04605263],
       ..., 
       [ 0.52894737,  0.47105263,  0.        ],
       [ 0.35921053,  0.61447368,  0.02631579],
       [ 0.12631579,  0.74605263,  0.12763158]])

In [77]:
test_preds[3]

array([[ 0.23684211,  0.71578947,  0.04736842],
       [ 0.17894737,  0.51578947,  0.30526316],
       [ 0.55263158,  0.38947368,  0.05789474],
       ..., 
       [ 0.63684211,  0.35789474,  0.00526316],
       [ 0.44210526,  0.53157895,  0.02631579],
       [ 0.12105263,  0.74736842,  0.13157895]])

In [41]:
et_orig = et.copy()

In [42]:
ExtraTreesClassifier()

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [50]:
et = {}
for f in range(5):
    et[f] = ExtraTreesClassifier(min_samples_split=12,min_samples_leaf=2,n_jobs=8, random_state=12345)
    et[f].fit(X_train[f], y_train[f])

    for i in range(20, 300, 10):
        et[f].set_params(warm_start = True, n_estimators = i)
        et[f].fit(X_train[f], y_train[f])
        p0 = et[f].predict_proba(X_test[f])
        print(f, i, log_loss(y_test[f], p0))

df_preds = []
for f in range(5):
    preds = et[f].predict_proba(X_test[f])
    df_preds.append(pd.DataFrame(preds))
    df_preds[-1].columns = ['gp_all0', 'gp_all1', 'gp_mixed']
    df_preds[-1]['group_1'] = gtrain.iloc[test_indexes[f]].group_1.values

df_preds_all = pd.concat(df_preds)

train_preds = np.array(df_preds_all.sort_values('group_1').drop('group_1', axis=1))

log_loss(gtrain.otype, train_preds)

0 20 0.817974151135
0 30 0.813412032996
0 40 0.812392322132
0 50 0.811886842968
0 60 0.809457785794
0 70 0.809058028615
0 80 0.808636937309
0 90 0.808376464235
0 100 0.80831385745
0 110 0.807934592503
0 120 0.807713919093
0 130 0.807456409231
0 140 0.807130790297
0 150 0.806710266905
0 160 0.806860461825
0 170 0.806525323377
0 180 0.806298659648
0 190 0.806008761758
0 200 0.806089340607
0 210 0.806138551074
0 220 0.806123383588
0 230 0.806300985585
0 240 0.80640922257
0 250 0.806314019841
0 260 0.806537071777
0 270 0.806379445295
0 280 0.806248697607
0 290 0.806325880333
1 20 0.832092037406
1 30 0.819917183891
1 40 0.815927651964
1 50 0.815495186819
1 60 0.813603799737
1 70 0.812762207605
1 80 0.812290602924
1 90 0.81202425171
1 100 0.811563554483
1 110 0.81081866783
1 120 0.81038991869
1 130 0.810326852703
1 140 0.810297011447
1 150 0.809359824745
1 160 0.809217759314
1 170 0.808869542607
1 180 0.808689412785
1 190 0.808794857292
1 200 0.808674698395
1 210 0.808600899474
1 220 0.80860

0.8050373772743904

In [25]:
import xgboost as xgb

In [26]:
xg_train = {}
xg_test = {}
for f in range(5):
    xg_train[f] = xgb.DMatrix( X_train[f], label=y_train[f])
    xg_test[f] = xgb.DMatrix(X_test[f], label=y_test[f])

In [27]:
y_train[f]

array([0, 1, 1, ..., 1, 0, 1])

In [49]:
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softprob'
# scale weight of positive examples
param['eta'] = 0.01
param['max_depth'] = 6
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 3
param['subsample'] = 0.7
param['colsample_bytree'] = 0.7
param['eval_metric'] = 'mlogloss'

bst = {}
pred = {}
for f in range(5):
    
    watchlist = [ (xg_train[f],'train'), (xg_test[f], 'test') ]
    num_round = 2500
    bst[f] = xgb.train(param, xg_train[f], num_round, watchlist , verbose_eval = 10, early_stopping_rounds=30)
    # get prediction
#    pred[f] = bst.predict( xg_test[f] )
    


[0]	train-mlogloss:1.09498	test-mlogloss:1.09512
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 30 rounds.
[10]	train-mlogloss:1.06051	test-mlogloss:1.06254
[20]	train-mlogloss:1.03033	test-mlogloss:1.03423
[30]	train-mlogloss:1.00362	test-mlogloss:1.00929
[40]	train-mlogloss:0.979763	test-mlogloss:0.98712
[50]	train-mlogloss:0.958614	test-mlogloss:0.967649
[60]	train-mlogloss:0.939315	test-mlogloss:0.949892
[70]	train-mlogloss:0.922003	test-mlogloss:0.934218
[80]	train-mlogloss:0.906313	test-mlogloss:0.920217
[90]	train-mlogloss:0.892022	test-mlogloss:0.907591
[100]	train-mlogloss:0.879311	test-mlogloss:0.896292
[110]	train-mlogloss:0.866947	test-mlogloss:0.885586
[120]	train-mlogloss:0.855923	test-mlogloss:0.87614
[130]	train-mlogloss:0.845627	test-mlogloss:0.867311
[140]	train-mlogloss:0.836272	test-mlogloss:0.859323
[150]	train-mlogloss:0.827842	test-mlogloss:0.8523
[160]	train-mlogloss:0.8

In [50]:
xgb_cv_pred = {}
for f in range(5):
    xgb_cv_pred[f] = bst[f].predict( xg_test[f], ntree_limit=bst[f].best_ntree_limit)


from sklearn.metrics import log_loss

xgb_cv_preds = []
df_preds = []
for f in range(5):
    #preds = et[f].predict_proba(X_test[f])
    xgb_cv_pred[f] = bst[f].predict( xg_test[f], ntree_limit=bst[f].best_ntree_limit)
    df_preds.append(pd.DataFrame(xgb_cv_pred[f]))
    df_preds[-1].columns = ['gp_all0', 'gp_all1', 'gp_mixed']
    df_preds[-1]['group_1'] = gtrain.iloc[test_indexes[f]].group_1.values

df_preds_all = pd.concat(df_preds)

train_preds = np.array(df_preds_all.sort_values('group_1').drop('group_1', axis=1))

log_loss(gtrain.otype, train_preds)

0.73343842413354743

In [51]:
gtesta = df_test.copy()
xgb_output = xgb.DMatrix(gtesta)

In [52]:
test_preds = []
for f in range(5):
    test_preds.append(bst[f].predict(xgb_output, ntree_limit = bst[f].best_ntree_limit))
    
preds_comb = np.mean([test_preds[j] for j in range(1,5)], axis = 0)

df_test_preds = pd.DataFrame(preds_comb)
df_test_preds.columns = ['gp_all0', 'gp_all1', 'gp_mixed']
df_test_preds['group_1'] = gtesta.group_1.values

In [55]:
preds_out = pd.concat([df_test_preds, df_preds_all])

preds_out.to_pickle('group3d-xgb-preds.pkl')