In [1]:
import gc
import time
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from xgboost import plot_importance
import matplotlib.pyplot as plt

def read_hdf(name:str):
    return pd.read_hdf("../data/{}.hdf".format(name), name)



In [2]:
%%time
merged_click_data = read_hdf("merged_click_data")

CPU times: user 72 ms, sys: 6.96 s, total: 7.04 s
Wall time: 7.05 s


In [3]:
%%time
merged_click_data.drop(columns=["click_id"], inplace=True)
train = merged_click_data[merged_click_data["dow"] < 3]
test = merged_click_data[merged_click_data["dow"]==3]

CPU times: user 20.3 s, sys: 9.49 s, total: 29.8 s
Wall time: 29.7 s


In [4]:
del merged_click_data
gc.collect()

14

In [5]:
%%time
y_train = train["is_attributed"].astype("int")
train.drop(columns=["is_attributed"], inplace=True)
y_valid = test["is_attributed"].astype("int")
test.drop(columns=["is_attributed"], inplace=True)
gc.collect()

CPU times: user 9.82 s, sys: 4.26 s, total: 14.1 s
Wall time: 14.1 s


In [6]:
dtrain = xgb.DMatrix(train, y_train)
dvalid = xgb.DMatrix(test, y_valid)

In [7]:
del train, y_train, test, y_valid
gc.collect()

0

In [8]:
params = {'eta': 0.6,
          'tree_method': "hist",
          'grow_policy': "lossguide",
          'max_leaves': 1400,
          'max_depth': 0,
          'subsample': 0.9,
          'colsample_bytree': 0.7,
          'colsample_bylevel': 0.7,
          'min_child_weight': 0,
          'alpha': 4,
          'objective': 'binary:logistic',
          'scale_pos_weight': 9,
          'eval_metric': 'auc',
          'nthread': 40,
          'random_state': 99,
          'silent': True}
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
model = xgb.train(params, dtrain, 200, watchlist,
                  maximize=True, early_stopping_rounds=25, verbose_eval=5)

[0]	train-auc:0.9628	valid-auc:0.957822
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 25 rounds.
[5]	train-auc:0.973968	valid-auc:0.966239
[10]	train-auc:0.979503	valid-auc:0.969907
[15]	train-auc:0.980409	valid-auc:0.965792
[20]	train-auc:0.979966	valid-auc:0.961555
[25]	train-auc:0.978609	valid-auc:0.959352
[30]	train-auc:0.976225	valid-auc:0.956477
[35]	train-auc:0.97595	valid-auc:0.957869
Stopping. Best iteration:
[11]	train-auc:0.980037	valid-auc:0.970026

