In [1]:
import gc
import time
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import plot_importance
import matplotlib.pyplot as plt
from logzero import logger

In [None]:
%%time
click_data = pd.concat([
    pd.read_feather("../data/basic_table"),
    pd.read_feather("../data/click_count_table"),
    pd.read_feather("../data/click_time_interval_ip_device_os_app_hash"),
    pd.read_feather("../data/intervals_table"),
    pd.read_feather("../data/misc_counts"),
    pd.read_feather("../data/multi_basic"),
    pd.read_feather("../data/next_prev_channels"),
    pd.read_feather("../data/ranks_table"),
    pd.read_feather("../data/time_of_day"),
    pd.read_feather("../data/time_table"),
], axis=1)

In [None]:
click_data.dtypes

In [None]:
error = pd.read_feather("../data/down_sampling_train")

In [None]:
error

In [None]:
click_data = click_data[:184903890]

In [None]:
click_data["error"] = error["error"]

In [None]:
error.sort_values(by=["error"], inplace=True)

In [14]:
error["abs"] = np.abs(error["error"])

In [16]:
error.sort_values(by=["sq"], inplace=True)

In [25]:
error[error["sq"]>0.007287].shape

(18491169, 2)

In [27]:
%%time
logger.info("Splitting data...")
click_data.drop(columns=["click_id"], inplace=True)
train = click_data[click_data["dow"] < 3]
valid = click_data[click_data["dow"] == 3]

del click_data
gc.collect()

[I 180427 08:27:11 <timed exec>:1] Splitting data...


CPU times: user 1min 35s, sys: 3min 19s, total: 4min 55s
Wall time: 4min 55s


In [28]:
threshold = 0.008
train = train[np.abs(train["error"]) > threshold]
train.shape

(11077958, 104)

In [29]:
%%time
y_train = train["is_attributed"].astype("int")
train.drop(columns=["is_attributed"], inplace=True)
y_valid = valid["is_attributed"].astype("int")
valid.drop(columns=["is_attributed"], inplace=True)
gc.collect()

CPU times: user 11.3 s, sys: 8.38 s, total: 19.7 s
Wall time: 19.6 s


In [None]:
%%time
logger.info("Generating matrix...")
dtrain = xgb.DMatrix(train, y_train)
del train, y_train
gc.collect()

dvalid = xgb.DMatrix(valid, y_valid)
del valid, y_valid
gc.collect()

[I 180427 08:34:48 <timed exec>:1] Generating matrix...


In [None]:
%%time
n_thread = 32
logger.info("Validating...")
params = {'eta': 0.6,
          'tree_method': "hist",
          'grow_policy': "lossguide",
          'max_leaves': 1400,
          'max_depth': 0,
          'subsample': 0.9,
          'colsample_bytree': 0.7,
          'colsample_bylevel': 0.7,
          'min_child_weight': 0,
          'alpha': 4,
          'objective': 'binary:logistic',
          'scale_pos_weight': 9,
          'eval_metric': 'auc',
          'nthread': n_thread,
          'random_state': 99,
          'silent': True}
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
model = xgb.train(params,
                  dtrain,
                  200,
                  watchlist,
                  maximize=True,
                  early_stopping_rounds=25,
                  verbose_eval=5)
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
plt.show()