In [1]:
import gc
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from logzero import logger
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [2]:
%%time
click_data = pd.concat([
    pd.read_feather("../data/basic_table"),
    pd.read_feather("../data/click_count_table"),
    pd.read_feather("../data/click_time_interval_ip_device_os_app_hash"),
    pd.read_feather("../data/intervals_table"),
    pd.read_feather("../data/misc_counts"),
    pd.read_feather("../data/multi_basic"),
    pd.read_feather("../data/next_prev_channels"),
    pd.read_feather("../data/ranks_table"),
    pd.read_feather("../data/time_table"),
], axis=1)

CPU times: user 39.7 s, sys: 51.8 s, total: 1min 31s
Wall time: 3min 4s


In [3]:
click_data.shape

(203694359, 100)

In [4]:
%%time
logger.info("Splitting data...")
click_data.drop(columns=["click_id"], inplace=True)
train = click_data[click_data["dow"] < 3]
valid = click_data[click_data["dow"] == 3]

del click_data
gc.collect()

[I 180425 23:09:07 <timed exec>:1] Splitting data...


CPU times: user 1min 53s, sys: 2min 48s, total: 4min 41s
Wall time: 4min 42s


In [5]:
train.columns

Index(['app', 'channel', 'device', 'ip', 'is_attributed', 'os',
       'click_count_by_ip', 'click_count_by_ip_os_device_dow',
       'click_count_by_ip_os_device_dow_hour',
       'click_time_interval_ip_device_os_app_hash',
       'first_click_interval_ip_device_os_app_hash',
       'click_time_interval_by_ip', 'click_time_interval_by_ip_os_device',
       'click_time_interval_by_ip_os_device_dow',
       'click_time_interval_by_ip_os_device_dow_hour', 'first_click_by_ip',
       'first_click_by_ip_os_device', 'first_click_by_ip_os_device_dow',
       'first_click_by_ip_os_device_dow_hour', 'ip_device_os_cumcount',
       'ip_cumcount', 'ip_channel_countuniq', 'ip_dow_hour_countuniq',
       'ip_app_countuniq', 'ip_app_os_countuniq', 'ip_device_countuniq',
       'app_channel_countuniq', 'ip_device_os_app_countuniq',
       'ip_dow_hour_count', 'ip_app_count', 'ip_app_os_count',
       'ip_dow_channel_var', 'ip_app_os_var', 'ip_app_channel_var',
       'ip_app_channel_mean', 'ip_os_d

In [12]:
numeric_features = ['ip_device_os_cumcount', 'ip_cumcount', 'ip_channel_countuniq', 'ip_dow_hour_countuniq',
       'ip_app_countuniq', 'ip_app_os_countuniq', 'ip_device_countuniq',
       'app_channel_countuniq', 'ip_device_os_app_countuniq',
       'ip_dow_hour_count', 'ip_app_count', 'ip_app_os_count',
       'ip_dow_channel_var', 'ip_app_os_var', 'ip_app_channel_var',
       'ip_app_channel_mean', 'dow', 'hour', 'minute', 'second',
 'click_count_by_ip',
 'click_count_by_ip_os_device_dow',
 'click_count_by_ip_os_device_dow_hour',
 'click_time_interval_by_ip',
 'click_time_interval_by_ip_os_device',
 'click_time_interval_by_ip_os_device_dow',
 'click_time_interval_by_ip_os_device_dow_hour',
 'click_time_interval_ip_device_os_app_hash',
 'rank_by_ip',
 'rank_by_ip_os_device',
 'rank_by_ip_os_device_dow']
categorical_features = [c for c in train.columns if c not in numeric_features and c!="is_attributed"]

In [13]:
sorted(categorical_features)

['app',
 'app_channel',
 'channel',
 'device',
 'device_app',
 'device_app_channel',
 'device_channel',
 'first_click_by_ip',
 'first_click_by_ip_os_device',
 'first_click_by_ip_os_device_dow',
 'first_click_by_ip_os_device_dow_hour',
 'first_click_interval_ip_device_os_app_hash',
 'ip',
 'ip_app',
 'ip_app_channel',
 'ip_channel',
 'ip_device',
 'ip_device_app',
 'ip_device_app_channel',
 'ip_device_channel',
 'ip_os',
 'ip_os_app',
 'ip_os_app_channel',
 'ip_os_channel',
 'ip_os_device',
 'ip_os_device_app',
 'ip_os_device_app_channel',
 'ip_os_device_app_hash',
 'ip_os_device_channel',
 'next_channel_by_app',
 'next_channel_by_device',
 'next_channel_by_device_app',
 'next_channel_by_ip',
 'next_channel_by_ip_app',
 'next_channel_by_ip_device',
 'next_channel_by_ip_device_app',
 'next_channel_by_ip_os',
 'next_channel_by_ip_os_app',
 'next_channel_by_ip_os_device',
 'next_channel_by_ip_os_device_app',
 'next_channel_by_os',
 'next_channel_by_os_app',
 'next_channel_by_os_device',
 '

In [14]:
objective = 'binary' 
metrics = 'auc'
early_stopping_rounds = 30 
verbose_eval = True 
num_boost_round = 1000
nthread = 32

lgb_params = {
    'boosting_type': 'gbdt',
    'objective': objective,
    'learning_rate': 0.10,
    #'is_unbalance': 'true', # replaced with scale_pos_weight argument
    'num_leaves': 7,  # 2^max_depth - 1
    'max_depth': 3,  # -1 means no limit
    'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
    'max_bin': 100,  # Number of bucketed bin for feature values
    'subsample': 0.7,  # Subsample ratio of the training instance.
    'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
    'colsample_bytree': 0.9,  # Subsample ratio of columns when constructing each tree.
    'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
    'scale_pos_weight':200, # because training data is extremely unbalanced 
    'subsample_for_bin': 200000,  # Number of samples for constructing bin
    'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
    'reg_alpha': 0,  # L1 regularization term on weights
    'reg_lambda': 0,  # L2 regularization term on weights
    'nthread': nthread,
    'verbose': 1,
    'metric':metrics
}

In [15]:
predictors = [c for c in train.columns if c!="is_attributed"]

In [None]:
%%time
xgtrain = lgb.Dataset(train[predictors].values, label=train["is_attributed"].values, feature_name=predictors, categorical_feature=categorical_features)

In [None]:
%%time
xgvalid = lgb.Dataset(valid[predictors].values, label=valid["is_attributed"].values, feature_name=predictors, categorical_feature=categorical_features)