In [1]:
import numpy as np
import pandas as pd
import datetime
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 100

import matplotlib.pyplot as plt
%matplotlib inline

import lightgbm as lgb

import sys, os, gc, types
import time
from subprocess import check_output

import sklearn
from sklearn.model_selection import train_test_split

In [3]:
root_paths = [
    "/data/kaggle-wikipedia/data2/",
    "/Users/jiayou/Dropbox/JuanCode/Kaggle/Wikipedia/data2/",
    "/Users/jiayou/Dropbox/Documents/JuanCode/Kaggle/Wikipedia/data2/",
    '/Users/junxie/Dropbox/JuanCode/Kaggle/Wikipedia/data2/'
]
root = None
for p in root_paths:
    if os.path.exists(p):
        root = p
        break

In [5]:
train = pd.read_csv(root + 'train_2.csv')
# train.fillna(0, inplace = True)
# train = train.where(train.notnull(), median_data[0])

In [7]:
date_df = pd.read_pickle(root + 'date_df.pkl')
page_df = pd.read_pickle(root + 'page_ohe.pkl')

In [None]:
train.drop(train.columns[1:50], axis=1, inplace=True)

In [None]:
# assign fold number before melt? 

In [None]:
train_df = train.melt(id_vars=['Page'], var_name='date')

In [None]:
train_df.dropna(axis=0, how='any', inplace=True)

# construct ABT

In [None]:
down_sample = None
if down_sample is not None:
    train_df = train_df[train_df.index % down_sample == 0]

In [None]:
train_df = train_df.merge(page_df, how='left', on='Page')
train_df = train_df.merge(date_df, how='left', on='date')

In [None]:
median_name = [
    '49', 'weekday_49', 'weekend_49', 
    'dow0', 'dow1', 'dow2', 'dow3', 'dow4', 'dow5', 'dow6',
    '28', 'weekday_28', 'weekend_28',
    '14', 'weekday_14', 'weekend_14',
    '21', 'weekday_21', 'weekend_21',
#     '35', 'weekday_35', 'weekend_35',
#     '42', 'weekday_42', 'weekend_42',
    '7',
]

melted_median = pd.read_pickle(root + 'melted_median_val62_26med.pkl')[median_name + ['Page', 'date']]

train_df = train_df.merge(
    melted_median, 
    how='left', 
    on=['Page','date']
)

base = '49'

for mname in median_name:
    train_df['median_{}'.format(mname)] = np.log1p(train_df['median_{}'.format(mname)])
for mname in median_name:
    if mname != base:
        train_df['median_diff_{}'.format(mname)] = train_df['median_{}'.format(mname)] - train_df['median_{}'.format(base)]
train_df['value'] = np.log1p(train_df['value']) - train_df['median_{}'.format(base)]

In [None]:
# val_days = 62

In [None]:
train_df['isval'] = (train_df.year == 2017) & (train_df.month >= 7)

In [None]:
train_df.isval.sum()

In [None]:
for c, dtype in zip(train_df.columns, train_df.dtypes):
    if dtype == np.float64:
        train_df[c] = train_df[c].astype(np.float32)
    if dtype == np.int64:
        train_df[c] = train_df[c].astype(np.int32)

In [None]:
# del median_df, page_df, date_df, train
gc.collect()

# Preparing data and hyperparams

In [None]:
train_df.head()

In [None]:
train_df.sort_index(axis=1, inplace=True)

train = train_df[train_df.isval == False]
val = train_df[train_df.isval == True]

del train_df
gc.collect()

In [None]:
def create_datasets(train, val):
    drop_list = ['value', 'isval', 'Page', 'date']
    train_value = train.value.values
    train.drop(drop_list, axis = 1, inplace=True)

    lgb_train = lgb.Dataset(
        train.values.astype(np.float32), 
        train_value.astype(np.float32),
        feature_name=list(train.columns),
    )
    lgb_val = lgb.Dataset(
        val.drop(drop_list, axis = 1).values.astype(np.float32), 
        val.value.values.astype(np.float32), 
        feature_name=list(train.columns),
        reference=lgb_train,
    )
    return lgb_train, lgb_val

lgb_train, lgb_val = create_datasets(train, val)

del train
gc.collect()

In [None]:
# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
# binary error
def SMAPE(preds, train_data):
    labels = train_data.get_label()
    y_true = np.round(np.expm1(labels))
    y_pred = np.round(np.expm1(preds))
    loss = np.mean(np.abs(y_true - y_pred) / np.maximum(1e-6, (np.abs(y_true) + np.abs(y_pred)))) * 200
    return 'SMAPE', loss, False

def SMAPE_2(preds, true):
    y_true = np.round(np.expm1(true))
    y_pred = np.round(np.expm1(preds))
    loss = np.mean(np.abs(y_true - y_pred) / np.maximum(1e-6, (np.abs(y_true) + np.abs(y_pred)))) * 200
    return loss

In [None]:
num_searches = 5
boosting_rounds = 10000
stopping_rounds = 20


In [None]:
results = []
val_abt = val.drop(['value', 'isval', 'Page', 'date'], axis=1)
val_pred_list = []

for i in range(num_searches):
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression_l1',
        'metric': {'l1'},
        'num_leaves': 512,
    #     'min_sum_hessian_in_leaf': 20,
        'max_depth': 12,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.85,
        'bagging_freq': 3,
        'verbose': 1,
        'feature_fraction_seed':np.random.randint(0, 1000),
        'bagging_seed':np.random.randint(0, 1000),
        'data_random_seed':np.random.randint(0, 1000),
    #     'device' : 'gpu'
    }
    name = 'gb12-r{}'.format(i)
    print('Start LightGBM training...')
    # train
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=boosting_rounds,
#                     feval=SMAPE,
                    valid_sets=[lgb_train, lgb_val],
#                   categorical_feature=[],
                    early_stopping_rounds=stopping_rounds)

    print('Save model...')
    # save model to file
    gbm.save_model('model.{}.txt'.format(name))

    print('Plot feature importances...') 
    ax = lgb.plot_importance(gbm, max_num_features=100, importance_type='gain', title = 'gain', figsize=(20, 20))
    plt.show()
    ax = lgb.plot_importance(gbm, max_num_features=100, importance_type='split', title = 'split', figsize=(20, 20))
    plt.show()
    
    val_pred = gbm.predict(val_abt, num_iteration=gbm.best_iteration)
    val_pred_list.append(val_pred)
    print(
        'val SMAPE: ', 
        SMAPE_2(
            val_pred + val['median_{}'.format(base)].values, 
            val.value.values + val['median_{}'.format(base)].values,
        )
    )

In [None]:
val_pred = np.mean(
    np.concatenate(
        [np.expand_dims(val_pred, 1) for val_pred in val_pred_list], 
        axis=1
    ), 
    axis=1
)
print(
    'ensembled val SMAPE: ', 
    SMAPE_2(
        val_pred + val['median_{}'.format(base)].values, 
        val.value.values + val['median_{}'.format(base)].values,
    )
)

- https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36780
- https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/38274#215155

# Prediction

In [None]:
# name = 'gb4-r0'
# gbm = lgb.Booster(model_file='model.{}.txt'.format(name))

In [None]:
test = pd.read_pickle(root + 'key_2_modified.pkl')

test_date_df = pd.read_pickle(root + 'test_date_df.pkl')
# page_df = pd.read_pickle(root + 'page_ohe.pkl')

In [None]:
median_data = []
median_name = [
    '49', 'weekday_49', 'weekend_49', 
    'dow0', 'dow1', 'dow2', 'dow3', 'dow4', 'dow5', 'dow6', 
    '28', 'weekday_28', 'weekend_28'
]
for mname in median_name:
    median_data.append(pd.read_pickle(root + 'median_{}.pkl'.format(mname)))
    
for i in range(len(median_data)):
    page_df['median_{}'.format(median_name[i])] = np.log1p(median_data[i].iloc[:, -1])
for i in range(len(median_data)):
    if i != 0:
        page_df['median_diff_{}'.format(median_name[i])] = page_df['median_{}'.format(median_name[i])] - page_df['median_{}'.format(base)]

In [None]:
page_df.head()

In [None]:
test = test.merge(page_df, how='left', on='Page')
test = test.merge(test_date_df, how='left', on='date')

for c, dtype in zip(test.columns, test.dtypes):
    if dtype == np.float64:
        test[c] = test[c].astype(np.float32)
    if dtype == np.int64:
        test[c] = test[c].astype(np.int32)

In [None]:
test.isnull().sum().T

In [None]:
test.sort_index(axis=1, inplace=True)
test_df = test.drop(['Page', 'date', 'Id'], axis=1)

In [None]:
pred_list = []
for i in range(num_searches):
    name = 'gb12-r{}'.format(i)
    gbm = lgb.Booster(model_file='model.{}.txt'.format(name))
    pred = gbm.predict(test_df, num_iteration=gbm.best_iteration)
    pred_list.append(pred)

pred = np.mean(
    np.concatenate(
        [np.expand_dims(pred, 1) for pred in pred_list], 
        axis=1
    ), 
    axis=1
)

In [None]:
visit = np.round(np.expm1(pred + test_df['median_{}'.format(base)].values))
pred_df = pd.DataFrame({'Id':test.Id,'Visits':visit})

In [None]:
name = 'gb12-ensemble'
pred_df.to_csv(
    os.path.join(root, 'test_prediction.{}.csv'.format(name)), index=False)