In [None]:
import numpy as np
import pandas as pd
import datetime
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 100

import matplotlib.pyplot as plt
%matplotlib inline

import lightgbm as lgb

import sys, os, gc, types
import time
from subprocess import check_output

import sklearn
from sklearn.model_selection import train_test_split

In [None]:
root_paths = [
    "/data/kaggle-wikipedia/data/",
    "/Users/jiayou/Dropbox/JuanCode/Kaggle/Wikipedia/data/",
    "/Users/jiayou/Dropbox/Documents/JuanCode/Kaggle/Wikipedia/data/"
]
root = None
for p in root_paths:
    if os.path.exists(p):
        root = p
        break
print(check_output(["ls", root]).decode("utf8"))

In [None]:
median_data = []
median_name = ['49_fix', 'weekday_fix', 'weekend_fix', 'dow0', 'dow1', 'dow2', 'dow3', 'dow4', 'dow5', 'dow6']
for name in median_name:
    median_data.append(pd.read_pickle(root + 'median_{}.pkl'.format(name)))

date_df = pd.read_csv(root + 'date_df.csv')
page_df = pd.read_pickle(root + 'page_ohe.pkl')


In [None]:
train = pd.read_csv(root + 'train_1.csv')
# train.fillna(0, inplace = True)
# train = train.where(train.notnull(), median_data[0])

In [None]:
train.drop(train.columns[1:50], axis=1, inplace=True)

In [None]:
train_df = train.melt(id_vars=['Page'], var_name='date')

In [None]:
train_df.dropna(axis=0, how='any', inplace=True)

# construct ABT

In [None]:
train_df = train_df.merge(page_df, how='left', on='Page')
train_df = train_df.merge(date_df.drop('date_str', axis = 1), how='left', on='date')

In [None]:
val_days = 62
if val_days != 0:
    for j in range(len(median_name)):
        last_day = median_data[j].iloc[:, -val_days-1].values.reshape((len(median_data[j]),1))
        for i in range(-val_days, 0):
            median_data[j].iloc[:, i] = last_day

In [None]:
median_df = []

for i in range(len(median_name)):
    cur_median = median_data[i].melt(
        id_vars=['Page'], 
        var_name='date', 
        value_name='median_{}'.format(median_name[i])
    )
    if i != 0:
        cur_median.drop(['Page', 'date'], axis=1, inplace=True)
    median_df.append(cur_median)
    

train_df = train_df.merge(
    pd.concat(median_df, axis=1), 
    how='left', 
    on=['Page','date']
)

In [None]:
train_df['isval'] = (train_df.dayofyear > 366 - val_days) & (train_df.year == 2016)

In [None]:
train_df.head()

In [None]:
for c, dtype in zip(train_df.columns, train_df.dtypes):
    if dtype == np.float64:
        train_df[c] = train_df[c].astype(np.float32)
    if dtype == np.int64:
        train_df[c] = train_df[c].astype(np.int32)

In [None]:
# del median_df, page_df, date_df, train
gc.collect()

# Preparing data and hyperparams

In [None]:
name = 'sub5-r1'
num_searches = 1
boosting_rounds = 10000
stopping_rounds = 10
down_sample = None

In [None]:
if down_sample is not None:
    train_df = train_df[train_df.index % down_sample == 0]

In [None]:
for mname in median_name:
    train_df['median_{}'.format(mname)] = np.log1p(train_df['median_{}'.format(mname)])
for mname in median_name:
    if mname != '49_fix':
        train_df['median_diff_{}'.format(mname)] = train_df['median_{}'.format(mname)] - train_df['median_49_fix']
        
train_df = train_df[train_df.value <= 5]

In [None]:
train_df.head()

In [None]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_leaves': 512,
#     'min_sum_hessian_in_leaf': 20,
    'max_depth': 12,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.85,
    'bagging_freq': 3,
    'verbose': 1,
    'num_class': 6
#     'device' : 'gpu'
}

In [None]:
train_df.sort_index(axis=1, inplace=True)

train = train_df[train_df.isval == False]
val = train_df[train_df.isval == True]

In [None]:
drop_list = ['value', 'isval', 'Page', 'date']

lgb_train = lgb.Dataset(
    train.drop(drop_list, axis = 1), 
    train.value,
)
lgb_eval = lgb.Dataset(
    val.drop(drop_list, axis = 1), 
    val.value, 
    reference=lgb_train,
)

# del train, val
gc.collect()

In [None]:
# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
# binary error
def SMAPE(preds, train_data):
    labels = train_data.get_label()
    y_true = np.round(np.expm1(labels))
    y_pred = np.round(np.expm1(preds))
    loss = np.mean(np.abs(y_true - y_pred) / np.maximum(1e-6, (np.abs(y_true) + np.abs(y_pred)))) * 200
    return 'SMAPE', loss, False

def SMAPE_2(preds, true):
    y_true = np.round(np.expm1(true))
    y_pred = np.round(np.expm1(preds))
    loss = np.mean(np.abs(y_true - y_pred) / np.maximum(1e-6, (np.abs(y_true) + np.abs(y_pred)))) * 200
    return loss

def SMAPE_3(pred, true):
    y_true = true
    y_pred = pred
    loss = np.mean(np.abs(y_true - y_pred) / np.maximum(1e-6, (np.abs(y_true) + np.abs(y_pred)))) * 200
    return loss

In [None]:
results = []
for i in range(num_searches):
    print('Start LightGBM training...')
    # train
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=boosting_rounds,
#                     feval=SMAPE,
                    valid_sets=[lgb_train, lgb_eval],
#                   categorical_feature=[],
                    early_stopping_rounds=stopping_rounds)

    print('Save model...')
    # save model to file
    gbm.save_model('model.{}.txt'.format(name))

    print('Plot feature importances...') 
    ax = lgb.plot_importance(gbm, max_num_features=100, importance_type='gain', title = 'gain')
    plt.show()
    ax = lgb.plot_importance(gbm, max_num_features=100, importance_type='split', title = 'split')
    plt.show()

In [None]:
val_abt = val.drop(['value', 'isval', 'Page', 'date'], axis=1)
val_pred = gbm.predict(val_abt, num_iteration=gbm.best_iteration)
val_visit = np.argmax(val_pred, axis=1)
print('val SMAPE: ', SMAPE_3(val_visit, val.value.values))

- https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36780
- https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/38274#215155

# Prediction

In [None]:
test = pd.read_csv(root + 'key_1_modified.csv')

test_date_df = pd.read_csv(root + 'test_date_df.csv')
page_df = pd.read_pickle(root + 'page_ohe.pkl')

In [None]:
median_data = []
median_name = ['49_fix', 'weekday_fix', 'weekend_fix', 'dow0', 'dow1', 'dow2', 'dow3', 'dow4', 'dow5', 'dow6']
for mname in median_name:
    median_data.append(pd.read_pickle(root + 'median_{}.pkl'.format(mname)))
    
for i in range(len(median_data)):
    page_df['median_{}'.format(median_name[i])] = np.log1p(median_data[i].iloc[:, -1])
for i in range(len(median_data)):
    if i != 0:
        page_df['median_diff_{}'.format(median_name[i])] = page_df['median_{}'.format(median_name[i])] - page_df['median_49_fix']

In [None]:
test = test.merge(page_df, how='left', on='Page')
test = test.merge(test_date_df, how='left', on='date')

for c, dtype in zip(test.columns, test.dtypes):
    if dtype == np.float64:
        test[c] = test[c].astype(np.float32)
    if dtype == np.int64:
        test[c] = test[c].astype(np.int32)

In [None]:
test.sort_index(axis=1, inplace=True)

test_df = test.drop(['Page', 'date', 'Id'], axis=1)
pred = gbm.predict(test_df, num_iteration=gbm.best_iteration)

In [None]:
pred[:5]

In [None]:
visit = np.argmax(pred, axis=1)
pred_df = pd.DataFrame({'Id':test.Id,'Visits':visit})

In [None]:
pred_df.head()

In [None]:
pred_df.to_csv(
    os.path.join(root, 'sub5_prediction.{}.csv'.format(name)), index=False)