In [16]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import gc
from dateutil.relativedelta import relativedelta
import random
import lightgbm as lgb
from sklearn.linear_model import Ridge
from pathlib import Path
import math

In [6]:
path = Path('../data/web-traffic/')
list(path.iterdir())

[PosixPath('../data/web-traffic/key_1.csv'),
 PosixPath('../data/web-traffic/sample_submission_1.csv'),
 PosixPath('../data/web-traffic/sample_submission_2.csv'),
 PosixPath('../data/web-traffic/train_2.csv'),
 PosixPath('../data/web-traffic/key_2.csv'),
 PosixPath('../data/web-traffic/train_1.csv')]

In [8]:
train_df = pd.read_csv(path/'train_2.csv')
train_df.head()

Unnamed: 0,Page,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,2015-07-06,2015-07-07,2015-07-08,2015-07-09,...,2017-09-01,2017-09-02,2017-09-03,2017-09-04,2017-09-05,2017-09-06,2017-09-07,2017-09-08,2017-09-09,2017-09-10
0,2NE1_zh.wikipedia.org_all-access_spider,18.0,11.0,5.0,13.0,14.0,9.0,9.0,22.0,26.0,...,19.0,33.0,33.0,18.0,16.0,27.0,29.0,23.0,54.0,38.0
1,2PM_zh.wikipedia.org_all-access_spider,11.0,14.0,15.0,18.0,11.0,13.0,22.0,11.0,10.0,...,32.0,30.0,11.0,19.0,54.0,25.0,26.0,23.0,13.0,81.0
2,3C_zh.wikipedia.org_all-access_spider,1.0,0.0,1.0,1.0,0.0,4.0,0.0,3.0,4.0,...,6.0,6.0,7.0,2.0,4.0,7.0,3.0,4.0,7.0,6.0
3,4minute_zh.wikipedia.org_all-access_spider,35.0,13.0,10.0,94.0,4.0,26.0,14.0,9.0,11.0,...,7.0,19.0,19.0,9.0,6.0,16.0,19.0,30.0,38.0,4.0
4,52_Hz_I_Love_You_zh.wikipedia.org_all-access_s...,,,,,,,,,,...,16.0,16.0,19.0,9.0,20.0,23.0,28.0,14.0,8.0,7.0


In [9]:
train_df.shape

(145063, 804)

In [14]:
train_df = pd.melt(train_df, id_vars='Page', var_name='date', value_name='Visits')
train_df['date'] = train_df['date'].astype('datetime64[ns]')
train_df['Visits'] = train_df['Visits'].astype('float32')

In [17]:
temp = train_df.loc[(train_df["Visits"] > 0) & (train_df["date"] < '2016-03-01')].groupby(["Page"]).size()
pages = list(temp.index)
train_df = train_df.loc[(train_df["date"] > '2015-03-01') & (train_df["Page"].isin(pages))]

le = LabelEncoder()
train_df.loc[:,"Page"] = le.fit_transform(train_df["Page"])

random.seed(2)
random_pages = random.sample(sorted(train_df["Page"].unique()), 50000)
train_df = train_df.loc[train_df["Page"].isin(random_pages)]




In [18]:
def smape(y_true, y_pred):
    out = 0
    for i in range(y_true.shape[0]):
        a = y_true[i]
        b = y_pred[i]
        if b < 1:
            b = 0
        c = a+b
        if c == 0:
            continue
        out += math.fabs(a - b) / c
    out *= (200.0 / y_true.shape[0])
    return out


def lgb_smape(preds, df):
    labels = df.get_label()
    labels, preds = np.expm1(np.array(labels)), np.expm1(np.array(preds))
    return 'smape', smape(labels, preds), False

In [19]:
def create_time_features(data):
    data['weekday'] = data.date.dt.weekday
    data['is_weekend'] = ((data.date.dt.dayofweek) // 5 == 1).astype('int32')
    data['year'] = data.date.dt.year 
    data['month'] = data.date.dt.month
    data['day'] = data.date.dt.day
    data['dayCount'] = data['date'].apply(lambda x: x.toordinal())
    data['weekOfYear'] = data.date.dt.weekofyear
    return data

In [21]:
train_df.Visits.fillna(0, inplace=True)
train_df = create_time_features(train_df)

train_df['Visits_log'] = np.log1p(train_df['Visits'])
train_df['yearminusone'] = train_df['year'] - 1

train_df = pd.merge(train_df, train_df[['Page', 'year', 'month', 'day', 'Visits_log']],
                    left_on =['Page','yearminusone','month','day'], right_on = ['Page','year','month','day'],
                    how = 'left', suffixes=('', '_2015'), sort = False)

train_df.drop(['yearminusone', 'year_2015', 'year'], axis = 1, inplace = True)

train_df = train_df.loc[train_df.date >= '2015-11-11']

In [22]:
gc.collect()

46

In [23]:
def prepareData(data, train_start_date, train_end_date, test_start_date, test_end_date):
    data.sort_values(by = "dayCount", inplace = True)
    data = data.reset_index(drop = True)
    
    train_start_index = data.loc[data.date == train_start_date].index[0]
    train_end_index = data.loc[data.date == train_end_date].index[-1]
    test_start_index = data.loc[data.date == test_start_date].index[0]
    test_end_index = data.loc[data.date == test_end_date].index[-1]
    
    last60days_index = data.loc[data.date == (pd.to_datetime(train_end_date) - relativedelta(days = 60))].index[0]
    last30days_index = data.loc[data.date == (pd.to_datetime(train_end_date) - relativedelta(days = 30))].index[0]
    
    data.loc[((data.month == 11) & (data.index >=test_start_index) & (data.index <= test_end_index)),"month"] = 10 
    
    # just for simplify
    data.rename(columns={"Visits":"y", "Visits_log":"y_log"}, inplace=True)
    
    print("Calculate averages....")
    
    temp = pd.DataFrame(data.loc[train_start_index:train_end_index].groupby(['Page','weekday'])['y_log'].median())
    temp.columns = ['weekday_average']
    data = data.join(temp, on =['Page','weekday'], how = 'left', sort = False)
    
    temp = pd.DataFrame(data.loc[train_start_index:train_end_index].groupby(['Page'])['y_log'].median())
    temp.columns = ['Page_average']
    data = data.join(temp, on =['Page'], how = 'left', sort = False)
    
    temp = pd.DataFrame(data.loc[train_start_index:train_end_index].groupby(['Page'])['y_log'].quantile(0.95))
    temp.columns = ['quant_95']
    data = data.join(temp, on =['Page'], how = 'left', sort = False)  
    
    temp = pd.DataFrame(data.groupby(['Page','weekOfYear'])['y_log'].median())
    temp.columns = ['week_0']
    temp['week_10'] = temp.week_0.shift(10)
    temp['week_11'] = temp.week_0.shift(11)
    temp['week_12'] = temp.week_0.shift(12)    
    data = data.join(temp, on =['Page','weekOfYear'], how = 'left', sort = False)
    
    
    data['test_month_no'] = ((data.month % 2 == 0) + 1).astype('int32')
    
    temp = pd.DataFrame(data.groupby(['Page','month','test_month_no'])['y_log'].median())
    temp.columns = ['month_0']
    temp['month_1'] = temp.month_0.shift(1)
    temp['month_2'] = temp.month_0.shift(2)
    temp['month_3'] = temp.month_0.shift(3)
    
    temp = temp.reset_index()
    temp['month_1'] = temp.month_0.shift(1)
    temp['month_2'] = temp.month_0.shift(2)
    temp["last_month"] = 0   
    temp.loc[temp.test_month_no == 1,"last_month"] = temp.month_1
    temp.loc[temp.test_month_no == 2,"last_month"] = temp.month_2
    temp.set_index(['Page', 'month', 'test_month_no'], inplace=True)    
    data = data.join(temp, on =['Page','month','test_month_no'], how = 'left', sort = False)
    
    data.sort_values(by = "dayCount", inplace = True)
    data = data.reset_index(drop = True)
    
    features = ['Page','Page_average','date','is_weekend', 'dayCount','weekday_average',\
                'week_10','last_month','month','Visits_log_2015', 'quant_95','test_month_no', 'y_log']
    return data[features]

In [24]:
def train_test_split(data, train_start_date,train_end_date, test_start_date, test_end_date):
    data.sort_values(by = "dayCount", inplace = True)
    data = data.reset_index(drop = True)
    train_start_index = data.loc[data.date == train_start_date].index[0]
    train_end_index = data.loc[data.date == train_end_date].index[-1]
    test_start_index = data.loc[data.date == test_start_date].index[0]
    test_end_index = data.loc[data.date == test_end_date].index[-1]

    print("Splitting to train - test....")
    X_train = data.loc[train_start_index:train_end_index,:].copy()
    
    #remove outliers
    X_train.loc[(X_train.y_log > X_train.quant_95),"y_log"] = X_train.quant_95
       
    y_train = X_train.loc[:,"y_log"]
    X_train = X_train.drop(["y_log"], axis=1)
       
    X_test = data.loc[test_start_index:test_end_index,:].copy()
    y_test = X_test.loc[:,"y_log"]
    X_test = X_test.drop(["y_log"], axis=1)
   
    print("Splitting done")
    return X_train, y_train, X_test, y_test

In [26]:
%%time
#train_start_date = '2016-01-01'
train_start_date = '2016-01-01'
train_end_date  = '2016-08-31'
test_start_date  = '2016-09-10'
test_end_date  = '2016-11-10'
dates = [train_start_date, train_end_date,test_start_date, test_end_date]
dataForModel = prepareData(train_df.copy(), *dates)
gc.collect()

Calculate averages....
CPU times: user 2min 9s, sys: 55.4 s, total: 3min 4s
Wall time: 2min 5s


In [27]:
dataForModel.Visits_log_2015.fillna(dataForModel.last_month, inplace = True)

In [28]:
X_train,y_train,X_test,y_test = train_test_split(dataForModel, *dates)

Splitting to train - test....
Splitting done


In [30]:
param = {}
param['application'] = 'regression_l2'
param['learning_rate'] = 0.1
param['feature_fraction'] = 0.5
param['bagging_fraction'] = 0.5
param['bagging_freq'] = 1
param['max_depth'] = 5
param['num_threads'] = 4
param['verbose'] = 0

In [None]:
features = ['Page','is_weekend', 'dayCount','weekday_average', 'last_month','test_month_no','month','Visits_log_2015']

lgb_train = lgb.Dataset(X_train[features], label=y_train, free_raw_data=False)
lgb_val = lgb.Dataset(X_test[features], label=y_test, free_raw_data=False, reference=lgb_train)

model = lgb.train(param, lgb_train, 300, valid_sets=[lgb_train,lgb_val], feval=lgb_smape, early_stopping_rounds=50)

[1]	training's l2: 4.44493	training's smape: 123.483	valid_1's l2: 4.24046	valid_1's smape: 121.546
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 3.68932	training's smape: 117.349	valid_1's l2: 3.53818	valid_1's smape: 115.423
[3]	training's l2: 3.25709	training's smape: 112.773	valid_1's l2: 2.99258	valid_1's smape: 109.717
[4]	training's l2: 2.72673	training's smape: 106.69	valid_1's l2: 2.53785	valid_1's smape: 103.811
[5]	training's l2: 2.44263	training's smape: 102.342	valid_1's l2: 2.17857	valid_1's smape: 98.4137
[6]	training's l2: 2.06643	training's smape: 96.5605	valid_1's l2: 1.87988	valid_1's smape: 92.9755
[7]	training's l2: 1.87966	training's smape: 92.6116	valid_1's l2: 1.64236	valid_1's smape: 88.0611
[8]	training's l2: 1.61008	training's smape: 87.3168	valid_1's l2: 1.44505	valid_1's smape: 83.2512
[9]	training's l2: 1.48737	training's smape: 83.8923	valid_1's l2: 1.28692	valid_1's smape: 78.9589
[10]	training's l2: 1.29226	training's 