# Homework 1 - Wikipedia Web Traffic Time Series

У вас есть данные по посещению 1000 страниц  Википедии из разных стран и разных девайсов ( \*  * данные взяты из [Kaggle соревнования](https://www.kaggle.com/c/web-traffic-time-series-forecasting)* )

*wikipedia_train* и *wikipedia_test* - содержат данные о трафике. Это файлы csv, где каждая строка соответствует определенной статье, и каждый столбец соответствует конкретной дате. В некоторых записях отсутствуют данные. Названия страниц содержат проект Википедии (например, en.wikipedia.org), тип доступа (например, desktop) и тип агента (например, spider). Другими словами, каждое имя статьи имеет следующий формат: «name_project_access_agent» (например, «AKB48_zh.wikipedia.org_all-access_spider»).

Вам нужно ответить на [вопросы](https://docs.google.com/forms/d/e/1FAIpQLSfDjWeeZJw5EvmKn1x_6b9xicjn7ed3MF0rbNm4Cmwr7psSkQ/viewform?usp=sf_link) и попробовать сделать самую простую модель которая сможет предсказывать будущие посещения. 

Вот примеры временных рядов посещаемости страниц Википедии (*синие* - обучающая выборка, *зеленые* - предсказания модели победителя соревнования на Kaggle, *оранжевые* - реальные значения):
![Wikipedia Web Traffic Time Series](https://image.ibb.co/cUpEJa/predictions.png)

In [2]:
import pandas as pd 
import numpy as np 
import re 
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import graph_objs as go

init_notebook_mode(connected = True)

In [3]:
def get_raw_data():
    train = pd.read_csv("../data/wikipedia_train.csv")
    test = pd.read_csv("../data/wikipedia_test.csv")
    return train, test

def get_transform_data(train, test):
    train_df = pd.melt(train, id_vars=['Page'], value_vars=train.columns[1: ], var_name='date', value_name='Visits')
    test_df = pd.melt(test, id_vars=['Page'], value_vars=test.columns[1: ], var_name='date', value_name='Visits' )
    return train_df, test_df

def pandas_smape(df):
    df.fillna(0, inplace=True)
    df["SMAPE"] = 200 * np.abs(df["Visits"] - df["pred_Visits"]) / (df["Visits"] + df["pred_Visits"])
    df["SMAPE"].fillna(0, inplace=True)
    return np.mean(df["SMAPE"])

def get_features(df):
    df['date'] = df['date'].astype('datetime64[ns]')
    df['dayofweek'] = df.date.apply(lambda x: x.dayofweek )
    df['weekend'] = df.date.apply(lambda x: x.dayofweek // 5 )
    return df

train, test  = get_raw_data()
train_df, test_df = get_transform_data(train, test)
train_df, test_df = get_features(train_df), get_features(test_df)

## Сколько страниц из русской Википедии в датасете? *


In [4]:
import re 
def get_language(page):
    res = re.search('[a-z][a-z].wikipedia.org',page)
    if res:
        return res.group(0)[0:2]
    return 'na'

train['country'] = train.Page.apply(get_language)
print(f"Number of RU Wikipedis pages is {train.country.value_counts()['ru']}" )

Number of RU Wikipedis pages is 102


## Какая самая популярная страница русской Википедии (в среднем)? 

In [7]:
train['average_visits'] = train.mean(axis=1)
top_pages = train[train.country == 'ru'].sort_values('average_visits', ascending=False)[:10]
print(f"The most visited page in Russia is {top_pages.Page.iloc[0]}")

The most visited page in Russia is Facebook_ru.wikipedia.org_desktop_all-agents


In [15]:
dates = train.set_index('Page').T
pages = dates.columns[5:6]
data = []
for page in pages :
    trace = go.Scatter(
            x = dates.index,
            y = dates[page],
            name = page)
    data.append(trace)

layout = go.Layout(title='Visits page ')
fig = dict(data = data, layout = layout )
iplot(fig, show_link=False)

## forecasting

In [16]:
train, test  = get_raw_data()
train_df, test_df = get_transform_data(train, test)
train_df, test_df = get_features(train_df), get_features(test_df)

## 1. last day predictions 

In [18]:
page2visits = {}
dates = train.set_index('Page').T
for page in dates.columns:
    last_valid = dates[page].last_valid_index()
    page2visits[page] = dates.loc[last_valid][page]
    
last_preds = test_df.copy()
last_preds['pred_Visits'] = test_df.Page.apply(lambda x: page2visits[x])

print(f"SMAPE for last day predictions is {pandas_smape(last_preds)}")    

SMAPE for last day predictions is 54.988221062619054


## 2. median of last 30 days 

In [19]:
last_30_medians = train[train.columns[-30:]].median(axis = 1)
last_30_median = pd.DataFrame({'Page': train.Page, 'pred_Visits': last_30_medians })
last_30_preds = test_df.merge(last_30_median, on='Page', how='left')
 
print(f"SMAPE for last day predictions is {pandas_smape(last_30_preds)}")

SMAPE for last day predictions is 52.46588329336902


## 3. moving average

In [20]:
def moving_average(series, n):
    return np.average(series[-n:])

dates = train.set_index('Page').T
def predict_average(dates, n_days):
    pages = dates.columns
    page2visits = {}
    for page in pages:
        page2visits[page] = moving_average(dates[page], n_days) 
    
    last_preds = test_df.copy()
    last_preds['pred_Visits'] = test_df.Page.apply(lambda x: page2visits[x])
    return pandas_smape(last_preds) 

for n_days in [1, 2, 3, 4, 5, 10, 20]:
    print(f"SMAPE for {n_days} days is {predict_average(dates, n_days)}")

SMAPE for 1 days is 54.16127748085736
SMAPE for 2 days is 52.579144886631624
SMAPE for 3 days is 52.10547892706432
SMAPE for 4 days is 52.95969511273052
SMAPE for 5 days is 53.15513031458176
SMAPE for 10 days is 54.03829161753088
SMAPE for 20 days is 56.195165523678696


## 4. median of weekends / workdays 

In [21]:
def predict_weekend_mean(train, test):
    agg_train_weekend = train.groupby(['Page', 'weekend']).median().reset_index()
    agg_train_weekend['pred_Visits'] = agg_train_weekend.Visits
    agg_train_weekend.drop(['Visits'], axis=1, inplace=True)

    predictions = test.merge(agg_train_weekend, on =['Page', 'weekend'])
    return predictions

print(f"SMAPE for {n_days} days is {pandas_smape(predict_weekend_mean(train_df, test_df))}")

SMAPE for 20 days is 50.86189697539343


## 5. adjusted medians  ~ 47.50

In [22]:
def predict_median_medians(Windows, train, test_df):
    train = train.fillna(0)
    n = train.shape[1] - 1 #  550
    Visits = np.zeros(train.shape[0])
    for i, row in train.iterrows():
        M = []
        start = row[1:].nonzero()[0]
        if len(start) == 0:
            continue
        if n - start[0] < Windows[0]:
            Visits[i] = row.iloc[start[0]+1:].median()
            continue
        for W in Windows:
            if W > n-start[0]:
                break
            M.append(row.iloc[-W:].median())
        Visits[i] = np.median(M)
        
    Visits[np.where(Visits < 1)] = 0.
    train['pred_Visits'] = Visits
    
    page2ldvisits = dict(zip(train.Page, train['pred_Visits']))
    last_preds = test_df.copy()
    last_preds['pred_Visits'] = test_df.Page.apply(lambda x: page2ldvisits[x])

    #print(f"SMAPE for last day predictions is {pandas_smape(last_preds)}")
    return last_preds 


In [23]:
Fibonacci  = [1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377]
pandas_smape( predict_median_medians(Fibonacci, train, test_df) )

47.55858146477792

In [77]:
pandas_smape( predict_median_medians([ 1, 2, 3, 3, 5, 8,  13, 21, 34, 55, 89, 144, 233, 377], train, test_df) )

47.616480226680046

## 6. median of medians depending on if a day is  a  workday /  weekend  - ~46.50

In [27]:
Fibonacci = [1, 2, 3, 5, 8, 55, 74, 88, 100, 120, 154, 181, 249, 273, 275, 277, 278]

In [28]:
def get_weekends_workends(train):
    dates = train.set_index('Page').columns[:-2]
    dates = dates.astype('datetime64[ns]')
    weekends = []
    workends = []
    for date in dates :
        if date.dayofweek//5 == 1 :
            weekends.append(date.date())
        else:
            workends.append(date.date())
    weekends = [weekend.strftime('%Y-%m-%d') for weekend in weekends]
    workends = [workend.strftime('%Y-%m-%d') for workend in workends]
    return weekends, workends

def split_df(df):
    weekends, workends = get_weekends_workends(df)
    columns_weekends = np.append('Page', weekends)
    columns_workends = np.append('Page', workends)
    return df[columns_weekends], df[columns_workends]

def get_transform_data2(train):
    train_df = pd.melt(train, id_vars=['Page'], value_vars=train.columns[1: ], var_name='date', value_name='Visits')
    return train_df 

train_weekends, train_workends = split_df(train)
test_weekends, test_workends = split_df(test)
test_df_weekends, test_df_workends = get_transform_data2(test_weekends), get_transform_data2(test_workends)
train_df_weekends, train_df_workends = get_transform_data2(train_weekends), get_transform_data2(train_workends)

t_wrk = predict_median_medians(Fibonacci, train_workends, test_df_workends)
t_week = predict_median_medians(Fibonacci, train_weekends, test_df_weekends)
n_d = pd.concat([t_week, t_wrk], axis=0)
pandas_smape(n_d)

46.05204881622072

In [29]:
train_weekends, train_workends = split_df(train)
test_weekends, test_workends = split_df(test)
test_df_weekends, test_df_workends = get_transform_data2(test_weekends), get_transform_data2(test_workends)
train_df_weekends, train_df_workends = get_transform_data2(train_weekends), get_transform_data2(train_workends)

t_wrk = predict_median_medians(Fibonacci, train_workends, test_df_workends)
pandas_smape(t_wrk)

46.2305914830357

In [30]:
t_wrk = predict_median_medians(FibonacciGeniya, train_workends, test_df_workends)
t_week = predict_median_medians(FibonacciGeniya, train_weekends, test_df_weekends)
n_d = pd.concat([t_week, t_wrk], axis=0)
pandas_smape(n_d)

46.05204881622072

### 7. median of medians depending on if a day is  a <br> 

### (Monday/Friday) / (Tuesday/Wednesday/Thursday) / (Saturday/Sunday)  - ~45.9

In [31]:
def split_df2(df):
    mf,twt,wends = get_days(df)
    columns_mf = np.append('Page', mf)
    columns_twt = np.append('Page', twt)
    columns_wends = np.append('Page', wends)
   
    return df[columns_mf], df[columns_twt], df[columns_wends]

def get_days(train):
    dates = train.set_index('Page').columns[:-2]
    dates = dates.astype('datetime64[ns]')
    mnfr = []
    twt= []
   
    wends = []
    
    for date in dates :
        if date.dayofweek == 0:
            mnfr.append(date.date())
        elif date.dayofweek == 1:
            twt.append(date.date())
        elif date.dayofweek == 2:
            twt.append(date.date()) 
        elif date.dayofweek == 3:
            twt.append(date.date())
        elif date.dayofweek == 4:
            mnfr.append(date.date())
        elif date.dayofweek == 5:
            wends.append(date.date())
        else:
            wends.append(date.date())
    mnfr = [weekend.strftime('%Y-%m-%d') for weekend in mnfr]
    twt = [weekend.strftime('%Y-%m-%d') for weekend in twt]
   
    wends = [weekend.strftime('%Y-%m-%d') for weekend in wends]
   
    return mnfr, twt,wends

In [32]:
tr_mnfr, tr_twt,  tr_wends = split_df2(train)
tst_mnfr, tst_twt, tst_wends = split_df2(test)

tr_df_mnfr, tr_df_twt,  tr_df_wends = get_transform_data2(tr_mnfr), get_transform_data2(tr_twt),  get_transform_data2(tr_wends)
tst_df_mnfr, tst_df_twt, tst_df_wends = get_transform_data2(tst_mnfr), get_transform_data2(tst_twt), get_transform_data2(tst_wends)

In [33]:
Fibonacci  = [1, 2, 3, 5, 8 ,58, 74, 85, 102,117, 154, 168, 179,180,182]

t_mnfr = predict_median_medians(Fibonacci, tr_mnfr, tst_df_mnfr)
t_twt = predict_median_medians(Fibonacci, tr_twt, tst_df_twt)

t_wends = predict_median_medians(Fibonacci, tr_wends, tst_df_wends)
n_d = pd.concat([t_mnfr, t_twt,  t_wends], axis=0)
pandas_smape(n_d)

45.903460527364985