In [12]:
import pandas as pd 
import numpy as np 

def get_raw_data():
    train = pd.read_csv("../data/wikipedia_train.csv")
    test = pd.read_csv("../data/wikipedia_test.csv")
    return train, test

def get_transform_data(train, test):
    train_df = pd.melt(train, id_vars=['Page'], value_vars=train.columns[1: ], var_name='date', value_name='Visits')
    test_df = pd.melt(test, id_vars=['Page'], value_vars=test.columns[1: ], var_name='date', value_name='Visits' )
    return train_df, test_df

def pandas_smape(df):
    df.fillna(0, inplace=True)
    df["SMAPE"] = 200 * np.abs(df["Visits"] - df["pred_Visits"]) / (df["Visits"] + df["pred_Visits"])
    df["SMAPE"].fillna(0, inplace=True)
    return np.mean(df["SMAPE"])

def get_features(df):
    df['date'] = df['date'].astype('datetime64[ns]')
    df['dayofweek'] = df.date.apply(lambda x: x.dayofweek )
    df['weekend'] = df.date.apply(lambda x: x.dayofweek // 5 )
    
    #df['weekend'] = (df.date.dayofweek // 5).astype(float)
    #df['shortweek'] = ((df.datex.dt.dayofweek) // 4 == 1).astype(float)
    return df

train, test  = get_raw_data()
train_df, test_df = get_transform_data(train, test)
train_df, test_df = get_features(train_df), get_features(test_df)

## Сколько страниц из русской Википедии в датасете? *


In [16]:
import re 
def get_language(page):
    res = re.search('[a-z][a-z].wikipedia.org',page)
    if res:
        return res.group(0)[0:2]
    return 'na'

train['country'] = train.Page.apply(get_language)
print(f"Number of RU Wikipedis pages is {train.country.value_counts()['ru']}" )

Number of RU Wikipedis pages is 102


## Какая самая популярная страница русской Википедии (в среднем)? 

In [17]:
train['average_visits'] = train.mean(axis=1)
top_pages = train[train.country == 'ru'].sort_values('average_visits', ascending=False)[:10]
print(f"The most visited page in Russia is {top_pages.Page.iloc[0]}")

The most visited page in Russia is Facebook_ru.wikipedia.org_desktop_all-agents


In [308]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import graph_objs as go

init_notebook_mode(connected = True)

n_pages = 1
dates = train.set_index('Page').T
pages = dates.columns[:n_pages]
data = []
for page in pages :
    trace = go.Scatter(
            x = dates.index,
            y = dates[page],
            name = page)
    data.append(trace)

layout = go.Layout(title='Visits page ')
fig = dict(data = data, layout = layout )
iplot(fig, show_link=False)

In [311]:
dates[page]

2015-07-01    32.0
2015-07-02    26.0
2015-07-03    22.0
2015-07-04    22.0
2015-07-05    29.0
2015-07-06    49.0
2015-07-07    20.0
2015-07-08    27.0
2015-07-09    19.0
2015-07-10    21.0
2015-07-11    21.0
2015-07-12    64.0
2015-07-13    38.0
2015-07-14    21.0
2015-07-15    27.0
2015-07-16    20.0
2015-07-17    23.0
2015-07-18    23.0
2015-07-19    27.0
2015-07-20    64.0
2015-07-21    21.0
2015-07-22    40.0
2015-07-23    97.0
2015-07-24    32.0
2015-07-25    30.0
2015-07-26    21.0
2015-07-27    30.0
2015-07-28    30.0
2015-07-29    28.0
2015-07-30    18.0
              ... 
2016-08-02    19.0
2016-08-03    22.0
2016-08-04    28.0
2016-08-05    19.0
2016-08-06    17.0
2016-08-07    11.0
2016-08-08    22.0
2016-08-09    20.0
2016-08-10    22.0
2016-08-11    28.0
2016-08-12    42.0
2016-08-13    23.0
2016-08-14    16.0
2016-08-15    21.0
2016-08-16    28.0
2016-08-17    29.0
2016-08-18    59.0
2016-08-19    40.0
2016-08-20    25.0
2016-08-21    36.0
2016-08-22    29.0
2016-08-23  

In [306]:


n_pages = 1
dates = train.set_index('Page').T
pages = dates.columns[:n_pages]
data = []
for page in pages :
    trace = go.Scatter(
            x = dates.index,
            y = dates[page],
            name = page)
    data.append(trace)

layout = go.Layout(title='Visits page ')
fig = dict(data = data, layout = layout )
iplot(fig, show_link=False)

## forecasting

In [219]:
train, test  = get_raw_data()
train_df, test_df = get_transform_data(train, test)
train_df, test_df = get_features(train_df), get_features(test_df)

## 1. last day predictions 

In [19]:
page2visits = {}
dates = train.set_index('Page').T
for page in n_d.columns:
    last_valid = dates[page].last_valid_index()
    page2visits[page] = dates.loc[last_valid][page]
    
last_preds = test_df.copy()
last_preds['pred_Visits'] = test_df.Page.apply(lambda x: page2visits[x])

print(f"SMAPE for last day predictions is {pandas_smape(last_preds)}")    

SMAPE for last day predictions is 54.988221062619054


## 2. median of last 30 days 

In [20]:
last_30_medians = train[train.columns[-30:]].median(axis = 1)
last_30_median = pd.DataFrame({'Page': train.Page, 'pred_Visits': last_30_medians })
last_30_preds = test_df.merge(last_30_median, on='Page', how='left')
 
print(f"SMAPE for last day predictions is {pandas_smape(last_30_preds)}")

SMAPE for last day predictions is 52.46588329336902


## 3. moving average

In [21]:
def moving_average(series, n):
    return np.average(series[-n:])

dates = train.set_index('Page').T
def predict_average(dates, n_days):
    pages = dates.columns
    page2visits = {}
    for page in pages:
        page2visits[page] = moving_average(dates[page], n_days) 
    
    last_preds = test_df.copy()
    last_preds['pred_Visits'] = test_df.Page.apply(lambda x: page2visits[x])
    return pandas_smape(last_preds) 

for n_days in [1, 2, 3, 4, 5, 10, 20]:
    print(f"SMAPE for {n_days} days is {predict_average(dates, n_days)}")

SMAPE for 1 days is 54.16127748085736
SMAPE for 2 days is 52.579144886631624
SMAPE for 3 days is 52.10547892706432
SMAPE for 4 days is 52.95969511273052
SMAPE for 5 days is 53.15513031458176
SMAPE for 10 days is 54.03829161753088
SMAPE for 20 days is 56.195165523678696


## 4. median of weekends / workdays 

In [60]:
def predict_weekend_mean(train, test):
    agg_train_weekend = train.groupby(['Page', 'weekend']).median().reset_index()
    agg_train_weekend['pred_Visits'] = agg_train_weekend.Visits
    agg_train_weekend.drop(['Visits'], axis=1, inplace=True)

    predictions = test.merge(agg_train_weekend, on =['Page', 'weekend'])
    return predictions

print(f"SMAPE for {n_days} days is {pandas_smape(predict_weekend_mean(train_df, test_df))}")

SMAPE for 20 days is 50.86189697539343


## 5. adjusted medians 

In [34]:
def predict_median_medians(Windows, train, test_df):
    train = train.fillna(0)
    n = train.shape[1] - 1 #  550
    Visits = np.zeros(train.shape[0])
    for i, row in train.iterrows():
        M = []
        start = row[1:].nonzero()[0]
        if len(start) == 0:
            continue
        if n - start[0] < Windows[0]:
            Visits[i] = row.iloc[start[0]+1:].median()
            continue
        for W in Windows:
            if W > n-start[0]:
                break
            M.append(row.iloc[-W:].median())
        Visits[i] = np.median(M)
        
    Visits[np.where(Visits < 1)] = 0.
    train['pred_Visits'] = Visits
    
    page2ldvisits = dict(zip(train.Page, train['pred_Visits']))
    last_preds = test_df.copy()
    last_preds['pred_Visits'] = test_df.Page.apply(lambda x: page2ldvisits[x])

    #print(f"SMAPE for last day predictions is {pandas_smape(last_preds)}")
    return last_preds 


In [103]:
Fibonacci  = [1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377]
pandas_smape( predict_median_medians(Fibonacci, train, test_df) )

47.55858146477792

In [108]:
Fibonacci  = [1,  2,  3, 8, 12, 5, 8, 13, 21, 34, 66, 89, 144,200, 233, 377]
pandas_smape( predict_median_medians(Fibonacci, train, test_df) )

47.773616395286275

In [111]:
Fibonacci  = [1, 1, 2, 3, 5, 8, 13, 33, 55, 88, 120, 180, 260, 320, 400]
pandas_smape( predict_median_medians(Fibonacci, train, test_df) )

47.47501954684211

In [113]:
Fibonacci  = [1, 1, 2, 3, 5, 8, 13, 33, 55, 88, 118, 181, 258, 299 ,320, 400]
pandas_smape( predict_median_medians(Fibonacci, train, test_df) )


47.33029189250828

In [134]:
Fibonacci  = [1, 2, 3, 5, 8, 13, 33, 55, 88, 118, 181, 258, 270, 299, 310, 320, 356, 381, 400, 423]
pandas_smape( predict_median_medians(Fibonacci, train, test_df) )


47.007391810134756

In [136]:
FibonacciGeniya  = [1, 2, 3, 5, 8, 13, 33, 55, 88, 118, 181, 258, 270, 299, 310, 320, 356, 381, 400, 423]
pandas_smape( predict_median_medians(Fibonacci, train, test_df) )

47.007391810134756

In [71]:
Windows = [6, 12, 18, 30, 48, 78, 126, 203, 329]
pandas_smape( predict_median_medians(Windows, train, test_df) )

48.69378348831441

In [77]:
pandas_smape( predict_median_medians([ 1, 2, 3, 3, 5, 8,  13, 21, 34, 55, 89, 144, 233, 377], train, test_df) )

47.616480226680046

In [59]:
for r in [ 1.65, 1.68, 1.7, 1.75, 1.8, 1.9,]:
    Windows = np.round(r**np.arange(0,9) * 5).astype(int)
    print( r, pandas_smape( predict_median_medians(Windows, train, test_df) ) ) 

1.65 48.781690356940004
1.68 48.499873132778376
1.7 48.3866570009572
1.75 48.803238560926744
1.8 48.67064225523273
1.9 48.90337900800943


## 6. median of medians dividing into weekend / workday 

In [214]:
def get_weekends_workends(train):
    dates = train.set_index('Page').columns[:-2]
    dates = dates.astype('datetime64[ns]')
    weekends = []
    workends = []
    for date in dates :
        if date.dayofweek//5 == 1 :
            weekends.append(date.date())
        else:
            workends.append(date.date())
    weekends = [weekend.strftime('%Y-%m-%d') for weekend in weekends]
    workends = [workend.strftime('%Y-%m-%d') for workend in workends]
    return weekends, workends

def split_df(df):
    weekends, workends = get_weekends_workends(df)
    columns_weekends = np.append('Page', weekends)
    columns_workends = np.append('Page', workends)
    return df[columns_weekends], df[columns_workends]

def get_transform_data2(train):
    train_df = pd.melt(train, id_vars=['Page'], value_vars=train.columns[1: ], var_name='date', value_name='Visits')
    return train_df 

train_weekends, train_workends = split_df(train)
test_weekends, test_workends = split_df(test)
test_df_weekends, test_df_workends = get_transform_data2(test_weekends), get_transform_data2(test_workends)
train_df_weekends, train_df_workends = get_transform_data2(train_weekends), get_transform_data2(train_workends)

t_wrk = predict_median_medians(Fibonacci, train_workends, test_df_workends)
t_week = predict_median_medians(Fibonacci, train_weekends, test_df_weekends)
n_d = pd.concat([t_week, t_wrk], axis=0)
pandas_smape(n_d)

46.93838287620072

In [227]:
train_weekends, train_workends = split_df(train)
test_weekends, test_workends = split_df(test)
test_df_weekends, test_df_workends = get_transform_data2(test_weekends), get_transform_data2(test_workends)
train_df_weekends, train_df_workends = get_transform_data2(train_weekends), get_transform_data2(train_workends)

t_wrk = predict_median_medians(Fibonacci, train_workends, test_df_workends)
pandas_smape(t_wrk)

46.84789339070022

In [221]:
t_wrk = predict_median_medians(FibonacciGeniya, train_workends, test_df_workends)
t_week = predict_median_medians(FibonacciGeniya, train_weekends, test_df_weekends)
n_d = pd.concat([t_week, t_wrk], axis=0)
pandas_smape(n_d)

46.07515729812105

In [279]:
FibonacciGeniya  = [1, 2, 3, 5, 8, 55, 74, 88, 100, 120, 154, 181, 249, 273, 275, 277, 278]

t_wrk = predict_median_medians(FibonacciGeniya, train_workends, test_df_workends)
t_week = predict_median_medians(FibonacciGeniya, train_weekends, test_df_weekends)
n_d = pd.concat([t_week, t_wrk], axis=0)
pandas_smape(n_d)

46.05204881622072

In [285]:
train

Unnamed: 0,Page,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,2015-07-06,2015-07-07,2015-07-08,2015-07-09,...,2016-08-22,2016-08-23,2016-08-24,2016-08-25,2016-08-26,2016-08-27,2016-08-28,2016-08-29,2016-08-30,2016-08-31
0,15._November_de.wikipedia.org_desktop_all-agents,32.0,26.0,22.0,22.0,29.0,49.0,20.0,27.0,19.0,...,29.0,23.0,31.0,25.0,27.0,23.0,17.0,26.0,23.0,37.0
1,2012_(film)_fr.wikipedia.org_all-access_spider,2.0,3.0,5.0,3.0,5.0,3.0,7.0,8.0,7.0,...,5.0,5.0,6.0,5.0,4.0,11.0,2.0,0.0,7.0,5.0
2,2016_FIFA_U-20女子ワールドカップ_ja.wikipedia.org_all-a...,1.0,3.0,2.0,2.0,1.0,10.0,2.0,1.0,4.0,...,4.0,3.0,2.0,3.0,2.0,4.0,2.0,0.0,5.0,4.0
3,2016_UEFA_Europa_League_Final_en.wikipedia.org...,3.0,3.0,3.0,8.0,12.0,12.0,8.0,12.0,23.0,...,10.0,14.0,26.0,5.0,29.0,23.0,17.0,16.0,12.0,14.0
4,2016_in_video_gaming_en.wikipedia.org_all-acce...,24.0,40.0,23.0,49.0,88.0,25.0,31.0,76.0,51.0,...,134.0,162.0,208.0,179.0,108.0,99.0,49.0,80.0,113.0,173.0
5,2018_FIFA_World_Cup_qualification_(CONMEBOL)_e...,48.0,14.0,14.0,17.0,58.0,10.0,49.0,23.0,33.0,...,18.0,22.0,20.0,19.0,21.0,34.0,36.0,18.0,21.0,41.0
6,22_апреля_ru.wikipedia.org_all-access_spider,18.0,13.0,29.0,28.0,25.0,5.0,21.0,24.0,35.0,...,8.0,10.0,16.0,28.0,12.0,6.0,13.0,7.0,19.0,6.0
7,30._März_de.wikipedia.org_desktop_all-agents,29.0,24.0,26.0,24.0,29.0,40.0,26.0,20.0,23.0,...,28.0,19.0,17.0,5.0,15.0,23.0,15.0,31.0,20.0,22.0
8,6-я_армия_(Третий_рейх)_ru.wikipedia.org_deskt...,16.0,43.0,28.0,33.0,43.0,34.0,21.0,57.0,21.0,...,49.0,50.0,39.0,69.0,66.0,34.0,47.0,53.0,40.0,51.0
9,AKB48_41stシングル選抜総選挙_ja.wikipedia.org_all-acces...,1086.0,946.0,1046.0,2939.0,1527.0,1074.0,1129.0,2012.0,1643.0,...,108.0,124.0,108.0,94.0,217.0,157.0,177.0,162.0,174.0,123.0


In [286]:
def get_weekends_workends(train):
    dates = train.set_index('Page').columns[:-2]
    dates = dates.astype('datetime64[ns]')
    sundays = []
    saturdays = []
    workends = []
    for date in dates :
        if date.dayofweek//6 == 1 :
            sundays.append(date.date())
        else:
            if date.dayofweek//5 == 1 :
                saturdays.append(date.date())
            else: 
                workends.append(date.date())
    sundays = [sunday.strftime('%Y-%m-%d') for sunday in sundays]
    saturdays = [saturday.strftime('%Y-%m-%d') for saturday in saturdays]
    workends = [workend.strftime('%Y-%m-%d') for workend in workends]
    return sundays, saturdays, workends

def split_df(df):
    sundays, saturdays, workends = get_weekends_workends(df)
    columns_sundays = np.append('Page', sundays)
    columns_saturdays = np.append('Page', saturdays)
    columns_workends = np.append('Page', workends)
    return df[columns_sundays], df[columns_saturdays], df[columns_workends]

def get_transform_data2(train):
    train_df = pd.melt(train, id_vars=['Page'], value_vars=train.columns[1: ], var_name='date', value_name='Visits')
    return train_df 

train_sundays, train_saturdays, train_workends = split_df(train)
test_sundays, test_saturdays, test_workends = split_df(test)
test_df_sundays, test_df_saturdays, test_df_workends = get_transform_data2(test_sundays), get_transform_data2(test_saturdays), get_transform_data2(test_workends)
train_df_sundays, train_df_saturdays, train_df_workends = get_transform_data2(train_sundays), get_transform_data2(train_sundays), get_transform_data2(train_workends)




In [287]:
Fibonacci = FibonacciGeniya
t_wrk = predict_median_medians(Fibonacci, train_workends, test_df_workends)
t_week = predict_median_medians(Fibonacci, train_weekends, test_df_weekends)
n_d = pd.concat([t_week, t_wrk], axis=0)
pandas_smape(n_d)

46.05204881622072

In [290]:
Fibonacci = FibonacciGeniya
t_wrk = predict_median_medians(Fibonacci, train_workends, test_df_workends)
t_sat = predict_median_medians(Fibonacci, train_saturdays, test_df_saturdays)
t_sun = predict_median_medians(Fibonacci, train_sundays, test_df_sundays)
n_d = pd.concat([t_sat, t_sun, t_wrk], axis=0)
pandas_smape(n_d)

47.130957180179976

In [319]:
def split_df2(df):
    mf,twt,wends = get_days(df)
    columns_mf = np.append('Page', mf)
    columns_twt = np.append('Page', twt)
    columns_wends = np.append('Page', wends)
   
    return df[columns_mf], df[columns_twt], df[columns_wends]

def get_days(train):
    dates = train.set_index('Page').columns[:-2]
    dates = dates.astype('datetime64[ns]')
    mnfr = []
    twt= []
   
    wends = []
    
    for date in dates :
        if date.dayofweek == 0:
            mnfr.append(date.date())
        elif date.dayofweek == 1:
            twt.append(date.date())
        elif date.dayofweek == 2:
            twt.append(date.date()) 
        elif date.dayofweek == 3:
            twt.append(date.date())
        elif date.dayofweek == 4:
            mnfr.append(date.date())
        elif date.dayofweek == 5:
            wends.append(date.date())
        else:
            wends.append(date.date())
    mnfr = [weekend.strftime('%Y-%m-%d') for weekend in mnfr]
    twt = [weekend.strftime('%Y-%m-%d') for weekend in twt]
   
    wends = [weekend.strftime('%Y-%m-%d') for weekend in wends]
   
    return mnfr, twt,wends

In [321]:
tr_mnfr, tr_twt,  tr_wends = split_df2(train)
tst_mnfr, tst_twt, tst_wends = split_df2(test)

tr_df_mnfr, tr_df_twt,  tr_df_wends = get_transform_data2(tr_mnfr), get_transform_data2(tr_twt),  get_transform_data2(tr_wends)
tst_df_mnfr, tst_df_twt, tst_df_wends = get_transform_data2(tst_mnfr), get_transform_data2(tst_twt), get_transform_data2(tst_wends)


In [393]:
Fibonacci  = [1, 2, 3, 5, 8 ,58, 74, 85, 102,117, 154, 168, 179,180,182]

t_mnfr = predict_median_medians(Fibonacci, tr_mnfr, tst_df_mnfr)
t_twt = predict_median_medians(Fibonacci, tr_twt, tst_df_twt)

t_wends = predict_median_medians(Fibonacci, tr_wends, tst_df_wends)
n_d = pd.concat([t_mnfr, t_twt,  t_wends], axis=0)
pandas_smape(n_d)

45.903460527364985

In [407]:
Fibonacci  = [1,  2, 3, 5, 8, 58, 74 ,85 ,102 ,117, 154, 168, 179,180,182]

t_mnfr = predict_median_medians(Fibonacci, tr_mnfr, tst_df_mnfr)
t_twt = predict_median_medians(Fibonacci, tr_twt, tst_df_twt)

t_wends = predict_median_medians(Fibonacci, tr_wends, tst_df_wends)
n_d = pd.concat([t_mnfr, t_twt,  t_wends], axis=0)
pandas_smape(n_d)

45.95914794213086

In [315]:
n_d = pd.concat([t_m, t_twt, t_f, t_wends], axis=0)
pandas_smape(n_d)

47.138309583856476

In [None]:
tr_mn, tr_tu, tr_wd, tr_th, tr_fr, tr_st, tr_sn = split2(train)
test_sundays, test_saturdays, test_workends = split_df(test)
test_df_sundays, test_df_saturdays, test_df_workends = get_transform_data2(test_sundays), get_transform_data2(test_saturdays), get_transform_data2(test_workends)
train_df_sundays, train_df_saturdays, train_df_workends = get_transform_data2(train_sundays), get_transform_data2(train_sundays), get_transform_data2(train_workends)


Fibonacci = FibonacciGeniya
t_wrk = predict_median_medians(Fibonacci, train_workends, test_df_workends)
t_week = predict_median_medians(Fibonacci, train_weekends, test_df_weekends)
n_d = pd.concat([t_week, t_wrk], axis=0)
pandas_smape(n_d)

# Чому так ???????????????????????????

In [32]:
# r = 1.61803398875
# Windows = np.round(r**np.arange(0,9) * 7)

In [None]:
Fibonacci  = [1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377]