In [2]:
import pandas as pd
import numpy as np
import os.path
import os

import numba
from typing import Tuple, Dict, Collection, List

In [3]:
def make_holidays(tagged, start, end):
    def read_df(lang):
        result = pd.read_pickle('data/holidays/%s.pkl' % lang)
        return result[~result.dw].resample('D').size().rename(lang)

    holidays = pd.DataFrame([read_df(lang) for lang in ['de', 'en', 'es', 'fr', 'ja', 'ru', 'zh']])
    holidays = holidays.loc[:, start:end].fillna(0)
    result =tagged[['country']].join(holidays, on='country').drop('country', axis=1).fillna(0).astype(np.int8)
    result.columns = pd.DatetimeIndex(result.columns.values)
    return result

In [2]:
def single_autocorr(series, lag):
    s1 = series[lag:]
    s2 = series[:-lag]
    ms1 = np.mean(s1)
    ms2 = np.mean(s2)
    ds1 = s1 - ms1
    ds2 = s2 - ms2
    divider = np.sqrt(np.sum(ds1 * ds1)) * np.sqrt(np.sum(ds2 * ds2))
    return np.sum(ds1 * ds2) / divider if divider != 0 else 0

In [3]:
def batch_autocorr(data, lag, starts, ends, threshold, backoffset=0):
    n_series = data.shape[0]
    n_days = data.shape[1]
    max_end = n_days - backoffset
    corr = np.empty(n_series, dtype=np.float64)
    support = np.empty(n_series, dtype=np.float64)
    for i in range(n_series):
        series = data[i]
        end = min(ends[i], max_end)
        real_len = end - starts[i]
        support[i] = real_len/lag
        if support[i] > threshold:
            series = series[starts[i]:end]
            c_365 = single_autocorr(series, lag)
            c_364 = single_autocorr(series, lag-1)
            c_366 = single_autocorr(series, lag+1)
            corr[i] = 0.5 * c_365 + 0.25 * c_364 + 0.25 * c_366
        else:
            corr[i] = np.NaN
    return corr

In [4]:
def find_start_end(data: np.ndarray):
    n_pages = data.shape[0]
    n_days = data.shape[1]
    start_idx = np.full(n_pages, -1, dtype=np.int32)
    end_idx = np.full(n_pages, -1, dtype=np.int32)
    for page in range(n_pages):
        for day in range(n_days):
            if not np.isnan(data[page, day]) and data[page, day] > 0:
                start_idx[page] = day
                break
        for day in range(n_days - 1, -1, -1):
            if not np.isnan(data[page, day]) and data[page, day] > 0:
                end_idx[page] = day
                break
    return start_idx, end_idx

In [5]:
def prepare_data(start, end, valid_threshold):
    df = read_x(start, end)
    starts, ends = find_start_end(df.values)
    page_mask = (ends - starts) / df.shape[1] < valid_threshold
    print("Masked %d pages from %d" % (page_mask.sum(), len(df)))
    inv_mask = ~page_mask
    df = df[inv_mask]
    nans = pd.isnull(df)
    return np.log1p(df.fillna(0)), nans, starts[inv_mask], ends[inv_mask]

In [6]:
def lag_indexes(begin, end):
    dr = pd.date_range(begin, end)
    base_index = pd.Series(np.arange(0, len(dr)), index=dr)
    def lag(offset):
        dates = dr - offset
        return pd.Series(data=base_index.loc[dates].fillna(-1).astype(np.int16).values, index=dr)
    return [lag(pd.DateOffset(months=m)) for m in (3, 6, 9, 12)]

In [7]:
def make_page_features(pages: np.ndarray):
    tagged = extractor.extract(pages).set_index('page')
    features: pd.DataFrame = tagged.drop(['term', 'marker'], axis=1)
    return features

In [8]:
def uniq_page_map(pages):
    import re
    result = np.full([len(pages), 4], -1, dtype=np.int32)
    pat = re.compile('(.+(?:(?:wikipedia\.org)|(?:commons\.wikimedia\.org)|(?:www\.mediawiki\.org)))_([a-z_-]+?)')
    prev_page = None
    num_page = -1
    agents = {'all-access_spider': 0, 'desktop_all-agents': 1, 'mobile-web_all-agents': 2, 'all-access_all-agents': 3}
    for i, entity in enumerate(pages):
        match = pat.fullmatch(entity)
        assert match
        page = match.group(1)
        agent = match.group(2)
        if page != prev_page:
            prev_page = page
            num_page += 1
        result[num_page, agents[agent]] = i
    return result[:num_page+1]

In [10]:
def encode_page_features(df):
    def encode(column):
        one_hot = pd.get_dummies(df[column], drop_first=False)
        return (one_hot - one_hot.mean()) / one_hot.std()

    return {str(column): encode(column) for column in df}

In [11]:
def normalize(values: np.ndarray):
    return (values - values.mean()) / np.std(values)
