In [7]:
import pandas as pd
import numpy as np
from aria.models.pooling import PoolingModel
import matplotlib.pyplot as plt
from fbprophet import Prophet
from datetime import datetime
%matplotlib inline
import os
def smape(a, b):
    return np.mean(np.abs(a - b) / ((np.abs(a) + np.abs(b)) / 2))

def mape(a, b):
    return np.mean(np.abs(a - b) / (np.abs(a)))

def mse(a,b):
    return np.square(np.subtract(a,b)).mean()

In [8]:
def sigmoid(x, min, max, size):
    # y = C / 1 + e ^-Bx- size/2) 
    # min, max를 가지며, (size/2, (min + max)/2)를 지남. B는 growth rate로 클수록 비균일증가(1설정시 shape커지면 직선 - >1/(max-min))
    # B로 적절한 값은? 가져야할 성질? (미분해서 B 설정)
    B = (max - min) / (max + min)**2
    return (max - min) / (1 + np.exp(-B * (x - size / 2))) + min

def info_gain(df, beta_min, beta_max):
    return [sigmoid(i, beta_min, beta_max, df.shape[0]) for i in range(df.shape[0])]

def cutoff_fit_q(df, i):
    df = df.copy()
#     print("cutoff: ",  df.loc[df['y'] > np.quantile(df.y,i), 'y'].shape[0])
    df.loc[df['y'] > np.quantile(df.y,i), 'y'] = None


    m = PoolingModel()
    m.add_seasonality(7, 3)

    m.fit(df)
    return m

def cutoff_fit(df, i):
    df = df.copy()

    u, s = np.mean(df['y']), np.std(df['y'])
#     print("cutoff: ", df.loc[df['y'] - u > i* s, 'y'].shape[0])
    df.loc[df['y'] - u > i* s, 'y'] = None

    m = PoolingModel()
    m.add_seasonality(7, 3)
    
    if df.shape[0] < 14:
        m = PoolingModel()
        m.add_seasonality(7, 3)
    elif df.shape[0] < 365: 
        m = PoolingModel()
        m.add_seasonality(7, 3)
        m.add_seasonality(30.4375, 5)
    else:
        m = PoolingModel()
        m.add_seasonality(7, 3)
        m.add_seasonality(30.4375, 5)
        m.add_seasonality(365.25, 10)
    m.fit(df)
    return m


In [9]:
def set_random(rn, df, dev_avg, beta_min, beta, beta_max , q_min, q_max):
    rng = np.random.RandomState(rn) # 0,1,2
    true_dev = rng.normal(dev_avg, 0.5) 
#     print("true_dev:", true_dev)
    
    ol_cnt = int(df.shape[0]*0.05)
    ol_date = rng.choice(df['ds'],ol_cnt)
    df['y'].iloc[np.where(df['ds'].isin(ol_date))[0]] *= true_dev
    
    yms = [(y,m) for y in np.unique(df.ds.dt.year) for m in np.unique(df.ds.dt.month)]
    
    errors = list()
    errors_cutoff = list()
    errors_adap_cutoff = list()
    errors_cutoff_q = list()
    errors_adap_cutoff_q = list()
    
    errors_lst = list()
    errors_cutoff_lst = list()
    errors_adap_cutoff_lst = list()
    errors_cutoff_q_lst  = list()
    errors_adap_cutoff_q_lst = list()
    
    gains = info_gain(df, beta_min, beta_max)

    history = df[(df['ds'].dt.year == df.ds.dt.year.iloc[0]) & (df['ds'].dt.month  ==  df.ds.dt.month.iloc[0])]
    
    for i, ym_i in enumerate(range(len(yms) - 1)):
        
        future = df[(df['ds'].dt.year == yms[ym_i + 1][0]) & (df['ds'].dt.month == yms[ym_i + 1][1])] 
        tf = pd.concat([history, future]).reset_index().copy()
        
        # 
        m = cutoff_fit(history, 100)
        tf['yhat_1'] = m.predict(tf)['yhat']
        error = smape(tf.y[-30:],tf.yhat_1[-30:])
        errors.append(error)

        #
        m = cutoff_fit(history, beta)
        tf['yhat_2'] = m.predict(tf)['yhat']
        error = smape(tf.y[-30:],tf.yhat_2[-30:])
        errors_cutoff.append(error)
        
        #
        beta = info_gain(df, beta_min, beta_max)[::-1][i * 30 - 1]
        m = cutoff_fit(history, beta)
        tf['yhat_3'] = m.predict(tf)['yhat']
        error = smape(tf.y[-30:],tf.yhat_3[-30:])
        errors_adap_cutoff.append(error)

        #
        m = cutoff_fit_q(history, 0.95)
        tf['yhat_4'] = m.predict(tf)['yhat']
        error = smape(tf.y[-30:],tf.yhat_4[-30:])
        errors_cutoff_q.append(error)
        
        #
        q = info_gain(df, q_min, q_max)[::-1][i * 30 - 1]
        m = cutoff_fit_q(history, q)
        tf['yhat_5'] = m.predict(tf)['yhat']
        error = smape(tf.y[-30:],tf.yhat_5[-30:])
        errors_adap_cutoff_q.append(error)
        
        history = pd.concat([history, future])
        
        
#         fig, ax = plt.subplots()
#         tf[['y', 'yhat_1', 'yhat_2',  'yhat_3','yhat_4',  'yhat_5',]].plot(ax = ax)
        
#         print("without cutoff: ", np.mean(errors))
#         print("fixed beta: ", np.mean(errors_cutoff))
#         print("adaptive: ", np.mean(errors_adap_cutoff))
#         print("q: ", np.mean(errors_cutoff_q))
#         print("adap_q: ", np.mean(errors_adap_cutoff_q))
        
        errors_lst.append(np.mean(errors))
        errors_cutoff_lst.append(np.mean(errors_cutoff))
        errors_adap_cutoff_lst.append(np.mean(errors_adap_cutoff))
        errors_cutoff_q_lst.append(np.mean(errors_cutoff_q))
        errors_adap_cutoff_q_lst.append(np.mean(errors_adap_cutoff_q))
        
    print("without cutoff: ", np.mean(errors_lst), np.std(errors_lst))
    print("fixed beta: ", np.mean(errors_cutoff_lst), np.std(errors_cutoff_lst))
    print("adaptive: ", np.mean(errors_adap_cutoff_lst), np.std(errors_adap_cutoff_lst))
    print("q", np.mean(errors_cutoff_q_lst), np.std(errors_cutoff_q_lst))
    print("adap_q", np.mean(errors_adap_cutoff_q_lst), np.std(errors_adap_cutoff_q_lst))
    return np.mean(errors_lst), np.mean(errors_cutoff_lst), np.mean(errors_adap_cutoff_lst), np.mean(errors_cutoff_q_lst), np.mean(errors_adap_cutoff_q_lst)

## Depending on sensitivity

In [10]:
DATA_PATH = './data/'
fnames = sorted(os.listdir(DATA_PATH ))

fnames = fnames[1:4]

no = list()
f1 = list()
f2 = list()
q1 = list()
q2 = list()

for fname in fnames:
    print(fname)

    df = pd.read_csv(os.path.join(DATA_PATH,fname), usecols = ['ds', 'y'])[-60:]
    df['y'] = pd.to_numeric(df['y'])
    df['ds'] = pd.to_datetime(df['ds'])

#     beta = df.loc[df['y'] > np.quantile(df.y,0.95), 'y'].mean()/df.loc[df['y'] < np.quantile(df.y,0.95), 'y'].mean()
    beta = 1.5
    
    rn = [1,2,3,4,5,6,7,8,9,10]
    for r in rn:
        a, b, c, d, e = set_random(r, df, 3, beta - 0.2, beta, beta + 0.2, 0.94, 0.96)
    
        no.append(a)
        f1.append(b)
        f2.append(c)
        q1.append(d)
        q2.append(e)
    
#     print(np.mean(a, axis = 1))

    print( "\n ====================================")
    #rn, dev_avg, beta_min, beta, beta_max , q_min, q_max

print("without cutoff: ", np.mean(no), np.std(no))
print("fixed beta: ", np.mean(f1), np.std(f1))
print("adaptive: ", np.mean(f2), np.std(f2))
print("q", np.mean(q1), np.std(q1))
print("adap_q", np.mean(q2), np.std(q2))

1milk.csv
without cutoff:  0.8792529730722618 0.0
fixed beta:  0.5449860696407706 0.0
adaptive:  0.5449860696407706 0.0
q 0.46327642297128463 0.0
adap_q 0.46327642297128463 0.0
without cutoff:  0.9020487442400493 0.0
fixed beta:  0.8516837976094486 0.0
adaptive:  0.8516837976094486 0.0
q 0.35551927003164 0.0
adap_q 0.35551927003164 0.0
without cutoff:  2.0 0.0
fixed beta:  0.8742985545953338 0.0
adaptive:  0.8742985545953338 0.0
q 0.3624543415004179 0.0
adap_q 0.3624543415004179 0.0
without cutoff:  1.6200278849251493 0.0
fixed beta:  1.6348497053521431 0.0
adaptive:  1.6348497053521431 0.0
q 0.5224334647117237 0.0
adap_q 0.5224334647117237 0.0
without cutoff:  1.7330650912964174 0.0
fixed beta:  2.0 0.0
adaptive:  2.0 0.0
q 0.5647671640374747 0.0
adap_q 0.5647671640374747 0.0
without cutoff:  2.0 0.0
fixed beta:  1.1670434423682694 0.0
adaptive:  1.1670434423682694 0.0
q 0.6406334670287882 0.0
adap_q 0.6406334670287882 0.0
without cutoff:  1.3847076062476267 0.0
fixed beta:  1.4991800

Fixing beta as 1.5 or 2 sometimes is better than setting beta depending on outlier_extremity 

## Depending on number of data

In [None]:
rn = [1,2,3,4,5,6,7,8,9,10]
for r in rn:
    df = pd.read_csv('data/2logistics.csv', usecols = ['ds', 'y'])[-90 * r:]
    df['y'] = pd.to_numeric(df['y'])
    df['ds'] = pd.to_datetime(df['ds'])
    beta = df.loc[df['y'] > np.quantile(df.y,0.95), 'y'].mean()/df.loc[df['y'] < np.quantile(df.y,0.95), 'y'].mean()
#         print(set_random(r, 3, 1.8, 2, 2.2, 0.94, 0.96))
#         a.append(list(*set_random(r, 3, 1.8, 2, 2.2, 0.94, 0.96)))
#     a, b, c, d, e = set_random(r, df, 3, 1.8, 2, 2.2, 0.94, 0.96)
    a, b, c, d, e = set_random(r, df, 3, beta - 0.2, beta, beta + 0.2, 0.94, 0.96)
    no.append(a)
    f1.append(b)
    f2.append(c)
    q1.append(d)
    q2.append(e)

#     print(np.mean(a, axis = 1))

print( "\n ====================================")
#rn, dev_avg, beta_min, beta, beta_max , q_min, q_max

print("without cutoff: ", np.mean(no), np.std(no))
print("fixed beta: ", np.mean(f1), np.std(f1))
print("adaptive: ", np.mean(f2), np.std(f2))
print("q", np.mean(q1), np.std(q1))
print("adap_q", np.mean(q2), np.std(q2))
    