In [None]:
import pandas as pd
import numpy as np
import pmdarima as pmd
import pickle

pd.options.display.max_rows = 9999
pd.options.display.max_columns = 100
    
def smape(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

def fit_arima(ts, multiplier, d, D, m, seasonal = True):
    if m == 1:
        seasonal = False
    return pmd.auto_arima(ts.t, d=d, D=D, start_p=0, start_q=0,
                             max_p=3, max_q=3, m=m*multiplier,
                             start_P=0, start_Q=0, max_P=3, max_Q=3,
                             seasonal=seasonal, test="adf",
                             trace=True,
                             error_action='ignore',
                             suppress_warnings=True,
                             stepwise=True)

def optimize_arima(ts, multiplier, val):
    combinations = [(0,0,24),(0,1,24),(1,0,24),(1,1,24),(0,0,1),(1,0,1)]
    for c in combinations:
        model = fit_arima(ts, multiplier, *c)
        pred = model.predict(len(val))
#         results_df[str(c)] = pred
        score = smape(val.t, pred)
        print()
        models.append((score,model))

In [None]:
data_names = ["avazu","IoT","wiki_de","wiki_en","horton","retailrocket"]

sampling_rates = ["1h","15min"]
multipliers = [1,4]
train_test_split = 0.8
train_val_split = 0.75

for data_name in data_names:
    for i,sampling_rate in enumerate(sampling_rates):
        print()
        print()
        print(data_name, sampling_rate)
        multiplier = multipliers[i]
        df = pd.read_csv("data/"+data_name+"_"+sampling_rate+".csv", index_col=0, parse_dates=True)

        df["t"] = df.messages
        df = df.drop(["messages"], axis=1)
        df = df.dropna()
        df = df.astype(np.int)

        train = df.iloc[:int(len(df)*train_test_split)]
        test = df.iloc[int(len(df)*train_test_split):]

        val = train.iloc[int(len(train)*train_val_split):]
        train_val = train.iloc[:int(len(train)*train_val_split)]
        train_val = train_val.iloc[-150:]

        print("Train shape:", train_val.shape)
        print("Val shape:", val.shape)

        models = []
        optimize_arima(train_val, multiplier, val)
        models = sorted(models, key=lambda x: x[0])

        model = models[0][1]
        model.update(val)

        results_df = test.t.to_frame()

        # update SARIMA every 24 hours to new samples.
        results_df["SARIMA"] = 0
        results_df["SARIMA"].iloc[:multiplier*24] = model.predict(multiplier*24)
        
        i = multiplier*24
        while i < len(test):
            model.update(test.t.iloc[i-(multiplier*24):i])
            try:
                results_df["SARIMA"].iloc[i:i+(multiplier*24)] = model.predict(multiplier*24)
            except ValueError:
                results_df["SARIMA"].iloc[i:] = model.predict(len(test)-i)
            i += multiplier*24

        pickle.dump(model, open("models/"+data_name+"_"+sampling_rate+".pkl","wb"))
        results_df.to_csv("results/"+data_name+"_"+sampling_rate+"_results.csv")