In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
from tqdm import tqdm
import numpy as np
from darts.models import AutoARIMA
from darts import TimeSeries
import torch
import json
import pickle
sys.path.append('..')
from metric import *
from utils import list_of_timeseries_to_tensor, normalize_list
print("success")

In [None]:
if os.getcwd()[-9:] == "notebooks":
    os.chdir(os.path.dirname(os.getcwd()))
os.getcwd()

In [5]:
def plot(features, targets, prediction):
    joint = list(np.array(features.pd_dataframe()).flatten()) + list(np.array(targets.pd_dataframe()).flatten())
    plt.plot(joint[-700:], label="features")
    plt.plot([None for _ in range(len(joint)- len(targets))] + list(np.array(prediction.pd_dataframe()).flatten()), label="pred")
    plt.legend()
    plt.show()

In [None]:
test_dataset = []
files = os.listdir("data/processed/test")
for i in tqdm(range(len(files))):
    file = files[i]
    df = pd.read_parquet(os.path.join("data/processed/test", file))
    series = TimeSeries.from_dataframe(df)
    test_dataset.append(series)

In [6]:
def get_features_targets(df, features_length, targets_length):
    if len(df) < features_length + targets_length:
        raise ValueError("features and targets can´t be longer than data")
    
    min_start_targets = features_length
    max_start_targets = len(df) - targets_length
    targets_start = np.random.randint(min_start_targets, max_start_targets)
    targets_end = targets_start + targets_length
    features_start = targets_start - features_length
    features, targets = df[features_start:targets_start], df[targets_start:targets_end]
    assert len(targets) == targets_length
    assert len(features) == features_length
    return features, targets

# Get predictions

In [8]:
#assert False # dont overwrite progress if not necessary
WINDOW_SIZE = 504
all_preds = []
all_targets = []
all_features = []
all_factors = []

In [None]:
for i in tqdm(range(len(test_dataset))):
    if i < len(all_preds):
        continue
    series = test_dataset[i]
    if len(series) < WINDOW_SIZE + 14:
        continue
    if len(series) > WINDOW_SIZE + 14:
        start = np.random.randint(0, len(series) - WINDOW_SIZE - 14)
    else:
        start = 0
    end = start + WINDOW_SIZE + 14
    selection = series.values().flatten()[start:end]
    selection, factors = normalize_list(selection, WINDOW_SIZE)
    selection = TimeSeries.from_dataframe(pd.DataFrame(selection))
    features, targets = selection[:-14], selection[-14:]

    model = AutoARIMA(start_p=2,
                      max_p=12,
                      start_q=2,
                      max_q=12,
                      d=None,           # let model determine 'd'
                      seasonal=True,   # Use SARIMA
                      D=None, 
                      trace=False,
                      error_action='ignore',  
                      suppress_warnings=True,
                      max_order=np.inf,
                      stepwise=True, # The stepwise algorithm can be significantly faster than fitting all (or a random subset of) hyper-parameter combinations and is less likely to over-fit the model.
                      maxiter=500,
                     )
    
    model.fit(features)
    pred = model.predict(len(targets))
    #plot(features, targets, pred)
    
    all_preds.append(pred)
    all_targets.append(targets)
    all_features.append(features)
    all_factors.append(factors)

In [None]:
with open(f"results/arima/arima_results_{WINDOW_SIZE}", "wb") as file:
    pickle.dump((all_preds, all_targets, all_features, all_factors), file)

In [9]:
with open(f"results/arima/arima_results_{WINDOW_SIZE}", "rb") as file:
    all_preds, all_targets, all_features, all_factors = pickle.load(file)

In [10]:
_preds = list_of_timeseries_to_tensor(all_preds)
_targets = list_of_timeseries_to_tensor(all_targets)
len(all_preds)

7624

In [None]:
for i in range(10):
    features = list(all_features[i].values().flatten())
    targets = list(all_targets[i].values().flatten())
    preds = list(all_preds[i].values().flatten())
    plt.plot(features + targets)
    plt.plot([None for _ in features] + preds)
    plt.show()

In [11]:
print("WINDOW_SIZE:", len(all_features[0]))

loss_functions = all_metrices


for i, fn in enumerate(all_metrices):
    if hasattr(fn, "pass_features_and_normal_factors"):
        loss = fn(_preds, _targets, all_features, all_factors)
    else:
        loss = fn(_preds, _targets)

    print(f"Loss for {all_metrices[i]}: {loss}")

WINDOW_SIZE: 504
Loss for L1Loss(): 0.05782635882496834
Loss for MedianAbsoluteError(): 0.042112261056900024
Loss for MSELoss(): 0.011655553244054317
Loss for MedianSquaredError(): 0.0024987126234918833
Loss for HuberLoss(): 0.004032130818814039
Loss for MeanLastValueError(): 0.08430555462837219
Loss for MedianLastValueError(): 0.05821645259857178
Loss for MeanTotalReturnError(): 0.03707563132047653
Loss for MedianTotalReturnError(): 0.02032783254981041
Loss for GeometricMeanDailyReturnError(): 0.006707353680447293
Loss for MeanFinalReturnError(): 0.053743549101764085
Loss for MedianFinalReturnError(): 0.025577385568290013
Market Outperformance Analysis (top 5.0%):
Market Benchmark: 0.004550777352089498
Portfolio Return: 0.025015971281624406
Outperformance: 0.020465193929534907
Loss for BackTestingProfitError(): 0.020465193929534907
