In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your data
data = pd.read_csv('data_clean.csv')

# Let's assume you want an 80-20 train-test split
train_data, test_data = train_test_split(data, test_size=0.01, shuffle=False)

# Save them if you want
train_data.to_csv('train.csv', index=False)
test_data.to_csv('test.csv', index=False)

print(f"Train shape: {train_data.shape}")
print(f"Test shape: {test_data.shape}")

Train shape: (928988, 7)
Test shape: (9384, 7)


## ARIMA model

In [None]:
from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score
from statsmodels.tsa.arima.model import ARIMA, ARIMAResults
import numpy as np
import pandas as pd
from tqdm import tqdm


def train_model(
    model: ARIMA,
    train_data: pd.DataFrame,
    order: tuple
) -> ARIMAResults:
    model = model(train_data, order=order)
    model_fit = model.fit()
    return model_fit


def evaluate_model(
    train_data: pd.DataFrame,
    test_data: pd.DataFrame,
    model: ARIMA,
    minutes: int = 4 * 60,
    order: tuple | None = None
) -> None:
    mses = []
    rmses = []
    r2s = []

    num_batch = len(test_data) // minutes - 1
    
    print("Training initial model...")
    model_fit = train_model(model, train_data, order)
    print("Trainig ended.")
    
    train_tqdm = tqdm(range(num_batch), total=num_batch)

    for batch in train_tqdm:
        
        forecast = model_fit.forecast(steps=minutes)
        
        y_true = test_data.iloc[batch * minutes : (batch + 1) * minutes]

        mse = mean_squared_error(y_true, forecast)
        rmse = root_mean_squared_error(y_true, forecast)
        r2 = r2_score(y_true, forecast)
        
        mses.append(mse)
        rmses.append(rmse)
        r2s.append(r2)
        
        model_fit = model_fit.append(test_data.iloc[batch * minutes : (batch + 1) * minutes], refit=False)
        
        train_tqdm.set_description(
            f"MSE: {np.mean(mses):0.3f}, RMSE: {np.mean(rmses):0.3f}, R2: {np.mean(r2s):0.3f}"
        )


In [9]:
train_data = pd.read_csv('train.csv', parse_dates=["Date"], index_col=0)["2013-01-01 00:00:00":]
train_data = train_data.asfreq(pd.infer_freq(train_data.index))

test_data = pd.read_csv('test.csv', parse_dates=["Date"], index_col=0)
test_data = test_data.asfreq(pd.infer_freq(test_data.index))

time_series = 2

evaluate_model(
    train_data=train_data.iloc[:, time_series],
    test_data=test_data.iloc[:, time_series],
    model=ARIMA,
    order=(2, 0, 2)
)

Training initial model...
Trainig ended.


MSE: 0.960, RMSE: 0.752, R2: -1.998: 100%|██████████| 38/38 [00:31<00:00,  1.20it/s]


# VAR model