In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your data
data = pd.read_csv('data_clean.csv')

# Let's assume you want an 80-20 train-test split
train_data, test_data = train_test_split(data, test_size=0.01, shuffle=False)

# Save them if you want
train_data.to_csv('train.csv', index=False)
test_data.to_csv('test.csv', index=False)

print(f"Train shape: {train_data.shape}")
print(f"Test shape: {test_data.shape}")

Train shape: (928988, 7)
Test shape: (9384, 7)


## ARIMA model

In [1]:
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima.model import ARIMA, ARIMAResults
import numpy as np
import pandas as pd
from tqdm import tqdm


def train_model(
    model: ARIMA,
    train_data: pd.DataFrame,
    order: tuple
) -> ARIMAResults:
    model = model(train_data, order=order)
    model_fit = model.fit()
    return model_fit


def evaluate_model(
    train_data: pd.DataFrame,
    test_data: pd.DataFrame,
    model: ARIMA,
    minutes: int = 4 * 60,
    order: tuple | None = None
) -> None:
    mses = []

    num_batch = len(test_data) // minutes - 1
    
    model_fit = train_model(model, train_data, order)

    for batch in tqdm(range(num_batch), total=num_batch):
        
        forecast = model_fit.forecast(steps=minutes)

        # Evaluate the forecast
        mse = mean_squared_error(test_data.iloc[batch * minutes : (batch + 1) * minutes], forecast)
        
        print(f'Batch ({batch}): Mean Squared Error on test data: {mse:.4f}')
        
        mses.append(mse)
        
        model_fit = model_fit.append(test_data.iloc[batch * minutes : (batch + 1) * minutes], refit=False)
        
    print(f"Average mean squared error on test data: {np.mean(mses)}")


In [None]:
train_data = pd.read_csv('train.csv', parse_dates=["Date"], index_col=0)["2013-01-01 00:00:00":]
train_data = train_data.asfreq(pd.infer_freq(train_data.index))

test_data = pd.read_csv('test.csv', parse_dates=["Date"], index_col=0)
test_data = test_data.asfreq(pd.infer_freq(test_data.index))

time_series = 2

evaluate_model(
    train_data=train_data.iloc[:, time_series],
    test_data=test_data.iloc[:, time_series],
    model=ARIMA,
    order=(2, 0, 2)
)