In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your data
data = pd.read_csv('data_clean.csv')

# Let's assume you want an 80-20 train-test split
train_data, test_data = train_test_split(data, test_size=0.01, shuffle=False)

# Save them if you want
train_data.to_csv('train.csv', index=False)
test_data.to_csv('test.csv', index=False)

print(f"Train shape: {train_data.shape}")
print(f"Test shape: {test_data.shape}")

Train shape: (928988, 7)
Test shape: (9384, 7)


In [30]:
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima.model import ARIMA
import numpy as np
from tqdm import tqdm


def train_model(
    train_data: pd.DataFrame,
    lags: int
) -> ARIMA:
    model = ARIMA(
        train_data,
        order=(lags, 1, 2),                # <-- d=1 here
        enforce_stationarity=False,        # relax stationarity check
        enforce_invertibility=False        # relax invertibility check
    )
    return model.fit()


def evaluate_model(
    train_data: pd.DataFrame,
    test_data: pd.DataFrame,
    minutes: int = 4 * 60,
    lags: int = 2
) -> None:
    mses = []

    num_batch = len(test_data) // minutes - 1

    for batch in tqdm(range(num_batch), total=num_batch):
        
        model_fit = train_model(pd.concat([train_data, test_data[ : batch * minutes]]), lags)
        
        forecast = model_fit.forecast(steps=minutes)

        # Evaluate the forecast
        mse = mean_squared_error(test_data.iloc[batch * minutes : (batch + 1) * minutes], forecast)
        
        print(f'Batch ({batch}): Mean Squared Error on test data: {mse:.4f}')
        
        mses.append(mse)
        
    print(f"Average mean squared error on test data: {np.mean(mses)}")


In [31]:
train_data = pd.read_csv('train.csv', parse_dates=["Date"], index_col=0)["2013-01-01 00:00:00":]
train_data = train_data.asfreq(pd.infer_freq(train_data.index))

test_data = pd.read_csv('test.csv', parse_dates=["Date"], index_col=0)
test_data = test_data.asfreq(pd.infer_freq(test_data.index))

evaluate_model(
    train_data=train_data.iloc[:, 4],
    test_data=test_data.iloc[:, 4]
)

  3%|▎         | 1/38 [00:39<24:25, 39.61s/it]

Batch (0): Mean Squared Error on test data: 0.8315


  5%|▌         | 2/38 [01:05<18:57, 31.60s/it]

Batch (1): Mean Squared Error on test data: 0.1392


  8%|▊         | 3/38 [01:40<19:20, 33.17s/it]

Batch (2): Mean Squared Error on test data: 0.0714


 11%|█         | 4/38 [02:18<19:55, 35.16s/it]

Batch (3): Mean Squared Error on test data: 0.3052


 13%|█▎        | 5/38 [02:44<17:21, 31.56s/it]

Batch (4): Mean Squared Error on test data: 2.7935


 16%|█▌        | 6/38 [03:07<15:23, 28.86s/it]

Batch (5): Mean Squared Error on test data: 0.1758


 18%|█▊        | 7/38 [03:31<14:00, 27.10s/it]

Batch (6): Mean Squared Error on test data: 0.7703


 21%|██        | 8/38 [03:57<13:23, 26.79s/it]

Batch (7): Mean Squared Error on test data: 2.7398


 24%|██▎       | 9/38 [04:18<12:08, 25.13s/it]

Batch (8): Mean Squared Error on test data: 6.1772


 26%|██▋       | 10/38 [04:46<12:06, 25.94s/it]

Batch (9): Mean Squared Error on test data: 0.9189


 29%|██▉       | 11/38 [06:45<24:27, 54.34s/it]

Batch (10): Mean Squared Error on test data: 1.5558


 32%|███▏      | 12/38 [07:08<19:30, 45.02s/it]

Batch (11): Mean Squared Error on test data: 0.3241


 34%|███▍      | 13/38 [09:23<30:03, 72.14s/it]

Batch (12): Mean Squared Error on test data: 0.1320


 37%|███▋      | 14/38 [09:44<22:43, 56.81s/it]

Batch (13): Mean Squared Error on test data: 0.1245


 39%|███▉      | 15/38 [10:12<18:26, 48.10s/it]

Batch (14): Mean Squared Error on test data: 4.2893


 42%|████▏     | 16/38 [10:39<15:13, 41.53s/it]

Batch (15): Mean Squared Error on test data: 0.1781


 45%|████▍     | 17/38 [12:39<22:53, 65.41s/it]

Batch (16): Mean Squared Error on test data: 0.1573


 47%|████▋     | 18/38 [13:27<19:59, 59.99s/it]

Batch (17): Mean Squared Error on test data: 0.1048


 50%|█████     | 19/38 [14:04<16:48, 53.05s/it]

Batch (18): Mean Squared Error on test data: 0.1078


 53%|█████▎    | 20/38 [14:34<13:49, 46.06s/it]

Batch (19): Mean Squared Error on test data: 1.1330


 55%|█████▌    | 21/38 [15:11<12:18, 43.41s/it]

Batch (20): Mean Squared Error on test data: 2.3627


 58%|█████▊    | 22/38 [15:52<11:26, 42.90s/it]

Batch (21): Mean Squared Error on test data: 0.2466


 61%|██████    | 23/38 [16:20<09:34, 38.32s/it]

Batch (22): Mean Squared Error on test data: 0.0802


 63%|██████▎   | 24/38 [16:48<08:10, 35.05s/it]

Batch (23): Mean Squared Error on test data: 0.0126


 66%|██████▌   | 25/38 [17:28<07:56, 36.65s/it]

Batch (24): Mean Squared Error on test data: 0.2219


 68%|██████▊   | 26/38 [19:16<11:36, 58.03s/it]

Batch (25): Mean Squared Error on test data: 0.1467


 71%|███████   | 27/38 [20:12<10:33, 57.62s/it]

Batch (26): Mean Squared Error on test data: 0.0586


 74%|███████▎  | 28/38 [21:11<09:38, 57.81s/it]

Batch (27): Mean Squared Error on test data: 0.1651


 76%|███████▋  | 29/38 [22:11<08:48, 58.69s/it]

Batch (28): Mean Squared Error on test data: 0.7207


 79%|███████▉  | 30/38 [22:35<06:26, 48.27s/it]

Batch (29): Mean Squared Error on test data: 0.9252


 82%|████████▏ | 31/38 [23:02<04:52, 41.75s/it]

Batch (30): Mean Squared Error on test data: 0.2798


 84%|████████▍ | 32/38 [23:59<04:38, 46.38s/it]

Batch (31): Mean Squared Error on test data: 0.2520


 87%|████████▋ | 33/38 [24:22<03:16, 39.30s/it]

Batch (32): Mean Squared Error on test data: 1.7507


 89%|████████▉ | 34/38 [26:21<04:12, 63.20s/it]

Batch (33): Mean Squared Error on test data: 0.8272


 92%|█████████▏| 35/38 [26:49<02:37, 52.55s/it]

Batch (34): Mean Squared Error on test data: 0.7023


 95%|█████████▍| 36/38 [27:24<01:34, 47.44s/it]

Batch (35): Mean Squared Error on test data: 2.0982


 97%|█████████▋| 37/38 [28:20<00:49, 49.92s/it]

Batch (36): Mean Squared Error on test data: 0.6055


100%|██████████| 38/38 [28:44<00:00, 45.38s/it]

Batch (37): Mean Squared Error on test data: 3.7299
Average mean squared error on test data: 1.0056627875776178



