In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your data
data = pd.read_csv('data_clean.csv')

# Let's assume you want an 80-20 train-test split
train_data, test_data = train_test_split(data, test_size=0.01, shuffle=False)

# Save them if you want
train_data.to_csv('train.csv', index=False)
test_data.to_csv('test.csv', index=False)

print(f"Train shape: {train_data.shape}")
print(f"Test shape: {test_data.shape}")

Train shape: (928988, 7)
Test shape: (9384, 7)


In [5]:
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima.model import ARIMA
import numpy as np
from tqdm import tqdm


def train_model(
    train_data: pd.DataFrame,
    lags: int
) -> ARIMA:
    model = ARIMA(train_data, order=(lags, 0, 2))
    model_fit = model.fit()
    return model_fit


def evaluate_model(
    train_data: pd.DataFrame,
    test_data: pd.DataFrame,
    minutes: int = 4 * 60,
    lags: int = 2
) -> None:
    mses = []

    num_batch = len(test_data) // minutes - 1

    for batch in tqdm(range(num_batch), total=num_batch):
        
        model_fit = train_model(pd.concat([train_data, test_data[ : batch * minutes]]), lags)
        
        forecast = model_fit.forecast(steps=minutes)

        # Evaluate the forecast
        mse = mean_squared_error(test_data.iloc[batch * minutes : (batch + 1) * minutes], forecast)
        
        print(f'Batch ({batch}): Mean Squared Error on test data: {mse:.4f}')
        
        mses.append(mse)
        
    print(f"Average mean squared error on test data: {np.mean(mses)}")


In [6]:
train_data = pd.read_csv('train.csv', parse_dates=["Date"], index_col=0)["2013-01-01 00:00:00":]
train_data = train_data.asfreq(pd.infer_freq(train_data.index))

test_data = pd.read_csv('test.csv', parse_dates=["Date"], index_col=0)
test_data = test_data.asfreq(pd.infer_freq(test_data.index))

evaluate_model(
    train_data=train_data.iloc[:, 0],
    test_data=test_data.iloc[:, 0]
)

  3%|▎         | 1/38 [00:27<16:55, 27.44s/it]

Batch (0): Mean Squared Error on test data: 0.3052


  5%|▌         | 2/38 [00:55<16:44, 27.89s/it]

Batch (1): Mean Squared Error on test data: 2.3403


  8%|▊         | 3/38 [01:21<15:37, 26.78s/it]

Batch (2): Mean Squared Error on test data: 0.1757


 11%|█         | 4/38 [01:46<14:56, 26.37s/it]

Batch (3): Mean Squared Error on test data: 0.1140


 13%|█▎        | 5/38 [02:14<14:42, 26.75s/it]

Batch (4): Mean Squared Error on test data: 0.1592


 16%|█▌        | 6/38 [02:41<14:26, 27.07s/it]

Batch (5): Mean Squared Error on test data: 0.6460


 18%|█▊        | 7/38 [03:09<14:01, 27.16s/it]

Batch (6): Mean Squared Error on test data: 0.7971


 21%|██        | 8/38 [03:37<13:42, 27.41s/it]

Batch (7): Mean Squared Error on test data: 0.1252


 24%|██▎       | 9/38 [04:04<13:10, 27.25s/it]

Batch (8): Mean Squared Error on test data: 0.7838


 26%|██▋       | 10/38 [04:30<12:33, 26.92s/it]

Batch (9): Mean Squared Error on test data: 0.8392


 29%|██▉       | 11/38 [04:55<11:54, 26.45s/it]

Batch (10): Mean Squared Error on test data: 0.1538


 32%|███▏      | 12/38 [05:22<11:29, 26.54s/it]

Batch (11): Mean Squared Error on test data: 0.1819


 34%|███▍      | 13/38 [05:49<11:05, 26.61s/it]

Batch (12): Mean Squared Error on test data: 0.0801


 37%|███▋      | 14/38 [06:15<10:36, 26.52s/it]

Batch (13): Mean Squared Error on test data: 0.1039


 39%|███▉      | 15/38 [06:41<10:09, 26.48s/it]

Batch (14): Mean Squared Error on test data: 0.3356


 42%|████▏     | 16/38 [07:07<09:37, 26.23s/it]

Batch (15): Mean Squared Error on test data: 0.1978


 45%|████▍     | 17/38 [07:36<09:26, 27.00s/it]

Batch (16): Mean Squared Error on test data: 0.2486


 47%|████▋     | 18/38 [08:02<08:57, 26.85s/it]

Batch (17): Mean Squared Error on test data: 0.2057


 50%|█████     | 19/38 [08:29<08:31, 26.92s/it]

Batch (18): Mean Squared Error on test data: 2.8297


 53%|█████▎    | 20/38 [08:57<08:06, 27.03s/it]

Batch (19): Mean Squared Error on test data: 0.3544


 55%|█████▌    | 21/38 [09:23<07:37, 26.89s/it]

Batch (20): Mean Squared Error on test data: 0.4705


 58%|█████▊    | 22/38 [09:50<07:08, 26.79s/it]

Batch (21): Mean Squared Error on test data: 0.0911


 61%|██████    | 23/38 [10:17<06:41, 26.76s/it]

Batch (22): Mean Squared Error on test data: 0.1359


 63%|██████▎   | 24/38 [10:43<06:14, 26.74s/it]

Batch (23): Mean Squared Error on test data: 0.0968


 66%|██████▌   | 25/38 [11:09<05:43, 26.45s/it]

Batch (24): Mean Squared Error on test data: 0.9102


 68%|██████▊   | 26/38 [11:35<05:13, 26.16s/it]

Batch (25): Mean Squared Error on test data: 0.3333


 71%|███████   | 27/38 [12:01<04:47, 26.18s/it]

Batch (26): Mean Squared Error on test data: 0.0217


 74%|███████▎  | 28/38 [12:28<04:26, 26.64s/it]

Batch (27): Mean Squared Error on test data: 0.0676


 76%|███████▋  | 29/38 [12:56<04:01, 26.84s/it]

Batch (28): Mean Squared Error on test data: 0.1162


 79%|███████▉  | 30/38 [13:22<03:33, 26.69s/it]

Batch (29): Mean Squared Error on test data: 0.1022


 82%|████████▏ | 31/38 [13:48<03:06, 26.60s/it]

Batch (30): Mean Squared Error on test data: 2.1922


 84%|████████▍ | 32/38 [14:16<02:40, 26.83s/it]

Batch (31): Mean Squared Error on test data: 0.0963


 87%|████████▋ | 33/38 [14:42<02:12, 26.48s/it]

Batch (32): Mean Squared Error on test data: 0.6475


 89%|████████▉ | 34/38 [15:07<01:44, 26.16s/it]

Batch (33): Mean Squared Error on test data: 0.8725


 92%|█████████▏| 35/38 [15:34<01:19, 26.52s/it]

Batch (34): Mean Squared Error on test data: 0.4645


 95%|█████████▍| 36/38 [16:03<00:54, 27.22s/it]

Batch (35): Mean Squared Error on test data: 0.8236


 97%|█████████▋| 37/38 [16:30<00:27, 27.23s/it]

Batch (36): Mean Squared Error on test data: 8.5445


100%|██████████| 38/38 [17:07<00:00, 27.04s/it]

Batch (37): Mean Squared Error on test data: 3.8154
Average mean squared error on test data: 0.8099835256034333



