In [1]:
cd ../

/Users/linafaik/Documents/projects/time-series-forecasting-models


In [2]:
import pandas as pd
import numpy as np
import os

from neuralforecast import NeuralForecast
from neuralforecast.models import TimeLLM
from neuralforecast.losses.pytorch import MAE
from mlforecast.utils import PredictionIntervals

from config import *
from src.data_processing import *
from src.metrics import *
from src.training import *
from src.viz import *

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm
2025-05-19 19:22:47,247	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-05-19 19:22:47,354	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [3]:
force = True

## Data loading

In [4]:
df = pd.read_csv(path_data_processed)
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")

df.head()

Unnamed: 0,date,store_id,state_id,sold_quantity,sold_amount,event_type_1,event_type_2,event_sporting,event_cultural,event_national,event_religious
0,2011-01-29,CA_1,CA,4337,10933.16,,,0,0,0,0
1,2011-01-29,CA_2,CA,3494,9101.52,,,0,0,0,0
2,2011-01-29,CA_3,CA,4739,11679.83,,,0,0,0,0
3,2011-01-29,CA_4,CA,1625,4561.59,,,0,0,0,0
4,2011-01-29,TX_1,TX,2556,6586.68,,,0,0,0,0


## Model training

In [5]:
name_scenario = "timellm_gpt2_v2"

In [6]:
path = os.path.join("output", name_scenario, f"forecasts_{name_scenario}.csv")

if force or not os.path.exists(path):
    
    df['dow'] = pd.to_datetime(df[time_col]).dt.dayofweek  # 0 = Monday
    df['month'] = pd.to_datetime(df[time_col]).dt.month

    train_df, test_df = split_train_test(
        df=df, 
        horizon=H,
        column_date=time_col,
        column_id=id_col,
        )

    print(f"{len(train_df)} rows for train")
    print(f"{len(test_df)} rows for test")
    
    columns_features = [
        'dow', 'month',
        'event_sporting', 'event_cultural', 
        'event_national', 'event_religious',
    ]
    
    input_size=3*H
    max_steps = 40
    val_check_steps = max_steps // 5  # every 20% of training

    
    prompt_prefix = f"""
The dataset contains daily sales data for Walmart stores. Each time series corresponds to a unique store. The available features include:
- dow: day of the week (0 = Monday, ..., 6 = Sunday)
- month: calendar month (1 = January, ..., 12 = December)
- event_sporting, event_cultural, event_national, event_religious: binary indicators for specific events

Sales typically increase on weekends (Saturday = 5, Sunday = 6) and during major events such as national holidays or cultural and sporting occasions.

Use these patterns to forecast sales for the next {H} days.
"""

    
    # Define the TimeLLM model
    timellm = TimeLLM(
        h=H,  # Forecast horizon
        input_size=input_size,  # Number of past observations used for forecasting
        patch_len=7, # Length of patch
        stride=8, # Stride of patch
        d_ff=128, # Dimension of fcn
        top_k=10, # Top tokens to consider
        d_llm=768, # Hidden dimension of LLM. 768 for gpt2 and 1024 for gpt2-medium
        d_model=128, #32 # Dimension of model
        n_heads=8, # Number of heads in attention layer
        enc_in=8, # Encoder input size
        dec_in=8, # Decoder input size
        llm='openai-community/gpt2', # Pre-trained LLM to use
        # 'openai-community/gpt2' 'google/flan-t5-small','openai-community/gpt2-medium'
        dropout=0.1, # Dropout rate
        prompt_prefix=prompt_prefix,
        batch_size=4,  #16 # Number of training samples processed together in one forward/backward pass.
        valid_batch_size=8, # Number of samples used per batch during validation (not training).
        windows_batch_size=32, # Number of windows extracted from the dataset at a time to construct training/validation batches
        max_steps=max_steps, #  Maximum number of training steps
        val_check_steps=val_check_steps, # Number of training steps between every validation loss check
        loss=MAE()
    )

    # Initialize NeuralForecast with the TimeLLM model
    nf = NeuralForecast(
        models=[timellm],
        freq=freq  
    )

    # Fit the model to the data
    nf.fit(
        df=train_df[[id_col, time_col, target_col]+columns_features],
        id_col=id_col,
        time_col=time_col,
        target_col=target_col,
        
        prediction_intervals=PredictionIntervals(
            n_windows=10, # Number of past rolling windows used for calibrating intervals
            h=14, # Forecast horizon (should match model's prediction length)
            method="conformal_distribution" # Nonparametric method for interval estimation using residuals
        )
    )

    # Generate forecasts
    forecasts_df = nf.predict(level=[90])
    
    forecasts_enr_df = ( 
        test_df
        .merge(forecasts_df, on=[id_col, time_col], how="left")
    )
    
    forecasts_enr_df = pd.concat([train_df, forecasts_enr_df], axis=0).reset_index(drop=True)

    os.makedirs(os.path.join("output", name_scenario), exist_ok=True)
    forecasts_enr_df.to_csv(path, index=False)

else:
    forecasts_enr_df = pd.read_csv(path)

forecasts_enr_df.tail()

Seed set to 1


19270 rows for train
140 rows for test
Successfully loaded model: openai-community/gpt2


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name                | Type               | Params | Mode 
-------------------------------------------------------------------
0 | loss                | MAE                | 0      | train
1 | padder_train        | ConstantPad1d      | 0      | train
2 | scaler              | TemporalNorm       | 0      | train
3 | llm                 | GPT2Model          | 124 M  | eval 
4 | patch_embedding     | PatchEmbedding     | 2.7 K  | train
5 | mapping_layer       | Linear             | 51.5 M | train
6 | reprogramming_layer | ReprogrammingLayer | 2.5 M  | train
7 | output_projection   | FlattenHead        | 10.8 K | train
8 | normalize_layers    | RevIN              | 0      | train
-------------------------------------------------------------------
54.0 M    Trainable params
124 M     Non-trainable params
178 M     Total params
713.647   Total estimated model params size (MB

Epoch 13:  33%|███▎      | 1/3 [00:02<00:05,  0.37it/s, v_num=107, train_loss_step=1.4e+3, train_loss_epoch=1.4e+3]  

`Trainer.fit` stopped: `max_steps=40` reached.


Epoch 13:  33%|███▎      | 1/3 [00:02<00:05,  0.37it/s, v_num=107, train_loss_step=1.4e+3, train_loss_epoch=1.4e+3]


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Predicting DataLoader 0: 100%|██████████| 2/2 [00:48<00:00,  0.04it/s]


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name                | Type               | Params | Mode 
-------------------------------------------------------------------
0 | loss                | MAE                | 0      | train
1 | padder_train        | ConstantPad1d      | 0      | train
2 | scaler              | TemporalNorm       | 0      | train
3 | llm                 | GPT2Model          | 124 M  | eval 
4 | patch_embedding     | PatchEmbedding     | 2.7 K  | train
5 | mapping_layer       | Linear             | 51.5 M | train
6 | reprogramming_layer | ReprogrammingLayer | 2.5 M  | train
7 | output_projection   | FlattenHead        | 10.8 K | train
8 | normalize_layers    | RevIN              | 0      | train
-------------------------------------------------------------------
54.0 M    Trainable params
124 M     Non-trainable params
178 M     Total params
713.647   Total estimated model params size (MB

Epoch 13:  33%|███▎      | 1/3 [02:44<05:28,  0.01it/s, v_num=109, train_loss_step=1.36e+3, train_loss_epoch=1.36e+3]

`Trainer.fit` stopped: `max_steps=40` reached.


Epoch 13:  33%|███▎      | 1/3 [02:44<05:29,  0.01it/s, v_num=109, train_loss_step=1.36e+3, train_loss_epoch=1.36e+3]


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Predicting DataLoader 0: 100%|██████████| 2/2 [00:15<00:00,  0.13it/s]


Unnamed: 0,date,store_id,state_id,sold_quantity,sold_amount,event_type_1,event_type_2,event_sporting,event_cultural,event_national,event_religious,dow,month,TimeLLM,TimeLLM-lo-90,TimeLLM-hi-90
19405,2016-05-18,WI_3,WI,3268,9163.29,,,0,0,0,0,2,5,11084.703125,8544.048871,13625.357379
19406,2016-05-19,WI_3,WI,3398,9660.13,,,0,0,0,0,3,5,10878.086914,8277.56358,13478.610248
19407,2016-05-20,WI_3,WI,4126,11982.37,,,0,0,0,0,4,5,11230.549805,8309.235346,14151.864264
19408,2016-05-21,WI_3,WI,4519,12370.23,,,0,0,0,0,5,5,10879.316406,7770.471166,13988.161646
19409,2016-05-22,WI_3,WI,4757,13432.85,,,0,0,0,0,6,5,10843.922852,6807.609266,14880.236437


## Results analysis

In [None]:
# Initialize containers:
# - `scores` will store overall evaluation metrics for the model
# - `scores_per_ts` will store evaluation metrics per time series (per ID)
scores = {}
scores_per_ts = []

# Loop through the list of model forecast columns (here only 'TimeLLM')
for column in ['TimeLLM']:
    
    # Filter out rows where the model forecast is missing (NaN)
    forecasts_filtered_df = forecasts_enr_df[forecasts_enr_df[column].notna()]
    
    # Compute global evaluation metrics (e.g., MAE, RMSE) for TimeLLM
    scores[column] = evaluate(
        forecasts_filtered_df[target_col],    # Ground truth
        forecasts_filtered_df[column]         # Model forecast
    )
    
    # Compute evaluation metrics per time series (grouped by id_col)
    scores_per_ts_model_df = (
        forecasts_filtered_df
        .groupby(id_col)
        .apply(lambda group: pd.Series(
            evaluate(group[target_col], group[column])  # Metrics per time series
        ))
        .reset_index()
    )
    
    # Add a column to tag each row with the model name
    scores_per_ts_model_df["model"] = column
    
    # Append only relevant columns to the results list
    scores_per_ts.append(
        scores_per_ts_model_df[["model"] + [c for c in scores_per_ts_model_df.columns if c != "model"]]
    )

# Convert the global scores dictionary into a DataFrame
scores_df = (
    pd.DataFrame(scores).T
    .reset_index()
    .rename(columns={"index": "model"})
)

# Concatenate all time-series-level score DataFrames into a single table
scores_per_ts_df = pd.concat(scores_per_ts, axis=0).reset_index(drop=True)






In [9]:
rnd_id = np.random.choice(scores_per_ts_df[id_col].unique())

plot_forecast_with_ci(
    forecasts_enr_df, 
    column_id=id_col,
    column_date=time_col,
    column_target=target_col,
    column_forecast="TimeLLM",
    uid=rnd_id, 
    model_name="TimeLLM",
    level=90, 
    train_tail=30
    )


  fc = forecast_uid_df[forecast_df[lower_q].notna()]
  test = forecast_uid_df[forecast_df[lower_q].notna()]
  train = forecast_uid_df[forecast_df[lower_q].isna()]
