# Time Series Forecasting: Electricity, Traffic, Exchange Rate
This notebook loads `.txt` datasets and trains Linear, DLinear and NLinear models using PyTorch.

In [1]:
# Import essential libraries for data handling and numerical operations
import pandas as pd              # Used for handling time series data in DataFrames
import numpy as np               # Core numerical library, especially for arrays and math functions
import gzip                      # To handle compressed .gz dataset files (if needed)

# PyTorch libraries for building and training neural network models
import torch
import torch.nn as nn

# Metrics to evaluate model performance
from sklearn.metrics import mean_squared_error, mean_absolute_error  # Standard regression metrics
from scipy.stats import pearsonr                                     # Used for computing Pearson correlation

# For visualizing model outputs and results
import matplotlib.pyplot as plt

# For interacting with the file system (e.g., loading files)
import os

# Redundant but harmless: re-importing core PyTorch modules
import torch
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer     # Required to build Transformer-based models

# Define the computation device: use GPU if available, otherwise fall back to CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [2]:
# Define a function to load a plain .txt file as a pandas DataFrame
# Assumes the file has no header row
def load_data(filepath):
    return pd.read_csv(filepath, header=None)

# Load the three benchmark datasets used for time series forecasting
# Each dataset is stored as a key-value pair in a dictionary for easy access
datasets = {
    'electricity': load_data('electricity.txt'),       # Hourly electricity consumption
    'traffic': load_data('traffic.txt'),               # Hourly traffic volume across sensors
    'exchange_rate': load_data('exchange_rate.txt')    # Daily exchange rate of currencies
}


In [3]:
# Normalize each dataset using Min-Max scaling to bring values to the [0, 1] range
# This is essential to ensure that features with larger numerical ranges don't dominate the model

for name in datasets:
    df = pd.DataFrame(datasets[name])  # Convert the dataset to a pandas DataFrame for easier processing
    # Apply Min-Max normalization: (value - min) / (max - min)
    datasets[name] = ((df - df.min()) / (df.max() - df.min())).values.astype(np.float32)
    # Convert the result back to a NumPy array of type float32, which is optimal for PyTorch


In [4]:
# This function transforms a time series into input/output sequences suitable for supervised learning
# It slides a window over the data to generate many (input, output) pairs

def create_sequences(data, input_len, output_len):
    X, y = [], []

    # Loop through the data to generate input/output pairs
    for i in range(len(data) - input_len - output_len):
        # Input: a sequence of 'input_len' steps
        X.append(data[i:i+input_len])
        # Output: the next 'output_len' steps immediately after the input
        y.append(data[i+input_len:i+input_len+output_len])

    # Convert lists to NumPy arrays for efficiency
    X = np.array(X)
    y = np.array(y)

    # Return the input/output sequences as PyTorch tensors
    return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)


In [5]:
import torch
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer

# 1. Linear
class LinearModel(nn.Module):
    def __init__(self, input_len, output_len, features):
        super().__init__()
        # Simple fully connected layer that maps input_len to output_len for each feature
        self.linear = nn.Linear(input_len, output_len)
    
    def forward(self, x):
        # Transpose to [batch, features, seq_len] to apply linear mapping per feature
        return self.linear(x.transpose(1, 2)).transpose(1, 2)

# 2. DLinear (Decomposition Linear)
class DLinear(nn.Module):
    def __init__(self, input_len, output_len, features):
        super().__init__()
        # First layer learns a decomposition of the signal (seasonal component)
        self.decomp = nn.Linear(input_len, input_len)
        # Second layer projects the decomposed sequence to the output length
        self.proj = nn.Linear(input_len, output_len)
    
    def forward(self, x):
        x = x.transpose(1, 2)  # [batch, features, seq_len]
        seasonal = self.decomp(x)  # Decomposition
        out = self.proj(seasonal)  # Forecast
        return out.transpose(1, 2)

# 3. NLinear (Normalized Linear)
class NLinear(nn.Module):
    def __init__(self, input_len, output_len, features):
        super().__init__()
        # Normalize across features to stabilize training
        self.norm = nn.LayerNorm(features)
        self.linear = nn.Linear(input_len, output_len)
    
    def forward(self, x):
        x = self.norm(x)  # Apply feature normalization
        return self.linear(x.transpose(1, 2)).transpose(1, 2)

# 4. TransformerModel (basic Transformer encoder for time series)
class TransformerModel(nn.Module):
    def __init__(self, input_len, output_len, features, d_model=512, nhead=8, num_layers=2):
        super().__init__()
        # Project input features to d_model dimension for the transformer
        self.input_proj = nn.Linear(features, d_model)
        encoder_layer = TransformerEncoderLayer(d_model=d_model, nhead=nhead)
        self.transformer = TransformerEncoder(encoder_layer, num_layers=num_layers)
        # Project back to original feature dimension
        self.output_proj = nn.Linear(d_model, features)
        self.output_len = output_len

    def forward(self, x):
        # Input: [batch, seq_len, features]
        x = self.input_proj(x)         # -> [batch, seq_len, d_model]
        x = x.transpose(0, 1)          # -> [seq_len, batch, d_model] (for transformer)
        x = self.transformer(x)        # -> [seq_len, batch, d_model]
        x = x.transpose(0, 1)          # -> [batch, seq_len, d_model]
        x = self.output_proj(x)        # -> [batch, seq_len, features]

        # Ensure output has the correct prediction length
        if x.shape[1] > self.output_len:
            x = x[:, -self.output_len:, :]
        elif x.shape[1] < self.output_len:
            # Upsample if needed using interpolation
            x = nn.functional.interpolate(x.transpose(1, 2), size=self.output_len, mode='linear', align_corners=False)
            x = x.transpose(1, 2)

        return x  # Final shape: [batch, output_len, features]

# 5. Autoformer (seasonal-trend decomposition model)
class Autoformer(nn.Module):
    def __init__(self, input_len, output_len, features):
        super().__init__()
        # Estimate trend component
        self.encoder = nn.Linear(features, features)
        # Decompose seasonal component
        self.decomp = nn.Linear(features, features)
        # Project over time to get final output length
        self.output_proj = nn.Linear(input_len, output_len)

    def forward(self, x):
        trend = self.encoder(x)  # Trend estimation
        seasonal = x - trend     # Subtract trend to get seasonal
        seasonal_decomp = self.decomp(seasonal)
        seasonal_decomp = seasonal_decomp.transpose(1, 2)
        out = self.output_proj(seasonal_decomp)
        return out.transpose(1, 2)

# 6. Informer (sparse attention model)
class Informer(nn.Module):
    def __init__(self, input_len, output_len, features):
        super().__init__()
        # Input projection for attention
        self.input_proj = nn.Linear(features, features)
        # Standard multi-head attention with only 1 head (simplified)
        self.attn = nn.MultiheadAttention(embed_dim=features, num_heads=1, batch_first=True)
        # Project the attention output to the output length
        self.output_proj = nn.Linear(input_len, output_len)

    def forward(self, x):
        x = self.input_proj(x)               # [batch, seq_len, features]
        attn_output, _ = self.attn(x, x, x)  # Self-attention
        attn_output = attn_output.transpose(1, 2)  # [batch, features, seq_len]
        out = self.output_proj(attn_output)       # [batch, features, output_len]
        return out.transpose(1, 2)                # [batch, output_len, features]

# 7. Reformer (simplified version)
class Reformer(nn.Module):
    def __init__(self, input_len, output_len, features):
        super().__init__()
        # Very simple version using linear mapping; original reformer is more complex
        self.linear = nn.Linear(input_len, output_len)

    def forward(self, x):
        return self.linear(x.transpose(1, 2)).transpose(1, 2)

# 8. FEDformer (frequency-enhanced decomposition transformer)
class FEDformer(nn.Module):
    def __init__(self, input_len, output_len, features):
        super().__init__()
        # Frequency filter: learnable transformation
        self.filter = nn.Linear(input_len, input_len)
        self.output_proj = nn.Linear(input_len, output_len)

    def forward(self, x):
        # Apply FFT to move to frequency domain
        freq = torch.fft.rfft(x, dim=1)
        # Inverse FFT to reconstruct
        filtered = torch.fft.irfft(freq, n=x.size(1), dim=1)
        filtered = filtered.transpose(1, 2)
        out = self.output_proj(self.filter(filtered))
        return out.transpose(1, 2)


In [6]:
# Function to train any given model on the training data
def train_model(model, X_train, y_train, epochs=10, lr=1e-3, batch_size=64):
    # Choose GPU if available, otherwise use CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)  # Move model to the selected device

    # Set up the optimizer and loss function (Mean Squared Error)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    # Wrap the training data into a PyTorch dataset and loader for batching
    dataset = torch.utils.data.TensorDataset(X_train, y_train)
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    print(f"\n--- Training {model.__class__.__name__} ---")
    print(f"Input shape: {X_train.shape}, Target shape: {y_train.shape}")
    print(f"Device: {device}")

    # Main training loop
    for epoch in range(epochs):
        model.train()  # Set model to training mode
        epoch_loss = 0.0  # Track total loss for this epoch

        # Iterate through all batches
        for batch_idx, (x_batch, y_batch) in enumerate(loader):
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()  # Reset gradients from previous step
            output = model(x_batch)  # Forward pass

            # If the model output shape doesn't match the target, skip this batch
            if output.shape != y_batch.shape:
                print(f"[!] Shape mismatch at batch {batch_idx}: output {output.shape}, target {y_batch.shape}")
                continue

            loss = loss_fn(output, y_batch)  # Compute loss
            loss.backward()  # Backpropagate gradients
            optimizer.step()  # Update weights

            epoch_loss += loss.item()  # Accumulate batch loss

        # Print average loss for this epoch
        print(f"Epoch {epoch+1}/{epochs} | Loss: {epoch_loss:.4f}")

    print(f"Training complete for {model.__class__.__name__} ✅\n")
    return model  # Return the trained model


In [7]:
# Function to evaluate a trained model on test data, with detailed debug information
def evaluate_model(model, X_test, y_test):
    # Set the model to evaluation mode
    # This disables layers like dropout and batch normalization which behave differently during training
    model.eval()
    
    # Print which model is being evaluated to keep track when running multiple models
    print(f"\n--- Evaluating {model.__class__.__name__} ---")
    
    # Print the shapes of the input test features and true labels
    # This helps verify that the data dimensions are as expected before prediction
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
    
    # Disable gradient calculation since we are only doing inference (no training or backpropagation)
    with torch.no_grad():
        # Run the model on the test data, moving inputs to the correct device (GPU or CPU)
        output = model(X_test.to(device))
        
        # Move the model's output predictions back to CPU and convert them to NumPy arrays for easy metric calculation
        pred = output.cpu().numpy()
        
        # Similarly, convert the ground truth labels to NumPy arrays
        true = y_test.cpu().numpy()
    
    # Print the shapes of the predictions and ground truth arrays to confirm they match
    print(f"Prediction shape: {pred.shape}, Ground truth shape: {true.shape}")
    
    # Display the first 5 predictions and their corresponding true values, flattened into 1D arrays
    # This quick visual check helps ensure predictions look reasonable
    print("\nFirst 5 predictions vs. true values (flattened):")
    for i in range(5):
        print(f"Pred: {pred[i].flatten()[:5]} | True: {true[i].flatten()[:5]}")
    
    # Calculate evaluation metrics on the flattened arrays, treating all test samples and time steps as one sequence
    
    # Mean Squared Error (MSE):
    # Average of squared differences between predicted and true values
    # Penalizes larger errors more than smaller ones, making it sensitive to outliers
    mse = mean_squared_error(true.flatten(), pred.flatten())
    
    # Mean Absolute Error (MAE):
    # Average of absolute differences between predicted and true values
    # Gives a linear penalty, easier to interpret as average error magnitude
    mae = mean_absolute_error(true.flatten(), pred.flatten())
    
    # Relative Squared Error (RSE):
    # Ratio of the root mean squared error (RMSE) to the standard deviation of the true values
    # This normalizes the error relative to the natural variability of the data, enabling comparison across datasets or scales
    rse = np.sqrt(mse) / np.std(true.flatten())
    
    # Pearson Correlation Coefficient:
    # Measures the linear relationship between predicted and true values
    # Its value ranges between -1 and 1, where:
    #   - 1 means perfect positive linear correlation (they move exactly together)
    #   - 0 means no linear correlation
    #   - -1 means perfect negative linear correlation (they move exactly opposite)
    # It does NOT range between 0 and 1; misunderstanding this interval was a mistake.
    corr = np.corrcoef(true.flatten(), pred.flatten())[0, 1]
    
    # Print all calculated metrics with precision up to 6 decimal places
    print(f"\nMetrics:")
    print(f" - MSE:  {mse:.6f}")
    print(f" - MAE:  {mae:.6f}")
    print(f" - RSE:  {rse:.6f}")
    print(f" - Corr: {corr:.6f}")
    
    # Return the metrics for further use (logging, saving results, comparison, etc.)
    return mse, mae, rse, corr


In [8]:
# Run all models on all datasets with detailed debug info
input_len, output_len = 96, 96  # Fixed input and output lengths (e.g., predict the next 96 steps given the past 96)
results = []  # List to store evaluation results for each model and dataset

# Complete list of model classes we want to evaluate
model_classes = [
    LinearModel,
    DLinear,
    NLinear,
    TransformerModel,
    Autoformer,
    Informer,
    Reformer,
    FEDformer
]

# Loop through all available datasets (e.g., electricity, traffic, exchange_rate...)
for name, data in datasets.items():
    print(f"\n=== Dataset: {name} ===")  # Show which dataset is currently being processed
    print(f"Original data shape: {data.shape}")  # Show original shape (e.g., [26304, 321])

    # Generate input-output sequences from the raw dataset
    # For example, if input_len = 96 and output_len = 96, this creates sliding windows
    X, y = create_sequences(data, input_len, output_len)
    print(f"Created sequences -> X: {X.shape}, y: {y.shape}")  # Show the shape of the generated sequences

    # Split the sequences into training and testing sets
    # We'll use the first 500 sequences for training and the last 100 for testing
    X_train, y_train = X[:500], y[:500]
    X_test, y_test = X[-100:], y[-100:]
    print(f"Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")

    # Train and evaluate each model
    for Model in model_classes:
        print(f"\n--- Training model: {Model.__name__} ---")  # Announce which model we're training

        # Initialize the model with the appropriate input/output lengths and number of features
        model = Model(input_len, output_len, data.shape[1])

        # Train the model using the training data (500 samples)
        model = train_model(model, X_train, y_train, epochs=10)  # You can adjust the number of epochs if needed
        print(f"Finished training {Model.__name__}, now evaluating...")  # Confirm training is complete

        # Evaluate the model on the test set (100 samples)
        # Collect metrics: MSE, MAE, RSE, and correlation
        mse, mae, rse, corr = evaluate_model(model, X_test, y_test)

        # Store the results for this model and dataset in the 'results' list
        results.append([name, Model.__name__, mse, mae, rse, corr])



=== Dataset: electricity ===
Original data shape: (26304, 321)
Created sequences -> X: torch.Size([26112, 96, 321]), y: torch.Size([26112, 96, 321])
Train size: 500, Test size: 100

--- Training model: LinearModel ---

--- Training LinearModel ---
Input shape: torch.Size([500, 96, 321]), Target shape: torch.Size([500, 96, 321])
Device: cpu
Epoch 1/10 | Loss: 2.1044
Epoch 2/10 | Loss: 0.7376
Epoch 3/10 | Loss: 0.3885
Epoch 4/10 | Loss: 0.3070
Epoch 5/10 | Loss: 0.2618
Epoch 6/10 | Loss: 0.2295
Epoch 7/10 | Loss: 0.2057
Epoch 8/10 | Loss: 0.1868
Epoch 9/10 | Loss: 0.1724
Epoch 10/10 | Loss: 0.1608
Training complete for LinearModel ✅

Finished training LinearModel, now evaluating...

--- Evaluating LinearModel ---
X_test shape: torch.Size([100, 96, 321]), y_test shape: torch.Size([100, 96, 321])
Prediction shape: (100, 96, 321), Ground truth shape: (100, 96, 321)

First 5 predictions vs. true values (flattened):
Pred: [ 0.00055385  0.24706827 -0.05837923  0.5029023   0.5132025 ] | True: 


Metrics:
 - MSE:  0.032880
 - MAE:  0.147033
 - RSE:  0.946217
 - Corr: 0.392152

--- Training model: Reformer ---

--- Training Reformer ---
Input shape: torch.Size([500, 96, 321]), Target shape: torch.Size([500, 96, 321])
Device: cpu
Epoch 1/10 | Loss: 1.4385
Epoch 2/10 | Loss: 0.4441
Epoch 3/10 | Loss: 0.3060
Epoch 4/10 | Loss: 0.2630
Epoch 5/10 | Loss: 0.2193
Epoch 6/10 | Loss: 0.1906
Epoch 7/10 | Loss: 0.1723
Epoch 8/10 | Loss: 0.1581
Epoch 9/10 | Loss: 0.1457
Epoch 10/10 | Loss: 0.1357
Training complete for Reformer ✅

Finished training Reformer, now evaluating...

--- Evaluating Reformer ---
X_test shape: torch.Size([100, 96, 321]), y_test shape: torch.Size([100, 96, 321])
Prediction shape: (100, 96, 321), Ground truth shape: (100, 96, 321)

First 5 predictions vs. true values (flattened):
Pred: [0.12938127 0.34013513 0.04436641 0.61898905 0.63771653] | True: [0.08571429 0.35135135 0.01331115 0.85128206 0.68007314]
Pred: [0.14050841 0.33541763 0.04277487 0.56014276 0.6159613 ] 

Prediction shape: (100, 96, 862), Ground truth shape: (100, 96, 862)

First 5 predictions vs. true values (flattened):
Pred: [0.0874192  0.1436529  0.24897738 0.12545237 0.06456556] | True: [0.067411   0.06266667 0.13068317 0.09116636 0.02482182]
Pred: [0.08559494 0.1345781  0.21547866 0.10950273 0.06325421] | True: [0.05940594 0.05733333 0.11283028 0.08712351 0.02482182]
Pred: [0.08560594 0.1198816  0.17028487 0.09066305 0.05154743] | True: [0.0471877  0.05177778 0.1061652  0.08954922 0.02113541]
Pred: [0.08303066 0.10107291 0.11832613 0.06902745 0.03706681] | True: [0.01558879 0.03711111 0.07736253 0.0667071  0.01474564]
Pred: [0.07424872 0.0794502  0.08199592 0.05423563 0.04039336] | True: [0.01200758 0.02488889 0.05236848 0.04386497 0.0093389 ]

Metrics:
 - MSE:  0.008100
 - MAE:  0.061722
 - RSE:  0.923366
 - Corr: 0.571025

--- Training model: Autoformer ---

--- Training Autoformer ---
Input shape: torch.Size([500, 96, 862]), Target shape: torch.Size([500, 96, 862])
Device: cpu


Epoch 4/10 | Loss: 0.0800
Epoch 5/10 | Loss: 0.0626
Epoch 6/10 | Loss: 0.0549
Epoch 7/10 | Loss: 0.0510
Epoch 8/10 | Loss: 0.0487
Epoch 9/10 | Loss: 0.0472
Epoch 10/10 | Loss: 0.0458
Training complete for DLinear ✅

Finished training DLinear, now evaluating...

--- Evaluating DLinear ---
X_test shape: torch.Size([100, 96, 8]), y_test shape: torch.Size([100, 96, 8])
Prediction shape: (100, 96, 8), Ground truth shape: (100, 96, 8)

First 5 predictions vs. true values (flattened):
Pred: [0.40933695 0.24774656 0.32359254 0.58088493 0.33599606] | True: [0.41392905 0.2498323  0.33197728 0.59765685 0.33024514]
Pred: [0.41471303 0.24743652 0.32998306 0.5818665  0.3372069 ] | True: [0.42368132 0.28414446 0.3417586  0.5950704  0.33227372]
Pred: [0.41540676 0.2506859  0.33126184 0.58384395 0.3374126 ] | True: [0.42507336 0.2849122  0.3430463  0.59630364 0.32990316]
Pred: [0.41473696 0.25111976 0.3308758  0.58476967 0.33672962] | True: [0.429545   0.28207418 0.33937988 0.5985521  0.332196  ]
Pred:

Epoch 1/10 | Loss: 1.0043
Epoch 2/10 | Loss: 0.2053
Epoch 3/10 | Loss: 0.1034
Epoch 4/10 | Loss: 0.0786
Epoch 5/10 | Loss: 0.0620
Epoch 6/10 | Loss: 0.0543
Epoch 7/10 | Loss: 0.0514
Epoch 8/10 | Loss: 0.0485
Epoch 9/10 | Loss: 0.0468
Epoch 10/10 | Loss: 0.0456
Training complete for FEDformer ✅

Finished training FEDformer, now evaluating...

--- Evaluating FEDformer ---
X_test shape: torch.Size([100, 96, 8]), y_test shape: torch.Size([100, 96, 8])
Prediction shape: (100, 96, 8), Ground truth shape: (100, 96, 8)

First 5 predictions vs. true values (flattened):
Pred: [0.40737045 0.2534953  0.3238963  0.57326674 0.3346743 ] | True: [0.41392905 0.2498323  0.33197728 0.59765685 0.33024514]
Pred: [0.4117322  0.2517777  0.3263044  0.5735339  0.33454272] | True: [0.42368132 0.28414446 0.3417586  0.5950704  0.33227372]
Pred: [0.40899724 0.2561911  0.32466066 0.5713967  0.33376223] | True: [0.42507336 0.2849122  0.3430463  0.59630364 0.32990316]
Pred: [0.40448144 0.25290176 0.32084852 0.5709798

In [9]:
# Show results
df_results = pd.DataFrame(results, columns=['Dataset', 'Model', 'MSE', 'MAE', 'RSE', 'Corr'])
df_results

Unnamed: 0,Dataset,Model,MSE,MAE,RSE,Corr
0,electricity,LinearModel,0.021637,0.111452,0.767574,0.654573
1,electricity,DLinear,0.015566,0.09111,0.65105,0.759485
2,electricity,NLinear,0.14557,0.328919,1.990954,0.509452
3,electricity,TransformerModel,0.035053,0.148322,0.976986,0.435207
4,electricity,Autoformer,0.032325,0.142765,0.938201,0.440458
5,electricity,Informer,0.03288,0.147033,0.946217,0.392152
6,electricity,Reformer,0.019448,0.104599,0.727711,0.693198
7,electricity,FEDformer,0.015707,0.091756,0.653987,0.757427
8,traffic,LinearModel,0.00508,0.047088,0.731293,0.706634
9,traffic,DLinear,0.004074,0.040586,0.654827,0.765963
