# Cell 1: Setup and Imports

## Description
 Installs and imports transformers, pytorch, and other necessary libraries. Sets up the device for GPU acceleration if available..

In [1]:
# --- Step 1: Install and Import Necessary Libraries ---
!pip install transformers accelerate torch scikit-learn pandas numpy matplotlib tqdm -q

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import json
from tqdm.notebook import trange, tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import TimeSeriesTransformerForPrediction, TimeSeriesTransformerConfig
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

# --- Reproducibility ---
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
print("Libraries imported and seed set.")

# --- Device Configuration ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Cell 2: Configure Model and Training Hyperparameters

All key parameters are defined here for easy tuning and experimentation



In [2]:
# --- Step 2: Configure Model and Training Hyperparameters ---
# All key parameters are defined here for easy tuning and experimentation.

# --- Data and Model Architecture ---
CONTEXT_LENGTH = 120#60 #288 #30           # Lookback window (how many past steps to consider).
PREDICTION_LENGTH = 1         # Horizon (how many steps to predict ahead).
LAGS_SEQUENCE = [1, 2, 3, 4, 5, 6, 7] # Default lags used by the model.

# --- Model Size & Regularization ---
D_MODEL = 32                  # Dimensionality of the transformer layers.
ENCODER_LAYERS = 2            # Number of encoder layers.
DECODER_LAYERS = 2            # Number of decoder layers.
ENCODER_ATTENTION_HEADS = 4   # Number of attention heads in the encoder.
DECODER_ATTENTION_HEADS = 4   # Number of attention heads in the decoder.
DROPOUT = 0.1                 # Dropout rate for regularization.

# --- Training Schedule ---
EPOCHS = 5                   # Maximum number of training epochs.
LEARNING_RATE = 1e-4          # Initial learning rate for the AdamW optimizer.
BATCH_SIZE = 64               # Number of samples per batch.
PATIENCE = 10                 # Early stopping patience.
DISTRIBUTION_OUTPUT = "normal"  #Use "normal" for MSE-like loss, "student_t" for default.

# Cell 3: Data Loading and Preprocessing

## Description:
Loads the EURUSD CSV file and correctly parses the Date and Time columns into a single Timestamp index for proper time-series handling.

In [3]:
# --- Step 3: Load and Preprocess Data ---
file_path = 'EURUSD_5m_10Yea.csv' #/content/EURUSD_5m_10Yea.csv
df_raw = pd.read_csv(file_path)
df_raw['Timestamp'] = pd.to_datetime(
    df_raw['Date'].astype(str) + ' ' + df_raw['Time'].astype(str),
    format='%Y%m%d %H:%M:%S'
)
df = df_raw[['Timestamp', 'Close']].copy()
df.set_index('Timestamp', inplace=True)

# --- Chronological Data Splitting (Identical to LSTM Notebook) ---
train_size = int(len(df) * 0.60)
val_size = int(len(df) * 0.20)

train_df = df.iloc[:train_size]
val_df = df.iloc[train_size:train_size + val_size]
test_df = df.iloc[train_size + val_size:]

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

Training set size: 448986
Validation set size: 149662
Test set size: 149663


# Cell 4: Feature Engineering and Data Splitting

## Description:
Creates time-based features required by the Transformer for positional information. The data is then split chronologically into training, validation, and testing sets (60/20/20)


In [4]:
# --- Step 4: Feature Engineering & Data Splitting ---

def create_time_features(df):
    """Creates time-based features from a datetime index, scaled to [-0.5, 0.5] range."""
    df_feat = df.copy()
    df_feat['hour'] = df_feat.index.hour / 23.0 - 0.5
    df_feat['day_of_week'] = df_feat.index.dayofweek / 6.0 - 0.5
    df_feat['day_of_month'] = (df_feat.index.day - 1) / 30.0 - 0.5
    df_feat['month'] = (df_feat.index.month - 1) / 11.0 - 0.5
    return df_feat[['hour', 'day_of_week', 'day_of_month', 'month']]

# --- Chronological Split ---
train_size = int(len(df) * 0.60)
val_size = int(len(df) * 0.20)

train_df = df.iloc[:train_size].copy()
val_df = df.iloc[train_size:train_size + val_size].copy()
test_df = df.iloc[train_size + val_size:].copy()

# --- Create Features for each split ---
train_features = create_time_features(train_df)
val_features = create_time_features(val_df)
test_features = create_time_features(test_df)

print("Data split and time features created.")
print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

Data split and time features created.
Train: 448986, Val: 149662, Test: 149663


# Step 5: Data Scaling

## Description:
Scales the 'Close' price using StandardScaler. The scaler is fitted only on the training data to prevent lookahead bias.

In [5]:
# --- Step 5: Scale the Data ---
target_scaler = StandardScaler()

# Fit on training data ONLY
train_df['Close_scaled'] = target_scaler.fit_transform(train_df[['Close']])
val_df['Close_scaled'] = target_scaler.transform(val_df[['Close']])
test_df['Close_scaled'] = target_scaler.transform(test_df[['Close']])

print("Target data ('Close' price) scaled successfully.")

Target data ('Close' price) scaled successfully.


# Step 6: PyTorch Dataset and DataLoader Creation

## Description:
Defines a custom Dataset class to create input/output windows for the model. This class provides the necessary history_length for the model's lag calculations. DataLoaders are then created to manage batching

In [6]:
# --- Step 6: Create PyTorch Datasets and DataLoaders ---
# The length of the history provided to the model must accommodate the context and max lag.
HISTORY_LENGTH = CONTEXT_LENGTH + max(LAGS_SEQUENCE)

class TimeSeriesDataset(Dataset):
    def __init__(self, target_df, feature_df, history_length, prediction_length):
        self.history_length = history_length
        self.prediction_length = prediction_length
        self.target = torch.from_numpy(target_df['Close_scaled'].values).float()
        self.features = torch.from_numpy(feature_df.values).float()

    def __len__(self):
        return len(self.target) - self.history_length - self.prediction_length + 1

    def __getitem__(self, idx):
        start_idx = idx
        end_idx_history = start_idx + self.history_length
        end_idx_prediction = end_idx_history + self.prediction_length

        return {
            'past_values': self.target[start_idx:end_idx_history],
            'past_time_features': self.features[start_idx:end_idx_history],
            'future_values': self.target[end_idx_history:end_idx_prediction],
            'future_time_features': self.features[end_idx_history:end_idx_prediction],
        }

# Create dataset instances
train_dataset = TimeSeriesDataset(train_df, train_features, HISTORY_LENGTH, PREDICTION_LENGTH)
val_dataset = TimeSeriesDataset(val_df, val_features, HISTORY_LENGTH, PREDICTION_LENGTH)
test_dataset = TimeSeriesDataset(test_df, test_features, HISTORY_LENGTH, PREDICTION_LENGTH)

# Create DataLoader instances
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)


print("PyTorch Datasets and DataLoaders are ready.")

PyTorch Datasets and DataLoaders are ready.


# Step 7: Model Definition

## Description:
Instantiates the TimeSeriesTransformerForPrediction model using a config object populated with our hyperparameters from Step 2.

In [7]:
# --- Step 7: Define the Transformer Model ---
config = TimeSeriesTransformerConfig(
    prediction_length=PREDICTION_LENGTH,
    context_length=CONTEXT_LENGTH,
    lags_sequence=LAGS_SEQUENCE,
    num_time_features=train_features.shape[1],
    num_static_categorical_features=0,
    num_static_real_features=0,
    num_dynamic_real_features=0,
    distribution_output=DISTRIBUTION_OUTPUT,
    loss="nll", # The loss remains Negative Log-Likelihood, but for a Normal distribution it's equivalent to MSE.
    encoder_layers=ENCODER_LAYERS,
    decoder_layers=DECODER_LAYERS,
    d_model=D_MODEL,
    encoder_attention_heads=ENCODER_ATTENTION_HEADS,
    decoder_attention_heads=DECODER_ATTENTION_HEADS,
    dropout=DROPOUT,
)

model = TimeSeriesTransformerForPrediction(config)
model.to(device)

print("TimeSeriesTransformer model created.")
print(f"Total model parameters: {model.num_parameters():,}")

TimeSeriesTransformer model created.
Total model parameters: 43,202


# Step 8: Training Loop with Validation

## Description:
The complete training and validation loop. It monitors validation loss and implements early stopping to prevent overfitting, saving the best model state.

In [None]:
# --- Step 8: Train the Model with Validation and Early Stopping ---
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
best_val_loss = float('inf')
patience_counter = 0

for epoch in trange(EPOCHS, desc="Epoch"):
    # --- Training Phase ---
    model.train()
    train_loss_total = 0
    progress_bar_train = tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}", leave=False)
    for batch in progress_bar_train:
        optimizer.zero_grad()
        past_values = batch['past_values'].to(device)
        future_values = batch['future_values'].to(device)
        past_time_features = batch['past_time_features'].to(device)
        future_time_features = batch['future_time_features'].to(device)
        past_observed_mask = torch.ones_like(past_values).to(device)
        future_observed_mask = torch.ones_like(future_values).to(device)

        outputs = model(
            past_values=past_values,
            past_time_features=past_time_features,
            past_observed_mask=past_observed_mask,
            future_values=future_values,
            future_time_features=future_time_features,
            future_observed_mask=future_observed_mask,
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss_total += loss.item()
        progress_bar_train.set_postfix({"loss": f"{loss.item():.6f}"})

    avg_train_loss = train_loss_total / len(train_dataloader)

    # --- Validation Phase ---
    model.eval()
    val_loss_total = 0
    progress_bar_val = tqdm(val_dataloader, desc=f"Validating Epoch {epoch+1}", leave=False)
    with torch.no_grad():
        for batch in progress_bar_val:
            past_values = batch['past_values'].to(device)
            future_values = batch['future_values'].to(device)
            past_time_features = batch['past_time_features'].to(device)
            future_time_features = batch['future_time_features'].to(device)
            past_observed_mask = torch.ones_like(past_values).to(device)
            future_observed_mask = torch.ones_like(future_values).to(device)

            outputs = model(
                past_values=past_values,
                past_time_features=past_time_features,
                past_observed_mask=past_observed_mask,
                future_values=future_values,
                future_time_features=future_time_features,
                future_observed_mask=future_observed_mask,
            )
            val_loss_total += outputs.loss.item()

    avg_val_loss = val_loss_total / len(val_dataloader)
    print(f"Epoch {epoch+1:02d}/{EPOCHS} -> Train Loss: {avg_train_loss:.8f}, Val Loss: {avg_val_loss:.8f}")

    # --- Early Stopping and Best Model Saving ---
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        print(f"New best validation loss: {best_val_loss:.8f}. Saving model...")
        torch.save(model.state_dict(), "best_transformer_model.pth")
        patience_counter = 0
    else:
        patience_counter += 1
        print(f"Validation loss did not improve. Patience: {patience_counter}/{PATIENCE}")

    if patience_counter >= PATIENCE:
        print("Early stopping triggered.")
        break

# Load the best performing model for final evaluation
model.load_state_dict(torch.load("best_transformer_model.pth"))
print("\nBest model loaded for final evaluation.")

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Training Epoch 1:   0%|          | 0/7014 [00:00<?, ?it/s]

Validating Epoch 1:   0%|          | 0/2337 [00:00<?, ?it/s]

Epoch 01/5 -> Train Loss: -1.22088630, Val Loss: -1.52779890
New best validation loss: -1.52779890. Saving model...


Training Epoch 2:   0%|          | 0/7014 [00:00<?, ?it/s]

Validating Epoch 2:   0%|          | 0/2337 [00:00<?, ?it/s]

Epoch 02/5 -> Train Loss: -2.09936859, Val Loss: -2.13239461
New best validation loss: -2.13239461. Saving model...


Training Epoch 3:   0%|          | 0/7014 [00:00<?, ?it/s]

Validating Epoch 3:   0%|          | 0/2337 [00:00<?, ?it/s]

# Step 9: Final Evaluation on All Datasets

## Description:
This comprehensive evaluation step generates predictions for the train, validation, and test sets using the best model saved from the training loop. It then calculates and displays the final RMSE and MAE metrics for all three datasets


In [None]:
# --- Step 9: Final Evaluation and Comparison ---
def get_predictions_and_actuals(loader, data_df):
    """Helper function to get predictions and true values for a given dataloader."""
    model.eval()
    predictions, actuals = [], []

    # Get the dates that correspond to the predictions
    start_idx = HISTORY_LENGTH + PREDICTION_LENGTH - 1
    dates = data_df.index[start_idx : start_idx + len(loader.dataset)]

    with torch.no_grad():
        for batch in tqdm(loader, desc=f"Predicting on {data_df.attrs.get('name', 'dataset')}"):
            past_values = batch['past_values'].to(device)
            past_time_features = batch['past_time_features'].to(device)
            future_time_features = batch['future_time_features'].to(device)
            past_observed_mask = torch.ones_like(past_values).to(device)

            outputs = model.generate(
                past_values=past_values,
                past_time_features=past_time_features,
                past_observed_mask=past_observed_mask,
                future_time_features=future_time_features,
            )

            preds_scaled = outputs.sequences.mean(dim=1).cpu().numpy()
            preds_unscaled = target_scaler.inverse_transform(preds_scaled)
            actuals_unscaled = target_scaler.inverse_transform(batch['future_values'].numpy())

            predictions.extend(preds_unscaled.flatten())
            actuals.extend(actuals_unscaled.flatten())

    return np.array(predictions), np.array(actuals), dates

# Add names to dataframes for better progress bar descriptions
train_df.attrs['name'] = 'Train Set'
val_df.attrs['name'] = 'Validation Set'
test_df.attrs['name'] = 'Test Set'

# Get predictions for all sets
y_pred_train, y_true_train, train_dates = get_predictions_and_actuals(train_dataloader, train_df)
y_pred_val, y_true_val, val_dates = get_predictions_and_actuals(val_dataloader, val_df)
y_pred_test, y_true_test, test_dates = get_predictions_and_actuals(test_dataloader, test_df)

# --- Calculate final metrics on the original scale ---
def calculate_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    return rmse, mae

rmse_train, mae_train = calculate_metrics(y_true_train, y_pred_train)
rmse_val, mae_val = calculate_metrics(y_true_val, y_pred_val)
rmse_test, mae_test = calculate_metrics(y_true_test, y_pred_test)

# --- Display Results Table ---
results_data = {
    'Dataset': ['Test', 'Test', 'Validation', 'Validation', 'Training', 'Training'],
    'Metric': ['RMSE', 'MAE', 'RMSE', 'MAE', 'RMSE', 'MAE'],
    'Value (EURUSD)': [f"{rmse_test:.6f}", f"{mae_test:.6f}",
                       f"{rmse_val:.6f}", f"{mae_val:.6f}",
                       f"{rmse_train:.6f}", f"{mae_train:.6f}"]
}
df_results = pd.DataFrame(results_data)

print("\n" + "="*50)
print("     Final Transformer Model Performance")
print("="*50)
print(df_results.to_string(index=False))
print("="*50)

# Step 10: Visualization of Results

## Description:
This cell generates the four key plots that visually compare the model's predictions against the actual values for each dataset, matching the format of your LSTM project.

In [None]:
# --- Step 10: Visualize Predictions (Corrected for Non-Continuous Time Series) ---
# To avoid plotting gaps (like weekends), we plot against a simple integer sequence
# and then format the x-axis ticks with the corresponding dates. This matches the
# "Gaps Collapsed" approach from the LSTM project.

import matplotlib.dates as mdates

def plot_subset_collapsed(dates, y_true, y_pred, title, subset_size=1000,
                          true_color='royalblue', pred_color='skyblue'):
    """
    Helper function to plot the last N points of a dataset against an integer index,
    collapsing time gaps and adding date labels.
    """
    plt.figure(figsize=(20, 7))

    # Ensure we don't plot more data than we have
    plot_size = min(subset_size, len(dates))

    # Use an integer sequence for the x-axis
    x_axis_index = np.arange(plot_size)

    plt.plot(x_axis_index, y_true[-plot_size:], color=true_color,
             label=f'Actual {title.split(" ")[0]} Price', marker='.', markersize=2, alpha=0.7)
    plt.plot(x_axis_index, y_pred[-plot_size:], color=pred_color,
             label=f'Predicted {title.split(" ")[0]} Price', linestyle='--')

    plt.title(f'{title}: Actual vs. Predicted (Last {plot_size} Points) - Gaps Collapsed', fontsize=16)
    plt.xlabel('Trading Sequence Point (Time Gaps Collapsed)', fontsize=12)
    plt.ylabel('EURUSD Price', fontsize=12)
    plt.legend()
    plt.grid(True)

    # Format the x-axis ticks to show dates
    # We select a few points from our index and label them with the corresponding date
    num_ticks = 7
    tick_indices = np.linspace(0, plot_size - 1, num_ticks, dtype=int)
    tick_labels = [dates[-plot_size:][i].strftime('%Y-%m-%d\n%H:%M') for i in tick_indices]

    plt.xticks(ticks=tick_indices, labels=tick_labels, rotation=30, ha='right')
    plt.tight_layout()
    plt.show()

# --- Plot subsets for clarity ---
plot_subset_collapsed(train_dates, y_true_train, y_pred_train, title='Training Set', true_color='royalblue', pred_color='skyblue')
plot_subset_collapsed(val_dates, y_true_val, y_pred_val, title='Validation Set', true_color='forestgreen', pred_color='lightgreen')
plot_subset_collapsed(test_dates, y_true_test, y_pred_test, title='Test Set', true_color='red', pred_color='darkorange')


# --- Plot 4: Test Set - Scaled (This plot does not need dates, so it's fine as is) ---
# We need to get the scaled true and predicted values for the test set
y_pred_test_scaled = target_scaler.transform(y_pred_test.reshape(-1, 1))
y_true_test_scaled = target_scaler.transform(y_true_test.reshape(-1, 1))

plt.figure(figsize=(20, 7))
plt.plot(y_true_test_scaled, color='blue', label='Actual Test Price (Scaled)')
plt.plot(y_pred_test_scaled, color='lime', label='Predicted Test Price (Scaled)', linestyle='--')
plt.title('Test Set: Actual vs. Predicted EURUSD Price (Scaled)', fontsize=16)
plt.xlabel(f'Time Step (Windowed Test Set)')
plt.ylabel('Scaled Price')
plt.legend()
plt.grid(True)
plt.show()