In [1]:
import torch

!pip uninstall torch-scatter torch-sparse torch-geometric torch-cluster  --y
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git
!pip install torch-geometric-temporal

[0mLooking in links: https://data.pyg.org/whl/torch-2.6.0+cu124.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.6.0%2Bcu124/torch_scatter-2.1.2%2Bpt26cu124-cp311-cp311-linux_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m88.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt26cu124
Looking in links: https://data.pyg.org/whl/torch-2.6.0+cu124.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.6.0%2Bcu124/torch_sparse-0.6.18%2Bpt26cu124-cp311-cp311-linux_x86_64.whl (5.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt26cu124
Looking in links: https://data.pyg.org/whl/torch-2.6.0+cu124.html
Collecting torch-cluster
  Download

### Customized Dataset for SpatioTemporalDataset

In [2]:
import torch 
import torch_geometric
from torch_geometric.data import Dataset, Data 
class SpatioTemporalDataset(Dataset):
    """
    A customized torch geometric dataset for spatiotemporal data,
    using the sliding window technique
    
    Args:
        data_array(np.ndarray): 3D data in form (nodes, features, timestamps)
        edge_index (np.ndarray): 2D np.array consists of edges in graph in form of (source, destination)
        edge_weight (np.ndarray): weight matrix with correspond to edges
        lookback (int): the length of past window used to predict the next days
        horizon (int): number of next days needs to be predcted
    """
    
    def __init__(self, data_array, edge_index, edge_attr=None, edge_weight=None, transform=None, lookback=90, horizon=30):
        super().__init__()
        self.lookback = lookback
        self.horizon = horizon
        self.transform = transform
        
        self.x_data = torch.from_numpy(data_array).float()
        self.edge_index = torch.from_numpy(edge_index).long()
        
        
        self.edge_weight = None 
        self.edge_attr = None
        
        if (edge_attr is not None):
            self.edge_attr = torch.from_numpy(edge_attr).float()
        if (edge_weight is not None):
            self.edge_weight = torch.from_numpy(edge_weight).float()
        
        self._num_timestamps = data_array.shape[2]
    
    def __len__(self):
        """
        Return the length of available dataset depending on the length of lookback 
        and future horizon
        """
        return self._num_timestamps - self.lookback - self.horizon + 1
    
    def __getitem__(self, idx):
        start_x = idx 
        end_x = start_x + self.lookback
        
        start_y = end_x
        end_y = start_y + self.horizon
        
        x_window = self.x_data[:, :, start_x:end_x]
        y_window = self.x_data[:, :, start_y:end_y]  
        
        data = Data(
            x = x_window,
            edge_index = self.edge_index,
            edge_attr = self.edge_attr,
            edge_weight=self.edge_weight, 
            y=y_window
        )   
        
        if self.transform:
            data = self.transform(data)
        
        return data

### Load data to customized Data and split into K-folds using Time-series K-folds for training and evaluation

In [3]:
# Data parameters
LOOKBACK = 90
HORIZON = 30

In [4]:
import pandas as pd 
import numpy as np
from torch_geometric.transforms import GCNNorm
from sklearn.preprocessing import StandardScaler

def convert3dtensor(nodes, df: pd.DataFrame):
    """ 
    Convert 2D sp500 dataframe into 3D shape tensor 
    which is in form (n_nodes, n_timestamps, n_features)
    
    Args:

        - df (pd.DataFrame): Dataframe needed to be converted
        - nodes (np.ndarray): A np.ndarry consists of stock labels
    Return:
        (np.ndarray): Data returned in 3D shape
    """
    df.set_index(['Symbol', 'Date'], inplace=True)
    
    return np.stack(
        [df.loc[node].values.T for node in nodes],
        axis=0
    ).transpose(0, 2, 1)


def load_data(train=True):
    
    """ 
    A funtion aims to split inital data into train, test, split using Time-series K-fold split technique.
    
    The last month data is considered as the test dataset.
    
    Args:
        - train (bool): If True, return 4 folds, each fold consists of both training & validation sets. 
        Otherwise, it would return test set with entire training set.
    Return:
        (list): Set of couples, each couple consists of 2 sets, the second following the first by time
    """
    
    df = pd.read_csv('/kaggle/input/s-and-p500-stock-market/clear_sp500.csv').drop(columns=['Dividends', 'Stock Splits'])
    df['Date'] = pd.to_datetime(df['Date'])
    nodes = df['Symbol'].unique()
    
    timelines = [
        pd.Timestamp(year=2025, month=4, day=1, tz='UTC'),
        pd.Timestamp(year=2025, month=5, day=1, tz='UTC'),
        pd.Timestamp(year=2025, month=6, day=1, tz='UTC'),
        pd.Timestamp(year=2025, month=7, day=1, tz='UTC'),
        pd.Timestamp(year=2025, month=8, day=1, tz='UTC')
    ]
    
    predicted_range = pd.Timedelta(days=30)
    
    folds = []
    
    feature_columns = df.columns.tolist()
    feature_columns.remove('Symbol')
    feature_columns.remove('Date')
    
    df[feature_columns] = np.log1p(df[feature_columns])
    feature_columns.remove('Adj Close')
    
    if train:
        
        for i in range(4):
            train_df = df[df['Date'] < timelines[i]].copy().reset_index(drop=True)
            valid_df = df[(df['Date'] >= (timelines[i] - pd.Timedelta(days=2*LOOKBACK))) & (df['Date'] <= 
                        
                        timelines[i] + predicted_range)].copy().reset_index(drop=True)
            
            # scaler = StandardScaler().fit(train_df[feature_columns])
            # train_df[feature_columns] = scaler.transform(train_df[feature_columns])
            # valid_df[feature_columns] = scaler.transform(valid_df[feature_columns])
            
            folds.append((convert3dtensor(nodes, train_df), 
                          convert3dtensor(nodes, valid_df)))
    
    else:
        train_df = df[df['Date'] < timelines[4]].copy().reset_index(drop=True)
        
        test_df = df[df['Date'] >= (timelines[4] - pd.Timedelta(days=3*LOOKBACK))].copy().reset_index(drop=True)
        
        scaler = StandardScaler().fit(train_df[feature_columns])
        # train_df[feature_columns] = scaler.transform(train_df[feature_columns])
        # test_df[feature_columns] = scaler.transform(test_df[feature_columns])
        
        folds.append((convert3dtensor(nodes, train_df),
                     convert3dtensor(nodes, test_df)))
        
    return folds
        
def load_edge(adj_path="/kaggle/input/adjacencymatricies/adj_correlation.npy"):
    """ 
    A function aims to construct edge-related matrix using the 
    pre-defined adjacency matrix
    
    Args:
        adj_path (str): Path to the corresponding adjacency matrix, which is 
        either correlation adjacency matrix or AE combined BERT adjacency matrix
        
    Return:
        (np.ndarray, np.ndarray): 2 edge-related matricies, the first one
        is edge_index matrix, which is in form of (2, num of edges), each row representing for
        (source, destination), and the other is edge_weight matrix in form of (edge_weight,) each row
        representing for weight of the corresponding edge
    """
    adj_matrix = np.load(adj_path)
    
    nodes_nb = len(adj_matrix)
    edge_nb = np.count_nonzero(adj_matrix)
    edge_index = np.zeros((2, edge_nb))
    edge_weight = np.zeros((edge_nb))
    count = 0
    
    for i in range(nodes_nb):
        for j in range(nodes_nb):
            if (weight := adj_matrix[i, j]) != 0:
                edge_index[0, count], edge_index[1, count] = i, j
                edge_weight[count] = weight
                count += 1
                
    return edge_index, edge_weight

def load_torchgeometric_data(train=True, LOOKBACK = 90, HORIZON=30, adj_path="/kaggle/input/adjacencymatricies/adj_correlation.npy"):
    """
    A funtion aims to split inital data into train, test, split using Time-series K-fold split technique.
    The data must be in form of customized SpatioTemporalDataset
    
    The last month data is considered as the test dataset.
    
    Args:
        - train (bool): If True, return 4 folds, each fold consists of both training & validation sets. 
        Otherwise, it would return test set with entire training set.
        - adj_path (str): Path to corresponding adjacency matrix
        - LOOKBACK (int): the length of past window used to predict the next days
        - HORIZON (int): number of next days needs to be predicted
    Return:
        (list): Set of couples, each couple consists of 2 sets, the second following the first by time
    """
    
    edge_index, edge_weight = load_edge(adj_path)
    
    folds = load_data(train)
    
    new_folds = []
    
    transfrom = GCNNorm()
    
    for fold in folds:
        train, other = fold[0], fold[1]
        
        converted_train = train.transpose(0, 2, 1)
        converted_other = other.transpose(0, 2, 1)
        
        train_datset = SpatioTemporalDataset(
            data_array=converted_train,
            edge_index=edge_index,
            edge_weight=edge_weight,
            lookback=LOOKBACK,
            horizon=HORIZON,
            transform=transfrom
        )
        
        test_dataset = SpatioTemporalDataset(
            data_array=converted_other,
            edge_index=edge_index,
            edge_weight=edge_weight,
            lookback=LOOKBACK,
            horizon=HORIZON,
            transform=transfrom
        )
        
        new_folds.append((train_datset, test_dataset))
        
    return new_folds

### Training & Evalutation phase

In [5]:
# Data parameters
HORIZON = 30
TARGET_IDX = 4
ADJ_PATH = '/kaggle/input/adjacencymatricies/adj_ae-bert.npy'
ADJ_TYPE = "AE-BERT"

# Model hyperparameters
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 1e-5
NUM_EPOCHS = 10
BATCH_SIZE = 32

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F 
from torch_geometric.nn import GCNConv, GATv2Conv 
from torch_geometric.loader import DataLoader   
from torch_geometric_temporal.nn.recurrent import A3TGCN, A3TGCN2
from tqdm import tqdm
import numpy as np

class TemporalGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_size):
        super().__init__()
        self.tgnn = A3TGCN(
            in_channels = in_channels,
            out_channels = hidden_size,
            periods = LOOKBACK,
        )

        self.relu = nn.ReLU()

        self.linear = nn.Linear(hidden_size, HORIZON)

    def forward(self, x, edge_index, edge_weight):
        # X: [batchsize * num_nodes, num_features, lookback]
        last_known_value = x[:, TARGET_IDX, -1].unsqueeze(1)
        h = self.tgnn(x, edge_index, edge_weight)

        delta = self.linear(self.relu(h))
        
        output = last_known_value + delta
        # h = F.relu(h)
        # h = self.linear(h)
        return output

In [7]:
train_folds = load_torchgeometric_data(
    train=True,
    LOOKBACK=LOOKBACK,
    HORIZON=HORIZON,
    adj_path=ADJ_PATH
)

test_folds = load_torchgeometric_data(
    train=False,
    LOOKBACK=LOOKBACK,
    HORIZON=HORIZON, 
    adj_path=ADJ_PATH
)

In [8]:
in_channels = train_folds[0][0][0].x.shape[-2]
hidden_size = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
# all_folds_train_losses = []
# all_folds_valid_losses = []

# for fold_idx, (trainset, validset) in enumerate(train_folds):
#     print(f"=============== FOLD {fold_idx + 1}/{len(train_folds)} ================")
    
#     trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
#     validloader = DataLoader(validset, batch_size=BATCH_SIZE, shuffle=False)
    
#     model = TemporalGNN(
#         in_channels=in_channels,
#         hidden_sie=hidden_size,
#         out_channels=out_channels
#     ).to(device)
    
#     criterion = nn.L1Loss()
#     optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    
#     for epoch in range(NUM_EPOCHS):
#         model.train()
#         train_loop = tqdm(trainloader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} [Train]")
#         for data in train_loop:
#             data = data.to(device)
#             optimizer.zero_grad()
#             out = model(data.x, data.edge_index, data.edge_weight)
#             target = data.y[:, TARGET_IDX]
#             loss = criterion(out, target)
#             loss.backward()
#             optimizer.step()
#             train_loop.set_postfix(loss=loss.item())
#     torch.save(model, f'/kaggle/working/A3TGCN_{ADJ_TYPE}_FOLD{fold_idx}.pt')    
#     final_train_loss = 0
#     final_valid_loss = 0
#     with torch.no_grad():
#         for data in trainloader:
#             data = data.to(device)
#             out = model(data.x, data.edge_index, data.edge_weight)
#             target = data.y[:, TARGET_IDX, :]
#             final_train_loss += criterion(out, target).item()
        
#         for data in validloader:
#             data = data.to(device)
#             out = model(data.x, data.edge_index, data.edge_weight)
#             target = data.y[:, TARGET_IDX]
#             final_valid_loss += criterion(out, target).item()
            
#     avg_train_loss = final_train_loss / len(trainloader)
#     avg_valid_loss = final_valid_loss / len(validloader)
    
#     all_folds_train_losses.append(avg_train_loss)
#     all_folds_valid_losses.append(avg_valid_loss)
#     print(f"Fold {fold_idx + 1} - Final Train Loss: {avg_train_loss:.6f}, Final Valid Loss: {avg_valid_loss:.6f}\n")


# print("=============== K-FOLD SUMMARY ===============")
# print(f"Average Train Loss across {len(train_folds)} folds: {np.mean(all_folds_train_losses):.6f}")
# print(f"Average Valid Loss across {len(train_folds)} folds: {np.mean(all_folds_valid_losses):.6f}")
# print("============================================\n")

In [10]:
final_trainset, testset = test_folds[0]
final_trainloader = DataLoader(final_trainset, batch_size=BATCH_SIZE, shuffle=True)
testloader = DataLoader([testset[len(testset) - 1]], batch_size=1, shuffle=False)

print("Re-initializing model for final training...")
final_model = TemporalGNN(
    in_channels=in_channels,
    hidden_size=hidden_size,
).to(device)

criterion = nn.L1Loss(reduction='mean')
optimizer = torch.optim.Adam(final_model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

for epoch in range(NUM_EPOCHS):
    final_model.train()
    train_loop = tqdm(final_trainloader, desc=f"Final Epoch {epoch+1}/{NUM_EPOCHS}")
    total_train_loss = 0
    for data in train_loop:
        data = data.to(device)
        optimizer.zero_grad()
        out = final_model(data.x, data.edge_index, data.edge_weight)
        target = data.y[:, TARGET_IDX, :]
        # print(out.shape, target.shape)
        loss = criterion(out, target)
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_loop.set_postfix(loss=loss.item())
    total_train_loss /= len(final_trainloader)
    final_model.eval()
    total_test_loss = 0
    with torch.no_grad():
        for data in testloader:
            data = data.to(device)
            out = final_model(data.x, data.edge_index, data.edge_weight)
            target = data.y[:, TARGET_IDX, :]
            loss = criterion(out, target)
            total_test_loss += loss.item()
        total_test_loss /= len(testloader)

    print(f"Epoch {epoch + 1}/{NUM_EPOCHS}"
          f"Train: {total_train_loss}"
          f"Test: {total_test_loss}")

torch.save(final_model, f'/kaggle/working/A3TGCN_{ADJ_TYPE}_FINAL_MODEL.pt')
# print("\nFinal model saved to /kaggle/working/A3TGCN_AE-BERT_FINAL_MODEL")

# 4. Đánh giá cuối cùng trên tập test


Re-initializing model for final training...


Final Epoch 1/10: 100%|██████████| 112/112 [05:11<00:00,  2.78s/it, loss=0.0409]


Epoch 1/10Train: 0.06938498334160872Test: 0.0527024008333683


Final Epoch 2/10: 100%|██████████| 112/112 [05:24<00:00,  2.90s/it, loss=0.0728]


Epoch 2/10Train: 0.04743845082287278Test: 0.051272910088300705


Final Epoch 3/10: 100%|██████████| 112/112 [05:24<00:00,  2.90s/it, loss=0.044]


Epoch 3/10Train: 0.04701093737302082Test: 0.05130486562848091


Final Epoch 4/10: 100%|██████████| 112/112 [05:24<00:00,  2.89s/it, loss=0.0572]


Epoch 4/10Train: 0.047065980333302705Test: 0.05136145278811455


Final Epoch 5/10: 100%|██████████| 112/112 [05:24<00:00,  2.90s/it, loss=0.044]


Epoch 5/10Train: 0.046953364774318676Test: 0.0512859970331192


Final Epoch 6/10: 100%|██████████| 112/112 [05:25<00:00,  2.90s/it, loss=0.0491]


Epoch 6/10Train: 0.04696633071372552Test: 0.05115729570388794


Final Epoch 7/10: 100%|██████████| 112/112 [05:24<00:00,  2.90s/it, loss=0.0494]


Epoch 7/10Train: 0.046955127002937455Test: 0.05134030431509018


Final Epoch 8/10: 100%|██████████| 112/112 [05:25<00:00,  2.90s/it, loss=0.0416]


Epoch 8/10Train: 0.04689195974996047Test: 0.05129013583064079


Final Epoch 9/10: 100%|██████████| 112/112 [05:25<00:00,  2.90s/it, loss=0.0495]


Epoch 9/10Train: 0.04694147245027125Test: 0.05132409930229187


Final Epoch 10/10: 100%|██████████| 112/112 [05:24<00:00,  2.90s/it, loss=0.0468]


Epoch 10/10Train: 0.046933961200660894Test: 0.05116827040910721


In [11]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

final_model.eval()

with torch.no_grad():
    for data in testloader:
        data = data.to(device)
        out = final_model(data.x, data.edge_index, data.edge_weight)
        target = data.y[:, TARGET_IDX, :]
        restored_output = np.expm1(out.cpu().numpy().flatten())
        restored_target = np.expm1(target.cpu().numpy().flatten())

r2 = r2_score(restored_target, restored_output)
mae = mean_absolute_error(restored_target, restored_output)
rmse = np.sqrt(mean_squared_error(restored_target, restored_output))

print(f"\n===========================================")
print(f"  PERFORMANCE ON LAST TEST SAMPLE (ORIGINAL SCALE) ")
print(f"===========================================")
print(f"-> MAE (Mean Absolute Error):      {mae:.6f}")
print(f"-> MSE (Mean Squared Error):       {rmse ** 2:.6f}")
print(f"-> RMSE (Root Mean Squared Error): {rmse:.6f}")
print(f"-> R² Score:                       {r2:.6f}")
print(f"===========================================")


  PERFORMANCE ON LAST TEST SAMPLE (ORIGINAL SCALE) 
-> MAE (Mean Absolute Error):      11.647688
-> MSE (Mean Squared Error):       869.468742
-> RMSE (Root Mean Squared Error): 29.486755
-> R² Score:                       0.996945
