In [None]:
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from datetime import datetime
import time
import warnings
warnings.filterwarnings('ignore')

import sklearn
from sklearn import metrics
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.utils import resample

import random, os, json
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import torch
from torch import nn
from collections import defaultdict

import sys
sys.path.append("../")
import utils

from joblib import Parallel, delayed
import multiprocessing

import pickle
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from einops import rearrange, repeat
from torch.utils.data import DataLoader, TensorDataset

from torch.optim import Adam
from torch.nn import BCELoss
from transformers import AutoModel, AutoTokenizer
import torch.nn as nn
from sklearn.metrics import recall_score, roc_auc_score, confusion_matrix, accuracy_score
from sklearn.utils import resample
import datetime
import matplotlib.pyplot as plt
from tqdm import tqdm


device = torch.device('cuda:0')

In [None]:
def calculate_metrics(y_true, y_pred_probs):
    y_pred = np.round(y_pred_probs).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    accuracy = accuracy_score(y_true, y_pred)
    sensitivity = recall_score(y_true, y_pred) 
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0 
    roc_auc = roc_auc_score(y_true, y_pred_probs)
    f1 = f1_score(y_true, y_pred)  

    return accuracy, sensitivity, specificity, roc_auc, f1, tn, fp, fn, tp

### HYPERPARAMETERS 

- **seeds**: Seed values to ensure reproducibility.
- **batch_size**: Number of samples per batch used during training.
- **dropout**: Dropout rate applied to prevent overfitting.
- **weight_decay**: Weight decay for the optimizer to apply additional L2 regularization.
- **lr**: Learning rate assigned to the optimizer.
- **patience**: Number of epochs with no improvement before early stopping is triggered.
- **max_tokens**: Maximum number of tokens allowed per block when tokenizing the input time series.

In [None]:
seeds = [9, 76, 227]  
batch_size = 32
n_epochs_max = 100


hyperparameters = {
    "batch_size": batch_size,          
    "n_epochs_max": n_epochs_max,     
    "lr": 1e-5,            
    "weight_decay": 1e-4,            
    "max_tokens": 1024,
    "patience": 15,
    "dropout": 0.1
}

### FUNCTIONS OF THE MODEL

In [None]:
def split_series_to_blocks(X, tokenizer, max_tokens=1024):
    batch_size, time_steps, features = X.shape
    text_blocks = []

    for b in range(batch_size):
        blocks = []
        current_block = []
        current_tokens = 0

        for i in range(time_steps):
            timestep_text = " ".join([f"f{j}:{X[b, i, j]:.2f}" for j in range(features)])
            tokens = tokenizer(timestep_text, truncation=False)["input_ids"]
            token_length = len(tokens)

            if current_tokens + token_length <= max_tokens:
                current_block.append(timestep_text)
                current_tokens += token_length
            else:
                if current_block:
                    blocks.append(" ".join(current_block))
                current_block = [timestep_text]
                current_tokens = token_length
        if current_block:
            blocks.append(" ".join(current_block))

        text_blocks.append(blocks)

    return text_blocks

class TimeSeriesGPT2WithBlocks(nn.Module):
    def __init__(self, model_name, num_classes=1, dropout=0.1):
        super(TimeSeriesGPT2WithBlocks, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model = AutoModel.from_pretrained(model_name, output_hidden_states=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(self.model.config.hidden_size, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, blocks, max_tokens=1024):
        block_outputs = []

        for block in blocks:
            inputs = self.tokenizer(
                block, return_tensors="pt", padding=True, truncation=True, max_length=max_tokens
            )
            inputs = {key: val.to(next(self.parameters()).device) for key, val in inputs.items()}
            outputs = self.model(**inputs)
            cls_embedding = outputs.hidden_states[-1][:, -1, :]
            block_outputs.append(cls_embedding)

        combined_output = torch.mean(torch.stack(block_outputs), dim=0)
        combined_output = self.dropout(combined_output)
        logits = self.fc(combined_output)
        return self.sigmoid(logits)

In [None]:
class EarlyStopping:
    def __init__(self, patience=5, delta=0, verbose=False):
    """
    Early stops the training if validation loss does not improve after a given patience.

    Args:
        - patience: Number of epochs to wait for an improvement before stopping the training. 
        - delta: Minimum change in the monitored metric to qualify as an improvement.
        - verbose: If True, prints detailed messages each time the model improves and when early stopping is triggered.
    """
        self.patience = patience
        self.delta = delta
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.best_loss = float('inf')

    def __call__(self, val_loss, model):
        score = -val_loss  # Inverting to track improvement 

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f"EarlyStopping counter: {self.counter} out of {self.patience}")
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        """Saves the model when the validation loss decreases."""
        if self.verbose:
            print(f"Validation loss decreased ({self.best_loss:.4f} --> {val_loss:.4f}). Saving model ...")
        self.best_loss = val_loss
        torch.save(model.state_dict(), 'checkpoint.pt')


In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False 


def run_network(X_train, X_val, y_train, y_val, hyperparameters, seed, log_file="training_log.txt"):    
    """
    Trains and evaluates the built model based on the provided data and hyperparameters.

    Args:
        - X_train, X_val, y_train, y_val: numpy.ndarray. Training (T) and Validation (V) data labels.
        - hyperparameters: Dictionary containing training and model hyperparameters.
        - seed: Random seed for reproducibility.
        - log_file: File path to save training logs.

    Returns:
        - model: The trained PyTorch model.
        - history: Training history object containing loss and metrics.
    """      
    X_train, y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32)
    
    device = torch.device('cuda:0')
    print(device)
    set_seed(seed)
    
    model = TimeSeriesGPT2WithBlocks("distilgpt2", dropout=hyperparameters['dropout']).to(device)
    loss_fn = nn.BCELoss() 
    optimizer = Adam(model.parameters(), lr=hyperparameters['lr'], weight_decay=hyperparameters['weight_decay'])

    tokenizer = model.tokenizer
    train_blocks = split_series_to_blocks(X_train, tokenizer, max_tokens=hyperparameters['max_tokens'])
    val_blocks = split_series_to_blocks(X_val, tokenizer, max_tokens=hyperparameters['max_tokens'])
    
    y_val = torch.tensor(y_val, dtype=torch.float32).to(device)

    train_loss_history = []
    val_loss_history = []
    early_stopping = EarlyStopping(patience=hyperparameters['patience'], delta=0.001, verbose=True)

    with open(log_file, "a") as log:  
        for epoch in range(hyperparameters['n_epochs_max']):
            model.train()
            train_loss = 0.0
            
            for blocks, label in zip(train_blocks, y_train):
                optimizer.zero_grad()
                label_tensor = torch.tensor([label], dtype=torch.float).to(device)  
                outputs = model(blocks)
                loss = loss_fn(outputs.flatten(), label_tensor)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()
            
            train_loss /= len(train_blocks)
            train_loss_history.append(train_loss)

            model.eval()
            val_loss = 0.0
            y_val_preds = []
            y_val_true = []
            with torch.no_grad():
                for blocks, label in zip(val_blocks, y_val):
                    label_tensor = torch.tensor([label], dtype=torch.float).to(device)
                    outputs = model(blocks)
                    loss = loss_fn(outputs.flatten(), label_tensor)
                    val_loss += loss.item()

                    y_val_preds.append(outputs.flatten().cpu().numpy())
                    y_val_true.append(label.cpu().numpy())
            
            val_loss /= len(val_blocks)
            val_loss_history.append(val_loss)

            y_val_preds = np.concatenate(y_val_preds)
            y_val_true = np.array(y_val_true)
            accuracy, sensitivity, specificity, roc_auc, f1, tn, fp, fn, tp = calculate_metrics(y_val_true, y_val_preds)
            
            print(f"Epoch {epoch + 1}, Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
            print(f"Validation Metrics - Accuracy: {accuracy:.4f}, f1: {f1:.4f}, sensitivity: {sensitivity:.4f}, Specificity: {specificity:.4f}, ROC AUC: {roc_auc:.4f}")
            print(f"Confusion Matrix - TN: {tn}, FP: {fp}, FN: {fn}, TP: {tp}")

            early_stopping(val_loss, model)
            if early_stopping.early_stop:
                print("Early stopping triggered.")
                break  
    
    model.load_state_dict(torch.load('checkpoint.pt')) 

    history = {'loss': train_loss_history, 'val_loss': val_loss_history}
    return model, history

In [None]:
def objective(trial, hyperparameters, seed, X_train, y_train, X_val, y_val):
    """
    Objective function for hyperparameter optimization using Optuna.
    Args:
        - trial (optuna.trial.Trial): Optuna trial object.
        - X_train, X_val, y_train, y_val: numpy.ndarray. Training (T) and Validation (V) data labels.
        - hyperparameters: Dictionary containing training and model hyperparameters.
        - seed: Random seed for reproducibility.  
   
    Returns:
        - metric_dev: Best validation loss achieved during training.     
    """
    trial_seed = seed + trial.number  
    set_seed(trial_seed)

    hyperparameters_copy = hyperparameters.copy()

    hyperparameters_copy['lr'] = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    hyperparameters_copy['weight_decay'] = trial.suggest_float("weight_decay", 1e-6, 1e-2, log=True)
    hyperparameters_copy['dropout'] = trial.suggest_float("dropout", 0.0, 0.3)
    hyperparameters_copy['patience'] = trial.suggest_int("patience", 3, 20)

    v_val_loss = [] 
    
    model, history = run_network(
            X_train, X_val,
            y_train,
            y_val,
            hyperparameters_copy,
            trial_seed, 
        )

    v_val_loss.append(np.min(hist.history["val_loss"]))

    metric_dev = np.mean(v_val_loss)
    return metric_dev

def optuna_study(hyperparameters, seed, X_train, y_train, X_val, y_val):
    """
    Runs an Optuna study to optimize hyperparameters for the model.
    
    Args:
        - X_train, X_val, y_train, y_val: numpy.ndarray. Training (T) and Validation (V) data labels.
        - hyperparameters: Dictionary containing training and model hyperparameters.
        - seed: Random seed for reproducibility.  
    Returns:
        - best_hyperparameters: Dictionary containing the best hyperparameters found 
          after the optimization process.
    """  
    set_seed(seed)

    sampler = optuna.samplers.TPESampler(seed=seed)
    study = optuna.create_study(direction='minimize', sampler=sampler)

    study.optimize(
        lambda trial: objective(trial, hyperparameters, seed, X_train, y_train, X_val, y_val),
        n_trials=20,  
        n_jobs=1      
    )

    best_params = study.best_params
    best_metric = study.best_value
    
    best_hyperparameters = {
        'lr': best_params['lr'], 
        'weight_decay': best_params['weight_decay'], 
        'dropout': best_params['dropout'],  
        'patience': best_params['patience'],  
    }

    print(f"Best Hyperparameters: {best_params}")
    print(f"Best Validation Metric: {best_metric}")

    return best_hyperparameters

### PREDICTIONS

In [None]:
import time
import os
import pickle
import torch
import numpy as np
import pandas as pd

run_model = True
results = []
if run_model:
    loss_train = []
    loss_dev = []
    v_models = []
 
    y_pred_by_split = {}
    bestHyperparameters_bySplit = {}
        
    for i in [1,2,3]:
        init = time.time()
        
        X_test = np.load(f"../../../DATA/w14days/s{i}/X_test_tensor_standardScaler.npy")
        y_test = pd.read_csv(f"../../../DATA/w14days/s{i}/y_test_tensor_standardScaler.csv")["individualMRGerm_stac"].values.astype(int)

        X_train = np.load(f"../../../DATA/w14days/s{i}/X_train_tensor_standardScaler.npy")
        y_train = pd.read_csv(f"../../../DATA/w14days/s{i}/y_train_tensor_standardScaler.csv")["individualMRGerm_stac"].values.astype(int)
    
        X_val = np.load(f"../../../DATA/w14days/s{i}/X_val_tensor_standardScaler.npy")
        y_val = pd.read_csv(f"../../../DATA/w14days/s{i}/y_val_tensor_standardScaler.csv")["individualMRGerm_stac"].values.astype(int)
   
 
        X_train = np.where(X_train == 666, 0, X_train)
        X_val = np.where(X_val == 666, 0, X_val)
        X_test = np.where(X_test == 666, 0, X_test)
        device = torch.device('cuda:0')
        print(device)
        loss_fn = nn.BCELoss()

        bestHyperparameters = optuna_study(
            hyperparameters,
            seeds[i-1],
            X_train, y_train,  
            X_val, y_val
        )

        bestHyperparameters_bySplit[str(i)] = bestHyperparameters
        
        split_directory = f'./Results_GPT2/split_{i}'
        if not os.path.exists(split_directory):
            os.makedirs(split_directory)
        
        with open(os.path.join(split_directory, f"bestHyperparameters_split_{i}.pkl"), 'wb') as f:
            pickle.dump(bestHyperparameters, f)

        hyperparameters.update({
            'lr': bestHyperparameters['lr'], 
            'weight_decay': bestHyperparameters['weight_decay'],
            'dropout': bestHyperparameters['dropout'], 
            'patience': bestHyperparameters['patience']
        })
                    
        
        model, history = run_network(
            X_train, X_val,
            y_train,
            y_val,
            hyperparameters,
            seeds[i-1]
        )
 
        v_models.append(model)
        loss_train.append(history['loss'])
        loss_dev.append(history['val_loss'])
 
        tokenizer = model.tokenizer
        test_blocks = split_series_to_blocks(X_test, tokenizer, max_tokens=hyperparameters['max_tokens'])
 
        y_pred = []
        with torch.no_grad():
            for blocks in test_blocks:
                outputs = model(blocks).detach().cpu().numpy()
                y_pred.append(outputs)
        y_pred = np.concatenate(y_pred)  
 
 
        accuracy, sensitivity, specificity, roc_auc, f1, tn, fp, fn, tp = calculate_metrics(y_test, y_pred)
        print(f"Test Metrics for Split {i} - Accuracy: {accuracy:.4f}, f1: {f1:.4f}, sensitivity: {sensitivity:.4f}, Specificity: {specificity:.4f}, ROC AUC: {roc_auc:.4f}")
        print(f"Confusion Matrix - TN: {tn}, FP: {fp}, FN: {fn}, TP: {tp}")
 
        results.append([accuracy, sensitivity, specificity, roc_auc, f1])
 
        y_pred_by_split[str(i)] = y_pred
        print(f"for split {i}:")
        print(y_pred_by_split[str(i)]), 
 
        y_pred_path = os.path.join(split_directory, f"y_pred_split_{i}.pkl")
        with open(y_pred_path, 'wb') as f:
            pickle.dump(y_pred, f)
    

### RESULTS (PERFORMANCE)

#### Step 1. Load model and best results

In [None]:
directory = './Results_GPT2'

def load_from_pickle(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

y_pred_by_split = {}
y_pred_by_split['1'] = load_from_pickle(os.path.join('./Results_GPT2/split_1', "y_pred_split_1.pkl"))
y_pred_by_split['2'] = load_from_pickle(os.path.join('./Results_GPT2/split_2', "y_pred_split_2.pkl"))
y_pred_by_split['3'] = load_from_pickle(os.path.join('./Results_GPT2/split_3', "y_pred_split_3.pkl"))

#### Step 2. Analysis of results

In [None]:
all_metrics = []

for i in [1,2,3]:
    y_test = pd.read_csv(f"../../../DATA/w14days/s{i}/y_test_tensor_standardScaler.csv")["individualMRGerm_stac"].values.astype(int)
    y_test_single = y_test.flatten()  
    y_test_pred = y_pred_by_split[str(i)].flatten()  
    
    df_metrics = utils.get_metrics_(y_test_single, (y_test_pred))
    print(df_metrics)
    utils.plot_metrics(df_metrics)
    utils.plot_roc_curve(y_test_single, y_test_pred)

    all_metrics.append(df_metrics)
print(all_metrics)

In [None]:
metrics_GPT2 = pd.concat(all_metrics)
metrics_GPT2.to_csv('./Results_GPT2/metrics_GPT2.csv', index=False)

metrics_GPT2.head()

In [None]:
metrics_mean = metrics_GPT2.mean()
metrics_std = metrics_GPT2.std()

summary_df = pd.DataFrame({
    "Metric": metrics_mean.index,
    "Mean": metrics_mean.values,
    "Standard Deviation": metrics_std.values
})

#summary_df.to_csv('./Results_GPT2/metrics_summary_GPT2.csv', index=False)

print("\nMean and Standard Deviation of the Splits:")
print(summary_df)


In [None]:
metrics_GPT2 = pd.read_csv('./Results_GPT2/metrics_GPT2.csv')

stats_GPT2 = metrics_GPT2.agg(["mean", "std"]) 
formatted_metrics = stats_GPT2.apply(lambda x: f"{x['mean']*100:.2f} ± {x['std']*100:.2f}", axis=0)
formatted_metrics_df = pd.DataFrame(formatted_metrics, columns=["Metrics (Mean ± Std)"])
formatted_metrics_df.to_csv('./Results_GPT2/metrics_GPT2_formatted.csv', index=True)
print(formatted_metrics_df)
