# Testing the Python Script

Write Python code and collaborate in real time. Your code runs in Modal's
**serverless cloud**, and anyone in the same workspace can join.

This notebook comes with some common Python libraries installed. Run
cells with `Shift+Enter`.

In [1]:
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

raw_data = pd.read_csv("input_data_modal.csv")
CACHE_DIR = "cache/"

In [2]:
def test_train_split_by_date(df, date_col="adjusted_date", train_frac=0.75, val_frac=0.12):
    """
    Splits the DataFrame into train, validation, and test sets based on unique dates.
    
    Parameters:
    - df: pandas DataFrame containing the data.
    - date_col: str, name of the column containing date information.
    - train_frac: float, fraction of data to be used for training.
    - val_frac: float, fraction of data to be used for validation.
    
    Returns:
    - train_df: DataFrame for training set.
    - val_df: DataFrame for validation set.
    - test_df: DataFrame for test set.
    """
    # Ensure the DataFrame is sorted by date
    df = df.sort_values(date_col).reset_index(drop=True)
    
    # Get unique sorted dates
    dates = np.array(sorted(df[date_col].unique()))
    n_dates = len(dates)
    
    # Calculate split indices
    train_end = int(train_frac * n_dates)
    val_end   = int((train_frac + val_frac) * n_dates)
    
    # Split dates
    train_dates = dates[:train_end]
    val_dates   = dates[train_end:val_end]
    test_dates  = dates[val_end:]
    
    # Create DataFrames for each set
    train_df = df[df[date_col].isin(train_dates)].reset_index(drop=True)
    val_df   = df[df[date_col].isin(val_dates)].reset_index(drop=True)
    test_df  = df[df[date_col].isin(test_dates)].reset_index(drop=True)


    return train_df, val_df, test_df


def scale_features(train_df, val_df, test_df, feature_cols):
    """
    Scales the specified feature columns to have mean 0 and standard deviation 1.
    
    Parameters:
    - train_df: DataFrame for training set.
    - val_df: DataFrame for validation set.
    - test_df: DataFrame for test set.
    - feature_cols: list of str, names of the columns to be scaled.
    
    Returns:
    - features_train: numpy array of scaled features for training set.
    - features_val: numpy array of scaled features for validation set.
    - features_test: numpy array of scaled features for test set.
    """
    scaler = StandardScaler()
    
    # Fit scaler on training data and transform
    features_train = scaler.fit_transform(train_df[feature_cols])
    
    # Transform validation and test data
    features_val   = scaler.transform(val_df[feature_cols])
    features_test  = scaler.transform(test_df[feature_cols])
    
    return features_train, features_val, features_test

In [3]:
import os, glob, hashlib, shutil
import torch
from torch.utils.data import Dataset
from transformers import AutoModel, AutoTokenizer

def setup_finbert(model_name = "yiyanghkust/finbert-tone"):
    """
    Sets up the FinBERT model and tokenizer for embedding financial text.

    Returns:
        model: The FinBERT model.
        tokenizer: The FinBERT tokenizer.
        device: torch.device
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    encoder = AutoModel.from_pretrained(model_name)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoder.to(device)
    encoder.eval()

    for param in encoder.parameters():  # freeze FinBERT weights
        param.requires_grad = False

    return encoder, tokenizer, device


def chunks(text, tokenizer, max_tokens=512, overlap=50):
    """
    Splits the input text into chunks of tokens with specified maximum length (512) and overlap.

    Args:
        text (str): The input text to be chunked.
        tokenizer: The tokenizer to convert text to token IDs.
        max_tokens (int): Maximum number of tokens per chunk.
        overlap (int): Number of overlapping tokens between consecutive chunks.

    Returns:
        List of chunks, where each chunk is a list of token IDs.        
    """
    tokens = tokenizer(
        text,
        add_special_tokens=False,
        truncation=False,
        return_attention_mask=False
    )["input_ids"]

    out = []
    start = 0
    while start < len(tokens):
        out.append(tokens[start:start + max_tokens])
        start += max_tokens - overlap
    return out


def chunk_to_vector(chunk_id_list, encoder, tokenizer, device, batch_size=16):
    """
    Takes in a list of chunks (each chunk is a list of token IDs), uses FinBERT to compute
    CLS vector for each chunk.

    Args:
        chunk_id_list: List of chunks, where each chunk is a list of token IDs.
        encoder: FinBERT model.
        tokenizer: FinBERT tokenizer.
        device: cpu or gpu device.
        batch_size: Number of chunks to process in a batch.

    Returns:
        torch.Tensor of shape (num_chunks, hidden_dim)
    """
    vecs = []

    with torch.no_grad():
        # process in batches and prepare inputs by padding/truncating
        for i in range(0, len(chunk_id_list), batch_size):
            batch = chunk_id_list[i:i + batch_size]

            inputs = [
                tokenizer.prepare_for_model(
                    ch,
                    add_special_tokens=True,
                    max_length=512,
                    truncation=True,
                    return_attention_mask=True
                )
                for ch in batch
            ]

            enc = tokenizer.pad(
                inputs,
                padding="max_length",
                max_length=512,
                return_tensors="pt"
            )

            # ensure batch dimension
            if enc["input_ids"].dim() == 1:
                enc["input_ids"] = enc["input_ids"].unsqueeze(0)
            if enc["attention_mask"].dim() == 1:
                enc["attention_mask"] = enc["attention_mask"].unsqueeze(0)
            if "token_type_ids" in enc and enc["token_type_ids"].dim() == 1:
                enc["token_type_ids"] = enc["token_type_ids"].unsqueeze(0)

            enc = {k: v.to(device) for k, v in enc.items()}

            out = encoder(**enc).last_hidden_state   # (B,512,768)
            vec = out[:, 0, :]                       # (B,768) CLS embedding
            vecs.append(vec)

    vec = torch.cat(vecs, dim=0)
    return vec  # (C,768)


def transcript_id(text):
    """
    Generates a unique identifier for a given transcript using its MD5 hash.
    """
    return hashlib.md5(text.encode("utf-8")).hexdigest()


def build_cache(data, cache_dir, encoder, tokenizer, device, overlap=0):
    """
    Takes in a dataset of transcripts, labels, and financial features, computes FinBERT
    embeddings, and caches to disk.

    Args:
        data: List of tuples (transcript, label, fin_features).
        cache_dir: Directory to store cached embeddings.
        encoder: FinBERT model.
        tokenizer: FinBERT tokenizer.
        device: cpu or gpu device.
        overlap: Number of overlapping tokens between consecutive chunks.

    """
    os.makedirs(cache_dir, exist_ok=True)

    for i, (transcript, y, fin_features) in enumerate(data):
        cid = transcript_id(transcript)
        path = os.path.join(cache_dir, f"{cid}.pt")
        if os.path.exists(path):
            continue

        chunk_id_list = chunks(transcript, tokenizer, overlap=overlap)
        Z = chunk_to_vector(chunk_id_list, encoder, tokenizer, device, batch_size=8)

        f = torch.tensor(fin_features, dtype=torch.float16)
        torch.save(
            {"Z": Z.to(torch.float16), "fin_features": f, "y": int(y)},
            path
        )

        if (i + 1) % 50 == 0:
            print(f"[{i+1}/{len(data)}] cached | files={len(glob.glob(cache_dir+'/*.pt'))}")

    print(f"Cached {len(data)} transcripts → {cache_dir}")


def zip_cache(cache_dir, zip_path):
    """
    Zips the entire cache directory into a single .zip file.
    """
    assert os.path.exists(cache_dir), f"{cache_dir} does not exist"
    shutil.make_archive(
        base_name=zip_path.replace(".zip", ""),
        format="zip",
        root_dir=cache_dir
    )
    print(f"Created zip file: {zip_path}")



def create_finbert_cache(raw_data,cache_dir=CACHE_DIR,return_days=1,overlap=50):
    """
    High-level function to create FinBERT cache from raw data.
    First split data by date into train/val/test, scale financial features, zip them and then input to create finbert cache.

    Args:
        data: List of tuples (transcript, label, fin_features).
        cache_dir: Directory to store cached embeddings.
        overlap: Number of overlapping tokens between consecutive chunks.
    """
    encoder, tokenizer, device = setup_finbert()

    train_data, val_data, test_data = test_train_split_by_date(raw_data)
    train_features,val_features, test_features = scale_features(train_data, val_data, test_data,["abvol_20d", "abcallday_r1", "abcallday_r5", "abcallday_r20"])
    train_transcripts,val_transcripts,test_transcripts=train_data["transcript"].tolist(),val_data["transcript"].tolist(),test_data["transcript"].tolist()
    
    if return_days==1:
        y_train_1d,y_val_1d,y_tet_1d=train_data["r1d_direction"],val_data["r1d_direction"],test_data["r1d_direction"]
    else:
        y_train_1d,y_val_1d,y_tet_1d=train_data["r5d_direction"],val_data["r5d_direction"],test_data["r5d_direction"]
    
    train_data, val_data, test_data = list(zip(train_transcripts, y_train_1d, train_features)), list(zip(val_transcripts, y_val_1d, val_features)), list(zip(test_transcripts, y_tet_1d, test_features))
    
    for data, split in zip([train_data, val_data, test_data], ["train", "val", "test"]):
        split_cache_dir = os.path.join(cache_dir, split,str(return_days))
        zip_path=os.path.join(cache_dir, f"cache_{split}_{return_days}"+".zip")    
        build_cache(data, split_cache_dir, encoder, tokenizer, device, overlap=overlap)
        zip_cache(split_cache_dir, zip_path)

# if __name__ == "__main__":
#     raw_data = prepare_earnings_data()
#     create_finbert_cache(raw_data,cache_dir=CACHE_DIR,return_days=1,overlap=50)
#     create_finbert_cache(raw_data,cache_dir=CACHE_DIR,return_days=5,overlap=50)


In [4]:
create_finbert_cache(raw_data,cache_dir=CACHE_DIR,return_days=1,overlap=50)

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

[50/2316] cached | files=43
[100/2316] cached | files=90
[150/2316] cached | files=138
[200/2316] cached | files=182
[250/2316] cached | files=209
[300/2316] cached | files=257
[400/2316] cached | files=313
[450/2316] cached | files=355
[550/2316] cached | files=409
[650/2316] cached | files=456
[700/2316] cached | files=504
[750/2316] cached | files=554
[800/2316] cached | files=600
[900/2316] cached | files=693
[1000/2316] cached | files=777
[1050/2316] cached | files=824
[1100/2316] cached | files=866
[1150/2316] cached | files=911
[1200/2316] cached | files=957
[1250/2316] cached | files=1004
[1350/2316] cached | files=1099
[1400/2316] cached | files=1146
[1450/2316] cached | files=1190
[1500/2316] cached | files=1238
[1550/2316] cached | files=1283
[1600/2316] cached | files=1329
[1650/2316] cached | files=1366
[1700/2316] cached | files=1412
[1750/2316] cached | files=1460
[1800/2316] cached | files=1508
[1850/2316] cached | files=1557
[1950/2316] cached | files=1641
[2000/2316] 

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import roc_auc_score
import os
from torch.utils.data import Dataset
import glob

class CachedDataset(Dataset):
    """
    Dataset for loading cached FinBERT embeddings and labels.
    """
    def __init__(self, cache_dir):
        self.paths = glob.glob(os.path.join(cache_dir, "*.pt"))

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        obj = torch.load(self.paths[idx], map_location="cpu")
        return obj["Z"].float(), torch.tensor(obj["y"], dtype=torch.float32)


class CachedZFinDataset(Dataset):
    """
    Dataset for loading cached FinBERT embeddings, financial features, and labels.
    """
    def __init__(self, cache_dir):
        self.paths = sorted(glob.glob(os.path.join(cache_dir, "*.pt")))

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        obj = torch.load(self.paths[idx], map_location="cpu")
        Z = obj["Z"].float()  # (C,768)
        y = torch.tensor(obj["y"], dtype=torch.float32)
        fin = obj["fin_features"].float().view(-1)
        return Z, fin, y
    

def collate_pad(batch):
    """Pads variable-length sequences in the batch with zeros to help with batching.""" 
    
    # batch = [(Z1, y1), (Z2, y2), ...]
    Z_list, y_list = zip(*batch)

    B = len(Z_list)
    dim = Z_list[0].shape[1]
    C_max = max(z.shape[0] for z in Z_list)

    Z_pad = torch.zeros(B, C_max, dim)
    mask  = torch.zeros(B, C_max)

    for i, Z in enumerate(Z_list):
        C = Z.shape[0]
        Z_pad[i, :C] = Z
        mask[i, :C] = 1.0

    y = torch.tensor(y_list, dtype=torch.float32)
    return Z_pad, mask, y


def collate_pad_chunks_with_fin(batch):
    """Pads variable-length sequences in the batch with zeros to help with batching. Also stacks financial features."""
    # batch: [(Z, fin, y), ...]
    Z_list, fin_list, y_list = zip(*batch)

    B = len(Z_list)
    dim = Z_list[0].shape[1]
    C_max = max(z.shape[0] for z in Z_list)

    Z_pad = torch.zeros(B, C_max, dim, dtype=torch.float32)
    mask  = torch.zeros(B, C_max, dtype=torch.float32)

    for i, Z in enumerate(Z_list):
        C = Z.shape[0]
        Z_pad[i, :C] = Z
        mask[i, :C] = 1.0

    fin = torch.stack([f.view(-1) for f in fin_list]).float()  # (B,K)
    y = torch.tensor(y_list, dtype=torch.float32)              # (B,)

    return Z_pad, mask, fin, y



class MeanPoolClassifier(nn.Module):
    """A simple mean-pooling classifier. Takes the mean of all chunk vectors to return one embedding vector per transcript
    
    Args:
        dim (int): Dimension of input features.
        hidden (int): Dimension of hidden layer.
        dropout (float): Dropout rate

    Returns:
        torch.Tensor: Output logits of shape (B,). 
    """

    def __init__(self, dim=768, hidden=256, dropout=0.2):
        super().__init__()
        self.fc1 = nn.Linear(dim, hidden)
        self.fc2 = nn.Linear(hidden, 1)
        self.drop = nn.Dropout(dropout)

    def forward(self, Z, mask):
        # Z: (B,C,768), mask: (B,C)
        mask3 = mask.unsqueeze(-1)  # (B,C,1)
        doc = (Z * mask3).sum(dim=1) / mask3.sum(dim=1).clamp(min=1e-9)
        x = F.relu(self.fc1(doc))
        x = self.drop(x)
        return self.fc2(x).squeeze(-1)  # (B,)


class AttnPoolClassifier(nn.Module):
    """
    A simple attention-pooling classifier. Learns attention weights over chunk vectors to return one embedding vector per transcript.
    Args:
        dim (int): Dimension of input features.
        hidden (int): Dimension of hidden layer.
        dropout (float): Dropout rate

    Returns:
        torch.Tensor: Output logits of shape (B,).        

    """
    def __init__(self, dim=768, hidden=256, dropout=0.2):
        super().__init__()
        self.attn = nn.Parameter(torch.randn(dim) * 0.02)
        self.fc1 = nn.Linear(dim, hidden)
        self.fc2 = nn.Linear(hidden, 1)
        self.drop = nn.Dropout(dropout)

    def forward(self, Z, mask):
        # Z: (B,C,768)
        scores = torch.einsum("bcd,d->bc", Z, self.attn)  # (B,C)
        scores = scores.masked_fill(mask == 0, -1e9)
        alpha = torch.softmax(scores, dim=1)
        doc = torch.einsum("bc,bcd->bd", alpha, Z)        # (B,768)
        x = F.relu(self.fc1(doc))
        x = self.drop(x)
        return self.fc2(x).squeeze(-1)

class AttnMLPPoolClassifier(nn.Module):
    """
    An attention-pooling classifier with non-linearity applied to attention. Learns attention weights over chunk vectors to return one embedding vector per transcript.

    Args:
        dim (int): Dimension of input features. 
        attn_hidden (int): Dimension of attention hidden layer.
        hidden (int): Dimension of hidden layer.
        dropout (float): Dropout rate

    Returns:
        torch.Tensor: Output logits of shape (B,).        
    """
    def __init__(self, dim=768, attn_hidden=256, hidden=256, dropout=0.2):
        super().__init__()
        self.W = nn.Linear(dim, attn_hidden)
        self.v = nn.Linear(attn_hidden, 1, bias=False)

        self.fc1 = nn.Linear(dim, hidden)
        self.fc2 = nn.Linear(hidden, 1)
        self.drop = nn.Dropout(dropout)

    def forward(self, Z, mask):
        # Z: (B,C,768), mask: (B,C)
        h = torch.tanh(self.W(Z))              # (B,C,H)
        scores = self.v(h).squeeze(-1)         # (B,C)
        scores = scores.masked_fill(mask == 0, -1e9)
        alpha = torch.softmax(scores, dim=1)   # (B,C)

        doc = torch.einsum("bc,bcd->bd", alpha, Z)  # (B,768)

        x = F.relu(self.fc1(doc))
        x = self.drop(x)
        return self.fc2(x).squeeze(-1)
      

class AttnPoolTwoTower(nn.Module):
    """
    An attention-pooling two-tower classifier. Learns attention weights over chunk vectors to return one embedding vector per transcript, and combines it with financial features.
    Uses two separate projection heads for document and financial features before combining them for final classification.

    Args:
        dim (int): Dimension of input features. 
        fin_dim (int): Dimension of financial features.
        hidden (int): Dimension of hidden layer.
        dropout (float): Dropout rate

    Returns:
        torch.Tensor: Output logits of shape (B,).        

    """
    def __init__(self, dim=768, fin_dim=4, hidden=256, dropout=0.2):
        super().__init__()
        self.attn = nn.Parameter(torch.randn(dim) * 0.02)

        self.doc_proj = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
        )
        self.fin_proj = nn.Sequential(
            nn.LayerNorm(fin_dim),          # optional
            nn.Linear(fin_dim, hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
        )

        self.out = nn.Linear(2 * hidden, 1)

    def forward(self, Z, mask, fin):
        scores = torch.einsum("bcd,d->bc", Z, self.attn)
        scores = scores.masked_fill(mask == 0, -1e9)
        alpha = torch.softmax(scores, dim=1)
        doc = torch.einsum("bc,bcd->bd", alpha, Z)  # (B,768)

        a = self.doc_proj(doc)   # (B,hidden)
        b = self.fin_proj(fin)   # (B,hidden)

        x = torch.cat([a, b], dim=1)
        return self.out(x).squeeze(-1)


loss_fn = nn.BCEWithLogitsLoss()

@torch.no_grad()
def eval_loop_auc(model, loader, device):
    """
    Evaluation loop that computes average loss and AUC over the dataset.

    Args:
        model (nn.Module): The model to evaluate.
        loader (DataLoader): DataLoader for the evaluation dataset.
        device (torch.device): Device to run the evaluation on.

    Returns:
        tuple: Logits,Average loss and AUC score.        
    """
    model.eval()
    total_loss, n = 0.0, 0

    all_logits = []
    all_labels = []

    for Z, mask, y in loader:
        Z = Z.to(device, non_blocking=True)
        mask = mask.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)

        logit = model(Z, mask)              # (B,)
        loss = loss_fn(logit, y)

        total_loss += loss.item() * y.size(0)
        n += y.size(0)

        all_logits.append(logit.cpu())
        all_labels.append(y.cpu())

    avg_loss = total_loss / max(1, n)

    logits = torch.cat(all_logits).numpy()
    labels = torch.cat(all_labels).numpy()

    auc = roc_auc_score(labels, logits)

    return logits,avg_loss, auc


@torch.no_grad()
def eval_loop_auc_fin(model, loader, device):
    """
Evaluation loop that computes y_scores, average loss and AUC over the dataset, for models that take financial features and transcript embeddings.

    Args:
        model (nn.Module): The model to evaluate.   
        loader (DataLoader): DataLoader for the evaluation dataset.
        device: Device to run the evaluation on.

    Returns:
        tuple: y_scores, Average loss and AUC score.        
    """
    model.eval()
    total_loss, n = 0.0, 0

    all_logits = []
    all_labels = []

    for Z, mask,fin, y in loader:
        Z = Z.to(device, non_blocking=True)
        mask = mask.to(device, non_blocking=True)
        fin=fin.to(device,non_blocking=True)
        y = y.to(device, non_blocking=True)

        logit = model(Z, mask,fin)              # (B,)
        loss = loss_fn(logit, y)

        total_loss += loss.item() * y.size(0)
        n += y.size(0)

        all_logits.append(logit.cpu())
        all_labels.append(y.cpu())

    avg_loss = total_loss / max(1, n)

    logits = torch.cat(all_logits).numpy()
    labels = torch.cat(all_labels).numpy()

    auc = roc_auc_score(labels, logits)

    return logits,avg_loss, auc


def train_with_early_stopping(
    model,
    train_loader,
    val_loader,
    device,
    max_epochs=50,
    patience=7,
    lr=1e-3,
    weight_decay=1e-2,
    save_path="best.pt",
):
    """
Training loop with early stopping based on validation AUC. Uses AdamW optimizer. 
Stops training if validation AUC does not improve for a specified number of epochs (patience).
    
    Args:
        model (nn.Module): The model to train.
        train_loader (DataLoader): DataLoader for the training dataset.
        val_loader (DataLoader): DataLoader for the validation dataset.
        device (torch.device): Device to run the training on.
        max_epochs (int): Maximum number of epochs to train.
        patience (int): Number of epochs to wait for improvement before stopping.
        lr (float): Learning rate for the optimizer.
        weight_decay (float): Weight decay for the optimizer.
        save_path (str): Path to save the best model weights.

    Returns:
        nn.Module: The trained model with the best weights loaded.        
    """
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    best_val = float("inf")
    bad_epochs = 0

    best_auc = -float("inf")
    patience = 7
    bad_epochs = 0

    for epoch in range(1, max_epochs + 1):
        model.train()
        total_loss, n = 0.0, 0

        for Z, mask, y in train_loader:
            Z = Z.to(device, non_blocking=True)
            mask = mask.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)

            logit = model(Z, mask)
            loss = loss_fn(logit, y)
    
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        total_loss += loss.item() * y.size(0)
        n += y.size(0)

        train_loss = total_loss / n
        val_logits,val_loss, val_auc = eval_loop_auc(model, val_loader, device)

        print(
            f"epoch {epoch:02d} | "
            f"train_loss={train_loss:.4f} | "
            f"val_loss={val_loss:.4f} | "
            f"val_auc={val_auc:.3f}"
        )
    
        if val_auc > best_auc + 1e-4:
            best_auc = val_auc
            bad_epochs = 0
            torch.save(model.state_dict(), save_path)
        else:
            bad_epochs += 1
            if bad_epochs >= patience:
                print("Early stopping on AUC.")
                break

    # load best weights before returning
    model.load_state_dict(torch.load(save_path, map_location=device))
    return model


def train_with_early_stopping_fin(
    model,
    train_loader,
    val_loader,
    device,
    max_epochs=50,
    patience=7,
    lr=1e-3,
    weight_decay=1e-2,
    save_path="best.pt",
):
    """
Training loop with early stopping based on validation AUC, for models that take financial features and transcript embeddings. Uses AdamW optimizer. 
Stops training if validation AUC does not improve for a specified number of epochs (patience).

    Args:
        model (nn.Module): The model to train.
        train_loader (DataLoader): DataLoader for the training dataset.
        val_loader (DataLoader): DataLoader for the validation dataset. 
        device (torch.device): Device to run the training on.
        max_epochs (int): Maximum number of epochs to train.
        patience (int): Number of epochs to wait for improvement before stopping.
        lr (float): Learning rate for the optimizer.
        weight_decay (float): Weight decay for the optimizer.
        save_path (str): Path to save the best model weights.

    Returns:
        nn.Module: The trained model with the best weights loaded.
    """
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    best_val = float("inf")
    bad_epochs = 0

    best_auc = -float("inf")
    patience = 7
    bad_epochs = 0

    for epoch in range(1, max_epochs + 1):
        model.train()
        total_loss, n = 0.0, 0

        for Z, mask,fin, y in train_loader:
            Z = Z.to(device, non_blocking=True)
            mask = mask.to(device, non_blocking=True)
            fin=fin.to(device,non_blocking=True)
            y = y.to(device, non_blocking=True)

            logit = model(Z, mask,fin)
            loss = loss_fn(logit, y)
    
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        total_loss += loss.item() * y.size(0)
        n += y.size(0)

        train_loss = total_loss / n
        val_logits,val_loss, val_auc = eval_loop_auc_fin(model, val_loader, device)

        print(
            f"epoch {epoch:02d} | "
            f"train_loss={train_loss:.4f} | "
            f"val_loss={val_loss:.4f} | "
            f"val_auc={val_auc:.3f}"
        )
    
        if val_auc > best_auc + 1e-4:
            best_auc = val_auc
            bad_epochs = 0
            torch.save(model.state_dict(), save_path)
        else:
            bad_epochs += 1
            if bad_epochs >= patience:
                print("Early stopping on AUC.")
                break

    # load best weights before returning
    model.load_state_dict(torch.load(save_path, map_location=device))
    return model


def load_cached_finbert_dataset(split,return_days=1,cache_dir=CACHE_DIR,batch_size=16,shuffle=True):
    """
    Loads a cached FinBERT dataset from disk.

    Args:
        cache_dir: Directory where cached embeddings are stored.
        split: One of "train", "val", or "test".
        batch_size: Batch size for DataLoader.
        shuffle: Whether to shuffle the data.
        collate_fn: Function to collate batches 
    Returns:
        DataLoader for the specified split.
    """
    from torch.utils.data import DataLoader

    split_cache_dir = os.path.join(cache_dir,split,str(return_days))
    dataset = CachedDataset(split_cache_dir)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=collate_pad)
    return dataloader

def load_cached_finbert_fin_dataset(split,return_days=1,cache_dir=CACHE_DIR,batch_size=16,shuffle=True):
    """
    Loads a cached FinBERT dataset with financial features from disk.

    Args:
        cache_dir: Directory where cached embeddings are stored.
        split: One of "train", "val", or "test".
        batch_size: Batch size for DataLoader.
        shuffle: Whether to shuffle the data.
        collate_fn: Function to collate batches 
    Returns:
        DataLoader for the specified split.
    """
    from torch.utils.data import DataLoader

    split_cache_dir = os.path.join(cache_dir, split,str(return_days))
    dataset = CachedZFinDataset(split_cache_dir)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=collate_pad_chunks_with_fin)
    return dataloader


def bootstrap_auc_se(y_true, y_scores, n_bootstraps=1000, random_seed=42):
    """
    Computes bootstrap confidence intervals and standard error for AUC.

    Args:
        y_true (array-like): True binary labels.
        y_scores (array-like): Predicted scores.
        n_bootstraps (int): Number of bootstrap samples.
        random_seed (int): Random seed for reproducibility.
    Returns:
        tuple: (lower_bound, upper_bound) of 95% confidence interval for AUC
        int: standard error of AUC
    """
    import numpy as np

    rng = np.random.RandomState(random_seed)
    bootstrapped_scores = []

    y_true = np.array(y_true)
    y_scores = np.array(y_scores)

    for i in range(n_bootstraps):
        indices = rng.randint(0, len(y_scores), len(y_scores))
        if len(np.unique(y_true[indices])) < 2:
            continue
        score = roc_auc_score(y_true[indices], y_scores[indices])
        bootstrapped_scores.append(score)

    sorted_scores = np.array(bootstrapped_scores)
    sorted_scores.sort()

    lower_bound = sorted_scores[int(0.025 * len(sorted_scores))]
    upper_bound = sorted_scores[int(0.975 * len(sorted_scores))]
    se = np.std(bootstrapped_scores)

    return (lower_bound, upper_bound), se

def call_model(Model="AttnMLPPoolClassifier",dim=768, attn_hidden=256, hidden=256, dropout=0.2,return_period=1):
    """
    trains the specifed model at the given return period and returns the trained model, test loss and test AUC confidence intervals.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_loader = load_cached_finbert_dataset(
        cache_dir=CACHE_DIR, split="train",return_days=1 ,batch_size=16, shuffle=True
    )
    val_loader = load_cached_finbert_dataset(
        cache_dir=CACHE_DIR, split="val",return_days=1,batch_size=16, shuffle=False
    )
    test_loader = load_cached_finbert_dataset(
        cache_dir=CACHE_DIR, split="test",return_days=1,batch_size=16, shuffle=False
    )

    if Model=="MeanPoolClassifier":
        model = MeanPoolClassifier(dim=dim, hidden=hidden, dropout=dropout).to(device)
    elif Model=="AttnPoolClassifier":
        model = AttnPoolClassifier(dim=dim, hidden=hidden, dropout=dropout).to(device)
    elif Model=="AttnMLPPoolClassifier":
        model = AttnMLPPoolClassifier(dim=dim, attn_hidden=attn_hidden, hidden=hidden, dropout=dropout).to(device)

    model = train_with_early_stopping(
        model,
        train_loader,
        val_loader,
        device,
        max_epochs=50,
        patience=7,
        lr=1e-3,
        weight_decay=1e-2,
        save_path=f"best_model_{return_period}r.pt",
    )

    test_logits,test_loss, test_auc = eval_loop_auc(model, test_loader, device)

    test_auc_ci,test_se = bootstrap_auc_se(
        y_true=[y for _, y in test_loader.dataset],
        y_scores=test_logits,
        n_bootstraps=1000,
        random_seed=42,
    )

    return model, test_loss, test_auc, test_auc_ci,test_se

def call_model_fin(Model="AttnPoolTwoTower",dim=768, fin_dim=4, hidden=256, dropout=0.2,return_period=1):
    """
    trains the specifed model at the given return period and returns the trained model, test loss and test AUC.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_loader = load_cached_finbert_fin_dataset(
        cache_dir=CACHE_DIR, split="train",return_days=return_period ,batch_size=16, shuffle=True
    )
    val_loader = load_cached_finbert_fin_dataset(
        cache_dir=CACHE_DIR, split="val",return_days=return_period,batch_size=16, shuffle=False
    )
    test_loader = load_cached_finbert_fin_dataset(
        cache_dir=CACHE_DIR, split="test",return_days=return_period,batch_size=16, shuffle=False
    )
    if Model=="AttnPoolTwoTower":
        model = AttnPoolTwoTower(dim=768, fin_dim=4, hidden=256, dropout=0.2).to(device)

    model = train_with_early_stopping_fin(
        model,
        train_loader,
        val_loader,
        device,
        max_epochs=50,
        patience=7,
        lr=1e-3,
        weight_decay=1e-2,
        save_path=f"best_model_fin_{return_period}r.pt",
    )

    test_logits,test_loss, test_auc = eval_loop_auc_fin(model, test_loader, device)

    test_auc_ci,test_se = bootstrap_auc_se(
        y_true=[y for _, _, y in test_loader.dataset],
        y_scores=test_logits,
        n_bootstraps=1000,
        random_seed=42,
    )

    return model, test_loss, test_auc, test_auc_ci,test_se




In [23]:
model, test_loss, test_auc, test_auc_ci,test_se = call_model()

epoch 01 | train_loss=0.7111 | val_loss=0.7000 | val_auc=0.448
epoch 02 | train_loss=0.6803 | val_loss=0.7114 | val_auc=0.449
epoch 03 | train_loss=0.6952 | val_loss=0.7018 | val_auc=0.446
epoch 04 | train_loss=0.7677 | val_loss=0.7076 | val_auc=0.447
epoch 05 | train_loss=0.6844 | val_loss=0.7006 | val_auc=0.461
epoch 06 | train_loss=0.6850 | val_loss=0.6963 | val_auc=0.453
epoch 07 | train_loss=0.7227 | val_loss=0.7091 | val_auc=0.447
epoch 08 | train_loss=0.6679 | val_loss=0.6944 | val_auc=0.456
epoch 09 | train_loss=0.6831 | val_loss=0.7108 | val_auc=0.462
epoch 10 | train_loss=0.7083 | val_loss=0.6967 | val_auc=0.463
epoch 11 | train_loss=0.6960 | val_loss=0.7039 | val_auc=0.480
epoch 12 | train_loss=0.7070 | val_loss=0.7018 | val_auc=0.474
epoch 13 | train_loss=0.7095 | val_loss=0.7128 | val_auc=0.479
epoch 14 | train_loss=0.6869 | val_loss=0.7226 | val_auc=0.473
epoch 15 | train_loss=0.6671 | val_loss=0.7247 | val_auc=0.477
epoch 16 | train_loss=0.7232 | val_loss=0.7287 | val_au

In [24]:
test_loss, test_auc, test_auc_ci,test_se

(0.7124160014976889,
 0.47144735606274074,
 (np.float64(0.40213294106830927), np.float64(0.5353311135775065)),
 np.float64(0.03381056244048584))

In [19]:
model, test_loss, test_auc, test_auc_ci,test_se = call_model_fin()

epoch 01 | train_loss=0.6900 | val_loss=0.6921 | val_auc=0.485
epoch 02 | train_loss=0.7701 | val_loss=0.7001 | val_auc=0.533
epoch 03 | train_loss=0.6255 | val_loss=0.8209 | val_auc=0.526
epoch 04 | train_loss=0.7149 | val_loss=0.7168 | val_auc=0.498
epoch 05 | train_loss=0.6817 | val_loss=0.7028 | val_auc=0.511
epoch 06 | train_loss=0.5825 | val_loss=0.7079 | val_auc=0.505
epoch 07 | train_loss=0.6441 | val_loss=0.7294 | val_auc=0.465
epoch 08 | train_loss=0.6140 | val_loss=0.7367 | val_auc=0.486
epoch 09 | train_loss=0.5509 | val_loss=0.7478 | val_auc=0.497
Early stopping on AUC.


In [22]:
test_loss, test_auc, test_auc_ci,test_se

(0.7000399997678853,
 0.4517704517704518,
 (np.float64(0.38404781918371156), np.float64(0.5198935275987672)),
 np.float64(0.03475443969530353))