# CLS - 1DayReturns

### Use CLS pooling at the token level (i.e. get one vector per chunk) and attention pooling at the chunk level (i.e. one vector per transcript)

In [1]:
import os, glob, hashlib
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np

MODEL_NAME = "yiyanghkust/finbert-pretrain"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
encoder = AutoModel.from_pretrained(MODEL_NAME)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = encoder.to(device)
encoder.eval()

for p in encoder.parameters():
    p.requires_grad = False

def chunks(text, max_tokens=512, overlap=50):
    tokens = tokenizer(
        text,
        add_special_tokens=False,
        truncation=False,
        return_attention_mask=False
    )["input_ids"]

    out = []
    start = 0
    while start < len(tokens):
        out.append(tokens[start:start + max_tokens])
        start += max_tokens - overlap
    return out


@torch.no_grad()
def chunk_to_vector(chunk_id_list,batch_size=16):
    vecs=[]
    for i in range(0,len(chunk_id_list),batch_size):
        batch=chunk_id_list[i:i+batch_size]
        inputs = [(tokenizer.prepare_for_model(ch,
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        return_attention_mask=True)) for ch in batch]
    
        enc = tokenizer.pad(
            inputs,
            padding="max_length",
            max_length=512,
            return_tensors="pt"
        )

    #  FIX: ensure batch dimension
    if enc["input_ids"].dim() == 1:
        enc["input_ids"] = enc["input_ids"].unsqueeze(0)
    if enc["attention_mask"].dim() == 1:
        enc["attention_mask"] = enc["attention_mask"].unsqueeze(0)
    if "token_type_ids" in enc and enc["token_type_ids"].dim() == 1:
        enc["token_type_ids"] = enc["token_type_ids"].unsqueeze(0)

    enc = {k: v.to(device) for k, v in enc.items()}

    out = encoder(**enc).last_hidden_state          # (B,512,768)
    vec = out[:, 0, :]                              # (B,768)  CLS embedding

    vecs.append(vec)
    vec=torch.cat(vecs,dim=0)
    return vec                      # (C,768)

def transcript_id(text):
    return hashlib.md5(text.encode("utf-8")).hexdigest()

@torch.no_grad()
def build_cache(data, cache_dir, overlap=0):
    os.makedirs(cache_dir, exist_ok=True)

    for i, (transcript, y, fin_features) in enumerate(data):
        cid = transcript_id(transcript)
        path = os.path.join(cache_dir, f"{cid}.pt")
        if os.path.exists(path):
            continue

        chunk_to_vector_list = chunks(transcript, overlap=overlap)
        Z=chunk_to_vector(chunk_to_vector_list,batch_size=8)
        f=torch.tensor(fin_features,dtype=torch.float16)        
        torch.save(
            {"Z": Z.to(torch.float16), "fin_features": f, "y": int(y)},
            path
        )
        if (i + 1) % 50 == 0:
            print(f"[{i+1}/{len(data)}] cached | files={len(glob.glob(cache_dir+'/*.pt'))}")

    print(f"Cached {len(data)} transcripts → {cache_dir}")

class CachedDataset(Dataset):
    def __init__(self, cache_dir):
        self.paths = glob.glob(os.path.join(cache_dir, "*.pt"))

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        obj = torch.load(self.paths[idx], map_location="cpu")
        return obj["Z"].float(), torch.tensor(obj["y"], dtype=torch.float32)



config.json:   0%|          | 0.00/359 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

In [2]:
import os
os.getcwd()

'/root'

In [4]:
df = pd.read_csv("model_input_data_v2.csv")
df = df.sort_values("adjusted_date").reset_index(drop=True)

dates = np.array(sorted(df["adjusted_date"].unique()))
train_end = int(0.75 * len(dates))
val_end   = int(0.87 * len(dates))

train_df = df[df["adjusted_date"].isin(dates[:train_end])]
val_df   = df[df["adjusted_date"].isin(dates[train_end:val_end])]
test_df  = df[df["adjusted_date"].isin(dates[val_end:])]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
y_train_1d = train_df["r1d_direction"]
y_val_1d   = val_df["r1d_direction"]
y_test_1d  = test_df["r1d_direction"]

y_train_5d = train_df["r5d_direction"]
y_val_5d   = val_df["r5d_direction"]
y_test_5d  = test_df["r5d_direction"]

features_base_train = scaler.fit_transform(train_df[["abvol_20d", "abcallday_r1", "abcallday_r5", "abcallday_r20"]])
features_base_val   = scaler.transform(val_df[["abvol_20d", "abcallday_r1", "abcallday_r5", "abcallday_r20"]])
features_base_test  = scaler.transform(test_df[["abvol_20d", "abcallday_r1", "abcallday_r5", "abcallday_r20"]])

train_transcripts = train_df["transcript"].tolist()
val_transcripts   = val_df["transcript"].tolist()
test_transcripts  = test_df["transcript"].tolist()

train_data_5d = list(zip(train_transcripts, y_train_5d, features_base_train))
val_data_5d   = list(zip(val_transcripts, y_val_5d, features_base_val))
test_data_5d  = list(zip(test_transcripts, y_test_5d, features_base_test))

train_data_1d = list(zip(train_transcripts, y_train_1d, features_base_train))
val_data_1d   = list(zip(val_transcripts, y_val_1d, features_base_val))
test_data_1d  = list(zip(test_transcripts, y_test_1d, features_base_test))

In [5]:
build_cache(train_data_1d, "cache/train_CLS_1d")

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[50/2823] cached | files=43
[100/2823] cached | files=90
[150/2823] cached | files=138
[200/2823] cached | files=182
[250/2823] cached | files=209
[300/2823] cached | files=257
[400/2823] cached | files=313
[450/2823] cached | files=355
[550/2823] cached | files=408
[600/2823] cached | files=430
[650/2823] cached | files=456
[700/2823] cached | files=504
[750/2823] cached | files=554
[800/2823] cached | files=600
[850/2823] cached | files=646
[900/2823] cached | files=692
[950/2823] cached | files=738
[1000/2823] cached | files=777
[1050/2823] cached | files=824
[1100/2823] cached | files=865
[1150/2823] cached | files=912
[1200/2823] cached | files=958
[1250/2823] cached | files=1005
[1300/2823] cached | files=1053
[1350/2823] cached | files=1100
[1400/2823] cached | files=1147
[1450/2823] cached | files=1191
[1500/2823] cached | files=1239
[1550/2823] cached | files=1284
[1600/2823] cached | files=1332
[1650/2823] cached | files=1366
[1700/2823] cached | files=1414
[1750/2823] cached

In [6]:
build_cache(test_data_1d, "cache/test_CLS_1d")
build_cache(val_data_1d, "cache/val_CLS_1d")

[50/402] cached | files=47
[100/402] cached | files=96
[150/402] cached | files=141
[200/402] cached | files=190
[250/402] cached | files=234
[300/402] cached | files=282
[350/402] cached | files=327
[400/402] cached | files=369
Cached 402 transcripts → cache/test_CLS_1d
[50/319] cached | files=48
[100/319] cached | files=95
[150/319] cached | files=143
[200/319] cached | files=191
[250/319] cached | files=239
[300/319] cached | files=286
Cached 319 transcripts → cache/val_CLS_1d


In [8]:
import shutil
import os

def zip_cache(cache_dir, zip_path):
    """
    Zips the entire cache directory into a single .zip file.
    """
    assert os.path.exists(cache_dir), f"{cache_dir} does not exist"
    shutil.make_archive(
        base_name=zip_path.replace(".zip", ""),
        format="zip",
        root_dir=cache_dir
    )
    print(f"Created zip file: {zip_path}")

# Create zip files for each cache directory
zip_cache("cache/train_CLS_1d", "cache_train_CLS_1d.zip")
zip_cache("cache/val_CLS_1d", "cache_val_CLS_1d.zip")
zip_cache("cache/test_CLS_1d", "cache_test_CLS_1d.zip")

Created zip file: cache_train_CLS_1d.zip
Created zip file: cache_val_CLS_1d.zip
Created zip file: cache_test_CLS_1d.zip


# Training Using Embeddings

In [9]:
import glob, os, torch
from torch.utils.data import Dataset

class CachedZDataset(Dataset):
    def __init__(self, cache_dir):
        self.paths = sorted(glob.glob(os.path.join(cache_dir, "*.pt")))

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        obj = torch.load(self.paths[idx], map_location="cpu")
        Z = obj["Z"].float()  # (C,768)
        y = torch.tensor(obj["y"], dtype=torch.float32)
        return Z, y


In [10]:
import os
os.getcwd()

'/root'

In [11]:
check_cache = CachedZDataset("cache/train_CLS_1d")
print(f"Training samples: {len(check_cache)}")

Training samples: 2488


In [12]:
def collate_pad(batch):
    # batch = [(Z1, y1), (Z2, y2), ...]
    Z_list, y_list = zip(*batch)

    B = len(Z_list)
    dim = Z_list[0].shape[1]
    C_max = max(z.shape[0] for z in Z_list)

    Z_pad = torch.zeros(B, C_max, dim)
    mask  = torch.zeros(B, C_max)

    for i, Z in enumerate(Z_list):
        C = Z.shape[0]
        Z_pad[i, :C] = Z
        mask[i, :C] = 1.0

    y = torch.tensor(y_list, dtype=torch.float32)
    return Z_pad, mask, y


In [28]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    CachedZDataset("cache/train_CLS_1d"),
    batch_size=16,
    shuffle=True,
    collate_fn=collate_pad
)

val_loader = DataLoader(
    CachedZDataset("cache/val_CLS_1d"),
    batch_size=16,
    shuffle=False,
    collate_fn=collate_pad
)

test_loader=DataLoader(
    CachedZDataset("cache/test_CLS_1d"),
    batch_size=16,
    shuffle=False,
    collate_fn=collate_pad
)

In [29]:
import torch.nn as nn
import torch.nn.functional as F

class MeanPoolClassifier(nn.Module):
    def __init__(self, dim=768, hidden=256, dropout=0.2):
        super().__init__()
        self.fc1 = nn.Linear(dim, hidden)
        self.fc2 = nn.Linear(hidden, 1)
        self.drop = nn.Dropout(dropout)

    def forward(self, Z, mask):
        # Z: (B,C,768), mask: (B,C)
        mask3 = mask.unsqueeze(-1)  # (B,C,1)
        doc = (Z * mask3).sum(dim=1) / mask3.sum(dim=1).clamp(min=1e-9)
        x = F.relu(self.fc1(doc))
        x = self.drop(x)
        return self.fc2(x).squeeze(-1)  # (B,)

class AttnPoolClassifier(nn.Module):
    def __init__(self, dim=768, hidden=256, dropout=0.2):
        super().__init__()
        self.attn = nn.Parameter(torch.randn(dim) * 0.02)
        self.fc1 = nn.Linear(dim, hidden)
        self.fc2 = nn.Linear(hidden, 1)
        self.drop = nn.Dropout(dropout)

    def forward(self, Z, mask):
        # Z: (B,C,768)
        scores = torch.einsum("bcd,d->bc", Z, self.attn)  # (B,C)
        scores = scores.masked_fill(mask == 0, -1e9)
        alpha = torch.softmax(scores, dim=1)
        doc = torch.einsum("bc,bcd->bd", alpha, Z)        # (B,768)
        x = F.relu(self.fc1(doc))
        x = self.drop(x)
        return self.fc2(x).squeeze(-1)

class AttnMLPPoolClassifier(nn.Module):
    def __init__(self, dim=768, attn_hidden=256, hidden=256, dropout=0.2):
        super().__init__()
        self.W = nn.Linear(dim, attn_hidden)
        self.v = nn.Linear(attn_hidden, 1, bias=False)

        self.fc1 = nn.Linear(dim, hidden)
        self.fc2 = nn.Linear(hidden, 1)
        self.drop = nn.Dropout(dropout)

    def forward(self, Z, mask):
        # Z: (B,C,768), mask: (B,C)
        h = torch.tanh(self.W(Z))              # (B,C,H)
        scores = self.v(h).squeeze(-1)         # (B,C)
        scores = scores.masked_fill(mask == 0, -1e9)
        alpha = torch.softmax(scores, dim=1)   # (B,C)

        doc = torch.einsum("bc,bcd->bd", alpha, Z)  # (B,768)

        x = F.relu(self.fc1(doc))
        x = self.drop(x)
        return self.fc2(x).squeeze(-1)
      


In [30]:
import torch
import torch.nn as nn

from sklearn.metrics import roc_auc_score
import torch.nn as nn

loss_fn = nn.BCEWithLogitsLoss()

@torch.no_grad()
def eval_loop_auc(model, loader, device):
    model.eval()
    total_loss, n = 0.0, 0

    all_logits = []
    all_labels = []

    for Z, mask, y in loader:
        Z = Z.to(device, non_blocking=True)
        mask = mask.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)

        logit = model(Z, mask)              # (B,)
        loss = loss_fn(logit, y)

        total_loss += loss.item() * y.size(0)
        n += y.size(0)

        all_logits.append(logit.cpu())
        all_labels.append(y.cpu())

    avg_loss = total_loss / max(1, n)

    logits = torch.cat(all_logits).numpy()
    labels = torch.cat(all_labels).numpy()

    auc = roc_auc_score(labels, logits)

    return avg_loss, auc


import torch.optim as optim

def train_with_early_stopping(
    model,
    train_loader,
    val_loader,
    device,
    max_epochs=50,
    patience=7,
    lr=1e-3,
    weight_decay=1e-2,
    save_path="best.pt",
):
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    best_val = float("inf")
    bad_epochs = 0

    best_auc = -float("inf")
    patience = 7
    bad_epochs = 0

    for epoch in range(1, max_epochs + 1):
        model.train()
        total_loss, n = 0.0, 0

        for Z, mask, y in train_loader:
            Z = Z.to(device, non_blocking=True)
            mask = mask.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)

            logit = model(Z, mask)
            loss = loss_fn(logit, y)
    
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        total_loss += loss.item() * y.size(0)
        n += y.size(0)

        train_loss = total_loss / n
        val_loss, val_auc = eval_loop_auc(model, val_loader, device)

        print(
            f"epoch {epoch:02d} | "
            f"train_loss={train_loss:.4f} | "
            f"val_loss={val_loss:.4f} | "
            f"val_auc={val_auc:.3f}"
        )
    
        if val_auc > best_auc + 1e-4:
            best_auc = val_auc
            bad_epochs = 0
            torch.save(model.state_dict(), save_path)
        else:
            bad_epochs += 1
            if bad_epochs >= patience:
                print("Early stopping on AUC.")
                break

    # load best weights before returning
    model.load_state_dict(torch.load(save_path, map_location=device))
    return model


In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# choose one:
mean_pool_model = MeanPoolClassifier().to(device)
# attn_pool_model = MeanPoolClassifier().to(device)
# model = AttnPoolClassifier().to(device)

mean_pool_model_train = train_with_early_stopping(
    mean_pool_model,
    train_loader,
    val_loader,
    device,
    max_epochs=50,
    patience=7,
    save_path="best_mean_pool_1d.pt",
)

test_loss, test_auc = eval_loop_auc(mean_pool_model_train, test_loader, device)
print(f"TEST | loss={test_loss:.4f} | acc={test_auc:.3f}")


epoch 01 | train_loss=0.7746 | val_loss=0.6875 | val_auc=0.485
epoch 02 | train_loss=0.6599 | val_loss=0.7022 | val_auc=0.525
epoch 03 | train_loss=0.6523 | val_loss=0.6914 | val_auc=0.547
epoch 04 | train_loss=0.6850 | val_loss=0.6923 | val_auc=0.484
epoch 05 | train_loss=0.6975 | val_loss=0.7044 | val_auc=0.529
epoch 06 | train_loss=0.7458 | val_loss=0.6881 | val_auc=0.519
epoch 07 | train_loss=0.6901 | val_loss=0.6946 | val_auc=0.516
epoch 08 | train_loss=0.6747 | val_loss=0.6882 | val_auc=0.536
epoch 09 | train_loss=0.6532 | val_loss=0.6969 | val_auc=0.537
epoch 10 | train_loss=0.6359 | val_loss=0.6896 | val_auc=0.507
Early stopping on AUC.
TEST | loss=0.6890 | acc=0.563


In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# choose one:
#mean_pool_model = MeanPoolClassifier().to(device)
attn_pool_model = AttnPoolClassifier().to(device)
# model = AttnPoolClassifier().to(device)

model = train_with_early_stopping(
    attn_pool_model,
    train_loader,
    val_loader,
    device,
    max_epochs=50,
    patience=7,
    save_path="best_attn_pool_1d.pt",
)

test_loss, test_auc = eval_loop_auc(model, test_loader, device)
print(f"TEST | loss={test_loss:.4f} | auc={test_auc:.3f}")


epoch 01 | train_loss=0.6735 | val_loss=0.6856 | val_auc=0.523
epoch 02 | train_loss=0.7293 | val_loss=0.6820 | val_auc=0.506
epoch 03 | train_loss=0.6914 | val_loss=0.7074 | val_auc=0.528
epoch 04 | train_loss=0.6436 | val_loss=0.6837 | val_auc=0.544
epoch 05 | train_loss=0.7147 | val_loss=0.6909 | val_auc=0.559
epoch 06 | train_loss=0.7318 | val_loss=0.6895 | val_auc=0.544
epoch 07 | train_loss=0.6477 | val_loss=0.6915 | val_auc=0.535
epoch 08 | train_loss=0.6810 | val_loss=0.6845 | val_auc=0.549
epoch 09 | train_loss=0.6715 | val_loss=0.6858 | val_auc=0.556
epoch 10 | train_loss=0.7177 | val_loss=0.6968 | val_auc=0.542
epoch 11 | train_loss=0.6188 | val_loss=0.6816 | val_auc=0.534
epoch 12 | train_loss=0.6082 | val_loss=0.6849 | val_auc=0.541
Early stopping on AUC.
TEST | loss=0.6931 | auc=0.543


In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# choose one:
#mean_pool_model = MeanPoolClassifier().to(device)
attn_NL_pool_model = AttnMLPPoolClassifier().to(device)
# model = AttnPoolClassifier().to(device)

model = train_with_early_stopping(
    attn_NL_pool_model,
    train_loader,
    val_loader,
    device,
    max_epochs=50,
    patience=7,
    save_path="best_attn_NL_1d.pt",
)

test_loss, test_auc = eval_loop_auc(model, test_loader, device)
print(f"TEST | loss={test_loss:.4f} | auc={test_auc:.3f}")


epoch 01 | train_loss=0.6645 | val_loss=0.7218 | val_auc=0.557
epoch 02 | train_loss=0.7022 | val_loss=0.6891 | val_auc=0.576
epoch 03 | train_loss=0.6585 | val_loss=0.6865 | val_auc=0.555
epoch 04 | train_loss=0.6749 | val_loss=0.6982 | val_auc=0.510
epoch 05 | train_loss=0.6345 | val_loss=0.6893 | val_auc=0.515
epoch 06 | train_loss=0.7048 | val_loss=0.6820 | val_auc=0.557
epoch 07 | train_loss=0.7265 | val_loss=0.6868 | val_auc=0.565
epoch 08 | train_loss=0.7269 | val_loss=0.6921 | val_auc=0.552
epoch 09 | train_loss=0.6394 | val_loss=0.6814 | val_auc=0.549
Early stopping on AUC.
TEST | loss=0.6919 | auc=0.530


# Training using embeddings+ finance features

In [16]:
import glob, os, torch
from torch.utils.data import Dataset

class CachedZFinDataset(Dataset):
    def __init__(self, cache_dir):
        self.paths = sorted(glob.glob(os.path.join(cache_dir, "*.pt")))

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        obj = torch.load(self.paths[idx], map_location="cpu")
        Z = obj["Z"].float()  # (C,768)
        y = torch.tensor(obj["y"], dtype=torch.float32)
        fin=obj["fin_features"].float()
        fin=fin.view(-1)
        return Z,fin, y


In [17]:
def collate_pad_chunks_with_fin(batch):
    # batch: [(Z, fin, y), ...]
    Z_list, fin_list, y_list = zip(*batch)

    B = len(Z_list)
    dim = Z_list[0].shape[1]
    C_max = max(z.shape[0] for z in Z_list)

    Z_pad = torch.zeros(B, C_max, dim, dtype=torch.float32)
    mask  = torch.zeros(B, C_max, dtype=torch.float32)

    for i, Z in enumerate(Z_list):
        C = Z.shape[0]
        Z_pad[i, :C] = Z
        mask[i, :C] = 1.0

    fin = torch.stack([f.view(-1) for f in fin_list]).float()  # (B,K)
    y = torch.tensor(y_list, dtype=torch.float32)              # (B,)

    return Z_pad, mask, fin, y


In [18]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    CachedZFinDataset("cache/train_CLS_1d"),
    batch_size=16,
    shuffle=True,
    collate_fn=collate_pad_chunks_with_fin,
    num_workers=2,
    pin_memory=True
)

val_loader = DataLoader(
    CachedZFinDataset("cache/val_CLS_1d"),
    batch_size=32,
    shuffle=False,
    collate_fn=collate_pad_chunks_with_fin,
    num_workers=2,
    pin_memory=True
)

test_loader = DataLoader(
    CachedZFinDataset("cache/test_CLS_1d"),
    batch_size=32,
    shuffle=False,
    collate_fn=collate_pad_chunks_with_fin,
    num_workers=2,
    pin_memory=True
)


In [20]:
class AttnPoolTwoTower(nn.Module):
    def __init__(self, dim=768, fin_dim=4, hidden=256, dropout=0.2):
        super().__init__()
        self.attn = nn.Parameter(torch.randn(dim) * 0.02)

        self.doc_proj = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
        )
        self.fin_proj = nn.Sequential(
            nn.LayerNorm(fin_dim),          # optional
            nn.Linear(fin_dim, hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
        )

        self.out = nn.Linear(2 * hidden, 1)

    def forward(self, Z, mask, fin):
        scores = torch.einsum("bcd,d->bc", Z, self.attn)
        scores = scores.masked_fill(mask == 0, -1e9)
        alpha = torch.softmax(scores, dim=1)
        doc = torch.einsum("bc,bcd->bd", alpha, Z)  # (B,768)

        a = self.doc_proj(doc)   # (B,hidden)
        b = self.fin_proj(fin)   # (B,hidden)

        x = torch.cat([a, b], dim=1)
        return self.out(x).squeeze(-1)


In [21]:
import torch
import torch.nn as nn

from sklearn.metrics import roc_auc_score
import torch.nn as nn

loss_fn = nn.BCEWithLogitsLoss()

@torch.no_grad()
def eval_loop_auc_fin(model, loader, device):
    model.eval()
    total_loss, n = 0.0, 0

    all_logits = []
    all_labels = []

    for Z, mask,fin, y in loader:
        Z = Z.to(device, non_blocking=True)
        mask = mask.to(device, non_blocking=True)
        fin=fin.to(device,non_blocking=True)
        y = y.to(device, non_blocking=True)

        logit = model(Z, mask,fin)              # (B,)
        loss = loss_fn(logit, y)

        total_loss += loss.item() * y.size(0)
        n += y.size(0)

        all_logits.append(logit.cpu())
        all_labels.append(y.cpu())

    avg_loss = total_loss / max(1, n)

    logits = torch.cat(all_logits).numpy()
    labels = torch.cat(all_labels).numpy()

    auc = roc_auc_score(labels, logits)

    return avg_loss, auc


import torch.optim as optim

def train_with_early_stopping_fin(
    model,
    train_loader,
    val_loader,
    device,
    max_epochs=50,
    patience=7,
    lr=1e-3,
    weight_decay=1e-2,
    save_path="best.pt",
):
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    best_val = float("inf")
    bad_epochs = 0

    best_auc = -float("inf")
    patience = 7
    bad_epochs = 0

    for epoch in range(1, max_epochs + 1):
        model.train()
        total_loss, n = 0.0, 0

        for Z, mask,fin, y in train_loader:
            Z = Z.to(device, non_blocking=True)
            mask = mask.to(device, non_blocking=True)
            fin=fin.to(device,non_blocking=True)
            y = y.to(device, non_blocking=True)

            logit = model(Z, mask,fin)
            loss = loss_fn(logit, y)
    
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        total_loss += loss.item() * y.size(0)
        n += y.size(0)

        train_loss = total_loss / n
        val_loss, val_auc = eval_loop_auc_fin(model, val_loader, device)

        print(
            f"epoch {epoch:02d} | "
            f"train_loss={train_loss:.4f} | "
            f"val_loss={val_loss:.4f} | "
            f"val_auc={val_auc:.3f}"
        )
    
        if val_auc > best_auc + 1e-4:
            best_auc = val_auc
            bad_epochs = 0
            torch.save(model.state_dict(), save_path)
        else:
            bad_epochs += 1
            if bad_epochs >= patience:
                print("Early stopping on AUC.")
                break

    # load best weights before returning
    model.load_state_dict(torch.load(save_path, map_location=device))
    return model


In [22]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# choose one:
#mean_pool_model = MeanPoolClassifier().to(device)
attn_poolfin_model = AttnPoolTwoTower().to(device)
# model = AttnPoolClassifier().to(device)

model = train_with_early_stopping_fin(
    attn_poolfin_model,
    train_loader,
    val_loader,
    device,
    max_epochs=50,
    patience=7,
    save_path="best_attn_pool_1d.pt",
)

test_loss, test_auc = eval_loop_auc_fin(model, test_loader, device)
print(f"TEST | loss={test_loss:.4f} | auc={test_auc:.3f}")


epoch 01 | train_loss=0.7541 | val_loss=0.7192 | val_auc=0.472
epoch 02 | train_loss=0.5377 | val_loss=0.7033 | val_auc=0.497
epoch 03 | train_loss=0.6936 | val_loss=0.7079 | val_auc=0.443
epoch 04 | train_loss=0.6381 | val_loss=0.7037 | val_auc=0.460
epoch 05 | train_loss=0.6931 | val_loss=0.7123 | val_auc=0.450
epoch 06 | train_loss=0.7227 | val_loss=0.7138 | val_auc=0.476
epoch 07 | train_loss=0.7068 | val_loss=0.7062 | val_auc=0.468
epoch 08 | train_loss=0.7918 | val_loss=0.7051 | val_auc=0.468
epoch 09 | train_loss=0.6682 | val_loss=0.7908 | val_auc=0.460
Early stopping on AUC.
TEST | loss=0.6953 | auc=0.549


In [26]:
test_loss, test_auc = eval_loop_auc_fin(model, test_loader, device)
print("AUC:", test_auc, "Flipped AUC:", 1 - test_auc)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


AUC: 0.46726895734597157 Flipped AUC: 0.5327310426540284


In [27]:
print(df[["abret_5d","r5d_direction"]].head(10))
print("fraction positive abret_5d:", (df["abret_5d"]>0).mean())
print("fraction r5d_direction==1:", (df["r5d_direction"]==1).mean())


   abret_5d  r5d_direction
0 -0.012433              0
1 -0.012433              0
2 -0.012433              0
3 -0.012433              0
4 -0.012433              0
5 -0.050790              0
6 -0.016076              0
7  0.028114              1
8 -0.017254              0
9 -0.019140              0
fraction positive abret_5d: 0.503668171557562
fraction r5d_direction==1: 0.503668171557562
