In [1]:
%pip install wandb

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install transformers==4.54.1 tokenizers==0.21.4 sentencepiece==0.2.0 tiktoken==0.9.0

Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install --upgrade torchvision

Collecting torchvision
  Downloading torchvision-0.23.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting torch==2.8.0 (from torchvision)
  Downloading torch-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting sympy>=1.13.3 (from torch==2.8.0->torchvision)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch==2.8.0->torchvision)
  Downloading nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-runtime-cu12==12.8.90 (from torch==2.8.0->torchvision)
  Downloading nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-cupti-cu12==12.8.90 (from torch==2.8.0->torchvision)
  Downloading nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cudnn-cu12==9.10.2.21 (from torch==2.8

In [1]:
import torch
import numpy as np
import pandas as pd
import wandb

from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from torch import optim, nn
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm

In [2]:
train_data = pd.read_csv("nli_train_balanced_v2.csv")
val_data = pd.read_csv("nli_val.csv")
test_data = pd.read_csv("nli_test.csv")

# one-hot label encodings
le = LabelEncoder()

In [3]:
labels = list(set(train_data.l1.unique().tolist() + test_data.l1.unique().tolist() + val_data.l1.unique().tolist()))
le.fit(labels)
train_data['l1'] = le.transform(train_data['l1'])
val_data['l1'] = le.transform(val_data['l1'])
test_data['l1'] = le.transform(test_data['l1'])

In [4]:
class SentenceData(Dataset):
  
    def __init__(self, df, remove_low_count=False, label_encoder=None):
        self.df = df
        if remove_low_count:
            print("Removing:")
            print(label_encoder.inverse_transform(df['l1'].value_counts()[df['l1'].value_counts() <= 100].index))
            self.df = self.df[self.df['l1'].map(self.df['l1'].value_counts()) >= 100]
        self.sentences = df['sentence'].values.tolist()
        self.labels = df['l1'].values.tolist()
  
    def __len__(self):
        return len(self.sentences)
  
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        return str(sentence), label

In [5]:
train_data = SentenceData(train_data)
val_data = SentenceData(val_data)
test_data = SentenceData(test_data)

In [6]:
len(test_data)

224776

In [7]:
class TransformerFeatureExtractor(nn.Module):
    """
    Wrapper to extract intermediate representations from a transformer model.
    """
    
    def __init__(self, hf_model_name, layer_indices=[-7, -6, -5, -4], freeze_transformer=True):
        
        super().__init__()
        
        self.transformer = AutoModel.from_pretrained(hf_model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(hf_model_name)
        self.hidden_size = self.transformer.config.hidden_size
        self.layer_indices = layer_indices
        self.freeze_transformer = freeze_transformer
        
        if self.freeze_transformer:
            for param in self.transformer.parameters():
                param.requires_grad = False
        
    def forward(self, x):
        x = self.tokenizer(x, return_tensors="pt", truncation=True, padding=True, max_length=512).to("cuda")
        outputs = self.transformer(
            input_ids=x["input_ids"], 
            attention_mask=x["attention_mask"],
            output_hidden_states=True
        )
        
        hidden_states = outputs.hidden_states
        intermediate_reps = [hidden_states[idx] for idx in self.layer_indices]

        return torch.cat(intermediate_reps, dim=-1)

In [8]:
class CNN1DClassifier(nn.Module):
    """
    1D CNN for sequence classification on (intermediate) transformer representations
    """
    def __init__(self, input_dim, num_classes, num_filters=256, filter_sizes=[3, 4, 5, 6], dropout=0.4):
        
        super().__init__()
        self.convs = nn.ModuleList([
            nn.Conv1d(input_dim, num_filters, kernel_size=k)
            for k in filter_sizes
        ])
    
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(len(filter_sizes) * num_filters, num_classes)
        
    def forward(self, x):
        # transpose input shape [batch_size, seq_len, input_dim]
        # to [batch_size, input_dim, seq_len] for conv1d
        x = x.transpose(1, 2)
        x = [
            torch.max(torch.relu(conv(x)), dim=2)[0] # conv + relu + pooling
            for conv in self.convs
        ]
        x = torch.cat(x, dim=1)  # [batch_size, len(filter_sizes) * num_filters]
        x = self.dropout(x)
        return self.linear(x)

In [9]:
class TransformerCNN(nn.Module):    
    
    def __init__(
        self,
        hf_model_name,
        num_classes,
        layer_indices=[-7, -6, -5, -4], 
        freeze_transformer=True,
        cnn_config={}
        ):
        
        super().__init__()
        
        self.tfe = TransformerFeatureExtractor(
            hf_model_name, layer_indices, freeze_transformer
        )
        
        # input dimension for CNN = hidden_size * num_layers
        input_dim = self.tfe.hidden_size * len(self.tfe.layer_indices)
    
        self.cnn = CNN1DClassifier(
            input_dim=input_dim,
            num_classes=num_classes,
            **cnn_config
        )
        
    def forward(self, x):
        x = self.tfe(x)
        return self.cnn(x)

In [12]:
from tqdm.auto import tqdm
from collections import defaultdict
from torch.utils.data import DataLoader
from sklearn.metrics import precision_recall_fscore_support, classification_report
from sklearn.model_selection import KFold
from collections import Counter
import numpy as np
import torch
import warnings
import uuid
warnings.simplefilter("ignore")

def validate(clf, val_loader, criterion):
    clf.eval()
    val_loss = 0
    val_acc = 0
    val_samples = 0
    with torch.no_grad():
        for val_sents, val_labels in val_loader:
            val_labels = torch.tensor(val_labels, device="cuda")
            val_outputs = clf(val_sents)
            
            val_loss += criterion(val_outputs, val_labels).item()
            val_pred = torch.argmax(val_outputs, dim=1)
            val_acc += torch.sum(val_pred == val_labels).item()
            val_samples += val_labels.size(0)
    
    avg_val_loss = val_loss / len(val_loader)
    avg_val_acc = val_acc / val_samples
    
    return avg_val_loss, avg_val_acc

def train(clf, criterion, optimizer, train_loader, n_batches, epochs, run, log_freq, pbar):
    clf.train()
    epoch_loss = 0
    epoch_acc = 0
    for i, (sentences, labels) in enumerate(train_loader):
        optimizer.zero_grad(set_to_none=True)
        
        labels = torch.tensor(labels, device="cuda")
        outputs = clf(sentences)
        
        loss = criterion(outputs, labels)
        pred = torch.argmax(outputs, dim=1)
        train_acc = torch.sum(pred == labels)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += train_acc.item()
        
        if i % log_freq == 0:
            run.log({"batch_loss": loss, "batch_train_acc": train_acc / labels.size(0)})
        pbar.update(1)
    
    avg_train_loss = epoch_loss / len(train_loader)
    avg_train_acc = epoch_acc / len(train_loader.dataset)
    
    return avg_train_loss, avg_train_acc

def training_loop(clf, criterion, optimizer, train_loader, n_batches, epochs, val_loader, run):
    total_batches = len(train_loader) * epochs
    pbar = tqdm(total=total_batches, desc='Training')
    for e in range(epochs):
        run.log({"epoch": e})
        avg_train_loss, avg_train_acc = train(clf, criterion, optimizer, train_loader, n_batches, epochs, run, log_interval, pbar)
        avg_val_loss, avg_val_acc = validate(clf, val_loader, criterion)
        run.log({
            "epoch_train_loss": avg_train_loss,
            "epoch_train_acc": avg_train_acc,
            "epoch_val_loss": avg_val_loss,
            "epoch_val_acc": avg_val_acc
        })
        pbar.set_postfix({
            'epoch': f'{e+1}/{epochs}',
            'loss': f'{avg_train_loss:.4f}'
        })
    model_name = str(uuid.uuid4())
    try:
        torch.save(clf.state_dict(), f"checkpoints/{model_name}.pt")
    except RuntimeError: pass

def test_loop(clf, test_loader, le, run):
    clf.eval()
    y_true = []
    y_pred = []
    target_names = [str(name) for name in le.classes_]
    for i, (sentences, labels) in tqdm(enumerate(test_loader), total=len(test_loader)):
        labels = torch.tensor(labels, device="cuda")
        outputs = clf(sentences)
        preds = torch.argmax(outputs, dim=1)
        y_true += labels.tolist()
        y_pred += preds.tolist()
    
    report = classification_report(
            y_true,
            y_pred,
            target_names=target_names,
            labels=list(range(len(target_names)))
    )
    ma_p, ma_r, ma_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    mi_p, mi_r, mi_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='micro')
    w_p, w_r, w_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    run.log({
        "test_macro_precision": ma_p,
        "test_macro_recall": ma_r,
        "test_macro_f1": ma_f1,
        "test_micro_precision": mi_p,
        "test_micro_recall": mi_r,
        "test_micro_f1": mi_f1,
        "test_weighted_precision": w_p,
        "test_weighted_recall": w_r,
        "test_weighted_f1": w_f1
    })
    return report

In [13]:
def test_fold(clf, test_loader, le):
    """Test a single fold and return metrics"""
    clf.eval()
    y_true = []
    y_pred = []
    target_names = [str(name) for name in le.classes_]
    
    with torch.no_grad():
        for sentences, labels in test_loader:
            labels = torch.tensor(labels, device="cuda")
            outputs = clf(sentences)
            preds = torch.argmax(outputs, dim=1)
            y_true += labels.tolist()
            y_pred += preds.tolist()
    
    # macro/micro/weighted metrics
    ma_p, ma_r, ma_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    mi_p, mi_r, mi_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='micro')
    w_p, w_r, w_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    
    metrics = {
        "macro_precision": ma_p,
        "macro_recall": ma_r,
        "macro_f1": ma_f1,
        "micro_precision": mi_p,
        "micro_recall": mi_r,
        "micro_f1": mi_f1,
        "weighted_precision": w_p,
        "weighted_recall": w_r,
        "weighted_f1": w_f1
    }
    
    report = classification_report(
        y_true,
        y_pred,
        target_names=target_names,
        labels=list(range(len(target_names)))
    )
    
    return metrics, report

def cross_validation(clf_class, dataset, le, criterion_class, optimizer_class, 
                    n_splits=5, epochs=10, batch_size=32, lr=1e-3, 
                    log_interval=100, stratified=True, run=None, **model_kwargs):
    """
    Perform k-fold cross validation
    
    Args:
        clf_class: The model class (not instantiated)
        dataset: Your full dataset (torch Dataset)
        le: Label encoder
        criterion_class: Loss function class
        optimizer_class: Optimizer class (e.g., torch.optim.Adam)
        n_splits: Number of folds
        epochs: Number of epochs per fold
        batch_size: Batch size
        lr: Learning rate
        log_interval: Logging frequency
        stratified: Whether to use stratified k-fold
        run: Experiment tracking object (wandb, etc.)
        **model_kwargs: Arguments to pass to model constructor
    
    Returns:
        dict: Cross-validation results
    """

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    if stratified:
        all_labels = [dataset[i][1] for i in range(len(dataset))]
        splits = list(kf.split(range(len(dataset)), all_labels))
    else:
        splits = list(kf.split(range(len(dataset))))
    
    fold_results = []
    all_metrics = defaultdict(list)
    
    for fold, (train_val_idx, test_idx) in enumerate(splits):

        val_size = len(train_val_idx) // 5  # 20% for validation
        train_idx = train_val_idx[:-val_size]
        val_idx = train_val_idx[-val_size:]
        
        train_loader = DataLoader(Subset(dataset, train_idx), batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(Subset(dataset, val_idx), batch_size=batch_size, shuffle=False)
        test_loader = DataLoader(Subset(dataset, test_idx), batch_size=batch_size, shuffle=False)
        
        clf = clf_class(**model_kwargs).cuda()
        criterion = criterion_class()
        optimizer = optimizer_class(clf.parameters(), lr=lr)

        # Train the model
        final_val_loss, final_val_acc = training_loop(
            clf, criterion, optimizer, train_loader, epochs, 
            val_loader, fold_run, log_interval
        )
        fold_metrics, fold_report = test_fold(clf, test_loader, le)
        fold_result = {
            'fold': fold + 1,
            'final_val_loss': final_val_loss,
            'final_val_acc': final_val_acc,
            'test_metrics': fold_metrics,
            'classification_report': fold_report
        }
        fold_results.append(fold_result)
        
        # accumulate metrics
        for metric_name, metric_value in fold_metrics.items():
            all_metrics[metric_name].append(metric_value)
        
        # log fold results
        if run is not None:
            fold_log = {f"fold_{fold+1}_{k}": v for k, v in fold_metrics.items()}
            fold_log[f"fold_{fold+1}_val_loss"] = final_val_loss
            fold_log[f"fold_{fold+1}_val_acc"] = final_val_acc
            run.log(fold_log)
    
    # overall statistics
    cv_results = {}
    for metric_name, values in all_metrics.items():
        cv_results[f"{metric_name}_mean"] = np.mean(values)
        cv_results[f"{metric_name}_std"] = np.std(values)
    
    if run is not None:
        cv_log = {f"cv_{k}": v for k, v in cv_results.items()}
        run.log(cv_log)
    
    return {
        'fold_results': fold_results,
        'cv_metrics': cv_results,
        'mean_macro_f1': cv_results['macro_f1_mean'],
        'std_macro_f1': cv_results['macro_f1_std']
    }

In [14]:
from torch.utils.data import Subset
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

n_classes = len(le.classes_)
n_layers = 1
batch_size = 32
lr = 1e-2
epochs = 10


embedding_model="KonradBRG/benali"

clf = TransformerCNN(
    embedding_model,
    n_classes,
    layer_indices=[-5], 
    freeze_transformer=True,
    cnn_config={
        "num_filters": 128,
        "dropout": 0.5
    }).to('cuda')

train_loader = DataLoader(train_data, batch_size=batch_size, num_workers=2, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_data, batch_size=batch_size, num_workers=2, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_data, batch_size=batch_size)

# wandb
args = {
    "entity": "konrad-brg-university-of-t-bingen",
    "project": "BENALI",
    "config": {
        "learning_rate": lr,
        "architecture": "BENALI+CNN",
        "dataset": "dataset_clean.csv",
        "epochs": epochs,
        "log_interval": 50,
        "n_batches": len(train_loader),
        "criterion": "CrossEntropyLoss"
    },
}

train_config = {
    "clf": clf,
    "optimizer": AdamW(clf.parameters(), lr=lr),
    "criterion": CrossEntropyLoss(),
    "train_loader": train_loader,
    "val_loader": val_loader,
    "n_batches": len(train_loader),
    "epochs": epochs,
}

In [15]:
%env TOKENIZERS_PARALLELISM=true
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
from torch.utils.data import ConcatDataset

log_interval = args["config"]["log_interval"]
with wandb.init(**args) as run:
    run.watch(clf, log_freq=10)
    train_config.update({"run": run})
    training_loop(**train_config)
    report = test_loop(clf, test_loader, le, run)
    cv_results = cross_validation(
        clf_class=TransformerCNN,
        dataset=ConcatDataset([train_data, val_data, test_data]),
        le=le,
        criterion_class=CrossEntropyLoss,
        optimizer_class=AdamW,
        n_splits=5,
        epochs=epochs,
        batch_size=batch_size,
        lr=lr,
        log_interval=50,
        stratified=True,
        run=run,
        num_workers=2,
        pin_memory=True,
        **{
            "embedding_model": "KonradBRG/benali",
            "n_classes": len(le.classes_),
            "layer_indices": [-5, -4, -3], 
            "freeze_transformer": True,
            "cnn_config": {
                "num_filters": 256,
                "dropout": 0.3
            }
        }  # Your TransformerCNN parameters
    )
    run.finish()
print(report)

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


env: TOKENIZERS_PARALLELISM=true
env: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


[34m[1mwandb[0m: Currently logged in as: [33mkonrad-brg[0m ([33mkonrad-brg-university-of-t-bingen[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Training:   0%|          | 0/355300 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



  0%|          | 0/7025 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "/tmp/ipykernel_696/1531112262.py", line 11, in <module>
    cv_results = cross_validation(
                 ^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_696/4060361331.py", line 87, in cross_validation
    clf = clf_class(**model_kwargs).cuda()
          ^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: TransformerCNN.__init__() got an unexpected keyword argument 'num_workers'


0,1
batch_loss,▃▇▅▁▄▄▅▂▃▅▃▅▂▄▅▄▇▃▄▄▅▅▅▄▃▅▃▅▅▅▅▇▄▅▅▄█▅▅▅
batch_train_acc,▆▅▃▇█▆█▇▃█▆▃▂█▂▃▁▃▆▅▃▅▂▂▅▇▁▃▇▁▃▃▂▃▁▂▃▇▃▅
epoch,▁▂▃▃▄▅▆▆▇█
epoch_train_acc,█▅▄▃▂▂▂▂▁▁
epoch_train_loss,▁▅▆▇█▇▇▇██
epoch_val_acc,▁█▂▆▆▇▇▇▆▃
epoch_val_loss,█▁▄▄▃▁▃▁▃▄
test_macro_f1,▁
test_macro_precision,▁
test_macro_recall,▁

0,1
batch_loss,4.04294
batch_train_acc,0.09375
epoch,9.0
epoch_train_acc,0.08374
epoch_train_loss,4.27414
epoch_val_acc,0.09412
epoch_val_loss,4.16696
test_macro_f1,0.01748
test_macro_precision,0.02414
test_macro_recall,0.02399


TypeError: TransformerCNN.__init__() got an unexpected keyword argument 'num_workers'

In [None]:
print(report)