In [None]:
%pip install wandb

In [None]:
%pip install --upgrade torchvision

In [None]:
%pip install transformers==4.54.1 tokenizers==0.21.4 sentencepiece==0.2.0 tiktoken==0.9.0

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
%pip install nltk

In [None]:
import torch
from torch.utils.data import Dataset
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
import nltk, string

nltk.download("punkt", quiet=True)

class SentenceData(Dataset):
    def __init__(self, df, fit_extractor=False, extractor=None, max_features=5000, ngram_range=(1,3)):
        """
        Args:
            df: pandas DataFrame with 'sentence' and 'l1' columns
            fit_extractor: if True, fit the feature extractor on this dataset
            extractor: optional existing feature extractor (shared between splits)
        """
        self.sentences = df["sentence"].values.tolist()
        self.labels = torch.tensor(df["l1"].values.tolist(), dtype=torch.long)

        # if no extractor provided, create one
        if extractor is None:
            self.extractor = self._build_extractor(max_features, ngram_range)
        else:
            self.extractor = extractor

        # fit on training data
        if fit_extractor:
            self.extractor["tfidf_word"].fit(self.sentences)
            self.extractor["tfidf_char"].fit(self.sentences)
            self.extractor["count"].fit(self.sentences)
            ling_feats = self._extract_linguistic_features(self.sentences)
            self.extractor["scaler"].fit(ling_feats)

        # transform into numeric features
        self.features = self._transform(self.sentences)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

    # ---------------- internal helpers ----------------
    def _build_extractor(self, max_features, ngram_range):
        return {
            "tfidf_word": TfidfVectorizer(max_features=max_features//4, ngram_range=(1,2), stop_words="english"),
            "tfidf_char": TfidfVectorizer(analyzer="char_wb", max_features=max_features//4, ngram_range=(2,4)),
            "count": CountVectorizer(max_features=max_features//4, ngram_range=ngram_range, stop_words="english"),
            "scaler": StandardScaler(),
            "ling_dim": None
        }

    def _extract_linguistic_features(self, texts):
        feats = []
        for text in texts:
            wc = len(text.split())
            cc = len(text)
            sent_count = len(nltk.sent_tokenize(text))
            avg_word_len = np.mean([len(w) for w in text.split()]) if wc > 0 else 0

            punct_count = sum(1 for c in text if c in string.punctuation)
            digit_count = sum(1 for c in text if c.isdigit())
            upper_count = sum(1 for c in text if c.isupper())

            feats.append([
                len(text), wc, sent_count, avg_word_len,
                cc, punct_count, digit_count, upper_count,
                punct_count / wc if wc else 0,
                digit_count / cc if cc else 0,
                upper_count / cc if cc else 0
            ])
        arr = np.array(feats)
        if self.extractor["ling_dim"] is None:
            self.extractor["ling_dim"] = arr.shape[1]
        return arr

    def _transform(self, texts):
        tfidf_word = self.extractor["tfidf_word"].transform(texts).toarray()
        tfidf_char = self.extractor["tfidf_char"].transform(texts).toarray()
        count_feats = self.extractor["count"].transform(texts).toarray()
        ling_feats = self._extract_linguistic_features(texts)
        ling_scaled = self.extractor["scaler"].transform(ling_feats)

        feats = np.hstack([tfidf_word, tfidf_char, count_feats, ling_scaled])
        return torch.tensor(feats, dtype=torch.float32)


In [None]:
import pandas as pd

train_df = pd.read_csv("nli_train_upsampled.csv")
val_df = pd.read_csv("nli_val.csv")
test_df = pd.read_csv("nli_test.csv")

In [None]:
# one-hot label encodings
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

train_df['l1'] = le.fit_transform(train_data['l1'])
val_df['l1'] = le.transform(val_data['l1'])
test_df['l1'] = le.transform(test_data['l1'])

#for df in [train_data, val_data, test_data]:
#    df = df.drop(columns=["l2", "source", "word_count"])

In [None]:
# Train dataset (fit extractor here)
train_data = SentenceData(train_df, fit_extractor=True)

# Share the same extractor for val/test
val_data   = SentenceData(val_df, extractor=train_data.extractor)
test_data  = SentenceData(test_df, extractor=train_data.extractor)

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_data, batch_size=64)
test_loader  = DataLoader(test_data, batch_size=64)

input_dim = train_data.features.shape[1]  # feature size for classifier


In [None]:
import torch
from torch import nn

class FeatureClassifier(nn.Module):
    def __init__(self, input_dim, n_classes, hidden_dim=256):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(hidden_dim, n_classes)
        )

    def forward(self, x):
        return self.net(x)


In [None]:
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from sklearn.metrics import precision_recall_fscore_support, classification_report
from collections import Counter
import numpy as np
import torch
import warnings
import uuid
warnings.simplefilter("ignore")

def validate(clf, val_loader, criterion):
    clf.eval()
    val_loss = 0
    val_acc = 0
    val_samples = 0
    with torch.no_grad():
        for val_sents, val_labels in val_loader:
            val_outputs = clf(val_sents)
            val_loss += criterion(val_outputs, val_labels).item()
            val_pred = torch.argmax(val_outputs, dim=1)
            val_acc += torch.sum(val_pred == val_labels).item()
            val_samples += val_labels.size(0)
    
    avg_val_loss = val_loss / len(val_loader)
    avg_val_acc = val_acc / val_samples
    
    return avg_val_loss, avg_val_acc

def train(clf, criterion, optimizer, train_loader, n_batches, epochs, run, log_freq, pbar):
    clf.train()
    epoch_loss = 0
    epoch_acc = 0
    for i, (sentences, labels) in enumerate(train_loader):
        optimizer.zero_grad(set_to_none=True)
        outputs = clf(sentences)
        loss = criterion(outputs, labels)
        pred = torch.argmax(outputs, dim=1)
        train_acc = torch.sum(pred == labels)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += train_acc.item()
        
        if i % log_freq == 0:
            run.log({"batch_loss": loss, "batch_train_acc": train_acc / labels.size(0)})
        pbar.update(1)
    
    avg_train_loss = epoch_loss / len(train_loader)
    avg_train_acc = epoch_acc / len(train_loader.dataset)
    
    return avg_train_loss, avg_train_acc

def training_loop(clf, criterion, optimizer, train_loader, n_batches, epochs, val_loader, run, log_interval=100):
    if hasattr(clf, 'fit_features') and not clf.fitted:
        print("Fitting feature extractors on training data...")
        all_train_texts = []
        for sentences, _ in tqdm(train_loader, desc="Collecting training texts"):
            all_train_texts.extend(sentences)
        clf.fit_features(all_train_texts)
        print(f"Feature extraction complete. Total features: {clf.n_features}")
        
    total_batches = len(train_loader) * epochs
    pbar = tqdm(total=total_batches, desc='Training')
    
    for e in range(epochs):
        run.log({"epoch": e})
        avg_train_loss, avg_train_acc = train(clf, criterion, optimizer, train_loader, n_batches, epochs, run, log_interval, pbar)
        avg_val_loss, avg_val_acc = validate(clf, val_loader, criterion)
        run.log({
            "epoch_train_loss": avg_train_loss,
            "epoch_train_acc": avg_train_acc,
            "epoch_val_loss": avg_val_loss,
            "epoch_val_acc": avg_val_acc
        })
        pbar.set_postfix({
            'epoch': f'{e+1}/{epochs}',
            'train_loss': f'{avg_train_loss:.4f}',
            'val_loss': f'{avg_val_loss:.4f}',
            'val_acc': f'{avg_val_acc:.4f}'
        })
    
    pbar.close()

def test_loop(clf, test_loader, le, run):
    clf.eval()
    y_true = []
    y_pred = []
    target_names = [str(name) for name in le.classes_]
    for i, (sentences, labels) in tqdm(enumerate(test_loader), total=len(test_loader)):
        with torch.no_grad():
            outputs = clf(sentences)
        preds = torch.argmax(outputs, dim=1)
        y_true += labels.tolist()
        y_pred += preds.tolist()
    report = classification_report(
            y_true,
            y_pred,
            target_names=target_names,
            labels=list(range(len(target_names)))
    )
    ma_p, ma_r, ma_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    mi_p, mi_r, mi_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='micro')
    w_p, w_r, w_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    
    scores = {
        "test_macro_precision": ma_p,
        "test_macro_recall": ma_r,
        "test_macro_f1": ma_f1,
        "test_micro_precision": mi_p,
        "test_micro_recall": mi_r,
        "test_micro_f1": mi_f1,
        "test_weighted_precision": w_p,
        "test_weighted_recall": w_r,
        "test_weighted_f1": w_f1
    }
    run.log(scores)
    
    print("Classification Report:")
    print(report)
    print(f"\nTest Scores:")
    for key, value in scores.items():
        print(f"{key}: {value:.4f}")
    
    return report, scores

In [None]:
from torch.utils.data import Subset
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

n_classes = len(le.classes_)
n_layers = 2
batch_size = 64
lr = 1e-3
epochs = 4

clf = DeepTraditionalL1Classifier(n_classes=n_classes)

train_loader = DataLoader(train_data, batch_size=batch_size, num_workers=2, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_data, batch_size=batch_size, num_workers=2, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_data, batch_size=batch_size)

# wandb
args = {
    "entity": "konrad-brg-university-of-t-bingen",
    "project": "BENALI-Trad",
    "config": {
        "learning_rate": lr,
        "architecture": "BERT+LINEAR",
        "dataset": "dataset_clean.csv",
        "epochs": epochs,
        "log_interval": 50,
        "n_batches": len(train_loader),
        "criterion": "CrossEntropyLoss"
    },
}

train_config = {
    "clf": clf,
    "optimizer": AdamW(clf.parameters(), lr=lr),
    "criterion": CrossEntropyLoss(),
    "train_loader": train_loader,
    "val_loader": val_loader,
    "n_batches": len(train_loader),
    "epochs": epochs,
}

In [None]:
%env TOKENIZERS_PARALLELISM=true
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
log_interval = 100
with wandb.init() as run:
    run.watch(clf, log_freq=log_interval)
    train_config.update({"run": run})
    training_loop(**train_config)
    report, scores = test_loop(clf, test_loader, le, run)
    run.finish()
print(report)

In [None]:
del model
del clf
del tokenizer

In [None]:
import json
with open("results/clf_3_layers_bert_base.json", "w") as f:
    json.dump(f, scores)