In [103]:
from functools import lru_cache
from pathlib import Path
from typing import Any

import evaluate
import numpy as np
import pandas as pd
import polars as pl
import torch
import torch.nn.functional as F
from datasets import Dataset
from joblib import Memory
from sklearn.model_selection import GroupShuffleSplit
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from transformers.modeling_outputs import (
    ModelOutput,  # or just use dict if not subclassing
)

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
CACHE_DIR = Path().cwd().parent / ".cache"
if not CACHE_DIR.exists():
    CACHE_DIR.mkdir()


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.5).astype(int).reshape(-1)
    return clf_metrics.compute(
        predictions=predictions, references=labels.astype(int).reshape(-1)
    )


# 8. Configure training arguments
training_args = TrainingArguments(
    output_dir="multilabel_model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    label_names=["labels"],
)

In [None]:
# 3. Load tokenizer
model_path = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [117]:
def get_full_data():
    english_hatespeech = Path().cwd().parent / "hatespeech-data" / "split" / "English"
    all_data = list(english_hatespeech.glob("*.tsv"))
    return (
        pl.DataFrame(
            pd.concat([pd.read_csv(f, sep="\t") for f in all_data]).drop(
                columns=["city", "state", "country", "date"]
            )
        )
        .with_columns(
            pl.col("gender").replace("x", None).cast(pl.Int8),
            pl.col("age").replace("x", None).cast(pl.Int8),
            pl.col("ethnicity").replace("x", None).cast(pl.Int8),
        )
        .drop_nulls()
        .rename({"label": "target"})
    )


def create_dataset(
    features: pl.DataFrame, labels: pl.Series, feature_names: list[str] | None = None
) -> Dataset:
    if feature_names is None:
        feature_names = features.columns
    feature_dict = {feature: features[feature].to_list() for feature in feature_names}
    return Dataset.from_dict(
        {
            **feature_dict,
            "target": labels.to_list(),
        }
    )


@lru_cache
def tokenize(text: str) -> dict[str, Any]:
    return tokenizer(text, truncation=True)


def preprocess_simple(example: dict[str, Any]) -> dict[str, Any]:
    tokenized = tokenize(example["text"])
    labels = [float(example[key]) for key in ["target", "gender", "age", "ethnicity"]]
    assert len(labels) == 4
    tokenized["labels"] = labels
    return tokenized


def compute_loss_func(
    outputs: ModelOutput | dict,
    labels: torch.Tensor,
    num_items_in_batch: int,  # noqa: ARG001
) -> torch.Tensor:
    """
    Custom loss function for HuggingFace Trainer:
    - Binary log loss for the first element
    - Squared loss (MSE) for the remaining elements

    Args:
        outputs: ModelOutput or dict containing 'logits' of shape (batch_size, num_outputs)
        labels: Tensor of shape (batch_size, num_outputs), ground-truth labels
        num_items_in_batch: Total number of items in the accumulated batch (unused here)
        num_classification_labels: Number of non-group based classification labels (default: 2)

    Returns:
        Scalar tensor representing the combined loss
    """
    logits = outputs.logits if hasattr(outputs, "logits") else outputs["logits"]

    log_loss = F.binary_cross_entropy_with_logits(logits[:, :1], labels[:, :1])

    # Regression loss (MSE) for remaining outputs
    if logits.shape[1] > 1:
        mse_loss = F.mse_loss(logits[:, 1:], labels[:, 1:])
        loss = log_loss + mse_loss
    else:
        loss = log_loss

    return loss


combined = get_full_data()

# 5. Prepare data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# 6. Metrics function
# 7. Initialize model
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=4,
    problem_type="multi_label_classification",
)


K = 5
gss = GroupShuffleSplit(n_splits=K, train_size=0.8, random_state=110)
all_features = combined.drop("target", "tid", "uid")
all_labels = combined["target"]
all_users = combined["uid"]

for train_index, test_index in gss.split(all_features, all_labels, groups=all_users):
    train_features = all_features[train_index]
    train_labels = all_labels[train_index]
    train_groups = all_users[train_index]
    test_features = all_features[test_index]
    test_labels = all_labels[test_index]

    # nested cross-validation
    for inner_train_index, validation_index in gss.split(
        train_features, train_labels, groups=train_groups
    ):
        inner_train_features = train_features[inner_train_index]
        inner_train_labels = train_labels[inner_train_index]
        inner_train_groups = train_groups[inner_train_index]
        inner_validation_features = train_features[validation_index]
        inner_validation_labels = train_labels[validation_index]
        inner_validation_groups = train_groups[validation_index]
        assert inner_validation_groups.shape[0] == validation_index.shape[0]

        train_dataset = create_dataset(
            inner_train_features,
            inner_train_labels,
        ).map(preprocess_simple)

        validation_dataset = create_dataset(
            inner_validation_features,
            inner_validation_labels,
        ).map(preprocess_simple)
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=validation_dataset,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_loss_func=compute_loss_func,
            compute_metrics=compute_metrics,
        )
        trainer.train()
        break

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 25621/25621 [00:02<00:00, 11077.27 examples/s]
Map: 100%|██████████| 6103/6103 [00:00<00:00, 11218.62 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [114]:
next(iter(train_dataset))

{'text': "hashtag totalblacktv 1 don't ask me to think ... i just want to entertain ! hashtag hashtag url",
 'gender': 0,
 'age': 0,
 'ethnicity': 1,
 'label': 1}

In [105]:
type(train_dataset["labels"])

list

In [60]:
sentiment_classes = ["pos", "neg"]
# Assuming Target column contains the classes we want to one-hot encode
target_classes = ["1", "2"]
all_classes = sentiment_classes + target_classes
# Create dictionaries for mapping
sentiment_class2id = {
    class_: class_id for class_id, class_ in enumerate(sentiment_classes)
}
target_class2id = {class_: class_id for class_id, class_ in enumerate(target_classes)}
# All class mappings (sentiment + targets)
all_classes = sentiment_classes + target_classes
class2id = {class_: class_id for class_id, class_ in enumerate(all_classes)}
id2class = {class_id: class_ for class_, class_id in class2id.items()}

In [59]:
tokenizer("hello", truncation=True)

{'input_ids': [101, 7592, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  trainer = Trainer(


In [15]:
# 10. Train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.439806,0.530275,0.530152,0.371145,0.927523
2,0.467200,0.432666,0.561206,0.546587,0.387779,0.925688
3,0.467200,0.504109,0.572477,0.553028,0.394295,0.925688


TrainOutput(global_step=819, training_loss=0.3579169612227779, metrics={'train_runtime': 204.6023, 'train_samples_per_second': 31.964, 'train_steps_per_second': 4.003, 'total_flos': 143747311387200.0, 'train_loss': 0.3579169612227779, 'epoch': 3.0})

In [16]:
# 12. Function to make predictions
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(
        model.device
    )
    outputs = model(**inputs)
    logits = outputs.logits.detach().cpu().numpy()
    probs = sigmoid(logits)[0]

    predictions = (probs > 0.5).astype(int)

    # Get sentiment prediction
    sentiment_idx = np.argmax(probs[: len(sentiment_classes)])
    sentiment = sentiment_classes[sentiment_idx]

    # Get target predictions (can be multiple)
    target_predictions = []
    for i, val in enumerate(predictions[len(sentiment_classes) :]):
        if val == 1:
            target_predictions.append(target_classes[i])

    return {
        "sentiment": sentiment,
        "targets": target_predictions,
        "probabilities": {id2class[i]: float(probs[i]) for i in range(len(probs))},
    }