In [1]:
from pathlib import Path

import evaluate
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from datasets import Dataset
from joblib import Memory
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from transformers.modeling_outputs import (
    ModelOutput,  # or just use dict if not subclassing
)

CACHE_DIR = Path().cwd().parent / ".cache"
if not CACHE_DIR.exists():
    CACHE_DIR.mkdir()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
memory = Memory(CACHE_DIR)


@memory.cache
def load_df(split: str = "train") -> pd.DataFrame:
    splits = {
        "train": "data/train-00000-of-00001.parquet",
        "test": "data/test-00000-of-00001.parquet",
    }
    return pd.read_parquet(
        "hf://datasets/FundSciImpact/semeval-2016/" + splits[split],
        columns=["Tweet", "Target", "Sentiment"],
    )


df = load_df()
df = df[df["Sentiment"] != "other"]

In [3]:
df[["Target", "Sentiment"]].value_counts().to_markdown()

"|                                             |   count |\n|:--------------------------------------------|--------:|\n| ('Feminist Movement', 'neg')                |     513 |\n| ('Hillary Clinton', 'neg')                  |     441 |\n| ('Legalization of Abortion', 'neg')         |     432 |\n| ('Atheism', 'pos')                          |     310 |\n| ('Hillary Clinton', 'pos')                  |     221 |\n| ('Climate Change is a Real Concern', 'neg') |     196 |\n| ('Legalization of Abortion', 'pos')         |     188 |\n| ('Atheism', 'neg')                          |     180 |\n| ('Climate Change is a Real Concern', 'pos') |     125 |\n| ('Feminist Movement', 'pos')                |     119 |"

In [4]:
sentiment_classes = ["pos", "neg"]
# Assuming Target column contains the classes we want to one-hot encode
target_classes = df["Target"].unique().tolist()  # Take first 5 unique values
all_classes = sentiment_classes + target_classes
# Create dictionaries for mapping
sentiment_class2id = {
    class_: class_id for class_id, class_ in enumerate(sentiment_classes)
}
target_class2id = {class_: class_id for class_id, class_ in enumerate(target_classes)}
# All class mappings (sentiment + targets)
all_classes = sentiment_classes + target_classes
class2id = {class_: class_id for class_id, class_ in enumerate(all_classes)}
id2class = {class_id: class_ for class_, class_id in class2id.items()}

In [5]:
dataset_dict = {
    "text": df["Tweet"].tolist(),
    "sentiment": df["Sentiment"].tolist(),
    "target": df["Target"].tolist(),
}
dataset = Dataset.from_dict(dataset_dict)

dataset = dataset.train_test_split(test_size=0.2)

# 3. Load tokenizer
model_path = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [6]:
# 4. Preprocess function
def preprocess_function(example):
    # Tokenize text
    tokenized = tokenizer(example["text"], truncation=True)

    # Prepare multilabel format
    labels = [0.0] * len(all_classes)

    # Set sentiment label (binary classification - either pos or neg)
    sentiment_id = class2id[example["sentiment"]]
    labels[sentiment_id] = 1.0

    # Set target labels (one-hot encoding for the 5 target classes)
    if example["target"] in target_classes:
        target_id = class2id[example["target"]]
        labels[target_id] = 1.0

    tokenized["labels"] = labels
    return tokenized

In [7]:
# Apply preprocessing
tokenized_dataset = dataset.map(preprocess_function)

Map: 100%|██████████| 2180/2180 [00:00<00:00, 10156.26 examples/s]
Map: 100%|██████████| 545/545 [00:00<00:00, 10343.56 examples/s]


In [8]:
# 5. Prepare data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 6. Metrics function
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

In [9]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.5).astype(int).reshape(-1)
    return clf_metrics.compute(
        predictions=predictions, references=labels.astype(int).reshape(-1)
    )

In [10]:
# 7. Initialize model
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=len(all_classes),
    id2label=id2class,
    label2id=class2id,
    problem_type="multi_label_classification",
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# 8. Configure training arguments
training_args = TrainingArguments(
    output_dir="multilabel_model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
def compute_loss_func(
    outputs: ModelOutput | dict,
    labels: torch.Tensor,
    num_items_in_batch: int,  # noqa: ARG001
    num_classification_labels: int = 2,
) -> torch.Tensor:
    """
    Custom loss function for HuggingFace Trainer:
    - Binary log loss for the first element
    - Squared loss (MSE) for the remaining elements

    Args:
        outputs: ModelOutput or dict containing 'logits' of shape (batch_size, num_outputs)
        labels: Tensor of shape (batch_size, num_outputs), ground-truth labels
        num_items_in_batch: Total number of items in the accumulated batch (unused here)
        num_classification_labels: Number of non-group based classification labels (default: 2)

    Returns:
        Scalar tensor representing the combined loss
    """
    logits = outputs.logits if hasattr(outputs, "logits") else outputs["logits"]

    # Binary classification loss for the first outputs
    log_loss = F.binary_cross_entropy_with_logits(
        logits[:, :num_classification_labels], labels[:, :num_classification_labels]
    )

    # Regression loss (MSE) for remaining outputs
    if logits.shape[1] > 1:
        mse_loss = F.mse_loss(
            logits[:, num_classification_labels:], labels[:, num_classification_labels:]
        )
        loss = log_loss + mse_loss
    else:
        loss = log_loss

    return loss


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_loss_func=compute_loss_func,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [15]:
# 10. Train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.439806,0.530275,0.530152,0.371145,0.927523
2,0.467200,0.432666,0.561206,0.546587,0.387779,0.925688
3,0.467200,0.504109,0.572477,0.553028,0.394295,0.925688


TrainOutput(global_step=819, training_loss=0.3579169612227779, metrics={'train_runtime': 204.6023, 'train_samples_per_second': 31.964, 'train_steps_per_second': 4.003, 'total_flos': 143747311387200.0, 'train_loss': 0.3579169612227779, 'epoch': 3.0})

In [16]:
# 12. Function to make predictions
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(
        model.device
    )
    outputs = model(**inputs)
    logits = outputs.logits.detach().cpu().numpy()
    probs = sigmoid(logits)[0]

    predictions = (probs > 0.5).astype(int)

    # Get sentiment prediction
    sentiment_idx = np.argmax(probs[: len(sentiment_classes)])
    sentiment = sentiment_classes[sentiment_idx]

    # Get target predictions (can be multiple)
    target_predictions = []
    for i, val in enumerate(predictions[len(sentiment_classes) :]):
        if val == 1:
            target_predictions.append(target_classes[i])

    return {
        "sentiment": sentiment,
        "targets": target_predictions,
        "probabilities": {id2class[i]: float(probs[i]) for i in range(len(probs))},
    }

In [18]:
predict("The world is burning and we cannot stop the oil!")

{'sentiment': 'neg',
 'targets': ['Hillary Clinton',
  'Climate Change is a Real Concern',
  'Legalization of Abortion',
  'Atheism'],
 'probabilities': {'pos': 0.04228944703936577,
  'neg': 0.9575344324111938,
  'Feminist Movement': 0.45859676599502563,
  'Hillary Clinton': 0.5194833278656006,
  'Climate Change is a Real Concern': 0.6952791810035706,
  'Legalization of Abortion': 0.5447657704353333,
  'Atheism': 0.5481216907501221}}

In [None]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e