<a href="https://colab.research.google.com/github/kamranr123/kamranr123.github.io/blob/master/fine_tune_multilingual_bert_on_emotions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
# Install dependencies
!pip install -q transformers datasets accelerate

In [None]:
!wget -P /content https://github.com/nazaninsbr/Persian-Emotion-Detection/raw/refs/heads/main/dataset.csv

In [1]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

# Disable W&B
os.environ["WANDB_DISABLED"] = "true"


In [2]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("/content/dataset.csv")

# Emotion label columns (vote counts 0-5)
label_cols = ["Anger", "Fear", "Happiness", "Hatred", "Sadness", "Wonder"]

# Binarize emotions: 1 if >=3 votes (majority), else 0
df[label_cols] = df[label_cols].apply(lambda row: [1 if int(x) >= 3 else 0 for x in row], axis=1, result_type='expand')


# Add Neutral: 1 if no emotions have majority (sum of binarized emotions == 0)
df["Neutral"] = (df[label_cols].sum(axis=1) == 0).astype(int)
final_labels = label_cols + ["Neutral"]
df["labels"] = df[final_labels].apply(lambda row: [float(x) for x in row], axis=1)

# Keep only text + labels
df = df[["text", "labels"]]

# Train/validation split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Create Hugging Face datasets
train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds = Dataset.from_pandas(val_df.reset_index(drop=True))
dataset = DatasetDict({"train": train_ds, "validation": val_ds})

# Verify
train_labels = np.array(dataset["train"]["labels"])
val_labels = np.array(dataset["validation"]["labels"])
print("Unique training label values:", np.unique(train_labels))
print("Unique validation label values:", np.unique(val_labels))
print("Training label frequencies:", np.sum(train_labels, axis=0) / train_labels.shape[0])
print("Validation label frequencies:", np.sum(val_labels, axis=0) / val_labels.shape[0])

Unique training label values: [0. 1.]
Unique validation label values: [0. 1.]
Training label frequencies: [0.05570833 0.02308333 0.023625   0.042125   0.05891667 0.033625
 0.796125  ]
Validation label frequencies: [0.04916667 0.02266667 0.02083333 0.04083333 0.05933333 0.02983333
 0.80483333]


In [18]:
import numpy as np
train_labels_np = np.stack(train_df["labels"].values)  # shape (N, num_labels)
neg = (train_labels_np == 0).sum(axis=0).astype(np.float32)
pos = (train_labels_np == 1).sum(axis=0).astype(np.float32)
# Avoid division by zero
pos = np.where(pos == 0, 1.0, pos)
pos_weight = neg / pos
print("pos_weight:", pos_weight)  # use this in loss


pos_weight: [16.950636   42.3213     41.32804    22.738873   15.973125   28.739777
  0.25608414]


In [10]:
for i in range(10):
  print(dataset["train"][i]['labels'])

[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]
[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
[1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]


In [4]:
# model_name = "google-bert/bert-base-multilingual-cased"
model_name = "HooshvareLab/bert-fa-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)

dataset = dataset.map(tokenize, batched=True)
# dataset = dataset.remove_columns(["text"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [5]:
model_name = "HooshvareLab/bert-fa-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=7,
    problem_type="multi_label_classification"
)
# model.config.hidden_dropout_prob = 0.4
# model.config.attention_probs_dropout_prob = 0.4

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from sklearn.metrics import f1_score, precision_recall_fscore_support, roc_auc_score, accuracy_score

def compute_metrics_simple(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    preds = (probs > 0.5).astype(int)
    labels = labels.astype(int)
    f1_macro = f1_score(labels, preds, average='macro', zero_division=0)
    f1_micro = f1_score(labels, preds, average='micro', zero_division=0)
    per_label = f1_score(labels, preds, average=None, zero_division=0)
    metrics = {"f1_macro": f1_macro, "f1_micro": f1_micro}
    for i, v in enumerate(per_label):
        metrics[f"f1_label_{i}"] = float(v)
    # ROC-AUC per label (only if label has positives)
    try:
        aucs = []
        for i in range(labels.shape[1]):
            if labels[:, i].sum() > 0 and labels[:, i].sum() < labels.shape[0]:
                aucs.append(roc_auc_score(labels[:, i], probs[:, i]))
            else:
                aucs.append(float("nan"))
        metrics["roc_auc_mean"] = np.nanmean(aucs)
    except Exception:
        pass
    return metrics

In [7]:
import shutil
shutil.rmtree('/content/bert-persian-emotions')

In [8]:
from transformers import TrainingArguments, DataCollatorWithPadding

args = TrainingArguments(
    output_dir="/content/bert-persian-emotions",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,  # effective batch 16 (instead of 64)
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=True,
    report_to="none"
)

In [None]:
from transformers import Trainer
import torch.nn as nn
from transformers import TrainerCallback

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=2, early_stopping_threshold=0.01):
        self.early_stopping_patience = early_stopping_patience
        self.early_stopping_threshold = early_stopping_threshold
        self.best_metric = None
        self.patience_counter = 0

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        metric_value = metrics.get("eval_f1_macro")
        if metric_value is None:
            return
        if self.best_metric is None or metric_value > self.best_metric + self.early_stopping_threshold:
            self.best_metric = metric_value
            self.patience_counter = 0
        else:
            self.patience_counter += 1
            if self.patience_counter >= self.early_stopping_patience:
                control.should_training_stop = True

# Focal loss
class FocalLoss(nn.Module):
    def __init__(self, alpha=1.0, gamma=3.0, label_smoothing=0.1):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.label_smoothing = label_smoothing
    def forward(self, logits, targets):
        targets = targets * (1 - self.label_smoothing) + 0.5 * self.label_smoothing
        bce_loss = nn.BCEWithLogitsLoss(reduction='none')(logits, targets)
        pt = torch.exp(-bce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * bce_loss
        return focal_loss.mean()

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels").float()
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = FocalLoss(alpha=1.0, gamma=3.0, label_smoothing=0.1)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

class WeightedBCETrainer(Trainer):
    def __init__(self, *args, pos_weight=None, **kwargs):
        super().__init__(*args, **kwargs)

        if pos_weight is not None:
            self.register_buffer = None
            self.pos_weight = torch.tensor(pos_weight, dtype=torch.float).to(self.args.device)
            print('pos_weight', self.pos_weight)
        else:
            self.pos_weight = None

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels").float().to(self.args.device)
        outputs = model(**inputs)
        logits = outputs.logits
        if self.pos_weight is not None:
            loss_fct = nn.BCEWithLogitsLoss(pos_weight=self.pos_weight)
        else:
            loss_fct = nn.BCEWithLogitsLoss()
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss


trainer = WeightedBCETrainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_simple,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.1)],
    pos_weight=pos_weight
)


trainer.train()

  super().__init__(*args, **kwargs)


pos_weight tensor([16.9506, 42.3213, 41.3280, 22.7389, 15.9731, 28.7398,  0.2561],
       device='cuda:0')


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro,F1 Label 0,F1 Label 1,F1 Label 2,F1 Label 3,F1 Label 4,F1 Label 5,F1 Label 6,Roc Auc Mean
1,0.5077,3.176405,0.180736,0.614403,0.062275,0.055453,0.048128,0.084942,0.110236,0.071823,0.832291,0.533672


In [17]:
print(trainer.pos_weight)

None


In [12]:
model.eval()
from sklearn.metrics import f1_score

# Get validation logits (use trainer.predict for convenience)
preds_output = trainer.predict(dataset["validation"])
val_logits = preds_output.predictions  # shape (N_val, num_labels)
val_probs = torch.sigmoid(torch.tensor(val_logits)).numpy()
val_labels = preds_output.label_ids.astype(int)

# Search thresholds per label (0.1..0.9)
best_thresholds = np.ones(val_labels.shape[1]) * 0.5
for i in range(val_labels.shape[1]):
    best_f1 = -1
    for t in np.linspace(0.1, 0.9, 17):
        p = (val_probs[:, i] > t).astype(int)
        f1 = f1_score(val_labels[:, i], p, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_thresholds[i] = t
print("Best thresholds:", best_thresholds)

# Compute final metrics with these thresholds
final_preds = (val_probs > best_thresholds[None, :]).astype(int)
print("Final macro F1:", f1_score(val_labels, final_preds, average='macro'))
print("Final micro F1:", f1_score(val_labels, final_preds, average='micro'))

Best thresholds: [0.1 0.1 0.1 0.1 0.1 0.1 0.1]
Final macro F1: 0.146064090074684
Final micro F1: 0.7387846961740435


In [None]:
texts = [
    "من امروز خیلی خوشحالم",      # Happy
    "احساس می‌کنم ناراحت و خسته‌ام", # Sad
    "از تاریکی می‌ترسم",           # Fear
    "قدم زدن زیر بارون شاید بهترین مسکن درد هاست..." # Neutral
]

inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
outputs = model(**inputs)
probs = torch.sigmoid(outputs.logits).detach().numpy()

for text, p in zip(texts, probs):
    labels_pred = [final_labels[i] for i, v in enumerate(p) if v > 0.5]
    print(text, "->", labels_pred)


In [None]:
import numpy as np
from sklearn.metrics import f1_score, classification_report

# collect true labels and preds on validation set
preds_logits = trainer.predict(dataset["validation"]).predictions  # raw logits
probs = 1 / (1 + np.exp(-preds_logits))
preds = (probs > 0.5).astype(int)

# binarize references (soft labels → 0/1)
refs = np.stack([ex["labels"] for ex in dataset["validation"]])
refs_bin = (refs >= 0.5).astype(int)

# overall per-label counts
pos_counts = refs_bin.sum(axis=0)
neg_counts = refs_bin.shape[0] - pos_counts
print("pos counts per label:", pos_counts)
print("neg counts per label:", neg_counts)

# per-label f1
for i, name in enumerate(final_labels):
    print(name, "F1:", f1_score(refs_bin[:, i], preds[:, i], zero_division=0))

# full classification report
print(classification_report(refs_bin, preds, zero_division=0))


In [None]:
import numpy as np

# Convert Hugging Face dataset into numpy array of labels
all_labels = np.stack(dataset["train"]["labels"])  # shape: (num_samples, num_labels)

# Count positives and negatives per label
pos_counts = all_labels.sum(axis=0)
neg_counts = all_labels.shape[0] - pos_counts

print("pos counts per label:", pos_counts)
print("neg counts per label:", neg_counts)

# If you want mapping to label names:
for name, pos, neg in zip(final_labels, pos_counts, neg_counts):
    print(f"{name:10s} | pos: {int(pos):5d} | neg: {int(neg):5d}")

val_labels = np.array(dataset["validation"]["labels"])
print("Validation label frequencies:", np.sum(val_labels, axis=0) / val_labels.shape[0])

pos counts per label: [4435.79776857 3377.90284005 3411.00137803 4083.14986549 4608.76150663
 3826.38664122  257.        ]
neg counts per label: [19564.20223143 20622.09715995 20588.99862197 19916.85013451
 19391.23849337 20173.61335878 23743.        ]
Anger      | pos:  4435 | neg: 19564
Fear       | pos:  3377 | neg: 20622
Happiness  | pos:  3411 | neg: 20588
Hatred     | pos:  4083 | neg: 19916
Sadness    | pos:  4608 | neg: 19391
Wonder     | pos:  3826 | neg: 20173
Neutral    | pos:   257 | neg: 23743
Validation label frequencies: [0.16867392 0.14521033 0.14398886 0.1701612  0.18877742 0.16365703
 0.01953125]
