<a href="https://colab.research.google.com/github/kamranr123/kamranr123.github.io/blob/master/fine_tune_multilingual_bert_on_emotions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
# Install dependencies
!pip install -q transformers datasets accelerate

In [2]:
!wget -P /content https://github.com/nazaninsbr/Persian-Emotion-Detection/raw/refs/heads/main/dataset.csv

--2025-09-21 12:31:38--  https://github.com/nazaninsbr/Persian-Emotion-Detection/raw/refs/heads/main/dataset.csv
Resolving github.com (github.com)... 140.82.116.3
Connecting to github.com (github.com)|140.82.116.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/nazaninsbr/Persian-Emotion-Detection/refs/heads/main/dataset.csv [following]
--2025-09-21 12:31:39--  https://raw.githubusercontent.com/nazaninsbr/Persian-Emotion-Detection/refs/heads/main/dataset.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6754866 (6.4M) [application/octet-stream]
Saving to: ‘/content/dataset.csv’


2025-09-21 12:31:40 (105 MB/s) - ‘/content/dataset.csv’ saved [6754866/6754866]



In [3]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

# Disable W&B
os.environ["WANDB_DISABLED"] = "true"


In [15]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("/content/dataset.csv")

# Emotion label columns (vote counts 0-5)
label_cols = ["Anger", "Fear", "Happiness", "Hatred", "Sadness", "Wonder"]

# Add Neutral label
# df["Neutral"] = df[label_cols].apply(lambda row: 1 if row.max() < 2 else 0, axis=1).astype(int)

# Binarize emotions: 1 if >=2 votes (majority), else 0
df[label_cols] = df[label_cols].apply(lambda row: [1 if int(x) >= 2 else 0 for x in row], axis=1, result_type='expand')


# Add Neutral: 1 if no emotions have majority (sum of binarized emotions == 0)
df["Neutral"] = (df[label_cols].sum(axis=1) == 0).astype(int)

final_labels = label_cols + ["Neutral"]
df["labels"] = df[final_labels].apply(lambda row: [float(x) for x in row], axis=1)

# Keep only text + labels
df = df[["text", "labels"]]

# Train/validation split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Create Hugging Face datasets
train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds = Dataset.from_pandas(val_df.reset_index(drop=True))
dataset = DatasetDict({"train": train_ds, "validation": val_ds})

# Verify
train_labels = np.array(dataset["train"]["labels"])
val_labels = np.array(dataset["validation"]["labels"])
print("Unique training label values:", np.unique(train_labels))
print("Unique validation label values:", np.unique(val_labels))
print("Training label frequencies:", np.sum(train_labels, axis=0) / train_labels.shape[0])
print("Validation label frequencies:", np.sum(val_labels, axis=0) / val_labels.shape[0])

Unique training label values: [0. 1.]
Unique validation label values: [0. 1.]
Training label frequencies: [0.22345833 0.14141667 0.13145833 0.19895833 0.24154167 0.17170833
 0.32766667]
Validation label frequencies: [0.22133333 0.1395     0.12733333 0.1905     0.24166667 0.16566667
 0.33616667]


In [23]:
import numpy as np
train_labels_np = np.stack(train_df["labels"].values)  # shape (N, num_labels)
neg = (train_labels_np == 0).sum(axis=0).astype(np.float32)
pos = (train_labels_np == 1).sum(axis=0).astype(np.float32)
# Avoid division by zero
pos = np.where(pos == 0, 1.0, pos)
pos_weight = neg / pos
print("pos_weight:", pos_weight)  # use this in loss


pos_weight: [3.4751072 6.0713024 6.606973  4.026178  3.1400723 4.823829  2.051882 ]


In [None]:
for i in range(10):
  print(dataset["train"][i]['labels'])

In [39]:
# model_name = "google-bert/bert-base-multilingual-cased"
# model_name = "HooshvareLab/bert-fa-base-uncased"
model_name = "/content/bert-persian-emotions/checkpoint-10500"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)

dataset = dataset.map(tokenize, batched=True)
# dataset = dataset.remove_columns(["text"])


Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [40]:
from transformers import AutoConfig, AutoModelForSequenceClassification

num_labels = 7  # your 6 emotions + Neutral

config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
    hidden_dropout_prob=0.3,   # dropout in transformer layers
    attention_probs_dropout_prob=0.3  # dropout in attention
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=config
)

In [31]:
from sklearn.metrics import f1_score, precision_recall_fscore_support, roc_auc_score, accuracy_score
from sklearn.metrics import precision_recall_curve

def find_best_thresholds(y_true, y_probs):
    thresholds = []
    for i in range(y_true.shape[1]):
        precision, recall, th = precision_recall_curve(y_true[:, i], y_probs[:, i])
        f1 = 2 * precision * recall / (precision + recall + 1e-6)
        thresholds.append(th[f1.argmax()])
    return thresholds

def compute_metrics_simple(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()

    best_thresholds = find_best_thresholds(labels, probs)
    print("Best thresholds per class:", best_thresholds)

    # preds = (probs > 0.5).astype(int)
    preds = (probs > best_thresholds).astype(int)

    labels = labels.astype(int)
    f1_macro = f1_score(labels, preds, average='macro', zero_division=0)
    f1_micro = f1_score(labels, preds, average='micro', zero_division=0)
    per_label = f1_score(labels, preds, average=None, zero_division=0)
    metrics = {"f1_macro": f1_macro, "f1_micro": f1_micro}
    for i, v in enumerate(per_label):
        metrics[f"f1_label_{i}"] = float(v)
    # ROC-AUC per label (only if label has positives)
    try:
        aucs = []
        for i in range(labels.shape[1]):
            if labels[:, i].sum() > 0 and labels[:, i].sum() < labels.shape[0]:
                aucs.append(roc_auc_score(labels[:, i], probs[:, i]))
            else:
                aucs.append(float("nan"))
        metrics["roc_auc_mean"] = np.nanmean(aucs)
    except Exception:
        pass
    return metrics

In [30]:
import shutil
shutil.rmtree('/content/bert-persian-emotions')

In [32]:
from transformers import TrainingArguments, DataCollatorWithPadding

args = TrainingArguments(
    output_dir="/content/bert-persian-emotions",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,  # effective batch 16 (instead of 64)
    per_device_eval_batch_size=16,
    num_train_epochs=8,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=True,
    report_to="none"
)

In [33]:
from transformers import Trainer
import torch.nn as nn
from transformers import TrainerCallback
import torch.nn.functional as F

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=2, early_stopping_threshold=0.01):
        self.early_stopping_patience = early_stopping_patience
        self.early_stopping_threshold = early_stopping_threshold
        self.best_metric = None
        self.patience_counter = 0

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        metric_value = metrics.get("eval_f1_macro")
        if metric_value is None:
            return
        if self.best_metric is None or metric_value > self.best_metric + self.early_stopping_threshold:
            self.best_metric = metric_value
            self.patience_counter = 0
        else:
            self.patience_counter += 1
            if self.patience_counter >= self.early_stopping_patience:
                control.should_training_stop = True

# Focal loss
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        bce = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-bce)
        focal_loss = (1 - pt) ** self.gamma * bce
        if self.alpha is not None:
            focal_loss = self.alpha * focal_loss
        if self.reduction == 'mean':
            return focal_loss.mean()
        else:
            return focal_loss.sum()

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels").float()
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = FocalLoss(alpha=1.0, gamma=3.0, label_smoothing=0.1)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

class WeightedBCETrainer(Trainer):
    def __init__(self, *args, pos_weight=None, **kwargs):
        super().__init__(*args, **kwargs)

        if pos_weight is not None:
            self.register_buffer = None
            self.pos_weight = torch.tensor(pos_weight, dtype=torch.float).to(self.args.device)
        else:
            self.pos_weight = None

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels").float().to(self.args.device)
        outputs = model(**inputs)
        logits = outputs.logits
        # if self.pos_weight is not None:
        #     loss_fct = nn.BCEWithLogitsLoss(pos_weight=self.pos_weight)
        # else:
        #     loss_fct = nn.BCEWithLogitsLoss()
        loss_fct = FocalLoss(alpha=self.pos_weight, gamma=2)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss


trainer = WeightedBCETrainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_simple,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.1)],
    pos_weight=pos_weight
)


trainer.train()

  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro,F1 Label 0,F1 Label 1,F1 Label 2,F1 Label 3,F1 Label 4,F1 Label 5,F1 Label 6,Roc Auc Mean
1,0.533,0.523442,0.213282,0.343207,0.361522,0.0,0.21796,0.005208,0.389397,0.016453,0.502435,0.499759
2,0.5332,0.520693,0.308181,0.333066,0.320178,0.221461,0.22564,0.259344,0.375392,0.251814,0.503435,0.524233
3,0.5283,0.523376,0.309035,0.332186,0.317487,0.232914,0.225965,0.309493,0.354006,0.220199,0.503182,0.533597
4,0.5261,0.519961,0.331806,0.343131,0.362451,0.250439,0.226066,0.314395,0.38704,0.27949,0.502763,0.539327
5,0.5262,0.519993,0.329652,0.343368,0.343861,0.243602,0.226269,0.319562,0.389084,0.28003,0.505157,0.541734
6,0.5231,0.518606,0.332536,0.343147,0.352128,0.253048,0.225706,0.318938,0.388814,0.2821,0.507017,0.543446
7,0.5261,0.518828,0.332069,0.341406,0.348728,0.25552,0.224925,0.319495,0.387833,0.281633,0.506349,0.545033
8,0.5252,0.518771,0.334206,0.343736,0.362147,0.25641,0.225673,0.319112,0.387735,0.282577,0.505786,0.54326


Best thresholds per class: [np.float32(0.40486038), np.float32(0.30673468), np.float32(0.33764017), np.float32(0.3619537), np.float32(0.41117), np.float32(0.3589146), np.float32(0.45048025)]
Best thresholds per class: [np.float32(0.37759805), np.float32(0.32917604), np.float32(0.32124704), np.float32(0.36965632), np.float32(0.39987022), np.float32(0.3628563), np.float32(0.43002766)]
Best thresholds per class: [np.float32(0.37914848), np.float32(0.31184584), np.float32(0.31605253), np.float32(0.33709443), np.float32(0.36590973), np.float32(0.34609914), np.float32(0.42536652)]
Best thresholds per class: [np.float32(0.35835305), np.float32(0.3181673), np.float32(0.33622208), np.float32(0.35902697), np.float32(0.38554814), np.float32(0.34986588), np.float32(0.4208971)]
Best thresholds per class: [np.float32(0.36274344), np.float32(0.33187717), np.float32(0.32199275), np.float32(0.35420963), np.float32(0.37513402), np.float32(0.33622208), np.float32(0.40721554)]
Best thresholds per class: [

TrainOutput(global_step=12000, training_loss=0.5279533106486003, metrics={'train_runtime': 3436.6009, 'train_samples_per_second': 55.869, 'train_steps_per_second': 3.492, 'total_flos': 2.52597952512e+16, 'train_loss': 0.5279533106486003, 'epoch': 8.0})

In [None]:
print(trainer.pos_weight)

None


In [34]:
model.eval()
from sklearn.metrics import f1_score

# Get validation logits (use trainer.predict for convenience)
preds_output = trainer.predict(dataset["validation"])
val_logits = preds_output.predictions  # shape (N_val, num_labels)
val_probs = torch.sigmoid(torch.tensor(val_logits)).numpy()
val_labels = preds_output.label_ids.astype(int)

# Search thresholds per label (0.1..0.9)
best_thresholds = np.ones(val_labels.shape[1]) * 0.5
for i in range(val_labels.shape[1]):
    best_f1 = -1
    for t in np.linspace(0.1, 0.9, 17):
        p = (val_probs[:, i] > t).astype(int)
        f1 = f1_score(val_labels[:, i], p, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_thresholds[i] = t
print("Best thresholds:", best_thresholds)

# Compute final metrics with these thresholds
final_preds = (val_probs > best_thresholds[None, :]).astype(int)
print("Final macro F1:", f1_score(val_labels, final_preds, average='macro'))
print("Final micro F1:", f1_score(val_labels, final_preds, average='micro'))

Best thresholds per class: [np.float32(0.37216288), np.float32(0.3165805), np.float32(0.32541335), np.float32(0.35779187), np.float32(0.380241), np.float32(0.34477428), np.float32(0.3859531)]
Best thresholds: [0.1 0.1 0.1 0.1 0.1 0.1 0.1]
Final macro F1: 0.332844457232974
Final micro F1: 0.337719905804128


In [43]:
texts = [
    "من امروز خیلی خوشحالم",      # Happy
    "احساس می‌کنم ناراحت و خسته‌ام", # Sad
    "از تاریکی می‌ترسم",           # Fear
    "قدم زدن زیر بارون شاید بهترین مسکن درد هاست..." # Neutral
]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
inputs = {k: v.to(device) for k, v in inputs.items()}
outputs = model(**inputs)
probs = torch.sigmoid(outputs.logits).detach().cpu().numpy()

for text, p in zip(texts, probs):
    labels_pred = [final_labels[i] for i, v in enumerate(p) if v > 0.5]
    print(text, "->", labels_pred)


من امروز خیلی خوشحالم -> []
احساس می‌کنم ناراحت و خسته‌ام -> []
از تاریکی می‌ترسم -> []
قدم زدن زیر بارون شاید بهترین مسکن درد هاست... -> []


In [None]:
import numpy as np
from sklearn.metrics import f1_score, classification_report

# collect true labels and preds on validation set
preds_logits = trainer.predict(dataset["validation"]).predictions  # raw logits
probs = 1 / (1 + np.exp(-preds_logits))
preds = (probs > 0.5).astype(int)

# binarize references (soft labels → 0/1)
refs = np.stack([ex["labels"] for ex in dataset["validation"]])
refs_bin = (refs >= 0.5).astype(int)

# overall per-label counts
pos_counts = refs_bin.sum(axis=0)
neg_counts = refs_bin.shape[0] - pos_counts
print("pos counts per label:", pos_counts)
print("neg counts per label:", neg_counts)

# per-label f1
for i, name in enumerate(final_labels):
    print(name, "F1:", f1_score(refs_bin[:, i], preds[:, i], zero_division=0))

# full classification report
print(classification_report(refs_bin, preds, zero_division=0))


In [16]:
import numpy as np

# Convert Hugging Face dataset into numpy array of labels
all_labels = np.stack(dataset["train"]["labels"])  # shape: (num_samples, num_labels)

# Count positives and negatives per label
pos_counts = all_labels.sum(axis=0)
neg_counts = all_labels.shape[0] - pos_counts

print("pos counts per label:", pos_counts)
print("neg counts per label:", neg_counts)

# If you want mapping to label names:
for name, pos, neg in zip(final_labels, pos_counts, neg_counts):
    print(f"{name:10s} | pos: {int(pos):5d} | neg: {int(neg):5d}")

val_labels = np.array(dataset["validation"]["labels"])
print("Validation label frequencies:", np.sum(val_labels, axis=0) / val_labels.shape[0])

pos counts per label: [5363. 3394. 3155. 4775. 5797. 4121. 7864.]
neg counts per label: [18637. 20606. 20845. 19225. 18203. 19879. 16136.]
Anger      | pos:  5363 | neg: 18637
Fear       | pos:  3394 | neg: 20606
Happiness  | pos:  3155 | neg: 20845
Hatred     | pos:  4775 | neg: 19225
Sadness    | pos:  5797 | neg: 18203
Wonder     | pos:  4121 | neg: 19879
Neutral    | pos:  7864 | neg: 16136
Validation label frequencies: [0.22133333 0.1395     0.12733333 0.1905     0.24166667 0.16566667
 0.33616667]
