In [49]:
import os
import warnings

from dotenv import load_dotenv

import torch
import evaluate
import numpy as np

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix,
    top_k_accuracy_score,
)
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from huggingface_hub import login
import wandb

DATASET_ID = "minhleduc/multilang-classify-dataset-02"
MODEL_NAMES = {
    "xlm-roberta": "FacebookAI/xlm-roberta-base",
    "mbert": "google-bert/bert-base-multilingual-cased",
    "distilbert-multilingual": "distilbert/distilbert-base-multilingual-cased",
    "deberta-v3": "microsoft/mdeberta-v3-base",
    "rembert": "google/rembert",
}


load_dotenv()
login(token=os.getenv("HF_TOKEN"))
wandb.login(key=os.getenv("WANDB_API_KEY"),
                )

warnings.filterwarnings("ignore")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/octoopt/.netrc
[34m[1mwandb[0m: Currently logged in as: [33moctoopt[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [20]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Using device: {DEVICE}")

Using device: cuda


# Dataset

In [7]:
# Load the dataset
dataset = load_dataset(DATASET_ID)
print("Dataset structure:")
print(dataset)
print("\nDataset info:")
print(f"Number of splits: {len(dataset)}")
for split_name, split_data in dataset.items():
    print(f"{split_name}: {len(split_data)} examples")

# Show first few examples
print("\nFirst 3 examples from train split:")
for i in range(min(3, len(dataset["train"]))):
    print(f"Example {i + 1}:")
    example = dataset["train"][i]
    for key, value in example.items():
        if isinstance(value, str) and len(value) > 100:
            print(f"  {key}: {value[:100]}...")
        else:
            print(f"  {key}: {value}")
    print()

Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['Language', 'Text'],
        num_rows: 83358
    })
    validation: Dataset({
        features: ['Language', 'Text'],
        num_rows: 17862
    })
    test: Dataset({
        features: ['Language', 'Text'],
        num_rows: 17863
    })
})

Dataset info:
Number of splits: 3
train: 83358 examples
validation: 17862 examples
test: 17863 examples

First 3 examples from train split:
Example 1:
  Language: 28
  Text: wa wa nasaha walevi could ya wagonjwa wa wamelewa wamelewa

Example 2:
  Language: 35
  Text: vĩnh viễn quảng cáo tỷ lệ tăng trưởng gdp sẽ yêu cầu bao giờ tăng các lượt chia sẻ tương đối của lưu...

Example 3:
  Language: 33
  Text: özellikle şehrin en ünlü sakinleri olan geyşa nın son eğitim merkezi olarak ünlü



In [8]:
train_ds = dataset["train"]
val_ds = dataset["validation"]
test_ds = dataset["test"]

In [39]:
code_to_index = {
    0: {"code": "ar", "name": "Arabic"},
    1: {"code": "bg", "name": "Bulgarian"},
    2: {"code": "zh-cn", "name": "Chinese (Simplified)"},
    3: {"code": "da", "name": "Danish"},
    4: {"code": "nl", "name": "Dutch"},
    5: {"code": "en", "name": "English"},
    6: {"code": "et", "name": "Estonian"},
    7: {"code": "fr", "name": "French"},
    8: {"code": "de", "name": "German"},
    9: {"code": "el", "name": "Greek"},
    10: {"code": "hi", "name": "Hindi"},
    11: {"code": "id", "name": "Indonesian"},
    12: {"code": "it", "name": "Italian"},
    13: {"code": "ja", "name": "Japanese"},
    14: {"code": "kn", "name": "Kannada"},
    15: {"code": "ko", "name": "Korean"},
    16: {"code": "it", "name": "Italian"},
    17: {"code": "ml", "name": "Malayalam"},
    18: {"code": "el", "name": "Greek"},
    19: {"code": "fa", "name": "Persian"},
    20: {"code": "pl", "name": "Polish"},
    21: {"code": "pt", "name": "Portuguese"},
    22: {"code": "pt", "name": "Portuguese"},
    23: {"code": "pt", "name": "Portuguese"},
    24: {"code": "fa", "name": "Persian"},
    25: {"code": "ro", "name": "Romanian"},
    26: {"code": "ru", "name": "Russian"},
    27: {"code": "es", "name": "Spanish"},
    28: {"code": "sw", "name": "Swahili"},
    29: {"code": "sv", "name": "Swedish"},
    30: {"code": "sv", "name": "Swedish"},
    31: {"code": "ta", "name": "Tamil"},
    32: {"code": "th", "name": "Thai"},
    33: {"code": "tr", "name": "Turkish"},
    34: {"code": "ur", "name": "Urdu"},
    35: {"code": "vi", "name": "Vietnamese"},
}

In [40]:
def index2label(code_dict: dict):
    return {k: v["name"] for k, v in code_dict.items()}


def label2index(code_dict: dict):
    return {v["name"]: k for k, v in code_dict.items()}


LABEL_TO_INDEX = label2index(code_to_index)
INDEX_TO_LABEL = index2label(code_to_index)

In [41]:
LABEL_TO_INDEX

{'Arabic': 0,
 'Bulgarian': 1,
 'Chinese (Simplified)': 2,
 'Danish': 3,
 'Dutch': 4,
 'English': 5,
 'Estonian': 6,
 'French': 7,
 'German': 8,
 'Greek': 18,
 'Hindi': 10,
 'Indonesian': 11,
 'Italian': 16,
 'Japanese': 13,
 'Kannada': 14,
 'Korean': 15,
 'Malayalam': 17,
 'Persian': 24,
 'Polish': 20,
 'Portuguese': 23,
 'Romanian': 25,
 'Russian': 26,
 'Spanish': 27,
 'Swahili': 28,
 'Swedish': 30,
 'Tamil': 31,
 'Thai': 32,
 'Turkish': 33,
 'Urdu': 34,
 'Vietnamese': 35}

In [42]:
INDEX_TO_LABEL

{0: 'Arabic',
 1: 'Bulgarian',
 2: 'Chinese (Simplified)',
 3: 'Danish',
 4: 'Dutch',
 5: 'English',
 6: 'Estonian',
 7: 'French',
 8: 'German',
 9: 'Greek',
 10: 'Hindi',
 11: 'Indonesian',
 12: 'Italian',
 13: 'Japanese',
 14: 'Kannada',
 15: 'Korean',
 16: 'Italian',
 17: 'Malayalam',
 18: 'Greek',
 19: 'Persian',
 20: 'Polish',
 21: 'Portuguese',
 22: 'Portuguese',
 23: 'Portuguese',
 24: 'Persian',
 25: 'Romanian',
 26: 'Russian',
 27: 'Spanish',
 28: 'Swahili',
 29: 'Swedish',
 30: 'Swedish',
 31: 'Tamil',
 32: 'Thai',
 33: 'Turkish',
 34: 'Urdu',
 35: 'Vietnamese'}

In [13]:
train_ds[2]

{'Language': 33,
 'Text': 'özellikle şehrin en ünlü sakinleri olan geyşa nın son eğitim merkezi olarak ünlü'}

# Experiments

In [43]:
model_id = MODEL_NAMES["xlm-roberta"]

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=len(code_to_index),
    id2label=INDEX_TO_LABEL,
    label2id=LABEL_TO_INDEX,
).to(DEVICE)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
model

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

## Preprocess dataset

In [17]:
def preprocessing(sample):
    return tokenizer(sample["Text"], truncation=True)

In [18]:
tokenized_dataset = dataset.map(preprocessing, batched=True)
tokenized_dataset

Map: 100%|██████████| 83358/83358 [00:04<00:00, 16866.69 examples/s]
Map: 100%|██████████| 17862/17862 [00:01<00:00, 14543.88 examples/s]
Map: 100%|██████████| 17863/17863 [00:01<00:00, 14811.11 examples/s]


DatasetDict({
    train: Dataset({
        features: ['Language', 'Text', 'input_ids', 'attention_mask'],
        num_rows: 83358
    })
    validation: Dataset({
        features: ['Language', 'Text', 'input_ids', 'attention_mask'],
        num_rows: 17862
    })
    test: Dataset({
        features: ['Language', 'Text', 'input_ids', 'attention_mask'],
        num_rows: 17863
    })
})

In [19]:
train_preprocessed = tokenized_dataset["train"]
val_preprocessed = tokenized_dataset["validation"]
test_preprocessed = tokenized_dataset["test"]

train_preprocessed[0]

{'Language': 28,
 'Text': 'wa wa nasaha walevi could ya wagonjwa wa wamelewa wamelewa',
 'input_ids': [0,
  259,
  259,
  22182,
  528,
  259,
  94201,
  5809,
  151,
  218761,
  259,
  259,
  39,
  79870,
  259,
  39,
  79870,
  2],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [47]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [54]:
# Model Evaluation Setup

"""
            Predicted
            Yes    No
Actual  Yes  TP   FN  <- Recall = TP/(TP+FN)
        No   FP   TN
            ↑
        Precision = TP/(TP+FP)
"""
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

Downloading builder script: 4.20kB [00:00, 17.2MB/s]
Downloading builder script: 6.79kB [00:00, 3.25MB/s]
Downloading builder script: 7.56kB [00:00, 5.47MB/s]
Downloading builder script: 7.38kB [00:00, 3.86MB/s]


In [63]:
BATCH_SIZE = 16 
EPOCHS = 3  
WEIGHT_DECAY = 0.01
LEARNING_RATE = 2e-5
LOGGING_STEPS = 10

training_args = TrainingArguments(
    output_dir="../results",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    report_to="wandb",
    logging_dir="../logs",
    logging_strategy="epoch",
    logging_steps=LOGGING_STEPS,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    run_name="multilang-classify-xlm-roberta-base",

)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,

    compute_metrics=clf_metrics,
)

In [None]:
trainer.train()

## Evaluation

In [None]:
# Before training
from copy import deepcopy

bf_model = deepcopy(model)

bf_model.to(DEVICE)

bf_model.eval()

with torch.no_grad():
    bf_predictions = bf_model.generate(
        **tokenized_dataset["test"],
    )

In [None]:
# After training

predictions = trainer.predict(tokenized_dataset["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

In [55]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Basic metrics
    accuracy = accuracy_score(labels, predictions)

    # F1 scores
    f1_macro = f1_score(labels, predictions, average="macro")
    f1_micro = f1_score(labels, predictions, average="micro")
    f1_weighted = f1_score(labels, predictions, average="weighted")

    # Precision & Recall
    precision_macro = precision_score(labels, predictions, average="macro")
    recall_macro = recall_score(labels, predictions, average="macro")

    # Top-k accuracy
    top3_acc = top_k_accuracy_score(labels, eval_pred[0], k=3)
    top5_acc = top_k_accuracy_score(labels, eval_pred[0], k=5)

    return {
        "accuracy": accuracy,
        "f1_macro": f1_macro,
        "f1_micro": f1_micro,
        "f1_weighted": f1_weighted,
        "precision_macro": precision_macro,
        "recall_macro": recall_macro,
        "top3_accuracy": top3_acc,
        "top5_accuracy": top5_acc,
    }


# For detailed analysis after training:
def detailed_evaluation(y_true, y_pred, y_pred_proba, class_names):
    # Classification report
    report = classification_report(
        y_true, y_pred, target_names=class_names, output_dict=True
    )

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Per-class F1 scores
    per_class_f1 = f1_score(y_true, y_pred, average=None)

    return {
        "classification_report": report,
        "confusion_matrix": cm,
        "per_class_f1": per_class_f1,
    }

### Note on DataCollatorWithPadding

`DataCollatorWithPadding` in Hugging Face is a utility that handles **dynamic padding** of tokenized sequences in a batch during training or inference. Here's what it does:

#### Key Functions:

1. **Dynamic Batching**: When you have sequences of different lengths in a batch, it pads shorter sequences to match the longest sequence in that specific batch (not a global maximum)

2. **Efficient Memory Usage**: Instead of padding all sequences to a fixed maximum length, it only pads to the longest sequence in each batch, saving memory and computation

3. **Automatic Padding Token**: Uses the tokenizer's padding token (usually `[PAD]` or `<pad>`) to fill shorter sequences

4. **Attention Mask Handling**: Automatically creates or updates attention masks to ignore padded tokens during model computation

#### Example:
```python
# Without padding - sequences have different lengths:
batch = [
    [1, 2, 3],           # length 3
    [4, 5, 6, 7, 8],     # length 5  
    [9, 10]              # length 2
]

# After DataCollatorWithPadding:
padded_batch = [
    [1, 2, 3, 0, 0],     # padded to length 5
    [4, 5, 6, 7, 8],     # already length 5
    [9, 10, 0, 0, 0]     # padded to length 5
]
# + corresponding attention masks: [1,1,1,0,0], [1,1,1,1,1], [1,1,0,0,0]
```

#### In our Context:
In this multilingual classification notebook, it ensures that when training batches are created from our tokenized text data (which have varying lengths), they're properly padded so the model can process them efficiently as tensors of uniform shape.