In [1]:
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
import json

In [2]:
# Load CoNLL Data
def load_conll_data(filepath):
    tokens, labels = [], []
    with open(filepath, encoding='utf-8') as f:
        words, tags = [], []
        for line in f:
            line = line.strip()
            if not line:
                if words:
                    tokens.append(words)
                    labels.append(tags)
                    words, tags = [], []
            else:
                splits = line.split()
                if len(splits) == 2:
                    words.append(splits[0])
                    tags.append(splits[1])
    return Dataset.from_dict({"tokens": tokens, "ner_tags": labels})

raw_dataset = load_conll_data("../data/labeled/ner_sample.conll")
dataset = DatasetDict({
    "train": raw_dataset.train_test_split(test_size=0.2, seed=42)["train"],
    "test": raw_dataset.train_test_split(test_size=0.2, seed=42)["test"],
})

In [3]:

# Label Mapping
label_list = sorted(set(tag for tags in dataset["train"]["ner_tags"] for tag in tags))
label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for l, i in label_to_id.items()}

# Tokenizer & Alignment
model_checkpoint = "masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            else:
                label_ids.append(label_to_id[label[word_idx]])
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [4]:
print("Sample dataset entry:", raw_dataset[0])
print("Unique labels found:", set(tag for tags in raw_dataset["ner_tags"] for tag in tags))
print("Label list:", label_list)


Sample dataset entry: {'tokens': ['👋', 'BARDEFU', '2', 'IN', '1', 'Multi', 'purpose', 'juicer', '👉', 'ኳሊቲ', 'የሆነ', 'የጁስ', 'መፍጫ', '👉', 'የጀርመን', 'ቴክኖሎጂ', 'የሆነ', '👉', '3', 'ሌትር', 'ጁስ', 'የሚፈጭ', 'ጆግ', 'ያለው', '👉', 'የብና', 'እና', 'የቅመማ', 'ቅመም', 'መፍጫ', 'ያለው', '👉', '8000Watt', 'የሆነ', '👉', 'ምላጮቹ', 'ጠንካራ', 'የሆኑ', '👉', 'ለቤት', 'እንዲሁም', 'ለስራ', 'የሚሆን', 'አሪፍ', 'እቃ', '👉', 'ለአጠቃቀም', 'ቀላል', '👉', 'በረዶ', 'ይፈጫል', 'ዋጋ:-6800ብር', 'ውስን', 'ፍሬ', 'ነው', 'ያለን/', 'Limited', 'Stock', '🏢', 'አድራሻ', 'ቁ.1', '👉', 'መገናኛ', 'ታሜ', 'ጋስ', 'ህንፃ', 'ጎን', 'ስሪ', 'ኤም', 'ሲቲ', 'ሞል', 'ሁለተኛ', 'ፎቅ', 'ቢሮ', 'ቁ.', 'SL-05A(ከ', 'ሊፍቱ', 'ፊት', 'ለ', 'ፊት)', '📍ቁ.2', '👉ለቡ', 'መዳህኒዓለም', 'ቤተ/ክርስቲያን', 'ወደ', 'ሙዚቃ', 'ቤት', 'ከፍ', 'ብሎ', '#ዛም_ሞል', '2ኛ', 'ፎቅ', 'ቢሮ.ቁ', '214', '📲', '0909522840', '📲', '0923350054', '👍ለቡ', 'ቅርንጫፍ📲0973611819', '🔖', '💬', 'በTelegram', 'ለማዘዝ', '⤵️', 'ይጠቀሙ', '@shager_onlinestore', 'ለተጨማሪ', 'ማብራሪያ', 'የቴሌግራም', 'ገፃችን⤵️', 'https://t.me/Shageronlinestore'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 

In [5]:
# Load model


model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id,
    ignore_mismatched_sizes=True 
)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./ner_model_afroxlmr",
    do_eval=True,  # enable evaluation
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs_afroxlmr",
    save_total_limit=2,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([1]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([1, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Metrics Function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
    pred_labels = [[id_to_label[p] for (p, l) in zip(pred, label) if l != -100]
                   for pred, label in zip(predictions, labels)]

    precision, recall, f1, _ = precision_recall_fscore_support(
        [item for sublist in true_labels for item in sublist],
        [item for sublist in pred_labels for item in sublist],
        average="macro"
    )

    return {"precision": precision, "recall": recall, "f1": f1}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train and Evaluate
trainer.train()

metrics = trainer.evaluate()
print(metrics)

# Save metrics
with open("metrics_afroxlmr.json", "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2, ensure_ascii=False)

  trainer = Trainer(


Step,Training Loss




{'eval_loss': 0.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 0.9399, 'eval_samples_per_second': 1.064, 'eval_steps_per_second': 1.064, 'epoch': 3.0}
