# Amharic NER Model Comparison & Selection


#### This notebook loads Amharic NER data in `.conll` format, fine-tunes multiple models, evaluates them, and compares results.


In [1]:
import os
import time
import random
import numpy as np
from datasets import Dataset, load_metric
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments,
)
import torch

#  Load `.conll` Data

In [2]:
def load_conll(filepath):
    sentences = []
    labels = []
    tokens = []
    tags = []

    with open(filepath, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens = []
                    tags = []
            else:
                splits = line.split()
                token = splits[0]
                tag = splits[-1]
                tokens.append(token)
                tags.append(tag)
        if tokens:
            sentences.append(tokens)
            labels.append(tags)
    return sentences, labels

# Load your single .conll file
file_path = "/content/ner_dataset.conll"
all_sentences, all_labels_text = load_conll(file_path)
print(f"Loaded {len(all_sentences)} sentences from {file_path}")

Loaded 100 sentences from /content/ner_dataset.conll


# Shuffle and split into train and validation sets

In [3]:
combined = list(zip(all_sentences, all_labels_text))
random.seed(42)
random.shuffle(combined)
all_sentences, all_labels_text = zip(*combined)

split_ratio = 0.8
split_idx = int(len(all_sentences) * split_ratio)

train_sentences = list(all_sentences[:split_idx])
train_labels_text = list(all_labels_text[:split_idx])

val_sentences = list(all_sentences[split_idx:])
val_labels_text = list(all_labels_text[split_idx:])

print(f"Train size: {len(train_sentences)} sentences")
print(f"Validation size: {len(val_sentences)} sentences")

Train size: 80 sentences
Validation size: 20 sentences


# Create label maps and encode labels

In [4]:
def get_label_map(all_labels):
    unique_labels = sorted(set(label for sent in all_labels for label in sent))
    label2id = {label: i for i, label in enumerate(unique_labels)}
    id2label = {i: label for label, i in label2id.items()}
    return label2id, id2label

def encode_labels(labels, label2id):
    return [[label2id[label] for label in sent] for sent in labels]

label2id, id2label = get_label_map(train_labels_text + val_labels_text)
print(f"Labels: {label2id}")

train_labels = encode_labels(train_labels_text, label2id)
val_labels = encode_labels(val_labels_text, label2id)

LABEL_LIST = [id2label[i] for i in range(len(id2label))]
NUM_LABELS = len(LABEL_LIST)

train_data = {"tokens": train_sentences, "labels": train_labels}
val_data = {"tokens": val_sentences, "labels": val_labels}

train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)

Labels: {'B-LOC': 0, 'B-PRICE': 1, 'B-Product': 2, 'I-LOC': 3, 'I-PRICE': 4, 'I-Product': 5, 'O': 6}


# Tokenize and align labels

In [5]:
def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=True
    )
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Define metric function for evaluation

In [6]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [LABEL_LIST[l] for l in label if l != -100] for label in labels
    ]
    true_predictions = [
        [LABEL_LIST[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Models to compare

In [7]:
MODELS = {
    # "bert-amharic": "Davlan/bert-base-amharic",
    "xlm-roberta": "xlm-roberta-base",
    "distilbert-multilingual": "distilbert-base-multilingual-cased",
    "bert-multilingual": "bert-base-multilingual-cased",
}

# Train, evaluate, and log metrics for each model

In [8]:
from transformers import TrainingArguments, Trainer
import time
import os
import torch
from torch.utils.data import DataLoader

results = []

for model_id, model_name in MODELS.items():
    print(f"\n=== Training and evaluating {model_id} ({model_name}) ===")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(
        model_name, num_labels=NUM_LABELS
    )

    tokenized_train = train_dataset.map(
        lambda x: tokenize_and_align_labels(x, tokenizer), batched=True
    )
    tokenized_val = val_dataset.map(
        lambda x: tokenize_and_align_labels(x, tokenizer), batched=True
    )

    # ✅ Avoid possible local class conflict with `TrainingArguments`
    assert isinstance(TrainingArguments, type), "TrainingArguments is not a class!"

    training_args = TrainingArguments(
        output_dir=f"./models/{model_id}",
        evaluation_strategy="epoch",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        logging_dir=f"./logs/{model_id}",
        logging_steps=10,
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        save_total_limit=1,
        seed=42,
        report_to=[],
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    start_time = time.time()
    trainer.train()
    train_time = time.time() - start_time

    eval_metrics = trainer.evaluate()

    # Inference speed
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    total_samples = 0
    total_time = 0.0
    batch_size = 32

    val_loader = DataLoader(tokenized_val, batch_size=batch_size)
    for batch in val_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k in ["input_ids", "attention_mask"]}
        with torch.no_grad():
            start_inf = time.time()
            _ = model(**inputs)
            end_inf = time.time()
        total_samples += inputs["input_ids"].size(0)
        total_time += (end_inf - start_inf)

    speed = total_samples / total_time if total_time > 0 else 0

    # Model size
    model_path = f"./models/{model_id}"
    size_mb = sum(
        os.path.getsize(os.path.join(model_path, f)) for f in os.listdir(model_path)
        if os.path.isfile(os.path.join(model_path, f))
    ) / (1024 * 1024)

    results.append({
        "Model": model_id,
        "F1 Score": eval_metrics.get("eval_f1", 0),
        "Precision": eval_metrics.get("eval_precision", 0),
        "Recall": eval_metrics.get("eval_recall", 0),
        "Accuracy": eval_metrics.get("eval_accuracy", 0),
        "Speed (samples/sec)": round(speed, 2),
        "Size (MB)": round(size_mb, 2),
        "Training Time (s)": round(train_time, 2),
    })



=== Training and evaluating xlm-roberta (xlm-roberta-base) ===


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

# Display comparison table

In [None]:
print("\n\n=== Model Comparison Summary ===")
print(f"{'Model':<25} {'F1 Score':<10} {'Precision':<10} {'Recall':<10} {'Accuracy':<10} {'Speed':<15} {'Size (MB)':<10} {'Train Time(s)':<15}")
print("-" * 110)
for res in results:
    print(
        f"{res['Model']:<25} "
        f"{res['F1 Score']:<10.4f} "
        f"{res['Precision']:<10.4f} "
        f"{res['Recall']:<10.4f} "
        f"{res['Accuracy']:<10.4f} "
        f"{res['Speed (samples/sec)']:<15} "
        f"{res['Size (MB)']:<10} "
        f"{res['Training Time (s)']:<15}"
    )