In [1]:
from pathlib import Path
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
import pandas as pd
import numpy as np
import torch
import random
import os


In [2]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)


In [3]:
def load_conll_file(file_path):
    sentences = []
    labels = []
    tokens = []
    tags = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens = []
                    tags = []
            else:
                splits = line.split()
                if len(splits) == 2:
                    token, tag = splits
                    tokens.append(token)
                    tags.append(tag)

    # Append last sentence if file doesn't end with newline
    if tokens:
        sentences.append(tokens)
        labels.append(tags)

    return sentences, labels

conll_path = "/content/labeled_data.txt"
sentences, ner_tags = load_conll_file(conll_path)

print(f"✅ Loaded {len(sentences)} sentences")
print("🧾 Sample:")
for tok, tag in zip(sentences[0], ner_tags[0]):
    print(f"{tok:10} {tag}")


✅ Loaded 50 sentences
🧾 Sample:
በቀላሉ       O
ያለ         O
የሚሆን       O
Electric   B-Product
Charcoal   I-Product
Burner     I-Product
1600       B-PRICE
ብር         I-PRICE
መገናኛ       B-LOC
መሰረት_ደፋር_ሞል I-LOC
ሁለተኛ_ፎቅ    I-LOC


In [4]:
from datasets import Dataset

# Define label list in challenge order
label_list = [
    "O",
    "B-Product", "I-Product",
    "B-PRICE", "I-PRICE",
    "B-LOC", "I-LOC"
]

label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

# Prepare data
data = [{"tokens": tokens, "ner_tags": [label_to_id[tag] for tag in tags]}
        for tokens, tags in zip(sentences, ner_tags)]

dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

print("✅ Dataset prepared:")
train_dataset[0]


✅ Dataset prepared:


{'tokens': ['ህንጻ',
  'ወረድ',
  'ግራውንድ',
  'የሱቅ',
  'ላይ',
  'ብሎ',
  'JORDAN',
  '11',
  '5500',
  'Br',
  'አዲስ',
  'አበባ',
  'ሜክሲኮ'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 6]}

In [5]:
model_checkpoint = "xlm-roberta-base"  # you can swap this later for afroxlmr or bert-tiny-amharic

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [6]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        return_offsets_mapping=True,
        padding=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                # Assign -100 to subword pieces so they are ignored in loss
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply to train and eval sets
tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_eval = eval_dataset.map(tokenize_and_align_labels, batched=True)

print("✅ Tokenization complete")


Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

✅ Tokenization complete


In [7]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
import evaluate

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [11]:
output_dir = "/content/ner-xlmr-model"

training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",  # Changed from evaluation_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

In [13]:
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


trainer.train()


  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmobangaibsa35[0m ([33mmobangaibsa35-hilcoe-school-of-computer-science-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,1.7313,1.598431,0.0,0.0,0.0,0.442177
2,1.5494,1.324272,0.142857,0.033333,0.054054,0.55102
3,1.3197,1.126068,0.3125,0.166667,0.217391,0.632653
4,1.1588,1.004558,0.411765,0.233333,0.297872,0.653061
5,1.0361,0.912893,0.5,0.266667,0.347826,0.659864
6,0.9704,0.883859,0.375,0.2,0.26087,0.659864


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=30, training_loss=1.2942917505900065, metrics={'train_runtime': 779.0093, 'train_samples_per_second': 0.308, 'train_steps_per_second': 0.039, 'total_flos': 4899535660800.0, 'train_loss': 1.2942917505900065, 'epoch': 6.0})