In [None]:
from pathlib import Path
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
import pandas as pd
import numpy as np
import torch
import random
import os


In [None]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)


In [None]:
def load_conll_file(file_path):
    sentences = []
    labels = []
    tokens = []
    tags = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens = []
                    tags = []
            else:
                splits = line.split()
                if len(splits) == 2:
                    token, tag = splits
                    tokens.append(token)
                    tags.append(tag)

    # Append last sentence if file doesn't end with newline
    if tokens:
        sentences.append(tokens)
        labels.append(tags)

    return sentences, labels

conll_path = "/content/labeled_data.txt"
sentences, ner_tags = load_conll_file(conll_path)

print(f"✅ Loaded {len(sentences)} sentences")
print("🧾 Sample:")
for tok, tag in zip(sentences[0], ner_tags[0]):
    print(f"{tok:10} {tag}")


In [None]:
from datasets import Dataset

# Define label list in challenge order
label_list = [
    "O",
    "B-Product", "I-Product",
    "B-PRICE", "I-PRICE",
    "B-LOC", "I-LOC"
]

label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

# Prepare data
data = [{"tokens": tokens, "ner_tags": [label_to_id[tag] for tag in tags]}
        for tokens, tags in zip(sentences, ner_tags)]

dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

print("✅ Dataset prepared:")
train_dataset[0]


In [None]:
model_checkpoint = "Davlan/bert-base-multilingual-cased-finetuned-amharic"
run_name = "bert-amharic-ner"
output_dir = "/content/bert-base-amharic"


tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        return_offsets_mapping=True,
        padding=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                # Assign -100 to subword pieces so they are ignored in loss
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply to train and eval sets
tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_eval = eval_dataset.map(tokenize_and_align_labels, batched=True)

print("✅ Tokenization complete")


In [None]:
import evaluate

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [None]:

training_args = TrainingArguments(
    output_dir=output_dir,
    run_name=run_name,
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir=f"{output_dir}/logs",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)


In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


In [None]:
trainer.save_model("/content/bert-base-amharic")


In [None]:
tokenizer.save_pretrained("/content/bert-base-amharic")

In [None]:
!zip -r bert-base-amharic-model.zip /content/bert-base-amharic


In [None]:
from google.colab import files
files.download("bert-base-amharic-model.zip")

| Epoch | Training Loss | Validation Loss | Precision | Recall | F1 | Accuracy |
| --- | --- | --- | --- | --- | --- | --- |
| 1 | No log | 1.572634 | 0.000000 | 0.000000 | 0.000000 | 0.455782 |
| 2 | No log | 1.271012 | 0.000000 | 0.000000 | 0.000000 | 0.591837 |
| 3 | No log | 1.060682 | 0.727273 | 0.266667 | 0.390244 | 0.687075 |
| 4 | No log | 0.893102 | 0.473684 | 0.300000 | 0.367347 | 0.727891 |
| 5 | No log | 0.790093 | 0.600000 | 0.400000 | 0.480000 | 0.761905 |
| 6 | No log | 0.752542 | 0.600000 | 0.400000 | 0.480000 | 0.761905 |
