In [None]:
!pip install transformers datasets evaluate seqeval

Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=b83b7276c9bd1f326fb203848fa48e7bce5a2ef85aeb7ec095eaaba66860bca7
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval, evaluate
Successfully installed evaluate-0.4.4 seqeval-1.2.2


In [None]:
# 🧲 Step 2: Import Libraries
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import evaluate
import torch

In [None]:
# 🧾 Step 3: Load the CoNLL-formatted dataset
def read_conll_data(filepath):
    tokens, labels, data = [], [], []
    with open(filepath, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    data.append({"tokens": tokens, "ner_tags": labels})
                    tokens, labels = [], []
            else:
                splits = line.split()
                if len(splits) >= 2:
                    tokens.append(splits[0])
                    labels.append(splits[1])
    if tokens:
        data.append({"tokens": tokens, "ner_tags": labels})
    return pd.DataFrame(data)

df = read_conll_data("amharic_ner_data.conll")

In [None]:
# 🧪 Step 4: Label Encoding
label_list = list(set(label for labels in df["ner_tags"] for label in labels))
label_list.sort()
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

df["labels"] = df["ner_tags"].apply(lambda tags: [label_to_id[tag] for tag in tags])
dataset = Dataset.from_pandas(df[["tokens", "labels"]])

In [None]:
# 🪄 Step 5: Load Tokenizer and Model
model_checkpoint = "Davlan/afro-xlmr-base"  # You can change this to "Ge'ez/bert-tiny-amharic" or similar
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list), id2label=id_to_label, label2id=label_to_id)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 🧵 Step 6: Tokenize and Align Labels
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["labels"][word_idx])
        else:
            labels.append(example["labels"][word_idx])
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels)

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

In [None]:
# 🧪 Step 7: Split Dataset
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)

In [None]:
# 🧱 Step 8: Training Setup
args = TrainingArguments(
    "amharic-ner-model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    save_total_limit=2,
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)
    true_predictions = [[id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
                        for prediction, label in zip(predictions, labels)]
    true_labels = [[id_to_label[l] for (p, l) in zip(prediction, label) if l != -100]
                   for prediction, label in zip(predictions, labels)]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return results

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
# 🚀 Step 9: Fine-Tune the Model
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
# 💾 Step 10: Save the Model
trainer.save_model("amharic_ner_final_model")
tokenizer.save_pretrained("amharic_ner_final_model")

('amharic_ner_final_model/tokenizer_config.json',
 'amharic_ner_final_model/special_tokens_map.json',
 'amharic_ner_final_model/sentencepiece.bpe.model',
 'amharic_ner_final_model/added_tokens.json',
 'amharic_ner_final_model/tokenizer.json')

In [None]:
import os
import zipfile

# Define model folder and zip path
model_folder = "amharic_ner_final_model"
zip_path = "amharic_ner_model.zip"

# Zip the model directory
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(model_folder):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, model_folder)
            zipf.write(file_path, arcname)

print(f"Model zipped as: {zip_path}")


Model zipped as: amharic_ner_model.zip
