# 📘 Amharic NER Fine-Tuning on Telegram Messages

# Install dependencies

In [1]:
!pip install --quiet --no-cache-dir \
  transformers \
  datasets \
  evaluate \
  seqeval \
  accelerate \
  fsspec \
  gcsfs

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m82.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m144.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m131.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m152.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m150.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m133.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m156.2 MB/s[0m eta 

# Upload the ner_dataset.conll file

In [2]:
from google.colab import files
uploaded = files.upload()  # Upload ner_dataset.conll

Saving ner_dataset.conll to ner_dataset.conll


# Load CoNLL data into Hugging Face format

In [3]:
from datasets import Dataset, DatasetDict

def read_conll_file(path):
    tokens, labels, data = [], [], []
    with open(path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens and len(tokens) == len(labels):
                    data.append({"tokens": tokens, "ner_tags": labels})
                else:
                    print(f"Skipping incomplete sentence: {tokens}, {labels}")
                tokens, labels = [], []
            else:
                splits = line.split()
                if len(splits) != 2:
                    print(f"Skipping invalid line: {line}")
                    continue
                tokens.append(splits[0])
                labels.append(splits[1])
    if tokens and len(tokens) == len(labels):
        data.append({"tokens": tokens, "ner_tags": labels})
    else:
        print(f"Skipping incomplete final sentence: {tokens}, {labels}")
    print(f"Loaded {len(data)} sentences from {path}")
    return data

dataset_data = read_conll_file("ner_dataset.conll")
dataset = Dataset.from_list(dataset_data).train_test_split(test_size=0.2)
raw_datasets = DatasetDict({"train": dataset["train"], "validation": dataset["test"]})

Skipping incomplete final sentence: [], []
Loaded 100 sentences from ner_dataset.conll


# Inspect label distribution

In [4]:
from collections import Counter
label_counts = Counter([label for example in dataset_data for label in example["ner_tags"]])
print("Label distribution:", label_counts)

Label distribution: Counter({'O': 3168, 'B-LOC': 618, 'B-PRICE': 488, 'I-LOC': 327, 'B-Product': 274, 'I-PRICE': 57, 'I-Product': 37})


# Tokenizer and label preparation

In [5]:
from transformers import AutoTokenizer

model_name = "xlm-roberta-base"  # Consider "distilbert-base-multilingual-cased" for smaller datasets
tokenizer = AutoTokenizer.from_pretrained(model_name)

unique_labels = sorted({label for row in dataset_data for label in row['ner_tags']})
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}
print("Unique labels:", unique_labels)

def encode_tags(example):
    example["ner_tags"] = [label2id[tag] for tag in example["ner_tags"]]
    return example

raw_datasets = raw_datasets.map(encode_tags)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Unique labels: ['B-LOC', 'B-PRICE', 'B-Product', 'I-LOC', 'I-PRICE', 'I-Product', 'O']


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

# Tokenize and align labels

In [6]:
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()

    label_ids = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            if word_idx >= len(example["ner_tags"]):
                print(f"Error: word_idx {word_idx} exceeds ner_tags length {len(example['ner_tags'])}")
                label_ids.append(-100)
            else:
                label_ids.append(example["ner_tags"][word_idx])
        else:
            label_ids.append(-100)  # Use -100 for subword tokens
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

tokenized_datasets = raw_datasets.map(tokenize_and_align_labels, batched=False)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

# Initialize model and training arguments

In [7]:
from transformers import (
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
import torch

model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(label2id), id2label=id2label, label2id=label2id
)

# Compute class weights
label_counts_dict = {label: label_counts.get(label, 1) for label in unique_labels}
total = sum(label_counts_dict.values())
class_weights = torch.tensor([total / (len(unique_labels) * label_counts_dict[label]) for label in unique_labels]).to("cuda" if torch.cuda.is_available() else "cpu")
print("Class weights:", class_weights)

training_args = TrainingArguments(
    output_dir="./amharic-ner-model",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=1,
    load_best_model_at_end=True,
    gradient_accumulation_steps=4,
    run_name="amharic-ner-run",
    report_to="none"  # Remove if using W&B
)

data_collator = DataCollatorForTokenClassification(tokenizer)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class weights: tensor([ 1.1486,  1.4546,  2.5907,  2.1708, 12.4536, 19.1853,  0.2241])


# Define metrics and custom trainer

In [8]:
import evaluate
import numpy as np

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_preds, true_labels = [], []
    for pred, lab in zip(predictions, labels):
        temp_pred, temp_lab = [], []
        for p, l in zip(pred, lab):
            if l != -100:
                temp_pred.append(id2label[p])
                temp_lab.append(id2label[l])
        true_preds.append(temp_pred)
        true_labels.append(temp_lab)

    print("Sample predictions:", true_preds[:1])
    print("Sample labels:", true_labels[:1])
    return seqeval.compute(predictions=true_preds, references=true_labels, zero_division=0)

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, len(unique_labels)), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Loc,Price,Product,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,1.7743,1.595149,"{'precision': 0.16363636363636364, 'recall': 0.13846153846153847, 'f1': 0.15000000000000002, 'number': 130}","{'precision': 0.28185328185328185, 'recall': 0.7604166666666666, 'f1': 0.4112676056338028, 'number': 96}","{'precision': 0.06227106227106227, 'recall': 0.918918918918919, 'f1': 0.11663807890222985, 'number': 37}",0.136612,0.475285,0.212224,0.174619
2,1.3935,1.209858,"{'precision': 0.2653061224489796, 'recall': 0.5, 'f1': 0.3466666666666667, 'number': 130}","{'precision': 0.28627450980392155, 'recall': 0.7604166666666666, 'f1': 0.41595441595441596, 'number': 96}","{'precision': 0.11267605633802817, 'recall': 0.43243243243243246, 'f1': 0.1787709497206704, 'number': 37}",0.239875,0.585551,0.340331,0.394924
3,1.0157,0.954152,"{'precision': 0.308411214953271, 'recall': 0.25384615384615383, 'f1': 0.27848101265822783, 'number': 130}","{'precision': 0.42391304347826086, 'recall': 0.8125, 'f1': 0.5571428571428573, 'number': 96}","{'precision': 0.12258064516129032, 'recall': 0.5135135135135135, 'f1': 0.19791666666666663, 'number': 37}",0.29148,0.494297,0.366714,0.546193
4,0.7525,0.782089,"{'precision': 0.37575757575757573, 'recall': 0.47692307692307695, 'f1': 0.42033898305084744, 'number': 130}","{'precision': 0.45664739884393063, 'recall': 0.8229166666666666, 'f1': 0.587360594795539, 'number': 96}","{'precision': 0.1891891891891892, 'recall': 0.5675675675675675, 'f1': 0.28378378378378377, 'number': 37}",0.360802,0.61597,0.455056,0.6
5,0.6645,0.693423,"{'precision': 0.4453125, 'recall': 0.43846153846153846, 'f1': 0.441860465116279, 'number': 130}","{'precision': 0.50625, 'recall': 0.84375, 'f1': 0.6328124999999999, 'number': 96}","{'precision': 0.22549019607843138, 'recall': 0.6216216216216216, 'f1': 0.33093525179856115, 'number': 37}",0.412821,0.612167,0.493109,0.661929
6,0.6192,0.595862,"{'precision': 0.4768211920529801, 'recall': 0.5538461538461539, 'f1': 0.5124555160142349, 'number': 130}","{'precision': 0.5477707006369427, 'recall': 0.8958333333333334, 'f1': 0.6798418972332017, 'number': 96}","{'precision': 0.2903225806451613, 'recall': 0.7297297297297297, 'f1': 0.41538461538461535, 'number': 37}",0.461347,0.703422,0.557229,0.714721
7,0.5273,0.551628,"{'precision': 0.5, 'recall': 0.47692307692307695, 'f1': 0.4881889763779527, 'number': 130}","{'precision': 0.5590062111801242, 'recall': 0.9375, 'f1': 0.7003891050583657, 'number': 96}","{'precision': 0.29, 'recall': 0.7837837837837838, 'f1': 0.4233576642335766, 'number': 37}",0.47013,0.688213,0.558642,0.726904
8,0.4571,0.510905,"{'precision': 0.5227272727272727, 'recall': 0.5307692307692308, 'f1': 0.5267175572519083, 'number': 130}","{'precision': 0.5859872611464968, 'recall': 0.9583333333333334, 'f1': 0.7272727272727273, 'number': 96}","{'precision': 0.3111111111111111, 'recall': 0.7567567567567568, 'f1': 0.44094488188976383, 'number': 37}",0.498681,0.718631,0.588785,0.760406
9,0.4222,0.489165,"{'precision': 0.5227272727272727, 'recall': 0.5307692307692308, 'f1': 0.5267175572519083, 'number': 130}","{'precision': 0.5923566878980892, 'recall': 0.96875, 'f1': 0.7351778656126482, 'number': 96}","{'precision': 0.3076923076923077, 'recall': 0.7567567567567568, 'f1': 0.4375, 'number': 37}",0.5,0.722433,0.59098,0.765482
10,0.3897,0.48127,"{'precision': 0.5755395683453237, 'recall': 0.6153846153846154, 'f1': 0.5947955390334573, 'number': 130}","{'precision': 0.5923566878980892, 'recall': 0.96875, 'f1': 0.7351778656126482, 'number': 96}","{'precision': 0.31521739130434784, 'recall': 0.7837837837837838, 'f1': 0.44961240310077516, 'number': 37}",0.520619,0.768061,0.620584,0.772589


Sample predictions: [['B-Product', 'B-Product', 'B-Product', 'B-Product', 'B-Product', 'B-Product', 'B-Product', 'B-Product', 'B-Product', 'B-Product', 'B-PRICE', 'B-PRICE', 'I-PRICE', 'B-Product', 'I-LOC', 'B-Product', 'B-PRICE', 'B-Product', 'B-Product', 'B-Product', 'B-Product', 'I-LOC', 'B-Product', 'B-PRICE', 'B-PRICE', 'B-PRICE', 'B-PRICE']]
Sample labels: [['O', 'O', 'B-Product', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
Sample predictions: [['B-Product', 'B-Product', 'B-Product', 'B-Product', 'B-Product', 'O', 'B-Product', 'B-Product', 'B-Product', 'B-LOC', 'B-PRICE', 'B-PRICE', 'I-PRICE', 'B-Product', 'I-LOC', 'B-Product', 'B-PRICE', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-PRICE', 'B-PRICE', 'B-PRICE', 'B-PRICE']]
Sample labels: [['O', 'O', 'B-Product', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
Sample predictions: [['B-Produ

TrainOutput(global_step=50, training_loss=0.8485044449567795, metrics={'train_runtime': 4147.6467, 'train_samples_per_second': 0.193, 'train_steps_per_second': 0.012, 'total_flos': 82904226327120.0, 'train_loss': 0.8485044449567795, 'epoch': 10.0})

# Save model

In [9]:
model.save_pretrained("amharic-ner-model")
tokenizer.save_pretrained("amharic-ner-model")

('amharic-ner-model/tokenizer_config.json',
 'amharic-ner-model/special_tokens_map.json',
 'amharic-ner-model/sentencepiece.bpe.model',
 'amharic-ner-model/added_tokens.json',
 'amharic-ner-model/tokenizer.json')

In [13]:
import shutil

shutil.make_archive("amharic-ner-model", 'zip', "amharic-ner-model")


'/content/amharic-ner-model.zip'

# Download the .zip File

In [14]:
from IPython.display import FileLink
FileLink('amharic-ner-model.zip')