# Task 3: Fine-Tune NER Model (Improved Workflow)

This notebook uses the improved `FixedNERTrainer` and `FinalNERPredictor` for Amharic NER.

In [1]:
# === Configuration ===
MODEL_NAME = 'xlm-roberta-base'
CONLL_FILE = '../data/labeled/conll_labeled.txt'
OUTPUT_DIR = '../models/fixed_ner_model'
EPOCHS = 10
BATCH_SIZE = 8
LEARNING_RATE = 3e-5


## Train the Model with FixedNERTrainer

In [3]:

import sys
import os

# Get the absolute path to the src directory
SRC_PATH = os.path.abspath(os.path.join(os.getcwd(), "../src"))
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)
from model_training.fixed_ner_trainer import FixedNERTrainer

trainer = FixedNERTrainer(MODEL_NAME)
trainer.train(
    conll_file=CONLL_FILE,
    output_dir=OUTPUT_DIR,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE
)


  from .autonotebook import tqdm as notebook_tqdm
INFO:model_training.fixed_ner_trainer:Loading tokenizer and model...
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:model_training.fixed_ner_trainer:Loading and preparing data...
INFO:model_training.fixed_ner_trainer:Loaded 50 sentences
INFO:model_training.fixed_ner_trainer:Label distribution: {'I-PRICE': 17, 'B-LOC': 18, 'B-PRICE': 19, 'B-PRODUCT': 14, 'O': 2047}
INFO:model_training.fixed_ner_trainer:Class weights: {0: 0.006839276990718124, 1: 0.7368421052631579, 2: 0.8235294117647058, 3: 0.7777777777777778, 4: 0.03309692671394799, 5: 1.0, 6: 0.03309692671394799}
INFO:model_training.fixed_ner_trainer:Training set: 40 sentences
INFO:model_training.fixed_ner_trainer:Validation set: 10 sent

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Entity F1
1,1.8723,1.686584,0.83965,0.898292,0.965741,0.83965,"{'PRICE': 0.0, 'LOC': 0.0}"
2,1.8432,1.679666,0.813411,0.884257,0.96862,0.813411,"{'PRICE': 0.0, 'LOC': 0.0}"
3,1.8481,1.66983,0.813411,0.884257,0.96862,0.813411,"{'PRICE': 0.0, 'LOC': 0.0}"
4,1.8426,1.652499,0.760933,0.851717,0.971409,0.760933,"{'PRICE': 0.0, 'LOC': 0.024096385542168676}"
5,1.7881,1.632211,0.699708,0.81182,0.982755,0.699708,"{'PRICE': 0.0, 'LOC': 0.05504587155963303}"
6,1.7818,1.602839,0.559767,0.706395,0.982678,0.559767,"{'PRICE': 0.0, 'LOC': 0.03821656050955414}"
7,1.7327,1.560053,0.419825,0.579907,0.985553,0.419825,"{'PRICE': 0.4, 'LOC': 0.029411764705882353}"
8,1.6456,1.491256,0.367347,0.525447,0.985544,0.367347,"{'PRICE': 0.25, 'LOC': 0.0273972602739726}"
9,1.5375,1.36378,0.361516,0.513257,0.984924,0.361516,"{'PRICE': 0.3157894736842105, 'LOC': 0.02830188679245283}"
10,1.2697,1.146452,0.3207,0.464588,0.98313,0.3207,"{'PRICE': 0.08450704225352113, 'LOC': 0.034482758620689655}"


INFO:model_training.fixed_ner_trainer:Saving model...
INFO:model_training.fixed_ner_trainer:Training completed!


<model_training.fixed_ner_trainer.FixedNERTrainer.create_weighted_trainer.<locals>.WeightedTrainer at 0x259d1523aa0>

## Inference: Predict Entities in New Text

In [5]:
from model_training.ner_trainer import FinalNERPredictor

predictor = FinalNERPredictor(OUTPUT_DIR)
sample_text = 'አዲስ አበባ ላይ የሚገኙ የህጻናት ሻይ በ 250 ብር ሽያጭ ላይ ነው'
predicted_entities = predictor.predict_with_confidence(sample_text, confidence_threshold=0.4)
print('Predicted entities:', predicted_entities)


Predicted entities: []


In [6]:
from sklearn.model_selection import train_test_split

# Load your data as before
def load_conll_data(file_path):
    sentences = []
    labels = []
    current_tokens = []
    current_labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if current_tokens:
                    sentences.append(current_tokens)
                    labels.append(current_labels)
                    current_tokens = []
                    current_labels = []
            else:
                parts = line.split('\t')
                if len(parts) == 2:
                    token, label = parts
                    current_tokens.append(token)
                    current_labels.append(label)
    if current_tokens:
        sentences.append(current_tokens)
        labels.append(current_labels)
    return sentences, labels

sentences, labels = load_conll_data("../data/labeled/conll_labeled.txt")
train_sents, val_sents, train_labels, val_labels = train_test_split(
    sentences, labels, test_size=0.2, random_state=42
)