In [10]:
#Import Libraries
# Metrics
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

In [11]:
#Load CoNLL File
def load_conll_data(filepath):
    tokens, labels = [], []
    with open(filepath, encoding='utf-8') as f:
        words, tags = [], []
        for line in f:
            line = line.strip()
            if not line:
                if words:
                    tokens.append(words)
                    labels.append(tags)
                    words, tags = [], []
            else:
                splits = line.split()
                if len(splits) == 2:
                    words.append(splits[0])
                    tags.append(splits[1])
    return Dataset.from_dict({"tokens": tokens, "ner_tags": labels})

raw_dataset = load_conll_data("../data/labeled/ner_sample.conll")

dataset = DatasetDict({
    "train": raw_dataset.train_test_split(test_size=0.2, seed=42)["train"],
    "test": raw_dataset.train_test_split(test_size=0.2, seed=42)["test"],
})


In [12]:
#Tokenize and Align Labels
model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Build label mappings
label_list = sorted(set(tag for tags in dataset['train']['ner_tags'] for tag in tags))
label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for l, i in label_to_id.items()}


In [13]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            else:
                label_ids.append(label_to_id[label[word_idx]])  # use same label for subwords
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [14]:
#Define Model and Training Arguments
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(label_list)
)

args = TrainingArguments(
    output_dir="./ner_model",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
)


data_collator = DataCollatorForTokenClassification(tokenizer)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
#Metrics Function

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
    pred_labels = [[id_to_label[p] for (p, l) in zip(pred, label) if l != -100]
                   for pred, label in zip(predictions, labels)]

    precision, recall, f1, _ = precision_recall_fscore_support(
        [item for sublist in true_labels for item in sublist],
        [item for sublist in pred_labels for item in sublist],
        average="macro"
    )

    return {"precision": precision, "recall": recall, "f1": f1}

In [16]:
#Train the Model
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


metrics = trainer.evaluate()
print(metrics)

  trainer = Trainer(


Step,Training Loss




{'eval_loss': 0.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 0.0798, 'eval_samples_per_second': 12.528, 'eval_steps_per_second': 12.528, 'epoch': 3.0}


In [17]:
import json
with open("metrics.json", "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2, ensure_ascii=False)




In [18]:

from transformers import pipeline

# Load the fine-tuned model
model_path = "./ner_model/checkpoint-3"
ner_pipeline = pipeline("ner", model=model_path, tokenizer=model_path, aggregation_strategy="simple")

# Example raw sentence
text = "የቦሌ ሻፍራ አዳዲስ እቃዎች አሉ"

# Run inference
predictions = ner_pipeline(text)
for entity in predictions:
    print(entity)

Device set to use mps:0


{'entity_group': 'LABEL_0', 'score': np.float32(1.0), 'word': 'የቦሌ ሻፍራ አዳዲስ እቃዎች አሉ', 'start': 0, 'end': 20}
