In [29]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from sklearn.model_selection import train_test_split
import torch


In [21]:
# Step 3: Load CoNLL-formatted data
def load_conll_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = f.read().strip().split('\n\n')
    
    sentences, labels = [], []
    for sentence in data:
        words, tags = [], []
        for line in sentence.split('\n'):
            word, tag = line.split()
            words.append(word)
            tags.append(tag)
        sentences.append(words)
        labels.append(tags)
    
    return sentences, labels

In [31]:
file_path = "../data/merged_amharic_ner_data.conll"  # Update with your actual file path
sentences, labels = load_conll_data(file_path)
print(f"Loaded {len(sentences)} sentences.")


Loaded 6454 sentences.


In [34]:
#Step 4: Prepare the Dataset
def prepare_dataset(sentences, labels):
    df = pd.DataFrame({'tokens': sentences, 'ner_tags': labels})
    dataset = Dataset.from_pandas(df)
    return dataset

dataset = prepare_dataset(sentences, labels)


In [35]:
# Step 5: Define label mappings (label to id and id to label)
def get_label_encodings(labels):
    unique_labels = set()
    for label_list in labels:
        unique_labels.update(label_list)
    label_list = sorted(unique_labels)
    label2id = {label: idx for idx, label in enumerate(label_list)}
    id2label = {idx: label for label, idx in label2id.items()}
    return label_list, label2id, id2label

label_list, label2id, id2label = get_label_encodings(labels)
num_labels = len(label_list)
print("Labels:", label_list)


Labels: ['B-LOC', 'B-PRICE', 'B-PROD', 'B-PRODUCT', 'B-Price', 'B-Product', 'I-LOC', 'I-PRICE', 'I-PRODUCT', 'I-Price', 'I-Product', 'O']


In [26]:
# Step 6: Load the pre-trained model and tokenizer
model_name = "xlm-roberta-base"  # Change to your preferred model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)




In [27]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


In [28]:
# Step 8: Set up the trainer
# Convert to Hugging Face DatasetDict
from datasets import DatasetDict

dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset['train']
eval_dataset = dataset['test']

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(eval_dataset)}")


In [30]:
# Step 9: Set Up Training Arguments

training_args = TrainingArguments(
    output_dir="./results",          # Output directory
    evaluation_strategy="epoch",     # Evaluate every epoch
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Weight decay
    logging_dir='./logs',            # Directory for logs
    logging_steps=10,
    save_total_limit=2,              # Limit the total amount of checkpoints
    load_best_model_at_end=True,     # Load the best model when finished training
    metric_for_best_model="f1",      # Use F1 score to select the best model
    greater_is_better=True
)


In [None]:
#Step 10: Define Evaluation Metrics
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    
    true_labels = [[id2label[label_id] for label_id in label if label_id != -100] for label in labels]
    true_predictions = [[id2label[pred_id] for pred_id, label_id in zip(prediction, label) if label_id != -100]
                        for prediction, label in zip(predictions, labels)]
    
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [None]:
#Step 11: Initialize the Trainer
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [None]:
#Step 12: Train the Model
trainer.train()


In [None]:
# Step 13: Evaluate the Model
evaluation_results = trainer.evaluate()
print(evaluation_results)


In [None]:
#Step 14: Save the Model
model.save_pretrained("./fine_tuned_ner_model")
tokenizer.save_pretrained("./fine_tuned_ner_model")


In [None]:
#Step 15: Perform Predictions with the Fine-Tuned Model
def predict_ner(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)
    
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    predicted_labels = [id2label[prediction.item()] for prediction in predictions[0]]
    
    return list(zip(tokens, predicted_labels))


In [None]:
sample_text = "ምርቶች ከአዲስ አበባ በቅናሽ ዋጋ ይሰጣሉ።"
prediction = predict_ner(sample_text, model, tokenizer)
print(prediction)
