In [32]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from sklearn.model_selection import train_test_split
import torch
import numpy as np
import evaluate



In [33]:
# Step 3: Load CoNLL-formatted data
def load_conll_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = f.read().strip().split('\n\n')

    sentences, labels = [], []
    for sentence in data:
        words, tags = [], []
        for line in sentence.split('\n'):
            word, tag = line.split()
            words.append(word)
            tags.append(tag)
        sentences.append(words)
        labels.append(tags)

    return sentences, labels

In [34]:
file_path = "/content/drive/MyDrive/merged_amharic_ner_data.conll"  # Update with your actual file path
sentences, labels = load_conll_data(file_path)
print(f"Loaded {len(sentences)} sentences.")


Loaded 6450 sentences.


In [35]:
#Step 4: Prepare the Dataset
def prepare_dataset(sentences, labels):
    df = pd.DataFrame({'tokens': sentences, 'ner_tags': labels})
    dataset = Dataset.from_pandas(df)
    return dataset

dataset = prepare_dataset(sentences, labels)


In [36]:
# Step 5: Define label mappings (label to id and id to label)
def get_label_encodings(labels):
    unique_labels = set()
    for label_list in labels:
        unique_labels.update(label_list)
    label_list = sorted(unique_labels)
    label2id = {label: idx for idx, label in enumerate(label_list)}
    id2label = {idx: label for label, idx in label2id.items()}
    return label_list, label2id, id2label

label_list, label2id, id2label = get_label_encodings(labels)
num_labels = len(label_list)
print("Labels:", label_list)


Labels: ['B-LOC', 'B-PRICE', 'B-PROD', 'B-PRODUCT', 'B-Price', 'B-Product', 'I-LOC', 'I-PRICE', 'I-PRODUCT', 'I-Price', 'I-Product', 'O']


In [37]:
# Step 6: Load the pre-trained model and tokenizer
model_name = "xlm-roberta-base"  # Change to your preferred model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/6450 [00:00<?, ? examples/s]

In [39]:
# Step 8: Set up the trainer
# Convert to Hugging Face DatasetDict
from datasets import DatasetDict

dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset['train']
eval_dataset = dataset['test']

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(eval_dataset)}")


Training samples: 5160
Validation samples: 1290


In [40]:
# Step 9: Set Up Training Arguments

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",              # Output directory
    eval_strategy="epoch",               # Updated to use eval_strategy instead of deprecated evaluation_strategy
    save_strategy="epoch",               # Ensuring save strategy matches eval strategy
    learning_rate=2e-5,                  # Learning rate
    per_device_train_batch_size=16,      # Batch size for training
    per_device_eval_batch_size=16,       # Batch size for evaluation
    num_train_epochs=3,                  # Number of epochs
    weight_decay=0.01,                   # Weight decay
    load_best_model_at_end=True,         # Load the best model at the end of training
)



In [43]:
# Step 10: Define Evaluation Metrics
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_labels = [[id2label[label_id] for label_id in label if label_id != -100] for label in labels]
    true_predictions = [[id2label[pred_id] for pred_id, label_id in zip(prediction, label) if label_id != -100]
                        for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [44]:
#Step 11: Initialize the Trainer
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [45]:
#Step 12: Train the Model
trainer.train()


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.058906,0.869427,0.890338,0.879758,0.983278
2,0.174800,0.03965,0.894623,0.92927,0.911618,0.988379
3,0.174800,0.025382,0.936635,0.952099,0.944304,0.993293


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=969, training_loss=0.1060663947626041, metrics={'train_runtime': 1071.0222, 'train_samples_per_second': 14.453, 'train_steps_per_second': 0.905, 'total_flos': 2148445507590144.0, 'train_loss': 0.1060663947626041, 'epoch': 3.0})

In [46]:
# Step 13: Evaluate the Model
evaluation_results = trainer.evaluate()
print(evaluation_results)


  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.02538190968334675, 'eval_precision': 0.9366352516543012, 'eval_recall': 0.9520994700366898, 'eval_f1': 0.9443040533710706, 'eval_accuracy': 0.993292673415589, 'eval_runtime': 21.8664, 'eval_samples_per_second': 58.995, 'eval_steps_per_second': 3.704, 'epoch': 3.0}


In [47]:
#Step 14: Save the Model
model.save_pretrained("./fine_tuned_ner_model")
tokenizer.save_pretrained("./fine_tuned_ner_model")


('./fine_tuned_ner_model/tokenizer_config.json',
 './fine_tuned_ner_model/special_tokens_map.json',
 './fine_tuned_ner_model/sentencepiece.bpe.model',
 './fine_tuned_ner_model/added_tokens.json',
 './fine_tuned_ner_model/tokenizer.json')

In [50]:
def predict_ner(text, model, tokenizer, id2label):
    # Set the model to evaluation mode
    model.eval()

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Move inputs to the same device as the model
    device = next(model.parameters()).device
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model(**inputs)

    # Get the predicted labels
    predictions = torch.argmax(outputs.logits, dim=2)

    # Convert predictions to labels
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    predicted_labels = [id2label[prediction.item()] for prediction in predictions[0]]

    # Return tokens and their corresponding predicted labels
    return list(zip(tokens, predicted_labels))


In [52]:
# Sample input text
sample_text = "ምርቶች ከአዲስ አበባ በቅናሽ ዋጋ ይሰጣሉ።"

# Call the predict function
predictions = predict_ner(sample_text, model, tokenizer, id2label)

# Print predictions
for token, label in predictions:
    print(f"{token}: {label}")


<s>: O
▁ምርቶች: O
▁ከአዲስ: O
▁አበባ: O
▁በ: O
ቅና: O
ሽ: O
▁ዋጋ: O
▁ይሰ: O
ጣ: O
ሉ።: O
</s>: O
