<a href="https://colab.research.google.com/github/Kalze1/Amharic_Named_Entity_Recognition/blob/task-1/notebook/Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report
import torch

In [3]:
# Function to read CoNLL formatted data
def read_conll(file_path):
    sentences = []
    labels = []
    with open(file_path, "r", encoding="utf-8") as file:
        sentence = []
        label = []
        for line in file:
            line = line.strip()
            if not line:  # New sentence
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence = []
                    label = []
            else:
                word, tag = line.split()
                sentence.append(word)
                label.append(tag)
        # Append last sentence
        if sentence:
            sentences.append(sentence)
            labels.append(label)
    return sentences, labels

# Load your labeled data (replace 'path/to/your/data.txt' with your actual file path)
sentences, labels = read_conll("../data/labeled_cleaned_tokenized_dataset.conll")

# Create a DataFrame
data = {'tokens': sentences, 'ner_tags': labels}
df = pd.DataFrame(data)

# Convert the DataFrame to a Hugging Face dataset
dataset = Dataset.from_pandas(df)


In [4]:
dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 473
})

In [5]:
# Load the pre-trained model and tokenizer
model_name = "xlm-roberta-base"  # Or use any Amharic-supporting model like "bert-tiny-amharic"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=6)  # Adjust `num_labels` as per your entities


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


ImportError: 
AutoModelForTokenClassification requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.


In [13]:
# Print unique labels in the dataset to verify their format
all_labels = [label for sublist in labels for label in sublist]
unique_labels = set(all_labels)
print("Unique labels in the dataset:", unique_labels)


Unique labels in the dataset: {'I-LOC', 'B-PRICE', 'B-LOC', 'O', 'I-PRICE', 'B-PRODUCT'}


In [23]:
# Update the label list based on your actual data
label_list = sorted(unique_labels)  # Sort to match the correct sequence of labels
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}
print("Label to ID Mapping:", label_to_id)


Label to ID Mapping: {'B-LOC': 0, 'B-PRICE': 1, 'B-PRODUCT': 2, 'I-LOC': 3, 'I-PRICE': 4, 'O': 5}


In [15]:
# Tokenize the dataset and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special tokens
            elif word_idx != previous_word_idx:  # Start of a new word
                if label[word_idx] in label_to_id:
                    label_ids.append(label_to_id[label[word_idx]])
                else:
                    print(f"Unmapped label encountered: {label[word_idx]}")  # Debugging
                    label_ids.append(-100)
            else:
                label_ids.append(label_to_id[label[word_idx]] if label[word_idx].startswith("I-") else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenizer and label alignment
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/974 [00:00<?, ? examples/s]

In [27]:
tokenized_dataset

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 974
})

In [26]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2
)




In [18]:
print("Number of labels expected by the model:", model.config.num_labels)
print("Label to ID Mapping:", label_to_id)


Number of labels expected by the model: 5
Label to ID Mapping: {'B-LOC': 0, 'B-PRICE': 1, 'B-PRODUCT': 2, 'I-LOC': 3, 'I-PRICE': 4, 'O': 5}


In [None]:
from datasets import Dataset, DatasetDict
from transformers import DataCollatorForTokenClassification, TrainingArguments, Trainer

# Define the train-test split ratio
train_test_split_ratio = 0.8  # 80% for training, 20% for validation

# Split the dataset using the `train_test_split` method from Hugging Face `datasets` library
split_dataset = tokenized_dataset.train_test_split(test_size=1 - train_test_split_ratio, seed=42)

# Access the train and validation datasets from the split result
train_dataset = split_dataset['train']
validation_dataset = split_dataset['test']

# Define the data collator with padding and truncation enabled
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Updated to avoid the warning about evaluation_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define the trainer with the newly split datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator  # Use the updated data collator
)

# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss


In [None]:
# Save the model
model.save_pretrained("./fine-tuned-model")
tokenizer.save_pretrained("./fine-tuned-model")
