#### TASK 3 : Model Comparison & Selection
#### Using Google CoLab (GPU)

In [None]:
# !pip install transformers datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.3


In [3]:
import pandas as pd
import re
from functools import lru_cache
from contextlib import contextmanager
import time
from datasets import Dataset, Features, Value, Sequence, ClassLabel
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, pipeline
from evaluate import load
import numpy as np
import shutil
# from google.colab import files

In [4]:
# Function to parse .conll file
def read_conll_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    tokens = []
    labels = []
    current_tokens = []
    current_labels = []

    for line in lines:
        line = line.strip()
        if line == "":  # End of a sentence
            if current_tokens:
                tokens.append(current_tokens)
                labels.append(current_labels)
                current_tokens = []
                current_labels = []
        else:
            parts = line.split()  # Assuming the format is: token [space] label
            if len(parts) == 2:
                current_tokens.append(parts[0])
                current_labels.append(parts[1])

    # Add the last sentence if the file doesn't end with a blank line
    if current_tokens:
        tokens.append(current_tokens)
        labels.append(current_labels)

    return {"tokens": tokens, "labels": labels}

# Load and preprocess the data
data = read_conll_file("C:/Users/ibsan/Desktop/TenX/week-5/data/labeled_messages.conll")
unique_labels = sorted(list(set(label for sublist in data["labels"] for label in sublist)))
print(f"Unique labels: {unique_labels}")

Unique labels: ['B-LOC', 'B-PRICE', 'B-PRODUCT', 'I-PRICE', 'I-PRODUCT', 'O']


In [5]:
# Define the features for the dataset
features = Features({
    "tokens": Sequence(Value("string")),
    "labels": Sequence(ClassLabel(names=unique_labels)),  # Treat labels as ClassLabel
})

# Convert to Hugging Face Dataset with explicit features
dataset = Dataset.from_dict(data, features=features)

# Split into train and validation sets
dataset = dataset.train_test_split(test_size=0.2, seed=42)
print("Dataset structure:", dataset)

Dataset structure: DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 40
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 10
    })
})


In [14]:
from collections import Counter

# Flatten the list of labels
all_labels = [label for sublist in data["labels"] for label in sublist]

# Count the frequency of each label
label_counts = Counter(all_labels)

# Print the label distribution
print("Label distribution:")
for label, count in label_counts.items():
    print(f"{label}: {count}")

Label distribution:
O: 896
B-PRICE: 105
I-PRICE: 103
B-LOC: 50
B-PRODUCT: 84
I-PRODUCT: 50


In [7]:
# Load the tokenizer
model_name = "xlm-roberta-base"  # Replace with "bert-tiny-amharic" or "afroxmlr" if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,  # Truncate sequences to the model's max length
        padding="max_length",  # Pad sequences to the model's max length
        max_length=512,  # Set max length (adjust based on your model and dataset)
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:  # Special tokens (e.g., [CLS], [SEP])
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # New word
                label_ids.append(label[word_idx])
            else:  # Same word (subword)
                label_ids.append(-100)  # Use -100 to ignore subwords in the loss function
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
print("Tokenized dataset:", tokenized_dataset)

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Tokenized dataset: DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 40
    })
    test: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 10
    })
})


In [12]:
# Load the pre-trained model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(unique_labels)  # Number of unique labels
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="C:/Users/ibsan/Desktop/TenX/week-5/model_output/results",  # Directory to save the model
    evaluation_strategy="epoch",  # Evaluate after each epoch
    learning_rate=2e-5,  # Learning rate
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    num_train_epochs=3,  # Number of epochs
    weight_decay=0.01,  # Weight decay for regularization
    save_strategy="epoch",  # Save model after each epoch
    logging_dir="C:/Users/ibsan/Desktop/TenX/week-5/model_output/logs",  # Directory for logs
    logging_steps=10,  # Log every 10 steps
    report_to="none",  # Disable external logging (e.g., to Weights & Biases)
)

print("Training arguments set up successfully!")

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training arguments set up successfully!




In [13]:
# Define the Trainer
trainer = Trainer(
    model=model,  # The pre-trained model
    args=training_args,  # Training arguments
    train_dataset=tokenized_dataset["train"],  # Training dataset
    eval_dataset=tokenized_dataset["test"],  # Evaluation dataset
    tokenizer=tokenizer,  # Tokenizer
    compute_metrics=compute_metrics,  # Add this line to include the metrics function
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./fine-tuned-model")
tokenizer.save_pretrained("./fine-tuned-model")

print("Model fine-tuning completed and saved successfully!")

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.648995,0.057851,0.1,0.073298,0.428571
2,No log,1.441319,0.0,0.0,0.0,0.69112
3,No log,1.312345,0.0,0.0,0.0,0.69112


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model fine-tuning completed and saved successfully!


In [17]:
# Load the evaluation metric
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score

# Define the compute_metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_labels = [[unique_labels[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [unique_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Compute metrics with zero_division=0 to avoid warnings
    precision = precision_score(true_labels, true_predictions, zero_division=0)
    recall = recall_score(true_labels, true_predictions, zero_division=0)
    f1 = f1_score(true_labels, true_predictions, zero_division=0)
    accuracy = accuracy_score(true_labels, true_predictions)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy,
    }

# Evaluate the model
eval_results = trainer.evaluate()

# Print evaluation results
print("Evaluation Results:")
print(f"Precision: {eval_results['eval_precision']:.4f}")
print(f"Recall: {eval_results['eval_recall']:.4f}")
print(f"F1-Score: {eval_results['eval_f1']:.4f}")
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")

Evaluation Results:
Precision: 0.0000
Recall: 0.0000
F1-Score: 0.0000
Accuracy: 0.6911


Using the Fine-Tuned Model for **Inference**

In [18]:
# Create a NER pipeline
ner_pipeline = pipeline(
    "token-classification",
    model="./fine-tuned-model",
    tokenizer="./fine-tuned-model",
    aggregation_strategy="simple"  # Combine subwords into full words
)

# Function to extract entities from new text
def extract_entities(text):
    predictions = ner_pipeline(text)
    entities = []
    for pred in predictions:
        entities.append({
            "entity": pred["entity_group"],  # Entity type (e.g., PRODUCT, PRICE, LOCATION)
            "word": pred["word"],  # Extracted word
            "start": pred["start"],  # Start position in the text
            "end": pred["end"],  # End position in the text
            "score": pred["score"],  # Confidence score
        })
    return entities

# Example: New Amharic text
new_text = "በአዲስ አበባ ውስጥ አዲስ ስልክ በ 5000 ብር ይገኛል።"  # Example Amharic text
entities = extract_entities(new_text)

# Print the extracted entities
print("Extracted Entities:")
for entity in entities:
    print(
        f"Entity: {entity['entity']}, Word: {entity['word']}, "
        f"Start: {entity['start']}, End: {entity['end']}, Score: {entity['score']:.2f}"
    )

Device set to use cpu


Extracted Entities:
Entity: LABEL_5, Word: በአዲስ አበባ ውስጥ አዲስ ስልክ በ 5000 ብር ይገኛል።, Start: 0, End: 36, Score: 0.25


In [None]:
# saving the model in a zip file
import shutil
import os

# Define the custom directory where the zipped model will be saved
custom_directory = "C:/Users/ibsan/Desktop/TenX/week-5/model_output"

# Define the name of the zipped file (without the .zip extension)
zip_file_name = "fine_tuned_model"

# Full path for the output zip file
output_zip_path = os.path.join(custom_directory, zip_file_name)

# Ensure the custom directory exists (create it if it doesn't)
os.makedirs(custom_directory, exist_ok=True)

# Define the source directory (the directory to be zipped)
source_dir = "C:/Users/ibsan/Desktop/TenX/week-5/fine-tuned-model"  # Update this path if needed

# Check if the source directory exists
if not os.path.exists(source_dir):
    print(f"Error: The directory '{source_dir}' does not exist.")
    # Optionally, create the directory if it doesn't exist
    os.makedirs(source_dir, exist_ok=True)
    print(f"Created directory: '{source_dir}'")
else:
    # Zip the source directory and save it to the custom directory
    shutil.make_archive(output_zip_path, 'zip', source_dir)
    print(f"Model saved as {output_zip_path}.zip")

Model saved as C:/Users/ibsan/Desktop/TenX/week-5/model_output\fine_tuned_model.zip
