#### TASK 3 : Model Comparison & Selection
#### Using Google CoLab (GPU)

In [1]:
from google.colab import files
uploaded = files.upload()  # This will prompt you to upload files


Saving merged_data.conll to merged_data.conll


In [7]:
files_path=uploaded


In [8]:
# Get the uploaded file name (only one file is expected here)
file_path = list(uploaded.keys())[0]  # First file uploaded

# Step 2: Define a function to parse the .conll file
def read_conll_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    tokens = []
    labels = []
    current_tokens = []
    current_labels = []

    for line in lines:
        line = line.strip()
        if line == "":  # End of a sentence
            if current_tokens:
                tokens.append(current_tokens)
                labels.append(current_labels)
                current_tokens = []
                current_labels = []
        else:
            parts = line.split()  # Assuming the format is: token [space] label
            if len(parts) == 2:
                current_tokens.append(parts[0])
                current_labels.append(parts[1])

    # Add the last sentence if the file doesn't end with a blank line
    if current_tokens:
        tokens.append(current_tokens)
        labels.append(current_labels)

    return {"tokens": tokens, "labels": labels}

# Step 3: Load and process the uploaded .conll file
data = read_conll_file(file_path)

# Step 4: Convert to Hugging Face Dataset
dataset = Dataset.from_dict(data)

# Step 5: Split into train and validation sets
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# Step 6: Inspect the dataset
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 12661
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 3166
    })
})


In [9]:
# Part 1: Load the dataset and tokenize
from datasets import Dataset, Features, Value, Sequence, ClassLabel
from transformers import AutoTokenizer

# Load the uploaded .conll file
data = read_conll_file(file_path)

# Get unique labels from the dataset
unique_labels = sorted(list(set(label for sublist in data["labels"] for label in sublist)))

# Define the features for the dataset
features = Features({
    "tokens": Sequence(Value("string")),
    "labels": Sequence(ClassLabel(names=unique_labels)),  # Treat labels as ClassLabel
})

# Convert to Hugging Face Dataset with explicit features
dataset = Dataset.from_dict(data, features=features)

# Split into train and validation sets
dataset = dataset.train_test_split(test_size=0.2, seed=42)

print("Dataset structure:", dataset)

# Load the tokenizer
model_name = "xlm-roberta-base"  # Replace with "bert-tiny-amharic" or "afroxmlr" if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,  # Truncate sequences to the model's max length
        padding="max_length",  # Pad sequences to the model's max length
        max_length=512,  # Set max length (adjust based on your model and dataset)
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:  # Special tokens (e.g., [CLS], [SEP])
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # New word
                label_ids.append(label[word_idx])
            else:  # Same word (subword)
                label_ids.append(-100)  # Use -100 to ignore subwords in the loss function
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

print("Tokenized dataset:", tokenized_dataset)

Dataset structure: DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 12661
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 3166
    })
})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/12661 [00:00<?, ? examples/s]

Map:   0%|          | 0/3166 [00:00<?, ? examples/s]

Tokenized dataset: DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 12661
    })
    test: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 3166
    })
})


In [11]:
# Part 2: Set up training arguments
from transformers import AutoModelForTokenClassification, TrainingArguments

# Load the pre-trained model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(unique_labels)  # Number of unique labels
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save the model
    evaluation_strategy="epoch",  # Evaluate after each epoch
    learning_rate=2e-5,  # Learning rate
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    num_train_epochs=3,  # Number of epochs
    weight_decay=0.01,  # Weight decay for regularization
    save_strategy="epoch",  # Save model after each epoch
    logging_dir="./logs",  # Directory for logs
    logging_steps=10,  # Log every 10 steps
    report_to="none",  # Disable external logging (e.g., to Weights & Biases)
)

print("Training arguments set up successfully!")

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training arguments set up successfully!




In [12]:
# Part 3: Fine-tune the model
from transformers import Trainer

# Define the Trainer
trainer = Trainer(
    model=model,  # The pre-trained model
    args=training_args,  # Training arguments
    train_dataset=tokenized_dataset["train"],  # Training dataset
    eval_dataset=tokenized_dataset["test"],  # Evaluation dataset
    tokenizer=tokenizer,  # Tokenizer
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./fine-tuned-model")
tokenizer.save_pretrained("./fine-tuned-model")

print("Model fine-tuning completed and saved successfully!")

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0097,0.007782
2,0.0041,0.002369
3,0.004,0.00204


Model fine-tuning completed and saved successfully!


In [17]:
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, Trainer
from evaluate import load  # Use the evaluate library
import numpy as np

# Define the compute_metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_labels = [[unique_labels[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [unique_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Compute metrics
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Redefine the Trainer with compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # Include compute_metrics
)
# Evaluate the model
eval_results = trainer.evaluate()

# Print evaluation results
print("Evaluation Results:")
print(f"Precision: {eval_results['eval_precision']:.4f}")
print(f"Recall: {eval_results['eval_recall']:.4f}")
print(f"F1-Score: {eval_results['eval_f1']:.4f}")
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")


  trainer = Trainer(


Evaluation Results:
Precision: 0.9944
Recall: 0.9949
F1-Score: 0.9946
Accuracy: 0.9996


Using the Fine-Tuned Model for **Inference**

In [None]:
from transformers import pipeline

# Load the fine-tuned model and tokenizer
model_path = "./fine-tuned-model"
tokenizer_path = "./fine-tuned-model"

# Create a NER pipeline
ner_pipeline = pipeline(
    "token-classification",
    model=model_path,
    tokenizer=tokenizer_path,
    aggregation_strategy="simple"  # Combine subwords into full words
)

# Function to extract entities from new text
def extract_entities(text):
    # Use the NER pipeline to predict entities
    predictions = ner_pipeline(text)

    # Format the predictions
    entities = []
    for pred in predictions:
        entities.append({
            "entity": pred["entity_group"],  # Entity type (e.g., PRODUCT, PRICE, LOCATION)
            "word": pred["word"],  # Extracted word
            "start": pred["start"],  # Start position in the text
            "end": pred["end"],  # End position in the text
            "score": pred["score"],  # Confidence score
        })

    return entities

# Example: New Amharic text usage
new_text = "በአዲስ አበባ ውስጥ አዲስ ስልክ በ 5000 ብር ይገኛል።"  # Example Amharic text
entities = extract_entities(new_text)

# Print the extracted entities
print("Extracted Entities:")
for entity in entities:
    print(
        f"Entity: {entity['entity']}, Word: {entity['word']}, "
        f"Start: {entity['start']}, End: {entity['end']}, Score: {entity['score']:.2f}"
    )


Device set to use cuda:0


Extracted Entities:
Entity: LABEL_51, Word: በአዲስ, Start: 0, End: 4, Score: 1.00
Entity: LABEL_67, Word: አበባ ውስጥ አዲስ ስልክ በ, Start: 5, End: 22, Score: 1.00
Entity: LABEL_66, Word: 5000, Start: 23, End: 27, Score: 0.86
Entity: LABEL_74, Word: ብር ይገኛል።, Start: 28, End: 36, Score: 1.00


In [None]:
import shutil

# Zip the model directory for download
shutil.make_archive("/content/fine_tuned_model", 'zip', "./fine-tuned-model")


'/content/fine_tuned_model.zip'