In [5]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report
import torch
import shap

In [6]:
!pip install transformers datasets seqeval



In [None]:
# ===============================
# 1. Import Libraries and Setup
# ===============================
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import json
import shap  # Install if needed: !pip install shap

# ===============================
# 2. Load and Prepare Dataset
# ===============================
# Define the read_conll function
def read_conll(file_path):
    sentences, labels = [], []
    with open(file_path, "r", encoding="utf-8") as file:
        sentence, label = [], []
        for line in file:
            line = line.strip()
            if not line:  # New sentence
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                sentence, label = [], []
            else:
                word, tag = line.split()
                sentence.append(word)
                label.append(tag)
        if sentence:  # Last sentence
            sentences.append(sentence)
            labels.append(label)
    return sentences, labels

# Load the labeled dataset
file_path = "/content/drive/MyDrive/data/labeled_cleaned_tokenized_dataset.conll"
sentences, labels = read_conll(file_path)

# Create Dataset
data = {"tokens": sentences, "ner_tags": labels}
df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)

# ===============================
# 3. Model and Tokenizer Setup
# ===============================
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=6)

# ===============================
# 4. Verify and Map Labels
# ===============================
# Check unique labels
unique_labels = set(label for sublist in labels for label in sublist)
label_list = sorted(unique_labels)
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}
print("Label to ID Mapping:", label_to_id)

# ===============================
# 5. Tokenization and Label Alignment
# ===============================
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] in label_to_id:
                label_ids.append(label_to_id[label[word_idx]])
            else:
                print(f"Unmapped label: {label[word_idx]}")
                label_ids.append(-100)
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# ===============================
# 6. Train-Test Split
# ===============================
train_test_split_ratio = 0.8
split_dataset = tokenized_dataset.train_test_split(test_size=1 - train_test_split_ratio, seed=42)
train_dataset = split_dataset["train"]
validation_dataset = split_dataset["test"]

# ===============================
# 7. Training Arguments
# ===============================
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    report_to="none",
)

# ===============================
# 8. Fine-Tuning with Trainer
# ===============================
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator,
)

trainer.train()

# ===============================
# 9. Evaluate Fine-Tuned Model
# ===============================
def evaluate_model(trainer, validation_dataset, id_to_label):
    predictions, labels, _ = trainer.predict(validation_dataset)
    predictions = predictions.argmax(-1)
    true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
    predicted_labels = [[id_to_label[p] for p in pred if p != -100] for pred in predictions]
    from seqeval.metrics import classification_report
    print(classification_report(true_labels, predicted_labels))

evaluate_model(trainer, validation_dataset, id_to_label)

# ===============================
# 10. Baseline Model Comparison
# ===============================
baseline_model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=6)
baseline_trainer = Trainer(
    model=baseline_model,
    args=training_args,
    eval_dataset=validation_dataset,
    data_collator=data_collator,
)
print("Baseline Model Performance:")
evaluate_model(baseline_trainer, validation_dataset, id_to_label)

# ===============================
# 11. Save Metrics and Model
# ===============================
fine_tuned_metrics = trainer.evaluate()
with open("fine_tuned_metrics.json", "w") as f:
    json.dump(fine_tuned_metrics, f, indent=4)

baseline_metrics = baseline_trainer.evaluate()
with open("baseline_metrics.json", "w") as f:
    json.dump(baseline_metrics, f, indent=4)

model.save_pretrained("./fine-tuned-model")
tokenizer.save_pretrained("./fine-tuned-model")

# ===============================
# 12. SHAP for Interpretability
# ===============================
explainer = shap.Explainer(model, tokenizer)
sample = validation_dataset[0]
inputs = tokenizer(sample["tokens"], return_tensors="pt", is_split_into_words=True, truncation=True)
shap_values = explainer(inputs)
shap.summary_plot(shap_values, inputs)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Label to ID Mapping: {'B-LOC': 0, 'B-PRODUCT': 1, 'I-LOC': 2, 'I-PRICE': 3, 'O': 4}


Map:   0%|          | 0/473 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.0256,0.009661


In [None]:
    # Function to read CoNLL formatted data
def read_conll(file_path):
    sentences = []
    labels = []
    with open(file_path, "r", encoding="utf-8") as file:
        sentence = []
        label = []
        for line in file:
            line = line.strip()
            if not line:  # New sentence
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence = []
                    label = []
            else:
                word, tag = line.split()
                sentence.append(word)
                label.append(tag)
        # Append last sentence
        if sentence:
            sentences.append(sentence)
            labels.append(label)
    return sentences, labels

# Load your labeled data (replace 'path/to/your/data.txt' with your actual file path)
sentences, labels = read_conll("/content/drive/MyDrive/data/labeled_cleaned_tokenized_dataset.conll")

# Create a DataFrame
data = {'tokens': sentences, 'ner_tags': labels}
df = pd.DataFrame(data)

# Convert the DataFrame to a Hugging Face dataset
dataset = Dataset.from_pandas(df)


In [None]:
dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 473
})

In [None]:
# Check unique labels
# Set the label list and mapping
all_labels = [label for sublist in labels for label in sublist]
unique_labels = sorted(set(all_labels))
label_to_id = {label: i for i, label in enumerate(unique_labels)}
id_to_label = {i: label for label, i in label_to_id.items()}
num_labels = len(unique_labels)
print("Label to ID Mapping:", label_to_id)

Label to ID Mapping: {'B-LOC': 0, 'B-PRODUCT': 1, 'I-LOC': 2, 'I-PRICE': 3, 'O': 4}


In [None]:
# Load the pre-trained model and tokenizer
model_name = "xlm-roberta-base"  # Or use any Amharic-supporting model like "bert-tiny-amharic"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)  # Adjust `num_labels` as per your entities

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize the dataset and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",  # Ensure consistent length
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map subwords to original words
        label_ids = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:  # Special tokens
                label_ids.append(-100)
            elif word_id != previous_word_id:  # First subword of a word
                label_ids.append(label_to_id[label[word_id]])
            else:  # Subsequent subwords
                label_ids.append(-100)
            previous_word_id = word_id
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


# Apply the tokenizer and label alignment
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/473 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/473 [00:00<?, ? examples/s]

In [None]:
#  Train-Test Split

train_test_split_ratio = 0.8
split_dataset = tokenized_dataset.train_test_split(test_size=1 - train_test_split_ratio, seed=42)
train_dataset = split_dataset["train"]
validation_dataset = split_dataset["test"]

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    report_to="none",
)




In [None]:
# Load DistilBERT for token classification
model_name_distilbert = "distilbert-base-multilingual-cased"
tokenizer_distilbert = AutoTokenizer.from_pretrained(model_name_distilbert)
model_distilbert = AutoModelForTokenClassification.from_pretrained(model_name_distilbert, num_labels=num_labels)

trainer_distilbert = Trainer(
    model=model_distilbert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer_distilbert,
    data_collator=data_collator
)

trainer_distilbert.train()


# Save the fine-tuned DistilBERT model and tokenizer
save_path_distilbert = "/content/drive/MyDrive/data/fine-tuned-model-distilbert"
model_distilbert.save_pretrained(save_path_distilbert)
tokenizer_distilbert.save_pretrained(save_path_distilbert)

NameError: name 'AutoTokenizer' is not defined

In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
print("Number of labels expected by the model:", model.config.num_labels)
print("Label to ID Mapping:", label_to_id)


In [None]:
all_labels = [label for example in tokenized_dataset for label in example['labels']]
print("Minimum label value:", min(all_labels))
print("Maximum label value:", max(all_labels))
print("Labels outside range:", [label for label in all_labels if label < -100 or label >= num_labels])
print("Number of unique labels in the dataset:", len(label_to_id))
print("Number of labels configured in the model:", model.config.num_labels)


Minimum label value: -100
Maximum label value: 4
Labels outside range: []
Number of unique labels in the dataset: 5
Number of labels configured in the model: 5


In [None]:
cleaned_dataset = tokenized_dataset.filter(
    lambda example: all(label in range(-100, num_labels) for label in example['labels'])
)


Filter:   0%|          | 0/473 [00:00<?, ? examples/s]

In [None]:
# Define the trainer with the newly split datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator
)

# Train the model
trainer.train()


NameError: name 'Trainer' is not defined

In [None]:
save_path_distilbert = "/content/drive/MyDrive/data/fine-tuned-model-distilbert"
model_distilbert.save_pretrained(save_path_distilbert)
tokenizer_distilbert.save_pretrained(save_path_distilbert)

NameError: name 'model_distilbert' is not defined

In [None]:
from seqeval.metrics import classification_report

# Function to evaluate the model and print classification report
def evaluate_model(model, tokenizer, validation_dataset):
    # Create a new trainer object for evaluation
    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=validation_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    # Get predictions on the validation dataset
    predictions, labels, _ = trainer.predict(validation_dataset)

    # Convert logits to predicted labels
    predictions = np.argmax(predictions, axis=2)

    # Align predictions and true labels (skip padding labels -100)
    true_labels = [
        [id_to_label[label] for label in label_seq if label != -100]
        for label_seq in labels
    ]
    predicted_labels = [
        [id_to_label[pred] for pred, label in zip(pred_seq, label_seq) if label != -100]
        for pred_seq, label_seq in zip(predictions, labels)
    ]

    # Generate classification report
    report = classification_report(true_labels, predicted_labels)
    print(report)

    return report

In [None]:
print("Evaluating DistilBERT:")
distilbert_report = evaluate_model(model_distilbert, tokenizer_distilbert, validation_dataset)

Evaluating DistilBERT:


NameError: name 'model_distilbert' is not defined

In [None]:
#Baseline Model Comparison
# ===============================
baseline_model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=6)
baseline_trainer = Trainer(
    model=baseline_model,
    args=training_args,
    eval_dataset=validation_dataset,
    data_collator=data_collator,
)
print("Baseline Model Performance:")
evaluate_model(baseline_trainer, validation_dataset, id_to_label)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Baseline Model Performance:


              precision    recall  f1-score   support

         LOC       0.00      0.01      0.00        93
       PRICE       0.00      0.00      0.00         1
     PRODUCT       0.00      0.01      0.01        94

   micro avg       0.00      0.01      0.00       188
   macro avg       0.00      0.01      0.00       188
weighted avg       0.00      0.01      0.00       188



In [None]:
# Save the model
model.save_pretrained("./fine-tuned-model")
tokenizer.save_pretrained("./fine-tuned-model")

('./fine-tuned-model/tokenizer_config.json',
 './fine-tuned-model/special_tokens_map.json',
 './fine-tuned-model/sentencepiece.bpe.model',
 './fine-tuned-model/added_tokens.json',
 './fine-tuned-model/tokenizer.json')

In [None]:
from collections import Counter

all_labels = [label for sublist in labels for label in sublist]
print("Label distribution:", Counter(all_labels))


Label distribution: Counter({'O': 5840, 'I-LOC': 2792, 'B-PRODUCT': 467, 'B-LOC': 465, 'I-PRICE': 1})
