In [None]:
!pip install transformers datasets seqeval


In [3]:
!pip install accelerate

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [4]:
from google.colab import files

uploaded = files.upload()  # Upload your .txt CoNLL file
!ls  # Confirm file upload


Saving conll_dataset.txt to conll_dataset (2).txt
'conll_dataset (1).txt'   conll_dataset.txt    sample_data
'conll_dataset (2).txt'   labeled_tokens.csv


In [23]:
from datasets import Dataset
import pandas as pd

def parse_conll_to_dataset(file_path):
    tokens = []
    labels = []
    all_tokens = []
    all_labels = []

    with open(file_path, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    all_tokens.append(tokens)
                    all_labels.append(labels)
                    tokens = []
                    labels = []
            else:
                splits = line.split()
                tokens.append(splits[0])
                labels.append(splits[1])
        # last sentence
        if tokens:
            all_tokens.append(tokens)
            all_labels.append(labels)

    data = {'tokens': all_tokens, 'ner_tags': all_labels}
    return Dataset.from_dict(data)

dataset = parse_conll_to_dataset("conll_dataset.txt")
print(dataset)



In [24]:
# Extract unique labels
unique_labels = set(label for doc_labels in dataset['ner_tags'] for label in doc_labels)
label_list = sorted(list(unique_labels))
print("Labels:", label_list)

label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for l, i in label_to_id.items()}



Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoTokenizer

model_names = [
    "xlm-roberta-base",
    "distilbert-base-multilingual-cased"
]

# Pick tokenizer for each model
tokenizer_xlm = AutoTokenizer.from_pretrained(model_names[0])
tokenizer_distil = AutoTokenizer.from_pretrained(model_names[1])

def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['tokens'],
                                 is_split_into_words=True,
                                 truncation=True,
                                 padding='max_length',
                                 max_length=128)

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # ignore token for loss
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            else:
                # For inside tokens, assign I-XXX if label starts with B-XXX else same label
                current_label = label[word_idx]
                if current_label.startswith("B-"):
                    label_ids.append(label_to_id[current_label.replace("B-", "I-")])
                else:
                    label_ids.append(label_to_id[current_label])
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset_xlm = dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer_xlm), batched=True)
tokenized_dataset_distil = dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer_distil), batched=True)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

In [21]:
split = tokenized_dataset_xlm.train_test_split(test_size=0.2, seed=42)
train_dataset_xlm = split['train']
val_dataset_xlm = split['test']

# For DistilBERT, same split:
train_dataset_distil = tokenized_dataset_distil.select(train_dataset_xlm['__index_level_0__'])
val_dataset_distil = tokenized_dataset_distil.select(val_dataset_xlm['__index_level_0__'])


In [22]:

from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import evaluate
import numpy as np

def fine_tune_ner_model(model_name, train_ds, val_ds, label_list):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

    metric = evaluate.load("seqeval")

    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
        true_preds = [
            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        results = metric.compute(predictions=true_preds, references=true_labels)
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

    training_args = TrainingArguments(
        output_dir=f"./results-{model_name.split('/')[-1]}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir=f"./logs-{model_name.split('/')[-1]}",
        logging_steps=10,
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    eval_results = trainer.evaluate()
    return eval_results, model


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
results_xlm, model_xlm = fine_tune_ner_model(model_names[0], train_dataset_xlm, val_dataset_xlm, label_list)
results_distil, model_distil = fine_tune_ner_model(model_names[1], train_dataset_distil, val_dataset_distil, label_list)

print("XLM-Roberta Results:", results_xlm)
print("DistilBERT Results:", results_distil)


NameError: name 'tokenized_dataset' is not defined

In [None]:
model_xlm.save_pretrained("./best_model_xlm")
tokenizer_xlm.save_pretrained("./best_model_xlm")
