In [5]:
!pip install transformers datasets seqeval evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
!pip install -U transformers datasets seqeval evaluate
!pip uninstall -y transformers tokenizers
!pip install transformers tokenizers

Collecting transformers
  Using cached transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Using cached tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.45.1-py3-none-any.whl (9.9 MB)
Using cached tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.9 MB)
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.20.0 transformers-4.45.1
Found existing installation: transformers 4.45.1
Uninstalling transformers-4.45.1:
  Successfully uninstalled transformers-4.45.1
Found existing installation: tokenizers 0.20.0
Uninstalling tokenizers-0.20.0:
  Successfully uninstalled tokenizers-0.20.0
Collecting transformers
  Using cached transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
Collecting tokenizers
  Using cached tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadat

In [3]:
import torch
from transformers import XLMRobertaForTokenClassification, DistilBertForTokenClassification, BertForTokenClassification
from transformers import XLMRobertaTokenizerFast, DistilBertTokenizerFast, BertTokenizerFast
from datasets import Dataset
from transformers import Trainer, TrainingArguments
import numpy as np
import pandas as pd
import evaluate

In [4]:
# Load CoNLL formatted data
def read_conll(file_path):
    sentences = []
    labels = []
    sentence = []
    label = []
    with open(file_path, 'r') as f:
        for line in f:
            if line == "\n":
                sentences.append(sentence)
                labels.append(label)
                sentence = []
                label = []
            else:
                word, tag = line.strip().split()
                sentence.append(word)
                label.append(tag)
    return sentences, labels

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Load your dataset
file_path = '/content/drive/MyDrive/labeled_data.conll'
sentences, labels = read_conll(file_path)

In [7]:
# Convert data to a pandas DataFrame for easier manipulation
data = pd.DataFrame({'sentence': sentences, 'label': labels})

In [11]:
# Create a label-to-id mapping (label encoding)
unique_labels = list(set([label for sublist in labels for label in sublist]))
label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

In [15]:
# Preprocess the dataset for model training
def tokenize_and_align_labels(tokenizer, sentences, labels):
    tokenized_inputs = tokenizer(sentences, truncation=True, is_split_into_words=True, padding=True)
    label_all_tokens = True
    new_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their corresponding word in the original sentence
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word
                # Convert label to integer using label_to_id
                label_ids.append(label_to_id[label[word_idx]])
            else:
                label_ids.append(-100 if label_all_tokens else label_to_id[label[word_idx]])
            previous_word_idx = word_idx
        new_labels.append(label_ids)
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [16]:
# Load fast tokenizers
xlm_tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")
distilbert_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-multilingual-cased")
bert_tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")



In [17]:
# Tokenize the dataset
xlm_dataset = Dataset.from_dict(tokenize_and_align_labels(xlm_tokenizer, sentences, labels))
distilbert_dataset = Dataset.from_dict(tokenize_and_align_labels(distilbert_tokenizer, sentences, labels))
bert_dataset = Dataset.from_dict(tokenize_and_align_labels(bert_tokenizer, sentences, labels))

In [18]:
# Define the models for fine-tuning
xlm_model = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=len(label_to_id))
distilbert_model = DistilBertForTokenClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=len(label_to_id))
bert_model = BertForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(label_to_id))

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# Load metrics for evaluation using 'evaluate' library
metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [23]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Ensure predictions and labels are properly aligned, ignoring special tokens (-100)
    true_labels = [
        [id_to_label[label] for label in label_seq if label != -100]
        for label_seq in labels
    ]
    true_predictions = [
        [id_to_label[pred] for pred, label in zip(pred_seq, label_seq) if label != -100]
        for pred_seq, label_seq in zip(predictions, labels)
    ]

    # Now, compute the evaluation metrics
    results = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "accuracy": results["overall_accuracy"],
        "f1": results["overall_f1"],
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
    }


# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)



In [24]:
# Define a trainer for each model
def train_model(model, dataset, tokenizer):
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        eval_dataset=dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    return trainer


In [25]:
# Train and evaluate each model
xlm_trainer = train_model(xlm_model, xlm_dataset, xlm_tokenizer)
distilbert_trainer = train_model(distilbert_model, distilbert_dataset, distilbert_tokenizer)
bert_trainer = train_model(bert_model, bert_dataset, bert_tokenizer)

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.022431,0.994138,0.857306,0.859026,0.855594
2,No log,0.013488,0.997041,0.911428,0.89272,0.930936
3,No log,0.012861,0.996483,0.900495,0.868505,0.934932


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.146567,0.963055,0.194794,0.368254,0.13242
2,No log,0.081832,0.983838,0.49563,0.546832,0.453196
3,No log,0.068217,0.988039,0.716599,0.876238,0.606164


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.097307,0.974319,0.589056,0.825953,0.457763
2,No log,0.067953,0.985722,0.71068,0.820628,0.626712
3,No log,0.056402,0.988192,0.743666,0.878011,0.644977


  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
# Evaluate models on the validation set
xlm_results = xlm_trainer.evaluate()
distilbert_results = distilbert_trainer.evaluate()
bert_results = bert_trainer.evaluate()

In [27]:
# Compare models
print("XLM-Roberta Results:", xlm_results)
print("DistilBERT Results:", distilbert_results)
print("mBERT Results:", bert_results)

XLM-Roberta Results: {'eval_loss': 0.012861184775829315, 'eval_accuracy': 0.996482804823582, 'eval_f1': 0.9004947773501925, 'eval_precision': 0.8685047720042418, 'eval_recall': 0.934931506849315, 'eval_runtime': 18.3256, 'eval_samples_per_second': 39.671, 'eval_steps_per_second': 2.51, 'epoch': 3.0}
DistilBERT Results: {'eval_loss': 0.06821694225072861, 'eval_accuracy': 0.9880387449754354, 'eval_f1': 0.7165991902834008, 'eval_precision': 0.8762376237623762, 'eval_recall': 0.6061643835616438, 'eval_runtime': 7.9084, 'eval_samples_per_second': 91.927, 'eval_steps_per_second': 5.817, 'epoch': 3.0}
mBERT Results: {'eval_loss': 0.05640244483947754, 'eval_accuracy': 0.9881922733363109, 'eval_f1': 0.7436656794998354, 'eval_precision': 0.878010878010878, 'eval_recall': 0.6449771689497716, 'eval_runtime': 13.3608, 'eval_samples_per_second': 54.413, 'eval_steps_per_second': 3.443, 'epoch': 3.0}
