In [1]:
!pip install transformers datasets seqeval
!pip install torch --upgrade

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from transformers import DataCollatorForTokenClassification
from sklearn.model_selection import train_test_split
import torch
import numpy as np
from seqeval.metrics import classification_report

In [17]:
def read_conll(file_path):
    """
    Parses a CoNLL file into a DataFrame with two columns: 'words' and 'labels'.
    """
    words, labels = [], []
    with open(file_path, 'r') as file:
        word_list, label_list = [], []
        for line in file:
            if line.strip():
                word, label = line.strip().split()
                word_list.append(word)
                label_list.append(label)
            else:
                if word_list:  # End of a sentence
                    words.append(word_list)
                    labels.append(label_list)
                    word_list, label_list = [], []
    return pd.DataFrame({"words": words, "labels": labels})

# Load your dataset in CoNLL format
conll_data = read_conll("/content/drive/MyDrive/labeled_data.conll")

# Split into train and validation set
train_df, val_df = train_test_split(conll_data, test_size=0.2)

In [18]:
def convert_to_hf_format(df):
    return Dataset.from_pandas(df)

train_dataset = convert_to_hf_format(train_df)
val_dataset = convert_to_hf_format(val_df)

#  Load the tokenizer and model
model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=10)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
#  Extract the unique labels from the dataset
label_list = sorted(list(set([label for sublist in conll_data['labels'] for label in sublist])))
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

#  Tokenize and align labels, but first convert the labels to their integer form
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Start of a new word
                label_ids.append(label_to_id[label[word_idx]])  # Convert label to its integer ID
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenization and alignment to both training and validation datasets
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)

#  Make sure to update the model's number of labels
num_labels = len(label_list)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

#  Metrics calculation function (for evaluation)
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return {
        "classification_report": classification_report(true_labels, true_predictions),
    }


Map:   0%|          | 0/581 [00:00<?, ? examples/s]

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
#  Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
)



In [22]:
#  Fine-tune the model using Hugging Face Trainer
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Classification Report
1,0.1598,0.11847,precision recall f1-score support  LOC 0.58 0.79 0.67 185  PRICE 0.00 0.00 0.00 89  Product 0.00 0.00 0.00 100  micro avg 0.58 0.39 0.47 374  macro avg 0.19 0.26 0.22 374 weighted avg 0.29 0.39 0.33 374
2,0.0548,0.046988,precision recall f1-score support  LOC 0.93 0.92 0.93 185  PRICE 0.97 0.94 0.95 89  Product 0.00 0.00 0.00 100  micro avg 0.94 0.68 0.79 374  macro avg 0.63 0.62 0.63 374 weighted avg 0.69 0.68 0.69 374
3,0.0417,0.037342,precision recall f1-score support  LOC 0.93 0.91 0.92 185  PRICE 0.94 0.94 0.94 89  Product 0.60 0.03 0.06 100  micro avg 0.93 0.68 0.79 374  macro avg 0.83 0.63 0.64 374 weighted avg 0.85 0.68 0.70 374


  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "              precision    recall  f1-score   support

         LOC       0.58      0.79      0.67       185
       PRICE       0.00      0.00      0.00        89
     Product       0.00      0.00      0.00       100

   micro avg       0.58      0.39      0.47       374
   macro avg       0.19      0.26      0.22       374
weighted avg       0.29      0.39      0.33       374
" of type <class 'str'> for key "eval/classification_report" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "              precision    recall  f1-score   support

         LOC       0.93      0.92      0.93       185
       PRICE       0.97      0.94      0.95        89
     Product       0.00      0.00      0.00       100

   micro avg       0.94      0.68      0

TrainOutput(global_step=111, training_loss=0.1960782829235803, metrics={'train_runtime': 254.9062, 'train_samples_per_second': 6.838, 'train_steps_per_second': 0.435, 'total_flos': 272101326973806.0, 'train_loss': 0.1960782829235803, 'epoch': 3.0})

In [23]:
#  Evaluate the model
results = trainer.evaluate()

Trainer is attempting to log a value of "              precision    recall  f1-score   support

         LOC       0.93      0.91      0.92       185
       PRICE       0.94      0.94      0.94        89
     Product       0.60      0.03      0.06       100

   micro avg       0.93      0.68      0.79       374
   macro avg       0.83      0.63      0.64       374
weighted avg       0.85      0.68      0.70       374
" of type <class 'str'> for key "eval/classification_report" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


In [24]:
#  Save the model
trainer.save_model("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/sentencepiece.bpe.model',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')