In [2]:
import torch
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)
from datasets import Dataset, Features, Sequence, ClassLabel, Value
import numpy as np
from seqeval.metrics import classification_report

In [3]:
# Function to read TSV files and parse sentences and labels
def read_tsv_file(file_path):
    sentences = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        tokens = []
        ner_tags = []
        for line in f:
            if line.strip():
                token, tag = line.strip().split('\t')
                tokens.append(token)
                ner_tags.append(tag)
            else:
                if tokens:
                    sentences.append(tokens)
                    labels.append(ner_tags)
                    tokens = []
                    ner_tags = []
        if tokens:
            sentences.append(tokens)
            labels.append(ner_tags)
    return sentences, labels

In [4]:
train_sentences, train_labels = read_tsv_file('../datasets/ner/BC2GM/train.tsv')
dev_sentences, dev_labels = read_tsv_file('../datasets/ner/BC2GM/dev.tsv')
test_sentences, test_labels = read_tsv_file('../datasets/ner/BC2GM/test.tsv')
label_list = ['B', 'I', 'O']
id2label = {i:label for i,label in enumerate(label_list)}
label2id = {label:i for i,label in enumerate(label_list)}

# Create datasets using the datasets package
features = Features({
    'tokens': Sequence(Value('string')),
    'ner_tags': Sequence(
        ClassLabel(names=label_list)
    )
})

train_data = {'tokens': train_sentences, 'ner_tags': train_labels}
dev_data = {'tokens': dev_sentences, 'ner_tags': dev_labels}
test_data = {'tokens': test_sentences, 'ner_tags': test_labels}

train_dataset = Dataset.from_dict(train_data, features=features)
dev_dataset = Dataset.from_dict(dev_data, features=features)
test_dataset = Dataset.from_dict(test_data, features=features)


In [5]:

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', use_fast=True) # use faster tokenizer

# Tokenize and align labels
def tokenize_and_align_labels(batch):
    tokenized_inputs = tokenizer(
        batch['tokens'],
        truncation=True,
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(batch['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs



In [6]:
# Apply the tokenization to the datasets
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
dev_dataset = dev_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/12574 [00:00<?, ? examples/s]

Map:   0%|          | 0/2519 [00:00<?, ? examples/s]

Map:   0%|          | 0/5038 [00:00<?, ? examples/s]

In [7]:
# Set the format for PyTorch tensors
columns = ['input_ids', 'attention_mask', 'labels']
train_dataset.set_format(type='torch', columns=columns)
dev_dataset.set_format(type='torch', columns=columns)
test_dataset.set_format(type='torch', columns=columns)

In [8]:
# Load the model for token classification
model = AutoModelForTokenClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Set training arguments
training_args = TrainingArguments(
    output_dir='../results/ner/BC2GM/distilbert-base-uncased',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01, 
)



ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`