In [None]:
import torch
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)
from datasets import Dataset, Features, Sequence, ClassLabel, Value
import numpy as np
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
from huggingface_hub import login

login(token='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')

In [None]:
# Function to read TSV files and parse sentences and labels
def read_tsv_file(file_path):
    sentences = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        tokens = []
        ner_tags = []
        for line in f:
            if line.strip():
                token, tag = line.strip().split('\t')
                tokens.append(token)
                ner_tags.append(tag)
            else:
                if tokens:
                    sentences.append(tokens)
                    labels.append(ner_tags)
                    tokens = []
                    ner_tags = []
        if tokens:
            sentences.append(tokens)
            labels.append(ner_tags)
    return sentences, labels

In [None]:
train_sentences, train_labels = read_tsv_file('../datasets/ner/BC2GM/train.tsv')
dev_sentences, dev_labels = read_tsv_file('../datasets/ner/BC2GM/dev.tsv')
test_sentences, test_labels = read_tsv_file('../datasets/ner/BC2GM/test.tsv')
label_list = ['B', 'I', 'O']
id2label = {i:label for i,label in enumerate(label_list)}
label2id = {label:i for i,label in enumerate(label_list)}

# Create datasets using the datasets package
features = Features({
    'tokens': Sequence(Value('string')),
    'ner_tags': Sequence(
        ClassLabel(names=label_list)
    )
})

train_data = {'tokens': train_sentences, 'ner_tags': train_labels}
dev_data = {'tokens': dev_sentences, 'ner_tags': dev_labels}
test_data = {'tokens': test_sentences, 'ner_tags': test_labels}

train_dataset = Dataset.from_dict(train_data, features=features)
dev_dataset = Dataset.from_dict(dev_data, features=features)
test_dataset = Dataset.from_dict(test_data, features=features)


In [None]:

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', use_fast=True) # use faster tokenizer

# Tokenize and align labels
def tokenize_and_align_labels(batch):
    tokenized_inputs = tokenizer(
        batch['tokens'],
        truncation=True,
        padding='max_length',
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(batch['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

In [None]:
# Apply the tokenization to the datasets
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
dev_dataset = dev_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
# Set the format for PyTorch tensors
columns = ['input_ids', 'attention_mask', 'labels']
train_dataset.set_format(type='torch', columns=columns)
dev_dataset.set_format(type='torch', columns=columns)
test_dataset.set_format(type='torch', columns=columns)

In [None]:
# Load the model for token classification
model = AutoModelForTokenClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
)

In [None]:
# Set training arguments
training_args = TrainingArguments(
    output_dir='../results/ner/BC2GM/distilbert-base-uncased',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01, 
)

In [None]:
 #Define the metric function using seqeval

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id2label[label_id] for label_id in label if label_id != -100]
        for label in labels
    ]
    true_predictions = [
        [id2label[pred_id] for (pred_id, label_id) in zip(prediction, label) if label_id != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return {
        'precision': round(precision_score(true_labels, true_predictions), 2), 
        'recall': round(recall_score(true_labels, true_predictions), 2),
        'f1': round(f1_score(true_labels, true_predictions), 2),
        'accuracy': round(accuracy_score(true_labels, true_predictions), 2),
    }


In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [None]:
# Train the model
trainer.train()

In [None]:
# push this model to the hub
model.push_to_hub('distilbert-base-uncased-FT-ner-BC2GM')

In [None]:
# reload the model from the hub and evaluate on the test set use trainer.predict
model = AutoModelForTokenClassification.from_pretrained('kbulutozler/distilbert-base-uncased-FT-ner-BC2GM')
trainer = Trainer(
    model=model,
)

In [None]:
# test the mdel on test set
predictions, labels, _ = trainer.predict(test_dataset)
compute_metrics((predictions, labels))