In [1]:
import datasets
import numpy as np
import torch
import torch.nn.functional as F
from transformers import BertTokenizerFast, DataCollatorForTokenClassification, AutoModelForTokenClassification, Trainer, TrainingArguments
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score

2024-04-01 22:28:23.589397: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [21]:
dataset = datasets.load_dataset("ktgiahieu/maccrobat2018_2020")

In [22]:
label_list = []

with open('./label_list.txt', 'r') as f:
    for line in f:
        parts = line.split(':')
        label = parts[1].strip()
        
        label_list.append(label)

In [23]:
dataset = dataset['train'].train_test_split(test_size=0.2)

In [24]:
validation_ds = dataset.pop('test')

In [25]:
dataset['validation'] = validation_ds

In [27]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [28]:
idx_to_label = {i: label for i, label in enumerate(label_list)}
label_to_idx = {label: i for i, label in enumerate(label_list)}

In [29]:
def tags_to_indices(example):
    example['tags'] = [label_to_idx[tag] for tag in example['tags']]
    return example

In [30]:
def indices_to_tags(example):
    example['tags'] = [idx_to_label[idx] for idx in example['tags']]
    return example

In [31]:
dataset = dataset.map(lambda x: tags_to_indices(x))

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

In [32]:
# Tokenize and align labels function
def tokenize_and_align_labels(example, label_all_tokens=True):
    tokenized_input = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(example['tags']):
        word_ids = tokenized_input.word_ids(batch_index=i)
        previous_word_idx = None
        
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    
    tokenized_input["labels"] = labels
    return tokenized_input

In [33]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

In [34]:
model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_list))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
# Define adversarial training function
def adversarial(inputs, labels, model, epsilon=0.1):
    inputs.requires_grad = True
    outputs = model(**inputs)
    loss = F.cross_entropy(outputs.logits, labels)
    loss.backward()
    perturbed_inputs = inputs + epsilon * inputs.grad.sign()
    perturbed_inputs = torch.clamp(perturbed_inputs, 0, 1)  # Ensure inputs are in valid range
    return perturbed_inputs

In [36]:
# Custom dataset class for adversarial training
class AdversarialDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer, model, epsilon=0.1):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.model = model
        self.epsilon = epsilon

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        example = self.dataset[idx]
        inputs = self.tokenizer(example["tokens"], truncation=True, padding="max_length", max_length=512, return_tensors="pt")
        labels = torch.tensor(example["tags"])
        perturbed_inputs = adversarial(inputs, labels, self.model, self.epsilon)
        return {"input_ids": perturbed_inputs.input_ids, "attention_mask": perturbed_inputs.attention_mask}, labels


In [37]:
# Create adversarial dataset
adversarial_train_dataset = AdversarialDataset(tokenized_dataset["train"], tokenizer, model, epsilon=0.1)

In [40]:
# Define training arguments
args = TrainingArguments(
    'adversarial-ner',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=100,
    weight_decay=0.01,
)

In [41]:
# Define compute_metrics function
def compute_metrics(p):
    logits, label_ids = p
    predictions = np.argmax(logits, axis=2)

    true_predictions = [
        label_list[p] for batch_preds, batch_labels in zip(predictions, label_ids) 
        for p, l in zip(batch_preds, batch_labels) if l != -100
    ]

    true_labels = [
        label_list[l] for batch_labels in label_ids
        for l in batch_labels if l != -100
    ]

    precision = precision_score([true_labels], [true_predictions])
    recall = recall_score([true_labels], [true_predictions])
    f1 = f1_score([true_labels], [true_predictions])
    accuracy = accuracy_score([true_labels], [true_predictions])

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'accuracy': accuracy
    }

In [42]:
# Create Trainer for adversarial training
trainer = Trainer(
    model,
    args,
    train_dataset=adversarial_train_dataset,
    eval_dataset=tokenized_dataset["validation"],
    data_collator=DataCollatorForTokenClassification(tokenizer),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# Training loop
for epoch in range(args.num_train_epochs):
    trainer.train()

In [None]:
# Save the model
model.save_pretrained('ner_model')

In [None]:
# Save the tokenizer
tokenizer.save_pretrained('tokenizer')