In [17]:
import datasets
from sklearn.model_selection import KFold
from transformers import BertTokenizerFast, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
import numpy as np

In [6]:
dataset = datasets.load_dataset("ktgiahieu/maccrobat2018_2020")

split_dataset = dataset['train'].train_test_split(test_size=0.1)

train_val_dataset = split_dataset['train']
test_dataset = split_dataset['test']

kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [7]:
label_list = []

with open('./label_list.txt', 'r') as f:
    for line in f:
        parts = line.split(':')
        label = parts[1].strip()
        
        label_list.append(label)

In [8]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [9]:
idx_to_label = {i: label for i, label in enumerate(label_list)}
label_to_idx = {label: i for i, label in enumerate(label_list)}

In [10]:
def tags_to_indices(example):
    example['tags'] = [label_to_idx[tag] for tag in example['tags']]
    return example

In [11]:
def indices_to_tags(example):
    example['tags'] = [idx_to_label[idx] for idx in example['tags']]
    return example

In [13]:
train_val_dataset = train_val_dataset.map(lambda x: tags_to_indices(x))
test_dataset = test_dataset.map(lambda x: tags_to_indices(x))

Map: 100%|██████████| 360/360 [00:00<00:00, 2301.84 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 1481.22 examples/s]


In [14]:
def tokenize_and_align_labels(example, label_all_tokens=True):
    tokenized_input = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(example['tags']):
        word_ids = tokenized_input.word_ids(batch_index=i)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It returns a list indicating the word corresponding to each token.
        previous_word_idx = None
        
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    
    tokenized_input["labels"] = labels
    return tokenized_input

In [18]:
model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_list))
metric = datasets.load_metric('seqeval')
data_collator = DataCollatorForTokenClassification(tokenizer)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [27]:
configurations = [
    {'learning_rate': 2e-5, 'batch_size': 32, 'epochs': 30, 'weight_decay': 0.01},
    {'learning_rate': 2e-5, 'batch_size': 32, 'epochs': 30, 'weight_decay': 0.02},
    {'learning_rate': 2e-5, 'batch_size': 32, 'epochs': 60, 'weight_decay': 0.01},
    {'learning_rate': 2e-5, 'batch_size': 32, 'epochs': 60, 'weight_decay': 0.02},
    {'learning_rate': 2e-5, 'batch_size': 32, 'epochs': 100, 'weight_decay': 0.01}
]

In [20]:
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score

def compute_metrics(p):
    logits, label_ids = p
    predictions = np.argmax(logits, axis=2)

    # Flatten the batches and remove the special tokens
    true_predictions = [
        label_list[p] for batch_preds, batch_labels in zip(predictions, label_ids) 
        for p, l in zip(batch_preds, batch_labels) if l != -100
    ]

    true_labels = [
        label_list[l] for batch_labels in label_ids
        for l in batch_labels if l != -100
    ]

    # Calculate metrics
    precision = precision_score([true_labels], [true_predictions])
    recall = recall_score([true_labels], [true_predictions])
    f1 = f1_score([true_labels], [true_predictions])
    accuracy = accuracy_score([true_labels], [true_predictions])

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'accuracy': accuracy
    }


In [28]:
from transformers import set_seed

set_seed(42)

results = []

for i, config in enumerate(configurations):
    fold_results = []
    print(f"Training with config: {config}")
    for fold, (train_index, val_index) in enumerate(kf.split(train_val_dataset)):
        print(f"Starting fold {fold+1}/{kf.get_n_splits()}")
        train_fold = train_val_dataset.select(train_index)
        val_fold = train_val_dataset.select(val_index)

        train_fold = train_fold.map(tokenize_and_align_labels, batched=True)
        val_fold = val_fold.map(tokenize_and_align_labels, batched=True)

        training_args = TrainingArguments(
            output_dir=f'./model_fold_{fold}',
            evaluation_strategy='epoch',
            learning_rate=config['learning_rate'],
            per_device_train_batch_size=config['batch_size'],
            num_train_epochs=config['epochs'],
            weight_decay=config['weight_decay'],
            logging_dir=f'./logs_fold_{fold}',  # Add logging
            logging_steps=10  # Log every 10 steps
        )

        model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_list))
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_fold,
            eval_dataset=val_fold,
            data_collator=data_collator,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )

        trainer.train()
        eval_results = trainer.evaluate()
        fold_results.append(eval_results)
        print(f"Results for fold {fold+1}: {eval_results}")

    model.save_pretrained(f'./ner_models/ner_model_config_{i}')
    tokenizer.save_pretrained(f'./tokenizers/tokenizer_config_{i}')

    avg_accuracy = np.mean([fr['eval_accuracy'] for fr in fold_results])
    results.append({'config': config, 'average_accuracy': avg_accuracy})
    print(f"Avg. Accuracy for current config: {avg_accuracy}")


Training with config: {'learning_rate': 2e-05, 'batch_size': 32, 'epochs': 1, 'weight_decay': 0.01}
Starting fold 1/5


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 53%|█████▎    | 239/450 [18:14<16:06,  4.58s/it]
100%|██████████| 9/9 [01:19<00:00,  8.25s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                             
[A                                            
100%|██████████| 9/9 [01:23<00:00,  8.25s/it]
                                             
100%|██████████| 9/9 [01:23<00:00,  9.28s/it]  


{'eval_loss': 3.172919750213623, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.6338393083443634, 'eval_runtime': 4.2466, 'eval_samples_per_second': 16.955, 'eval_steps_per_second': 2.119, 'epoch': 1.0}
{'train_runtime': 83.4764, 'train_samples_per_second': 3.45, 'train_steps_per_second': 0.108, 'train_loss': 3.7714106241861978, 'epoch': 1.0}


100%|██████████| 9/9 [00:04<00:00,  1.92it/s]


Results for fold 1: {'eval_loss': 3.172919750213623, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.6338393083443634, 'eval_runtime': 5.1535, 'eval_samples_per_second': 13.971, 'eval_steps_per_second': 1.746, 'epoch': 1.0}
Starting fold 2/5


Map: 100%|██████████| 288/288 [00:00<00:00, 741.95 examples/s]
Map: 100%|██████████| 72/72 [00:00<00:00, 785.08 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 9/9 [01:29<00:00,  9.81s/it]
[A
[A
[A
[A
[A
[A
[A
[A
  _warn_prf(average, modifier, msg_start, len(result))


                                             
[A                                            
100%|██████████| 9/9 [01:35<00:00,  9.81s/it]
                                             
100%|██████████| 9/9 [01:35<00:00, 10.59s/it]  


{'eval_loss': 2.7188925743103027, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.6411092985318108, 'eval_runtime': 5.3731, 'eval_samples_per_second': 13.4, 'eval_steps_per_second': 1.675, 'epoch': 1.0}
{'train_runtime': 95.3259, 'train_samples_per_second': 3.021, 'train_steps_per_second': 0.094, 'train_loss': 3.4452633327907987, 'epoch': 1.0}


100%|██████████| 9/9 [00:04<00:00,  1.89it/s]


Results for fold 2: {'eval_loss': 2.7188925743103027, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.6411092985318108, 'eval_runtime': 5.347, 'eval_samples_per_second': 13.465, 'eval_steps_per_second': 1.683, 'epoch': 1.0}
Starting fold 3/5


Map: 100%|██████████| 288/288 [00:00<00:00, 937.73 examples/s]
Map: 100%|██████████| 72/72 [00:00<00:00, 867.55 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 9/9 [01:08<00:00,  7.15s/it]
[A
[A
[A
[A

                                             
[A                                            
100%|██████████| 9/9 [01:09<00:00,  7.15s/it]
                                             
100%|██████████| 9/9 [01:09<00:00,  7.69s/it]  


{'eval_loss': 2.714573621749878, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.6399550160039217, 'eval_runtime': 0.7829, 'eval_samples_per_second': 91.97, 'eval_steps_per_second': 11.496, 'epoch': 1.0}
{'train_runtime': 69.2228, 'train_samples_per_second': 4.16, 'train_steps_per_second': 0.13, 'train_loss': 3.423275417751736, 'epoch': 1.0}


100%|██████████| 9/9 [00:00<00:00, 13.36it/s]


Results for fold 3: {'eval_loss': 2.714573621749878, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.6399550160039217, 'eval_runtime': 0.7506, 'eval_samples_per_second': 95.927, 'eval_steps_per_second': 11.991, 'epoch': 1.0}
Starting fold 4/5


Map: 100%|██████████| 288/288 [00:00<00:00, 896.50 examples/s]
Map: 100%|██████████| 72/72 [00:00<00:00, 905.46 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 22%|██▏       | 2/9 [00:15<00:54,  7.82s/it]