In [None]:
# Install required packages
!pip install -q \
    "pyarrow<15.0.0" \
    transformers \
    datasets \
    tokenizers \
    seqeval \
    tensorflow_probability --upgrade \
    evaluate \
    hyperopt \
    "ray[tune]"

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.0/172.0 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.9 MB/s[0m eta [

In [None]:
# Load libraries
import torch
from datasets import load_from_disk, DatasetDict

from transformers import (
    AutoTokenizer,
    DataCollatorForTokenClassification,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer
)

import numpy as np
import evaluate
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
# Load dataset
finer = load_from_disk("/content/drive/MyDrive/Code/hfdata_finer.json")

In [17]:
# Set model
model_checkpoint = "bert-base-cased"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Create tokenized dataset
tokenized_finer = finer.map(tokenize_and_align_labels, batched=True)
tokenized_finer



DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3187
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 903
    })
    valid: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 464
    })
})

In [None]:
# Show example of tokenized input sequence
example = finer["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', 'Following', 'the', 'closing', 'bell', ',', 'American', 'Express', '(', 'A', '##X', '##P', ')', 'reported', 'weaker', 'earnings', 'and', 'revenue', 'than', 'analysts', 'anticipated', '.', '[SEP]']


In [None]:
# Initialise data_collator for token classification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Load seqeval
seqeval = evaluate.load("seqeval")

# Map and define text and numeral labels
label_list = ["O","B-PER","I-PER","B-LOC","I-LOC","B-ORG","I-ORG"]

id2label = {
    0: "O",
    1: "B-PER",
    2: "I-PER",
    3: "B-LOC",
    4: "I-LOC",
    5: "B-ORG",
    6: "I-ORG"
}

label2id = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-LOC": 3,
    "I-LOC": 4,
    "B-ORG": 5,
    "I-ORG": 6
}

labels = [label_list[i] for i in example[f"ner_tags"]]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(p.predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    mlb = MultiLabelBinarizer()
    true_labels = mlb.fit_transform(true_labels)
    true_predictions = mlb.transform(true_predictions)

    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(true_labels, true_predictions, average='macro')
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(true_labels, true_predictions, average='micro')
    accuracy = accuracy_score(true_labels, true_predictions)

    return {
        'macro_precision': precision_macro,
        'macro_recall': recall_macro,
        'macro_f1': f1_macro,
        'micro_precision': precision_micro,
        'micro_recall': recall_micro,
        'micro_f1': f1_micro,
        'accuracy': accuracy
    }

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
# Train bert-large-case using best hyperparameters from bert-base-cased protocol
model = AutoModelForTokenClassification.from_pretrained(
    'bert-large-cased',
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
)

# Train model with best hyperparameters
training_args = TrainingArguments(
    "bert-finetuned-ner",
    num_train_epochs=5,
    learning_rate=8.288916866885136e-06,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_finer["train"],
    eval_dataset=tokenized_finer["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro Precision,Macro Recall,Macro F1,Micro Precision,Micro Recall,Micro F1,Accuracy
1,No log,0.045731,0.91717,0.936225,0.925547,0.947315,0.964912,0.956033,0.864224
2,No log,0.035888,0.930278,0.950861,0.939853,0.960163,0.970072,0.965092,0.898707
3,0.095600,0.037233,0.934364,0.956825,0.944618,0.962245,0.973168,0.967676,0.905172
4,0.095600,0.03884,0.94235,0.961244,0.951313,0.966292,0.976264,0.971253,0.915948
5,0.015400,0.04011,0.936741,0.9592,0.947177,0.962322,0.975232,0.968734,0.909483


TrainOutput(global_step=1000, training_loss=0.05548418760299682, metrics={'train_runtime': 461.274, 'train_samples_per_second': 34.546, 'train_steps_per_second': 2.168, 'total_flos': 1972632486298188.0, 'train_loss': 0.05548418760299682, 'epoch': 5.0})

In [None]:
# Evaluate test dataset and print results.
res = trainer.evaluate(tokenized_finer["test"])

for metric_name, metric_value in res.items():
    print(f"{metric_name}: {metric_value:.4f}")

eval_loss: 0.0468
eval_macro_precision: 0.9505
eval_macro_recall: 0.9461
eval_macro_f1: 0.9473
eval_micro_precision: 0.9618
eval_micro_recall: 0.9711
eval_micro_f1: 0.9664
eval_accuracy: 0.8815
eval_runtime: 5.6388
eval_samples_per_second: 160.1420
eval_steps_per_second: 10.1090
epoch: 5.0000
