In [1]:
from transformers import (
    set_seed,
)

In [2]:
set_seed(123)

In [3]:
from datasets import load_dataset

data_files = "./train_jp.json"
datasets = load_dataset("json", data_files=data_files, split={
    "train": "train[:90%]",
    "validation": "train[90%:]"
})

print(datasets)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 6707
    })
    validation: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 745
    })
})


In [4]:
column_names = datasets["train"].column_names
features = datasets["train"].features

column_names, features

(['tokens', 'tags'],
 {'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
  'tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)})

In [5]:
text_column_name = "tokens"
label_column_name = "tags"

text_column_name, label_column_name

('tokens', 'tags')

In [6]:
# In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
    # unique labels.
def get_label_list(labels):
    unique_labels = set()
    for label in labels:
        unique_labels = unique_labels | set(label)
    label_list = list(unique_labels)
    label_list.sort()
    return label_list

label_list = get_label_list(datasets["train"][label_column_name])
label_to_id = {l: i for i, l in enumerate(label_list)}
num_labels = len(label_list)

label_list, label_to_id

(['B', 'I', 'O'], {'B': 0, 'I': 1, 'O': 2})

In [7]:
from transformers import AutoConfig, BertTokenizerFast, BertForTokenClassification

model_name_or_path = "./out_new3"

# config = AutoConfig.from_pretrained(
#     model_name_or_path,
#     num_labels=num_labels,
#     finetuning_task="pos",
# )
tokenizer = BertTokenizerFast.from_pretrained(
    model_name_or_path,
)
model = BertForTokenClassification.from_pretrained(
    model_name_or_path,
)

In [8]:
padding = "max_length" # False

In [9]:
# Whether to put the label for one word on all tokens of generated by that word or just on the ""one (in which case the other tokens will have a padding index).
label_all_tokens = False

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding=padding,
        truncation=True,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label_to_id[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


# train_dataset = datasets["train"]
# train_dataset = train_dataset.map(
#     tokenize_and_align_labels,
#     batched=True,
#     num_proc=None,
#     load_from_cache_file=None,
# )

eval_dataset = datasets["validation"]
eval_dataset = eval_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=None,
    load_from_cache_file=None,
)

In [12]:
len(eval_dataset)

745

In [14]:
!pip install accelerate

Collecting accelerate
  Obtaining dependency information for accelerate from https://files.pythonhosted.org/packages/d9/92/2d3aecf9f4a192968035880be3e2fc8b48d541c7128f7c936f430d6f96da/accelerate-0.23.0-py3-none-any.whl.metadata
  Downloading accelerate-0.23.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
   ---------------------------------------- 0.0/258.1 kB ? eta -:--:--
   --------- ------------------------------ 61.4/258.1 kB 1.6 MB/s eta 0:00:01
   ---------------------- ----------------- 143.4/258.1 kB 2.1 MB/s eta 0:00:01
   ---------------------------------- ----- 225.3/258.1 kB 1.7 MB/s eta 0:00:01
   ---------------------------------------- 258.1/258.1 kB 1.8 MB/s eta 0:00:00
Installing collected packages: accelerate
Successfully installed accelerate-0.23.0


In [15]:
# Metrics
import numpy as np
from datasets import load_metric
from accelerate import Accelerator

# Accelerator(log_with=args.report_to, project_dir=args.output_dir) if args.with_tracking else Accelerator()
accelerator = Accelerator()

device = accelerator.device
metric = load_metric("./my_seqeval.py")

def compute_metrics(p, return_entity_level_metrics:bool):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

def compute_metrics2(return_entity_level_metrics: bool = False):
    results = metric.compute()
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }


  metric = load_metric("./my_seqeval.py")


In [16]:
def get_labels(predictions, references):
    # Transform predictions and references tensos to numpy arrays
    if device.type == "cpu":
        y_pred = predictions.detach().clone().numpy()
        y_true = references.detach().clone().numpy()
    else:
        y_pred = predictions.detach().cpu().clone().numpy()
        y_true = references.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(pred, gold_label) if l != -100]
        for pred, gold_label in zip(y_pred, y_true)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(pred, gold_label) if l != -100]
        for pred, gold_label in zip(y_pred, y_true)
    ]
    return true_predictions, true_labels

In [17]:
import torch



eval_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=128)
processed_count = 0
with torch.no_grad():    
    for batch in dataloader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        predictions = outputs.logits.argmax(dim=-1)
        predictions_gathered, labels_gathered = accelerator.gather((predictions, labels))

        preds, refs = get_labels(predictions_gathered, labels_gathered)
        # compute_metrics((outputs.logits, labels), return_entity_level_metrics=False)
        metric.add_batch(
            predictions=preds,
            references=refs,
        )  # predictions and preferences are expected to be a nested list of labels, not label_ids
        processed_count += len(input_ids)
        print(processed_count, end="\r")

745

In [18]:
compute_metrics2()

{'precision': 0.4701195219123506,
 'recall': 0.6519337016574586,
 'f1': 0.5462962962962964,
 'accuracy': 0.9085395205545393}

In [None]:
("DONE",)