In [251]:
# imports
import datasets
import evaluate
import numpy as np
from numpy.core import overrides
from datasets import load_dataset, Dataset
from datasets import Features
from datasets import Sequence
from datasets import ClassLabel
from datasets import Value
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertConfig, AutoModelForSequenceClassification, \
    TrainingArguments, Trainer, AutoModelForTokenClassification
import torch

In [252]:
RES_PATH = os.path.abspath("../resources/data/") + "/"

In [253]:
def load_dataset_dict(dataset_name, dataset_path, limit=None):
    class_label = Sequence(ClassLabel(num_classes=3, names=["B", "I", "O"]))

    with open(dataset_path + dataset_name + "/train.json", "r", encoding="utf-8") as file_train:
        raw_train = pd.read_json(file_train, encoding="utf-8", orient="index").head(limit).fillna("").rename(columns={"label": "labels"})
    with open(dataset_path + dataset_name + "/test.json", "r", encoding="utf-8") as file_test:
        raw_test = pd.read_json(file_test, encoding="utf-8", orient="index").head(limit).fillna("").rename(columns={"label": "labels"})
    with open(dataset_path + dataset_name + "/dev.json", "r", encoding="utf-8") as file_dev:
        raw_dev = pd.read_json(file_dev, encoding="utf-8", orient="index").head(limit).fillna("").rename(columns={"label": "labels"})

    return datasets.DatasetDict({
        "train": Dataset.from_pandas(raw_train, preserve_index=True).rename_column("__index_level_0__", "id").cast_column("labels", class_label),
        "test": Dataset.from_pandas(raw_test, preserve_index=True).rename_column("__index_level_0__", "id").cast_column("labels", class_label),
        "dev": Dataset.from_pandas(raw_dev, preserve_index=True).rename_column("__index_level_0__", "id").cast_column( "labels", class_label)
    })

In [254]:
laptop_dataset_dict = load_dataset_dict("laptop", RES_PATH, 10)
rest_dataset_dict = load_dataset_dict("rest", RES_PATH, 10)

Casting the dataset:   0%|          | 0/10 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/10 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/10 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/10 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/10 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/10 [00:00<?, ? examples/s]

In [255]:
laptop_dataset_dict["train"][0]

{'labels': [0, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2],
 'sentence': ['Keyboard',
  'is',
  'great',
  'but',
  'primary',
  'and',
  'secondary',
  'control',
  'buttons',
  'could',
  'be',
  'more',
  'durable',
  '.'],
 'id': 0}

In [256]:
label2id = {
    "B": 0,
    "I": 1,
    "O": 2
}

label_list = rest_dataset_dict["train"].features["labels"].feature.names
#label_list = list(map(label2id.get, label_list))
label_list

['B', 'I', 'O']

In [257]:
bert_auto_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [258]:
example = rest_dataset_dict["train"][0]
tokenized_input = bert_auto_tokenizer(example["sentence"], truncation=True, padding=True, is_split_into_words=True)
tokens = bert_auto_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
example["labels"]

[2, 2, 2, 0]

In [259]:
max_length = 100

In [260]:
def tokenize_and_align_labels(batch_data):
    tokenized_inputs = bert_auto_tokenizer(batch_data["sentence"], truncation=True, padding=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(batch_data["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100 -> ignored by PT
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [261]:
tokenized_rest_dataset_dict = rest_dataset_dict.map(tokenize_and_align_labels, batched=True)
tokenized_laptop_dataset_dict = laptop_dataset_dict.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [262]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=bert_auto_tokenizer)

In [263]:
#clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
clf_metrics = evaluate.load("seqeval")
#clf_metrics = evaluate.evaluator("text-classification")

In [264]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)

    print(labels)
    true_predictions = []
    for prediction, label in zip(predictions, labels):
        for (p, l) in zip(prediction, label):
            if l != -100:
                true_predictions.append(label_list[p])

    true_labels = []
    for prediction, label in zip(predictions, labels):
        for (p, l) in zip(prediction, label):
            if l != -100:
                true_labels.append(label_list[l])

    results = clf_metrics.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [265]:
id2label = {
    0: "B",
    1: "I",
    2: "O",
}
label2id = {
    "B": 0,
    "I": 1,
    "O": 2
}

In [266]:
model_bert = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream

In [267]:
training_args = TrainingArguments(
    output_dir="distilbert-base-uncased_auto-seq-model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False)

trainer = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=tokenized_laptop_dataset_dict["train"],
    eval_dataset=tokenized_laptop_dataset_dict["test"],
    tokenizer=bert_auto_tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics)

In [268]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


[[-100    0    1    2    2    2    2    2    2    2    2    2    2    2
     2    2 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
  -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
  -100 -100 -100 -100 -100 -100 -100 -100 -100]
 [-100    0    1    2    2    2    2    2    2    2    2    2    2    2
     2    2    2    2 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
  -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
  -100 -100 -100 -100 -100 -100 -100 -100 -100]
 [-100    2    2    2    2    2    2    2 -100 -100 -100 -100 -100 -100
  -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
  -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
  -100 -100 -100 -100 -100 -100 -100 -100 -100]
 [-100    0    1    2    2    2 -100 -100 -100 -100 -100 -100 -100 -100
  -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
  -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -1

ValueError: Got a string but expected a list instead: 'B'

In [None]:
#ps=trainer.predict(laptop_dataset_dict_enc["dev"])
#print(ps.predictions.shape, ps.label_ids.shape)
bert_auto_tokenizer.convert_tokens_to_ids()

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.evaluate(laptop_dataset_dict["dev"])

In [None]:
# model evaluation with metrics precision, recall, accuracy, f1-score and confusion matrix
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html