In [1]:
# imports
import os
import numpy as np
import pandas as pd
import torch

import evaluate
from datasets import load_dataset, ClassLabel, Features, Sequence, Value
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer, pipeline
from huggingface_hub import notebook_login

KeyboardInterrupt: 

In [None]:
#notebook_login()
# access token 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

In [None]:
RES_PATH = os.path.abspath("../resources/data/") + "/"

In [None]:
def load_dataset_dict(dataset_name, dataset_path, limit=None):
    train_file = dataset_path + dataset_name + "/train.jsonl"
    test_file = dataset_path + dataset_name + "/test.jsonl"
    dev_file = dataset_path + dataset_name + "/dev.jsonl"

    return load_dataset("json", data_files={"train":train_file, "validation":dev_file, "test":test_file},
                        features=Features({
                            "id": Value(dtype="string", id=None),
                            "label": Sequence(ClassLabel(num_classes=3, names=["B", "I", "O"]), length=-1, id=None),
                            "sentence": Sequence(Value(dtype="string", id=None), length=-1, id=None),
                        })).rename_column("label", "labels")

In [None]:
#laptop_dataset_dict = load_dataset_dict("test", RES_PATH)
laptop_dataset_dict = load_dataset_dict("laptop", RES_PATH)
#laptop_dataset_dict.push_to_hub("laptop-reviews")

In [None]:
laptop_dataset_dict["train"][0]

In [None]:
#label_list = laptop_dataset_dict["train"].features["label"].feature.names # ["B", "I", "O"]
label_list = ["B", "I", "O"]
label_list

In [None]:
bert_auto_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
example = laptop_dataset_dict["train"][0]
tokenized_input = bert_auto_tokenizer(example["sentence"], is_split_into_words=True, padding=True, truncation=True)
tokens = bert_auto_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

In [None]:
def tokenize_and_align_labels(batch_data):
    tokenized_inputs = bert_auto_tokenizer(batch_data["sentence"], is_split_into_words=True, padding=True, truncation=True)

    labels = []
    for i, label in enumerate(batch_data["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100 -> ignored by PT
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_laptop_dataset_dict = laptop_dataset_dict.map(tokenize_and_align_labels, batched=True)

In [None]:
tokenized_laptop_dataset_dict["train"].features

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=bert_auto_tokenizer, padding=True)

In [None]:
clf_metrics = evaluate.load("seqeval")
#accuracy_metric = evaluate.load("accuracy")
#precision_metric = evaluate.load("precision")
#recall_metric = evaluate.load("recall")
#f1_metric = evaluate.load("f1")

In [None]:
labels = [label_list[i] for i in example["labels"]]

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # true_labels = [label2id[item] for item in true_labels]
    # true_predictions = [label2id[item] for item in true_predictions]

    results = clf_metrics.compute(predictions=true_predictions, references=true_labels, zero_division=np.nan)

    return {
        "accuracy": results["overall_accuracy"],
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
    }

In [None]:
id2label = {
    0: "B",
    1: "I",
    2: "O",
}
label2id = {
    "B": 0,
    "I": 1,
    "O": 2
}

In [None]:
bert_token_classificator = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id)

In [None]:
training_args = TrainingArguments(
    output_dir="aspect_extraction_laptop_reviews",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    #save_steps=100.0,
    #save_total_limit=2,
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True)

In [None]:
trainer = Trainer(
    model=bert_token_classificator,
    args=training_args,
    train_dataset=tokenized_laptop_dataset_dict["train"],
    eval_dataset=tokenized_laptop_dataset_dict["validation"],
    tokenizer=bert_auto_tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
trainer.evaluate()

In [None]:
trainer.predict(tokenized_laptop_dataset_dict["test"])

In [None]:
text = "I Did not enjoy the new Apple operating system. I wish I could go back to the previous one!"
classifier = pipeline("ner", model="jannikseus/aspect_extraction_laptop_reviews")
classifier(text)

In [None]:
inf_tokenizer = AutoTokenizer.from_pretrained("jannikseus/aspect_extraction_laptop_reviews")
inputs = inf_tokenizer(text, return_tensors="pt")

inf_model = AutoModelForTokenClassification.from_pretrained("jannikseus/aspect_extraction_laptop_reviews")
with torch.no_grad():
    logits = inf_model(**inputs).logits

inf_predictions = torch.argmax(logits, dim=2)
inf_predicted_token_class = [inf_model.config.id2label[t.item()] for t in inf_predictions[0]]
inf_predicted_token_class