<a href="https://colab.research.google.com/github/icoxfog417/transformer-sandbox/blob/main/transfer_classifier/finetune_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip -q install torch transformers datasets fugashi ipadic scikit-learn

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import EarlyStoppingCallback


In [3]:
class AmazonReview():

    def __init__(self, lang: str = "ja"):
        self.lang = lang

    def load(self, split: str, shuffle=True):
        dataset = load_dataset("amazon_reviews_multi", self.lang, split=split)
        if shuffle:
            return dataset.shuffle()
        else:
            return dataset

    def tokenize(self, dataset, tokenizer, batched=True):

        def encode(examples):
            tokenized = tokenizer(examples["review_title"], truncation=True, max_length=512, padding="max_length")
            return tokenized

        return dataset.map(encode, batched=batched)

    def labels(self, dataset, batched=True):

        def convert_star(star):
            if 1 < star < 5:
                return -1
            elif star == 1:
                return 0
            else:
                return 1

        def encode(examples):
            labels = {"labels": np.array([convert_star(s) for s in examples["stars"]])}
            return labels

        return dataset.map(encode, batched=batched)

    def format(self, dataset, tokenizer, batched=True):
        tokenized = self.tokenize(dataset, tokenizer, batched)
        labeled = self.labels(tokenized, batched)
        filtered = labeled.filter(lambda example: example["labels"] >= 0)
        filtered.set_format(type="torch",
                            columns=["input_ids",
                                     "token_type_ids",
                                     "attention_mask",
                                     "labels"])
        return filtered

    def statistics(self, formatted):
        positives = len([e for e in formatted if e["labels"].item() == 1])
        negatives = len([e for e in formatted if e["labels"].item() == 0])

        return {
            "total": len(formatted),
            "positive": positives,
            "negative": negatives,
        }


In [4]:
# Read data
# About slice https://huggingface.co/docs/datasets/splits.html
review = AmazonReview(lang="ja")

# Define pretrained tokenizer and model
model_name = "cl-tohoku/bert-base-japanese-whole-word-masking"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

dataset = review.load("validation")

dataset = dataset.train_test_split(test_size=0.2)
dataset_train = review.format(dataset["train"], tokenizer)
dataset_validation = review.format(dataset["test"], tokenizer)

print(review.statistics(dataset_train))
print(review.statistics(dataset_validation))

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


{'total': 1622, 'positive': 803, 'negative': 819}
{'total': 378, 'positive': 197, 'negative': 181}


In [5]:
# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


# Define Trainer
args = TrainingArguments(
    output_dir="output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="epoch",
    seed=0,
    load_best_model_at_end=True,
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset_train,
    eval_dataset=dataset_validation,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# Train pre-trained model
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Runtime,Samples Per Second
100,No log,0.471348,0.833333,0.965278,0.705584,0.815249,7.0319,53.755
200,No log,0.369668,0.878307,0.841629,0.944162,0.889952,7.0429,53.671
300,No log,0.267108,0.931217,0.943005,0.923858,0.933333,7.0346,53.734
400,No log,0.298745,0.936508,0.939086,0.939086,0.939086,7.0278,53.786
500,0.282000,0.284665,0.944444,0.973118,0.918782,0.94517,7.0446,53.658
600,0.282000,0.278831,0.939153,0.943878,0.939086,0.941476,7.0397,53.695


TrainOutput(global_step=600, training_loss=0.25006734689076743, metrics={'train_runtime': 506.8492, 'train_samples_per_second': 1.202, 'total_flos': 1629782501597184.0, 'epoch': 2.96, 'init_mem_cpu_alloc_delta': 1652424704, 'init_mem_gpu_alloc_delta': 443266560, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 422342656, 'train_mem_gpu_alloc_delta': 1856252928, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 6126096384})