# Imports

In [17]:
import numpy as np
import pandas as pd
import torch

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
from torch.utils.data import Dataset

# Data Preprocessing

In [18]:
df = pd.read_csv("./data/japanese_toxicity_dataset.csv")
labels = df["Toxic/Not Toxic"].replace({"Not Toxic": 0, "Toxic": 1}).to_list()
text = df["Text"].to_list()
text_train, text_test, labels_train, labels_test = train_test_split(text, labels, test_size=0.2, stratify=labels)

# Tokenize Data

In [19]:
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
text_train = tokenizer(text_train, padding="max_length", truncation=True)
text_test = tokenizer(text_test, padding="max_length", truncation=True)

# Dataset Class

In [20]:
class ToxicityDataset(Dataset):
    def __init__(self, encodings: list, labels: list) -> None:
        self.encodings: list = encodings
        self.labels: list = labels

    def __getitem__(self, index: int) -> dict:
        item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
        item["label"] = self.labels[index]
        return item

    def __len__(self) -> int:
        return len(self.labels)

# Metrics Function

In [21]:
def compute_metrics(eval_prediction: EvalPrediction) -> dict:
    true = eval_prediction.label_ids
    prediction = np.argmax(eval_prediction.predictions, axis=1)

    metrics: dict = {}
    metrics["accuracy"] = accuracy_score(true, prediction)
    metrics["precision"] = precision_score(true, prediction)
    metrics["recall"] = recall_score(true, prediction)
    metrics["f1"] = f1_score(true, prediction)
    return metrics

# Create/Train BERT Model

In [22]:
training_dataset = ToxicityDataset(text_train, labels_train)
testing_dataset = ToxicityDataset(text_test, labels_test)
model = AutoModelForSequenceClassification.from_pretrained("cl-tohoku/bert-base-japanese", num_labels=2)
training_args = TrainingArguments(
    output_dir="./models",
    num_train_epochs=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_dataset,
    eval_dataset=testing_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 0.08609665185213089, 'eval_accuracy': 0.96, 'eval_precision': 0.9693877551020408, 'eval_recall': 0.95, 'eval_f1': 0.9595959595959594, 'eval_runtime': 97.8116, 'eval_samples_per_second': 2.045, 'eval_steps_per_second': 0.256, 'epoch': 1.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 0.13430924713611603, 'eval_accuracy': 0.97, 'eval_precision': 0.9433962264150944, 'eval_recall': 1.0, 'eval_f1': 0.970873786407767, 'eval_runtime': 97.6443, 'eval_samples_per_second': 2.048, 'eval_steps_per_second': 0.256, 'epoch': 2.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 0.1387237310409546, 'eval_accuracy': 0.975, 'eval_precision': 0.9523809523809523, 'eval_recall': 1.0, 'eval_f1': 0.975609756097561, 'eval_runtime': 97.6552, 'eval_samples_per_second': 2.048, 'eval_steps_per_second': 0.256, 'epoch': 3.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 0.07674000412225723, 'eval_accuracy': 0.975, 'eval_precision': 0.9797979797979798, 'eval_recall': 0.97, 'eval_f1': 0.9748743718592964, 'eval_runtime': 97.8323, 'eval_samples_per_second': 2.044, 'eval_steps_per_second': 0.256, 'epoch': 4.0}
{'loss': 0.088, 'learning_rate': 2.5e-05, 'epoch': 5.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 0.1450657695531845, 'eval_accuracy': 0.98, 'eval_precision': 0.9615384615384616, 'eval_recall': 1.0, 'eval_f1': 0.9803921568627451, 'eval_runtime': 97.6885, 'eval_samples_per_second': 2.047, 'eval_steps_per_second': 0.256, 'epoch': 5.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 0.1481538563966751, 'eval_accuracy': 0.98, 'eval_precision': 0.9615384615384616, 'eval_recall': 1.0, 'eval_f1': 0.9803921568627451, 'eval_runtime': 97.6983, 'eval_samples_per_second': 2.047, 'eval_steps_per_second': 0.256, 'epoch': 6.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 0.14582760632038116, 'eval_accuracy': 0.98, 'eval_precision': 0.9615384615384616, 'eval_recall': 1.0, 'eval_f1': 0.9803921568627451, 'eval_runtime': 97.6111, 'eval_samples_per_second': 2.049, 'eval_steps_per_second': 0.256, 'epoch': 7.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 0.1466090977191925, 'eval_accuracy': 0.98, 'eval_precision': 0.9615384615384616, 'eval_recall': 1.0, 'eval_f1': 0.9803921568627451, 'eval_runtime': 97.6933, 'eval_samples_per_second': 2.047, 'eval_steps_per_second': 0.256, 'epoch': 8.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 0.14837326109409332, 'eval_accuracy': 0.98, 'eval_precision': 0.9615384615384616, 'eval_recall': 1.0, 'eval_f1': 0.9803921568627451, 'eval_runtime': 97.6903, 'eval_samples_per_second': 2.047, 'eval_steps_per_second': 0.256, 'epoch': 9.0}
{'loss': 0.0, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 0.14896900951862335, 'eval_accuracy': 0.98, 'eval_precision': 0.9615384615384616, 'eval_recall': 1.0, 'eval_f1': 0.9803921568627451, 'eval_runtime': 97.7082, 'eval_samples_per_second': 2.047, 'eval_steps_per_second': 0.256, 'epoch': 10.0}
{'train_runtime': 12957.8256, 'train_samples_per_second': 0.617, 'train_steps_per_second': 0.077, 'train_loss': 0.0440301960259676, 'epoch': 10.0}


TrainOutput(global_step=1000, training_loss=0.0440301960259676, metrics={'train_runtime': 12957.8256, 'train_samples_per_second': 0.617, 'train_steps_per_second': 0.077, 'train_loss': 0.0440301960259676, 'epoch': 10.0})

# Save Model

In [None]:
trainer.save_model("./models/bert_toxicity")