# Imports

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline

# Load Model

In [9]:
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
model = AutoModelForSequenceClassification.from_pretrained("./models/bert_toxicity")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=device)

# Load Unclassified Data/Classify Function

In [10]:
def classify(path: str, pipe: TextClassificationPipeline) -> tuple:
    data = []
    with open(path) as file:
        for line in file:
            line = line.strip()
            data.append(line.encode("utf-8").decode("unicode_escape"))
    results = pipe(data)
    num_toxic = sum("LABEL_1" == result["label"] for result in results)
    num_nontoxic = len(results) - num_toxic
    return num_nontoxic, num_toxic

# Show Results for Each Dataset

In [13]:
datasets = [
    "data/twitter_ainu_since_2018_to_2020.csv",
    "data/twitter_ainu_since_2018_to_2022.csv",
    "data/twitter_ainu_since_2018_to_present.csv",
    "data/twitter_burakumin_since_2018_to_2020.csv",
    "data/twitter_burakumin_since_2018_to_2022.csv",
    "data/twitter_burakumin_since_2018_to_present.csv",
    "data/twitter_zainichi_since_2018_to_2020.csv",
    "data/twitter_zainichi_since_2018_to_2022.csv",
    "data/twitter_zainichi_since_2018_to_present.csv"
]
for dataset in datasets:
    num_nontoxic, num_toxic = classify(dataset, pipe)
    print(f"Dataset: {dataset}, Number Non-Toxic: {num_nontoxic}, Number Toxic: {num_toxic}")

Dataset: data/twitter_ainu_since_2018_to_2020.csv, Number Non-Toxic: 23066, Number Toxic: 12570
Dataset: data/twitter_ainu_since_2018_to_2022.csv, Number Non-Toxic: 21147, Number Toxic: 9066
Dataset: data/twitter_ainu_since_2018_to_present.csv, Number Non-Toxic: 21552, Number Toxic: 10650
Dataset: data/twitter_burakumin_since_2018_to_2020.csv, Number Non-Toxic: 46, Number Toxic: 28
Dataset: data/twitter_burakumin_since_2018_to_2022.csv, Number Non-Toxic: 23, Number Toxic: 10
Dataset: data/twitter_burakumin_since_2018_to_present.csv, Number Non-Toxic: 13, Number Toxic: 8
Dataset: data/twitter_zainichi_since_2018_to_2020.csv, Number Non-Toxic: 170483, Number Toxic: 544002
Dataset: data/twitter_zainichi_since_2018_to_2022.csv, Number Non-Toxic: 154670, Number Toxic: 479918
Dataset: data/twitter_zainichi_since_2018_to_present.csv, Number Non-Toxic: 114666, Number Toxic: 346844
