[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kubotaissei/defamation_japanese_twitter/blob/master/notebooks/inference_example.ipynb)

In [1]:
!pip install transformers==4.26 sentencepiece

In [2]:
# ====================================================
# CFG
# ====================================================
class CFG:
    # base
    gpu_id=["0"]
    output_dir="output"
    seed=777
    # Dataset
    agg_type="ds" #[mv, ds, gl] # Label Aggregation Method, Majoryty Voting or Dawid-Skene or GLAD
    max_len=497
    # Model
    model="kubota/luke-large-defamation-detection-japanese"
    

# Library

In [30]:
# ====================================================
# Library
# ====================================================
import os
import random
import warnings
warnings.filterwarnings("ignore")

os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(CFG.gpu_id)

import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
                          Trainer)
%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


# Helper functions for scoring

In [4]:
# ====================================================
# Helper functions for scoring
# ====================================================
def softmax(x):
    f = np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)
    return f


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if labels.ndim == 2:
        labels = np.argmax(labels, axis=1)
    acc = accuracy_score(labels, np.argmax(predictions, axis=1))
    f1 = f1_score(labels, np.argmax(predictions, axis=1), average="macro")
    auc = roc_auc_score(
        np.identity(4)[labels],
        softmax(predictions),
        multi_class="ovr",
        average="macro",
    )
    return {
        "accuracy": acc,
        "f1": f1,
        "auc": auc,
    }

# Utils

In [5]:
# ====================================================
# Utils
# ====================================================
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(CFG.seed)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
model = AutoModelForSequenceClassification.from_pretrained(CFG.model)

In [7]:
test_df = pd.read_pickle("test.pkl")
test_df = test_df.drop("label", axis=1).rename(columns={f"{CFG.agg_type}_soft_label":"label"})

text_lengths = []
tk1 = tqdm(test_df["text"].fillna("").values, total=len(test_df))
for text in tk1:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    text_lengths.append(length)

max_len = max(text_lengths) + 2 # CLS + SEP
print(f'Text max(lengths): {max(text_lengths)}')

def clean_text(text):
    return (
        text.replace(" ", "")
        .replace("@user", "")
        .replace("　", "")
        .replace("__BR__", "\n")
        .replace("\xa0", "")
        .replace("\r", "")
        .lstrip("\n")
    )

def tokenize(batch):
    return tokenizer(
        list(map(clean_text, batch["text"])),
        add_special_tokens=True,
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors="pt",
    )

test_data = Dataset.from_pandas(test_df).map(tokenize, batched=True, batch_size=None)

100%|██████████| 879/879 [00:00<00:00, 3824.81it/s]


Text max(lengths): 495


  0%|          | 0/1 [00:00<?, ?ba/s]

In [31]:
trainer = Trainer(model, compute_metrics=compute_metrics)
result = trainer.predict(test_data)
print(result.metrics)
test_df["pred_label"] = result.predictions.argmax(1).tolist()
display(test_df[["text", "ds_hard_label", "pred_label"]].head())
report = classification_report(test_df["ds_hard_label"], test_df["pred_label"], target_names=model.config.id2label.values(), output_dict=True)
display(pd.DataFrame(report))

{'test_loss': 0.4339948296546936, 'test_accuracy': 0.6473265073947668, 'test_f1': 0.6356487994276481, 'test_auc': 0.8492810797008399, 'test_runtime': 21.5323, 'test_samples_per_second': 40.822, 'test_steps_per_second': 5.109}


Unnamed: 0,text,ds_hard_label,pred_label
0,@ReutersJapan確かに侵略戦争で民間人を殺害したのは裁かれるべきですが、本来ロシア...,0,3
1,日本人が「劣化」したワケ…受験勉強で「学歴の高いバカ」が大量生産されている！中野信子氏と和田...,0,0
2,静岡県知事がリニアを通さない。これ自体が日本が中国に支配されていて日本を潰すのは施政者など力...,3,3
3,日本に外国人を住まわせちゃいけない理由\n\n・最早区別は差別と同義\n・別の国から来た人は...,0,0
4,やられてもやり返すな。同類だぞみたいな日本人特有の文化なに？そんな考えのやつばっかだから日本...,0,2


Unnamed: 0,中傷性のない発言,脅迫的な発言,侮蔑的な発言,名誉を低下させる発言,accuracy,macro avg,weighted avg
precision,0.713068,0.690141,0.590747,0.519231,0.647327,0.628297,0.651314
recall,0.625935,0.803279,0.677551,0.486486,0.647327,0.648313,0.647327
f1-score,0.666667,0.742424,0.631179,0.502326,0.647327,0.635649,0.646537
support,401.0,122.0,245.0,111.0,0.647327,879.0,879.0
