# Training Sample Scripts Using Huggingface Trainer

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kubotaissei/defamation_japanese_twitter/blob/master/notebooks/train_example.ipynb)

In [1]:
!pip install transformers==4.26 datasets==2.9.0 sentencepiece crowd-kit

# CFG

In [1]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    # base
    gpu_id=["0"]
    output_dir="output"
    twitter_bearer_token=""  # Fill in your Twitter API bearer token
    # Dataset
    agg_type="ds" #[mv, ds, gl] # Label Aggregation Method, Majoryty Voting or Dawid-Skene or GLAD
    type="label" #[label, target] # 種類予測 or 対象予測 
    label_type="soft" #[hard, soft] # ソフトラベル or ハードラベル
    max_len=497
    # Model
    model="studio-ousia/luke-japanese-large"
    num_classes=4
    # Scheduler
    scheduler_type="cosine"
    # Train
    seed=777
    lr=1e-5
    weight_decay=0.2
    epochs=4
    gradient_accumulation_steps=1
    batch_size=4
    save_total_limit=0

if CFG.type=="target":
    CFG.num_classes=3

if CFG.debug:
    CFG.epochs = 1

# Library

In [2]:
# ====================================================
# Library
# ====================================================
import gc
import json
import os
import random
import warnings
warnings.filterwarnings("ignore")

os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(CFG.gpu_id)

import numpy as np
import pandas as pd
import requests
import torch
from tqdm import tqdm
from crowdkit.aggregation import GLAD, DawidSkene, MajorityVote
from datasets import Dataset, DatasetDict, load_dataset
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
                          Trainer, TrainingArguments)
%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


# Helper functions for scoring

In [3]:
# ====================================================
# Helper functions for scoring
# ====================================================
def softmax(x):
    f = np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)
    return f


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if labels.ndim == 2:
        labels = np.argmax(labels, axis=1)
    acc = accuracy_score(labels, np.argmax(predictions, axis=1))
    f1 = f1_score(labels, np.argmax(predictions, axis=1), average="macro")
    auc = roc_auc_score(
        np.identity(4)[labels],
        softmax(predictions),
        multi_class="ovr",
        average="macro",
    )
    return {
        "accuracy": acc,
        "f1": f1,
        "auc": auc,
    }

# Utils

In [4]:
# ====================================================
# Utils
# ====================================================
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(CFG.seed)

# Data Loading

In [4]:
# ====================================================
# Data Loading
# ====================================================

# sample code from https://github.com/twitterdev/Twitter-API-v2-sample-code/blob/main/Tweet-Lookup/get_tweets_with_bearer_token.py


def create_url(ids: list):
    tweet_fields = "tweet.fields=created_at"
    ids = f"ids={','.join(ids)}"
    url = "https://api.twitter.com/2/tweets?{}&{}".format(ids, tweet_fields)
    return url


def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {CFG.twitter_bearer_token}"
    r.headers["User-Agent"] = "v2TweetLookupPython"
    return r


def connect_to_endpoint(url):
    response = requests.request("GET", url, auth=bearer_oauth)
    if response.status_code != 200:
        raise Exception(
            "Request returned an error: {} {}".format(
                response.status_code, response.text
            )
        )
    return response.json()


def get_text_data(examples):
    url = create_url(examples["id"])
    json_response = connect_to_endpoint(url)
    # print(json_response["data"])
    text_dict = {data["id"]: data["text"] for data in json_response["data"]}
    time_dict = {data["id"]: data["created_at"] for data in json_response["data"]}
    return {"text": [text_dict.get(id) for id in examples["id"]], "created_at": [time_dict.get(id) for id in examples["id"]]}

def get_dataset(train_path = "train.pkl", test_path = "test.pkl"):
    if os.path.exists(train_path) and os.path.exists(test_path):
        return pd.read_pickle(train_path), pd.read_pickle(test_path)
    else:
        dataset = load_dataset("kubota/defamation-japanese-twitter")
        dataset = dataset.map(get_text_data, batched=True, batch_size=100)
        
        # 欠損(元ツイートが削除されているもの)を削除
        df = dataset["train"].to_pandas().dropna()
        # 全員がことなるもの，2名以上がCを選択したものを排除
        df = df[df["label"].apply(lambda l: np.median(l) != 0.0 if len(set(l)) != len(l) else False)]
        # ラベル統合のために変形
        d_target = dict(
            worker = pd.concat([df["user_id_list"].apply(lambda x:x[i]) for i in range(3)]),
            task = df["id"].to_list()*3,
            label = pd.concat([df["target"].apply(lambda x:x[i]) for i in range(3)]),
        )
        d_target = pd.DataFrame.from_dict(d_target)
        d_target["label"].replace({3:0}, inplace=True)
        d_label = dict(
            worker = pd.concat([df["user_id_list"].apply(lambda x:x[i]) for i in range(3)]),
            task = df["id"].to_list()*3,
            label = pd.concat([df["label"].apply(lambda x:x[i]) for i in range(3)]),
        )
        d_label = pd.DataFrame.from_dict(d_label)
        d_label["label"].replace({4:0}, inplace=True)
        
        # 誹謗中傷の対象，種類ごとにそれぞれラベル統合
        df["mv_hard_target"] = list(MajorityVote().fit_predict(d_target))
        df["ds_hard_target"] = list(DawidSkene(n_iter=200).fit_predict(d_target))
        df["gl_hard_target"] = list(GLAD(n_iter=200).fit_predict(d_target))
        df["mv_soft_target"] = MajorityVote().fit_predict_proba(d_target).to_numpy().tolist()
        df["ds_soft_target"] = DawidSkene(n_iter=200).fit_predict_proba(d_target).to_numpy().tolist()
        df["gl_soft_target"] = GLAD(n_iter=200).fit_predict_proba(d_target).to_numpy().tolist()
        df["mv_hard_label"] = list(MajorityVote().fit_predict(d_label))
        df["ds_hard_label"] = list(DawidSkene(n_iter=200).fit_predict(d_label))
        df["gl_hard_label"] = list(GLAD(n_iter=200).fit_predict(d_label))
        df["mv_soft_label"] = MajorityVote().fit_predict_proba(d_label).to_numpy().tolist()
        df["ds_soft_label"] = DawidSkene(n_iter=200).fit_predict_proba(d_label).to_numpy().tolist()
        df["gl_soft_label"] = GLAD(n_iter=200).fit_predict_proba(d_label).to_numpy().tolist()
        display(df.groupby("mv_hard_target").count()["id"])
        display(df.groupby("mv_hard_label").count()["id"])
        
        # 学習データ，テストデータにそれぞれ分割
        train_df = df.query("created_at < '2022-05-21 00:00:00+00:00'").reset_index(drop=True)
        test_df = df.query("created_at > '2022-05-21 00:00:00+00:00'").reset_index(drop=True)
        train_df.to_pickle(train_path)
        test_df.to_pickle(test_path)
        return train_df, test_df

train_df, test_df = get_dataset()
train_df = train_df.drop("label", axis=1).rename(columns={f"{CFG.agg_type}_{CFG.label_type}_{CFG.type}":"label"})
test_df = test_df.drop("label", axis=1).rename(columns={f"{CFG.agg_type}_{CFG.label_type}_{CFG.type}":"label"})

print(f"train.shape: {train_df.shape}")
display(train_df[["text", "label"]].head())
print(f"test.shape: {test_df.shape}")
display(test_df[["text", "label"]].head())

train.shape: (7119, 24)


Unnamed: 0,text,label
0,ロシアは異常な国‼️,"[0.01905700520092961, 0.000806961516404502, 0...."
1,いや、、怪しいな、、。第三者が「こっちだよ、こっちに友達いたよ」て誘導したとしか思えない、、。,"[0.9618334745855203, 0.0004892807042054704, 0...."
2,風評被害なかったの？状況知らないけど、自分なら管理出来ないなら飼うな。って、当たり前にレスな...,"[0.9618334745855203, 0.0004892807042054704, 0...."
3,もう何言ってるのか分からん状態になってきたな。。側近も止められないのかな・・・「さすがにそん...,"[0.9618334745855203, 0.0004892807042054704, 0...."
4,「耐えなけば...」と言う町長。訳が分からない！住民にこれだけ損害を与えておいて、どのような...,"[0.9618334745855203, 0.0004892807042054704, 0...."


test.shape: (1448, 24)


Unnamed: 0,text,label
0,中国人の健康保険タダ乗り出来なくなるからこれはいいワクチン打ちに行ったら、日本語怪しいアジア...,"[0.3605458931966271, 2.401694192479152e-15, 0...."
1,犯罪国家を国連は殲滅せよ！！,"[3.700261900179377e-05, 0.9987935254549858, 0...."
2,町の責任で、若者の人生狂わせたとか同情の声があるが、自分は遅かれ早かれ犯罪は犯したと思う。人...,"[0.09318441187853412, 0.0004033875637437423, 0..."
3,また悲惨な事故が起こる前に高齢者の免許強制返納させた方がいい運転する資格ない,"[0.3312375860956289, 8.635079984672648e-15, 0...."
4,北を早く滅ぼそう,"[3.700261900179377e-05, 0.9987935254549858, 0...."


# Tokenizer

In [5]:
# ====================================================
# Tokenizer
# ====================================================

tokenizer = AutoTokenizer.from_pretrained(CFG.model)
text_lengths = []
tk0 = tqdm(train_df["text"].fillna("").values, total=len(train_df))
tk1 = tqdm(test_df["text"].fillna("").values, total=len(test_df))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    text_lengths.append(length)
for text in tk1:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    text_lengths.append(length)

CFG.max_len = max(text_lengths) + 2 # CLS + SEP
print(f'Text max(lengths): {max(text_lengths)}')

100%|██████████| 7119/7119 [00:01<00:00, 4247.70it/s]
100%|██████████| 1448/1448 [00:01<00:00, 727.70it/s] 

Text max(lengths): 472





# Dataset

In [6]:
# ====================================================
# Dataset
# ====================================================
dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(train_df),
        "test": Dataset.from_pandas(test_df),
    }
)

def clean_text(text):
    return (
        text.replace(" ", "")
        .replace("@user", "")
        .replace("　", "")
        .replace("__BR__", "\n")
        .replace("\xa0", "")
        .replace("\r", "")
        .lstrip("\n")
    )

def tokenize(batch):
    return tokenizer(
        list(map(clean_text, batch["text"])),
        add_special_tokens=True,
        padding=True,
        truncation=True,
        max_length=CFG.max_len,
        return_tensors="pt",
    )

dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

# Training

In [7]:
# ====================================================
# Training
# ====================================================
model = AutoModelForSequenceClassification.from_pretrained(
    CFG.model, num_labels=CFG.num_classes
)

training_args = TrainingArguments(
    output_dir=CFG.output_dir,
    num_train_epochs=CFG.epochs,
    learning_rate=CFG.lr,
    per_device_train_batch_size=CFG.batch_size,
    per_device_eval_batch_size=CFG.batch_size,
    weight_decay=CFG.weight_decay,
    gradient_accumulation_steps=CFG.gradient_accumulation_steps,
    evaluation_strategy="epoch",
    lr_scheduler_type=CFG.scheduler_type,
    disable_tqdm=False,
    save_total_limit=CFG.save_total_limit,
    logging_steps=len(dataset_encoded["train"]) // CFG.batch_size,
    push_to_hub=False,
    log_level="error",
    fp16=True,
    seed=CFG.seed,
    save_strategy="no",
    report_to="tensorboard"
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["test"],
    tokenizer=tokenizer,
)

trainer.train()

Some weights of the model checkpoint at studio-ousia/luke-japanese-large were not used when initializing LukeForSequenceClassification: ['entity_predictions.transform.dense.bias', 'lm_head.dense.weight', 'entity_predictions.transform.LayerNorm.bias', 'lm_head.layer_norm.weight', 'entity_predictions.bias', 'entity_predictions.transform.LayerNorm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'entity_predictions.transform.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing LukeForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LukeForSequenceClassifica

Epoch,Training Loss,Validation Loss,Accuracy,F1,Auc
1,0.4211,0.379707,0.675414,0.633065,0.864727
2,0.3362,0.383698,0.671961,0.622264,0.86523
3,0.2842,0.396729,0.676105,0.648729,0.868334
4,0.2554,0.405629,0.671271,0.64492,0.866171


TrainOutput(global_step=1780, training_loss=0.3240535478913382, metrics={'train_runtime': 1022.6499, 'train_samples_per_second': 27.845, 'train_steps_per_second': 1.741, 'total_flos': 3.0709835913443904e+16, 'train_loss': 0.3240535478913382, 'epoch': 4.0})

In [8]:
output = trainer.predict(dataset_encoded["test"])
test_df["pred_logits"] = softmax(output.predictions).tolist()
test_df[f"pred_{CFG.type}"] = softmax(output.predictions).argmax(1)
test_df[["text", f"{CFG.agg_type}_hard_{CFG.type}", f"pred_{CFG.type}"]].head()

Unnamed: 0,text,ds_hard_label,pred_label
0,中国人の健康保険タダ乗り出来なくなるからこれはいいワクチン打ちに行ったら、日本語怪しいアジア...,3,0
1,犯罪国家を国連は殲滅せよ！！,1,1
2,町の責任で、若者の人生狂わせたとか同情の声があるが、自分は遅かれ早かれ犯罪は犯したと思う。人...,3,0
3,また悲惨な事故が起こる前に高齢者の免許強制返納させた方がいい運転する資格ない,3,1
4,北を早く滅ぼそう,1,1


In [9]:
%load_ext tensorboard
%tensorboard --logdir output/runs