# Training Sample Scripts Using Huggingface Trainer

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kubotaissei/defamation_japanese_twitter/blob/master/notebooks/train_example.ipynb)

In [1]:
!pip install transformers==4.26 datasets==2.8.0 sentencepiece crowd-kit

# CFG

In [2]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    # base
    gpu_id=["0"]
    output_dir="output"
    twitter_bearer_token=""  # Fill in your Twitter API bearer token
    # Dataset
    agg_type="mv" #[mv, ds, gl]
    type="label" #[label, target]
    label_type="hard" #[hard, soft]
    max_len=497
    # Model
    model="studio-ousia/luke-japanese-large"
    num_classes=4
    # Scheduler
    scheduler_type="cosine"
    # Train
    seed=777
    lr=1e-5
    weight_decay=0.2
    epochs=4
    gradient_accumulation_steps=1
    batch_size=4
    save_total_limit=0
    

if CFG.debug:
    CFG.epochs = 1

# Library

In [3]:
# ====================================================
# Library
# ====================================================
import gc
import json
import os
import random
import warnings
warnings.filterwarnings("ignore")

os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(CFG.gpu_id)

import numpy as np
import pandas as pd
import requests
import torch
from tqdm import tqdm
from crowdkit.aggregation import GLAD, DawidSkene, MajorityVote
from datasets import Dataset, DatasetDict, load_dataset
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
                          Trainer, TrainingArguments)
%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


# Helper functions for scoring

In [4]:
# ====================================================
# Helper functions for scoring
# ====================================================
def softmax(x):
    f = np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)
    return f


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    acc = accuracy_score(labels, np.argmax(predictions, axis=1))
    f1 = f1_score(labels, np.argmax(predictions, axis=1), average="macro")
    auc = roc_auc_score(
        np.identity(4)[labels],
        softmax(predictions),
        multi_class="ovr",
        average="macro",
    )
    return {
        "accuracy": acc,
        "f1": f1,
        "auc": auc,
    }

# Utils

In [5]:
# ====================================================
# Utils
# ====================================================
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(CFG.seed)

# Data Loading

In [6]:
# ====================================================
# Data Loading
# ====================================================

# sample code from https://github.com/twitterdev/Twitter-API-v2-sample-code/blob/main/Tweet-Lookup/get_tweets_with_bearer_token.py


def create_url(ids: list):
    tweet_fields = "tweet.fields=created_at"
    ids = f"ids={','.join(ids)}"
    url = "https://api.twitter.com/2/tweets?{}&{}".format(ids, tweet_fields)
    return url


def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {CFG.twitter_bearer_token}"
    r.headers["User-Agent"] = "v2TweetLookupPython"
    return r


def connect_to_endpoint(url):
    response = requests.request("GET", url, auth=bearer_oauth)
    if response.status_code != 200:
        raise Exception(
            "Request returned an error: {} {}".format(
                response.status_code, response.text
            )
        )
    return response.json()


def get_text_data(examples):
    url = create_url(examples["id"])
    json_response = connect_to_endpoint(url)
    # print(json_response["data"])
    text_dict = {data["id"]: data["text"] for data in json_response["data"]}
    time_dict = {data["id"]: data["created_at"] for data in json_response["data"]}
    return {"text": [text_dict.get(id) for id in examples["id"]], "created_at": [time_dict.get(id) for id in examples["id"]]}

def get_dataset(train_path = "train.pkl", test_path = "test.pkl"):
    if os.path.exists(train_path) and os.path.exists(test_path):
        return pd.read_pickle(train_path), pd.read_pickle(test_path)
    else:
        dataset = load_dataset("kubota/defamation-japanese-twitter")
        dataset = dataset.map(get_text_data, batched=True, batch_size=100)
        
        # 欠損(元ツイートが削除されているもの)を削除
        df = dataset["train"].to_pandas().dropna()
        # 全員がことなるもの，2名以上がCを選択したものを排除
        df = df[df["label"].apply(lambda l: np.median(l) != 0.0 if len(set(l)) != len(l) else False)]
        # ラベル統合のために変形
        d_target = dict(
            worker = pd.concat([df["user_id_list"].apply(lambda x:x[i]) for i in range(3)]),
            task = df["id"].to_list()*3,
            label = pd.concat([df["target"].apply(lambda x:x[i]) for i in range(3)]),
        )
        d_target = pd.DataFrame.from_dict(d_target)
        d_target["label"].replace({3:0}, inplace=True)
        d_label = dict(
            worker = pd.concat([df["user_id_list"].apply(lambda x:x[i]) for i in range(3)]),
            task = df["id"].to_list()*3,
            label = pd.concat([df["label"].apply(lambda x:x[i]) for i in range(3)]),
        )
        d_label = pd.DataFrame.from_dict(d_label)
        d_label["label"].replace({4:0}, inplace=True)
        
        # 誹謗中傷の対象，種類ごとにそれぞれラベル統合
        df["mv_hard_target"] = list(MajorityVote().fit_predict(d_target))
        df["ds_hard_target"] = list(DawidSkene(n_iter=200).fit_predict(d_target))
        df["gl_hard_target"] = list(GLAD(n_iter=200).fit_predict(d_target))
        df["mv_soft_target"] = MajorityVote().fit_predict_proba(d_target).to_numpy().tolist()
        df["ds_soft_target"] = DawidSkene(n_iter=200).fit_predict_proba(d_target).to_numpy().tolist()
        df["gl_soft_target"] = GLAD(n_iter=200).fit_predict_proba(d_target).to_numpy().tolist()
        df["mv_hard_label"] = list(MajorityVote().fit_predict(d_label))
        df["ds_hard_label"] = list(DawidSkene(n_iter=200).fit_predict(d_label))
        df["gl_hard_label"] = list(GLAD(n_iter=200).fit_predict(d_label))
        df["mv_soft_label"] = MajorityVote().fit_predict_proba(d_label).to_numpy().tolist()
        df["ds_soft_label"] = DawidSkene(n_iter=200).fit_predict_proba(d_label).to_numpy().tolist()
        df["gl_soft_label"] = GLAD(n_iter=200).fit_predict_proba(d_label).to_numpy().tolist()
        display(df.groupby("mv_hard_target").count()["id"])
        display(df.groupby("mv_hard_label").count()["id"])
        
        # 学習データ，テストデータにそれぞれ分割
        train_df = df.query("created_at < '2022-05-21 00:00:00+00:00'").reset_index(drop=True)
        test_df = df.query("created_at > '2022-05-21 00:00:00+00:00'").reset_index(drop=True)
        train_df.to_pickle(train_path)
        test_df.to_pickle(test_path)
        return train_df, test_df

train_df, test_df = get_dataset()
train_df = train_df.drop("label", axis=1).rename(columns={f"{CFG.agg_type}_{CFG.label_type}_{CFG.type}":"label"})
test_df = test_df.drop("label", axis=1).rename(columns={f"{CFG.agg_type}_{CFG.label_type}_{CFG.type}":"label"})

print(f"train.shape: {train_df.shape}")
display(train_df[["text", "label"]].head())
print(f"test.shape: {test_df.shape}")
display(test_df[["text", "label"]].head())

train.shape: (3122, 17)


Unnamed: 0,text,label
0,ジェンダーとかそういうのは関係なく、男の子はこんなもん、女の子だからこういうもん、そういう言...,0
1,@sharenewsjapan1 河村は阿呆か？\n今韓国と仲良くお手てつないでなどできるは...,2
2,低能共がドヤ顔で写真撮る権利なんて要らねえよ。\n糞どもはやく死なねえかな。\n鉄道は静かに...,1
3,ライル：（…消えてしまえ、消えてしまえ消えろ消えろ消えろ消えろっ!!!!マフィアなんて全部、...,1
4,自分が住んでいる都市は街全体がおっきな老人ホームみたいになっていて、老人の住みやすさが優先さ...,3


test.shape: (879, 17)


Unnamed: 0,text,label
0,@ReutersJapan 確かに侵略戦争で民間人を殺害したのは裁かれるべきですが、本来ロシ...,0
1,日本人が「劣化」したワケ…受験勉強で「学歴の高いバカ」が大量生産されている！ 中野信子氏と和...,0
2,静岡県知事がリニアを通さない。これ自体が日本が中国に支配されていて日本を潰すのは施政者など力...,3
3,日本に外国人を住まわせちゃいけない理由\n\n・最早区別は差別と同義\n・別の国から来た人は...,0
4,やられてもやり返すな。同類だぞみたいな日本人特有の文化なに？そんな考えのやつばっかだから日本...,0


# Tokenizer

In [7]:
# ====================================================
# Tokenizer
# ====================================================

tokenizer = AutoTokenizer.from_pretrained(CFG.model)
text_lengths = []
tk0 = tqdm(train_df["text"].fillna("").values, total=len(train_df))
tk1 = tqdm(test_df["text"].fillna("").values, total=len(test_df))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    text_lengths.append(length)
for text in tk1:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    text_lengths.append(length)

CFG.max_len = max(text_lengths) + 2 # CLS + SEP
print(f'Text max(lengths): {max(text_lengths)}')

100%|██████████| 3122/3122 [00:00<00:00, 4245.52it/s]
100%|██████████| 879/879 [00:00<00:00, 959.52it/s]

Text max(lengths): 495





# Dataset

In [8]:
# ====================================================
# Dataset
# ====================================================
dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(train_df),
        "test": Dataset.from_pandas(test_df),
    }
)

def clean_text(text):
    return (
        text.replace(" ", "")
        .replace("@user", "")
        .replace("　", "")
        .replace("__BR__", "\n")
        .replace("\xa0", "")
        .replace("\r", "")
        .lstrip("\n")
    )

def tokenize(batch):
    return tokenizer(
        list(map(clean_text, batch["text"])),
        add_special_tokens=True,
        padding=True,
        truncation=True,
        max_length=CFG.max_len,
        return_tensors="pt",
    )

dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

# Training

In [9]:
# ====================================================
# Training
# ====================================================
model = AutoModelForSequenceClassification.from_pretrained(
    CFG.model, num_labels=CFG.num_classes
)

training_args = TrainingArguments(
    output_dir=CFG.output_dir,
    num_train_epochs=CFG.epochs,
    learning_rate=CFG.lr,
    per_device_train_batch_size=CFG.batch_size,
    per_device_eval_batch_size=CFG.batch_size,
    weight_decay=CFG.weight_decay,
    gradient_accumulation_steps=CFG.gradient_accumulation_steps,
    evaluation_strategy="epoch",
    lr_scheduler_type=CFG.scheduler_type,
    disable_tqdm=False,
    save_total_limit=CFG.save_total_limit,
    logging_steps=len(dataset_encoded["train"]) // CFG.batch_size,
    push_to_hub=False,
    log_level="error",
    fp16=True,
    seed=CFG.seed,
    save_strategy="no",
    report_to="tensorboard"
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["test"],
    tokenizer=tokenizer,
)

trainer.train()

Some weights of the model checkpoint at studio-ousia/luke-japanese-large were not used when initializing LukeForSequenceClassification: ['lm_head.layer_norm.bias', 'entity_predictions.transform.LayerNorm.weight', 'entity_predictions.transform.LayerNorm.bias', 'lm_head.dense.weight', 'entity_predictions.transform.dense.bias', 'entity_predictions.bias', 'entity_predictions.transform.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing LukeForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LukeForSequenceClassifica

Epoch,Training Loss,Validation Loss,Accuracy,F1,Auc
1,1.0178,0.849494,0.641638,0.596703,0.836809
2,0.697,0.974318,0.648464,0.607049,0.827209
3,0.4985,1.033791,0.642776,0.607933,0.833853
4,0.3729,1.084045,0.650739,0.615489,0.831961


TrainOutput(global_step=784, training_loss=0.6448173109365969, metrics={'train_runtime': 319.8105, 'train_samples_per_second': 39.048, 'train_steps_per_second': 2.451, 'total_flos': 9120488340374208.0, 'train_loss': 0.6448173109365969, 'epoch': 4.0})