In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="2"

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import pandas as pd
import lightning as L

from tqdm.auto import tqdm
from lightning.pytorch.callbacks import ModelCheckpoint

from src.model.modeling_char_encoder import LitCharEncoder
from src.data.dataset import get_train_dataloader, get_dev_dataloader, get_test_dataloader

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
SEED=42
DATASET_NAME = 'jwengr/ToxiBenchCN'
SPACE_TOKEN = '[SEP]'
UNK_TOKEN = '[UNK]'
PAD_TOKEN = '[PAD]'
MINI_BATCH_SIZE=32
N_BATCH = 1
BASE_MODEL_NAME='google-bert/bert-base-chinese'
EPOCHS=10
LEARNING_RATE = 5e-5
TRAIN_MAX_LENGTH=128
VALID_MAX_LENGTH=128
INFERENCE_SENTENCE_MAX_LENGTH=64
INFERENCE_SENTENCE_MIN_LENGTH=32
INFERENCE_SENTENCE_N_OVERLAP=3

L.seed_everything(SEED)

Seed set to 42


42

In [4]:
train_dl = get_train_dataloader(DATASET_NAME, batch_size=MINI_BATCH_SIZE, max_length=TRAIN_MAX_LENGTH)
dev_dl = get_dev_dataloader(DATASET_NAME, batch_size=MINI_BATCH_SIZE, max_length=VALID_MAX_LENGTH)
test_dl = get_test_dataloader(DATASET_NAME, batch_size=MINI_BATCH_SIZE)

In [None]:
lit_char_encoder = LitCharEncoder(
    base_model_name=BASE_MODEL_NAME,
    space_token=SPACE_TOKEN,
    unk_token=UNK_TOKEN,
    pad_token=PAD_TOKEN,
    lr=LEARNING_RATE,
    epochs=EPOCHS,
    inference_sentence_max_length=INFERENCE_SENTENCE_MAX_LENGTH,
    inference_sentence_min_length=INFERENCE_SENTENCE_MIN_LENGTH,
    inference_sentence_n_overlap=INFERENCE_SENTENCE_N_OVERLAP,
)

checkpoint_callback = ModelCheckpoint(
    dirpath='checkpoints/charencoder',
    filename=f"{DATASET_NAME.split('/')[1]}/{BASE_MODEL_NAME.split('/')[1]}"+"-{epoch:02d}-{valid_loss:.4f}",
    every_n_epochs=1,
    save_top_k=-1,
)

trainer = L.Trainer(
    callbacks=[checkpoint_callback],
    # precision='bf16',
    max_epochs=EPOCHS,
    enable_checkpointing=True,
    accumulate_grad_batches=N_BATCH
)

trainer.fit(lit_char_encoder, train_dl, dev_dl)

In [6]:
lit_char_encoder = LitCharEncoder.load_from_checkpoint(
    'checkpoints/charencoder/ToxiBenchCN/bert-base-chinese-epoch=00-valid_loss=1.1117.ckpt',
    base_model_name=BASE_MODEL_NAME,
    space_token=SPACE_TOKEN,
    unk_token=UNK_TOKEN,
    pad_token=PAD_TOKEN,
    lr=LEARNING_RATE,
    epochs=EPOCHS,
    inference_sentence_max_length=INFERENCE_SENTENCE_MAX_LENGTH,
    inference_sentence_min_length=INFERENCE_SENTENCE_MIN_LENGTH,
    inference_sentence_n_overlap=INFERENCE_SENTENCE_N_OVERLAP,
)

Some weights of the model checkpoint at google-bert/bert-base-chinese were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
trainer = L.Trainer()

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [8]:
preds = trainer.predict(lit_char_encoder, test_dl)

You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2]
/home/jjw1214/.conda/envs/jjw1214_py312/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=127` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 48/48 [00:08<00:00,  5.62it/s]


In [9]:
prediction = []

for pred in tqdm(preds):
    prediction.extend(pred)

100%|██████████| 48/48 [00:00<00:00, 257450.88it/s]


In [10]:
categories = []
inputs = []
true = []
for batch in test_dl:
    true.extend(batch['sentence'])
    inputs.extend(batch['sentence_noisy'])
    categories.extend(batch['category'])

In [11]:
result_df = pd.DataFrame()
result_df['input'] = inputs
result_df['pred'] = prediction
result_df['true'] = true
result_df['category'] = categories

In [17]:
result_df[result_df['pred'].apply(len)!=result_df['true'].apply(len)]

Unnamed: 0,input,pred,true,category
22,💦🌧️黑子出来答话,💦🌧️黑子出来就话,地域黑子出来答话,emo
79,正什么歪成什么样子了都是群🈵👄🔪🉐仁1️⃣的🐶在搞笑,正什么歪成什么样子了都是群群👄不不仁1️⃣的人在搞笑,正什么歪成什么样子了都是群满嘴道德仁义的狗在搞笑,emo
88,爱家乡不分男女，可📉🀂人是为了黑2️⃣生😅,爱家乡不分男女，可以女人是为了黑2️⃣生活,爱家乡不分男女，可江西人是为了黑而生😅,emo
97,阿🌶️🌊都不要黑人,阿🌶️🌊都不要黑人,阿拉伯都不要黑人,emo
99,这男的也是个👅🐶，这种情况⬇还不⬇💀🖐️起诉法院,这男的也是个坏佬，这种情况下还不会双🖐️起诉法院,这男的也是个舔狗，这种情况下还不下死手起诉法院,emo
111,异性恋的恋童🍺应该⬇💦🌧️嗷,异性恋的恋童恋应该抵应🌧️嗷,异性恋的恋童癖应该下地狱嗷,emo
140,异性恋的恋童🍺应该⬇💦🌧️嗷,异性恋的恋童恋应该抵应🌧️嗷,异性恋的恋童癖应该下地狱嗷,emo
162,所以说嘛他在哪个吧都是这个13味2️⃣🐴他就对了,所以说嘛他在哪个吧都是这个13味2️⃣🐴他就对了,所以说嘛他在哪个吧都是这个13味儿骂他就对了,emo
202,你✌️die们在呼唤你,你✌️die恶在呼唤你,你野die们在呼唤你,emo
254,我7️⃣🔟有🀄🦶7️⃣🔟的人和黑人🐶🐶,我7️⃣北有男族歧视的人和黑人🐶🐶,我歧视有种族歧视的人和黑人🐶🐶,emo


In [25]:
for cat in set(result_df['category']):
    cat_df = result_df[result_df['category']==cat].copy()
    result, result_list = calculate_metric(cat_df['input'].tolist(), cat_df['true'].tolist(), cat_df['pred'].tolist())
    print(cat, result)

trad {'S_D_p': 62.376, 'S_D_r': 71.591, 'S_D_f1': 66.667, 'S_C_p': 51.485, 'S_C_r': 59.091, 'S_C_f1': 55.026, 'C_D_p': 75.309, 'C_D_r': 99.728, 'C_D_f1': 85.815, 'C_C_p': 68.519, 'C_C_r': 90.736, 'C_C_f1': 78.078}
emo {'S_D_p': 53.815, 'S_D_r': 52.344, 'S_D_f1': 53.069, 'S_C_p': 8.835, 'S_C_r': 8.594, 'S_C_f1': 8.713, 'C_D_p': 87.037, 'C_D_r': 86.823, 'C_D_f1': 86.93, 'C_C_p': 24.938, 'C_C_r': 24.877, 'C_C_f1': 24.907}
homo {'S_D_p': 19.512, 'S_D_r': 19.2, 'S_D_f1': 19.355, 'S_C_p': 5.285, 'S_C_r': 5.2, 'S_C_f1': 5.242, 'C_D_p': 87.149, 'C_D_r': 71.039, 'C_D_f1': 78.274, 'C_C_p': 31.965, 'C_C_r': 26.056, 'C_C_f1': 28.71}
swap {'S_D_p': 5.761, 'S_D_r': 5.578, 'S_D_f1': 5.668, 'S_C_p': 2.469, 'S_C_r': 2.39, 'S_C_f1': 2.429, 'C_D_p': 69.578, 'C_D_r': 47.187, 'C_D_f1': 56.236, 'C_C_p': 31.295, 'C_C_r': 21.224, 'C_C_f1': 25.294}
radical {'S_D_p': 40.0, 'S_D_r': 40.664, 'S_D_f1': 40.329, 'S_C_p': 11.02, 'S_C_r': 11.203, 'S_C_f1': 11.111, 'C_D_p': 84.559, 'C_D_r': 81.657, 'C_D_f1': 83.083, 'C

In [11]:
len(ids)

30

In [21]:
sentence = '你应该可怜那些💦🌧️7️⃣🔟的人 他们就是活在网络上的㊗🐛'
self = lit_char_encoder.encoder.tokenizer

In [27]:
# sentence = sentence.replace(' ', self.space_token)
encoded_ids = []
token_type_ids = []
for char in graphemes(sentence):
    if char==' ':
        char = self.space_token
    if self.target_chars_dict:
        if self.target_chars_dict.get(ord(char)):
            token_type_ids.extend([1])
        else:
            token_type_ids.extend([0])
    else:
        token_type_ids.extend([1])
    encoded_id = self.base_tokenizer.encode(char, add_special_tokens=False)[:1]
    if not encoded_id:
        encoded_id = [self.unk_token_id]
    encoded_ids.extend(encoded_id)
    print(char, len(encoded_ids))


你 1
应 2
该 3
可 4
怜 5
那 6
些 7
💦 8
🌧️ 9
7️⃣ 10
🔟 11
的 12
人 13
[SEP] 14
他 15
们 16
就 17
是 18
活 19
在 20
网 21
络 22
上 23
的 24
㊗ 25
🐛 26
