In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import pandas as pd
import lightning as L

from tqdm.auto import tqdm
from lightning.pytorch.callbacks import ModelCheckpoint

from src.model.modeling_char_encoder import LitCharEncoder
from src.model.modeling_bind import LitBIND
from src.data.dataset import get_train_dataloader, get_dev_dataloader, get_test_dataloader

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
SEED=42
DATASET_NAME = 'jwengr/ToxiBenchCN'
MINI_BATCH_SIZE=16
N_BATCH = 2
BASE_MODEL_NAME='Qwen/Qwen3-0.6B-Base'
EPOCHS=10
LEARNING_RATE = 1e-4
USE_BNTD=True
TRAIN_MAX_LENGTH=128
VALID_MAX_LENGTH=128
INFERENCE_SENTENCE_MAX_LENGTH=64
INFERENCE_SENTENCE_MIN_LENGTH=32
INFERENCE_SENTENCE_N_OVERLAP=3

L.seed_everything(SEED)

train_dl = get_train_dataloader(DATASET_NAME, batch_size=MINI_BATCH_SIZE, max_length=TRAIN_MAX_LENGTH)
dev_dl = get_dev_dataloader(DATASET_NAME, batch_size=MINI_BATCH_SIZE, max_length=VALID_MAX_LENGTH)
test_dl = get_test_dataloader(DATASET_NAME, batch_size=MINI_BATCH_SIZE)

Seed set to 42


In [4]:
lit_bind = LitBIND.load_from_checkpoint(
    'checkpoints/bind/ToxiBenchCN-Qwen3-0.6B-Base-addbce-focalloss-epoch=01-valid_loss=0.2061.ckpt',
    base_model_name=BASE_MODEL_NAME,
    lr=LEARNING_RATE,
    epochs=EPOCHS,
    use_bntd=USE_BNTD,
    inference_sentence_max_length=INFERENCE_SENTENCE_MAX_LENGTH,
    inference_sentence_min_length=INFERENCE_SENTENCE_MIN_LENGTH,
    inference_sentence_n_overlap=INFERENCE_SENTENCE_N_OVERLAP,
)

use full attn qwen3


In [5]:
trainer = L.Trainer()

Trainer will use only 1 of 4 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=4)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [6]:
preds = trainer.predict(lit_bind, test_dl)

You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
/home/jjw1214/.conda/envs/jjw1214_py312/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=127` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 96/96 [00:37<00:00,  2.54it/s]


In [7]:
prediction = []

for pred in tqdm(preds):
    prediction.extend(pred)

100%|██████████| 96/96 [00:00<00:00, 974947.18it/s]


In [8]:
categories = []
inputs = []
true = []
for batch in test_dl:
    true.extend(batch['sentence'])
    inputs.extend(batch['sentence_noisy'])
    categories.extend(batch['category'])

In [9]:
result_df = pd.DataFrame()
result_df['input'] = inputs
result_df['pred'] = prediction
result_df['true'] = true
result_df['category'] = categories

In [10]:
from src.metrics.metric import calculate_metric

In [11]:
for cat in set(result_df['category']):
    cat_df = result_df[result_df['category']==cat].copy()
    result, result_list = calculate_metric(cat_df['input'].tolist(), cat_df['true'].tolist(), cat_df['pred'].tolist())
    print(cat, result)

radical {'S_D_p': 54.251, 'S_D_r': 55.602, 'S_D_f1': 54.918, 'S_C_p': 25.506, 'S_C_r': 26.141, 'S_C_f1': 25.82, 'C_D_p': 90.431, 'C_D_r': 89.467, 'C_D_f1': 89.946, 'C_C_p': 63.876, 'C_C_r': 63.195, 'C_C_f1': 63.534}
homo {'S_D_p': 13.278, 'S_D_r': 12.8, 'S_D_f1': 13.035, 'S_C_p': 8.299, 'S_C_r': 8.0, 'S_C_f1': 8.147, 'C_D_p': 83.269, 'C_D_r': 57.394, 'C_D_f1': 67.952, 'C_C_p': 60.92, 'C_C_r': 41.989, 'C_C_f1': 49.713}
pinyin {'S_D_p': 81.933, 'S_D_r': 81.25, 'S_D_f1': 81.59, 'S_C_p': 7.143, 'S_C_r': 7.083, 'S_C_f1': 7.113, 'C_D_p': 95.769, 'C_D_r': 97.724, 'C_D_f1': 96.737, 'C_C_p': 38.385, 'C_C_r': 39.168, 'C_C_f1': 38.773}
swap {'S_D_p': 5.907, 'S_D_r': 5.578, 'S_D_f1': 5.738, 'S_C_p': 4.219, 'S_C_r': 3.984, 'S_C_f1': 4.098, 'C_D_p': 62.236, 'C_D_r': 40.671, 'C_D_f1': 49.194, 'C_C_p': 42.296, 'C_C_r': 27.641, 'C_C_f1': 33.433}
trad {'S_D_p': 66.82, 'S_D_r': 82.386, 'S_D_f1': 73.791, 'S_C_p': 62.212, 'S_C_r': 76.705, 'S_C_f1': 68.702, 'C_D_p': 77.053, 'C_D_r': 99.728, 'C_D_f1': 86.936

In [10]:
tokenizer.base_tokenizer.encode('🌧️', add_special_tokens=False)[:1]

[100]

In [14]:
list('🌧️')[0]

'🌧'

In [21]:
res[1].shape

torch.Size([32, 30])

In [22]:
res[2].shape

torch.Size([32, 30])

In [17]:
for batch in train_dl:
    sentence_len = tokenizer.batch_encode_char(batch['sentence'])[0].shape[1]
    sentence_noisy_len = tokenizer.batch_encode_char(batch['sentence_noisy'])[0].shape[1]
    if sentence_len!=sentence_noisy_len:
        print(batch)

{'sentence_noisy': ['网络由于其匿名性🉑以🔍人的恶念🙂'], 'sentence': ['网络由于其匿名性可以放大人的恶念🙂']}
{'sentence_noisy': ['看了徐静🌧️的视频了吧，他说的是五千詹姆斯，你整1️⃣万，是怕蜀国输的👋惨败的不快🎱'], 'sentence': ['看了徐静雨的视频了吧，他说的是五千詹姆斯，你整一万，是怕蜀国输的不惨败的不快吧']}
{'sentence_noisy': ['理想状态下👧找过黑人👨朋友，没人接盘…现实🀄，几千万光棍排着队接盘呢，抬回家跪舔…就算🉐了艾滋也🈶艾滋群体里大量👨性跪👅根本原因在于，男性内卷，👧有恃无恐'], 'sentence': ['理想状态下女生找过黑人男朋友，没人接盘…现实中，几千万光棍排着队接盘呢，抬回家跪舔…就算得了艾滋也有艾滋群体里大量男性跪舔根本原因在于，男性内卷，女生有恃无恐']}
{'sentence_noisy': ['说明1️⃣⬇，我只关爱自己身边的👩性，比如👴婆女儿等，别的👩关👴🐦🔟。v'], 'sentence': ['说明一下，我只关爱自己身边的女性，比如老婆女儿等，别的女人关我鸟事。v']}
{'sentence_noisy': ['证件都没🈶，抓了也没用，你能怎么样¿动🖐️？还👋是又接触到了法律，好好动动脑子。3️⃣非没有触犯法律¿亏你口口声声法律。关起来可以🎱¿让他🚪在外面犯罪¿动动你猪脑子。不想遣返才是真正的原因。'], 'sentence': ['证件都没有，抓了也没用，你能怎么样？动手？还不是又接触到了法律，好好动动脑子。三非没有触犯法律？亏你口口声声法律。关起来可以吧？让他们在外面犯罪？动动你猪脑子。不想遣返才是真正的原因。']}
{'sentence_noisy': ['👴看了几个屈机头条的视频，非常的喜欢，里面的几位博士学者学识渊博，风趣幽默，对国家一片赤诚。非常的令人崇敬！可怜我只能在微博⬆关注搬运工，还得眼疾🖐️快的⬇载，不能让渣浪抢了先。'], 'sentence': ['我看了几个屈机头条的视频，非常的喜欢，里面的几位博士学者学识渊博，风趣幽默，对国家一片赤诚。非常的令人崇敬！可怜我只能在微博上关注搬运工，还得眼疾手快的下载，不能让渣浪抢了先。']}
{'sentence_noisy': ['肩上的脚丫🎱的👨同性恋就是脑子长

In [13]:
tokenizer.batch_encode_char(batch['sentence'])[0].shape

torch.Size([16, 514])

In [9]:
len(tokenizer.encode_char('太可怕了，永远反黑👍️')[0])

46

In [10]:
len(tokenizer.encode_char('太🉑怕了，永远🍚黑👍️')[0])

46

In [5]:
from src.tokenizer.modeling_tokenizer import BonitaTokenizer

In [6]:
tokenizer = BonitaTokenizer(BASE_MODEL_NAME)

In [7]:
encoded_ids, label_ids = tokenizer.encode('안녋하세욟', '안녕하세요')

[126246, 56940, 56940, 56940, 74165, 233, 56940, 56940, 16186, 56940, 56940, 56940, 41429, 56940, 56940, 56940, 14922, 253, 56940, 56940]
[126246, 56940, 56940, 56940, 144370, 56940, 56940, 56940, 16186, 56940, 56940, 56940, 41429, 56940, 56940, 56940, 35711, 56940, 56940, 56940]


In [8]:
len(encoded_ids), len(label_ids)

(49, 49)

In [7]:
sentence_ids = [encoded_id for encoded_id, label_id in zip(encoded_ids, label_ids) if label_id!=-100]

In [9]:
tokenizer.decode(encoded_ids, label_ids)

'안녕하세요'

In [33]:
char_start, char_end = 0xAC00, 0xD7A3  # 가-힣
kor_chars = list(set([chr(code) for code in range(char_start, char_end + 1)]))
char_ids = []
for kor_char in kor_chars:
    ids = tokenizer.encode(kor_char, add_special_tokens=False)
    print(ids)


[1]
[2022]
[1]
[1]
[1]
[3313]
[1]
[1]
[1]
[1]
[1]
[1]
[3865]
[1]
[1]
[1]
[1]
[1]
[2990]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[2205]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[2299]
[1]
[2835]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[3697]
[1]
[1]
[1]
[1]
[1]
[2422]
[1]
[1]
[2128]
[1]
[1]
[1]
[3491]
[1]
[1]
[1]
[1]
[2531]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[2742]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[3198]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[2900]
[1]
[1]
[2170]
[1]
[1]
[1]
[1]
[1]
[1]
[3010]
[2206]
[1]
[3728]
[1]
[1]
[1]
[1]
[1]
[1]
[2140]
[1]
[1]
[2695]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[2689]
[1]
[1]
[1]
[1]
[1]
[1]
[3742]
[1]
[2301]
[1]
[1]
[1]
[1]
[2544]
[1]
[1]
[1]
[1]
[2727]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[3686]
[1]
[1]
[1]
[1]
[1]
[2706]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[2174]
[1]
[1]
[3488]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[3690]
[3518]
[2042]
[1]
[2716]
[1]
[1]
[1

In [26]:
chos = ['ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']
joongs = ['ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ']
jongs = [tokenizer.pad_token, 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']
jamos = list(set(chos) | set(joongs) | set(jongs))
jamo_ids = tokenizer(jamos, add_special_tokens=False)['input_ids']
jamo_to_id = {jamo: jamo_id[-1] for jamo, jamo_id in zip(jamos, jamo_ids)}
cho_ids = [jamo_to_id[cho] for cho in chos]
joong_ids = [jamo_to_id[joong] for joong in joongs]
jong_ids = [jamo_to_id[jong] for jong in jongs]
id_to_jamo = {jamo_id: jamo for jamo, jamo_id in jamo_to_id.items()}

In [27]:
id_to_jamo

{3: 'ㄺ',
 184: 'ㄷ',
 195: 'ㅜ',
 190: 'ㅈ',
 196: 'ㅠ',
 197: 'ㅡ',
 198: 'ㅣ',
 183: 'ㄴ',
 194: 'ㅏ',
 1: '[PAD]',
 189: 'ㅇ',
 182: 'ㄱ',
 188: 'ㅅ',
 191: 'ㅋ',
 186: 'ㅁ',
 192: 'ㅌ',
 185: 'ㄹ',
 187: 'ㅂ',
 193: 'ㅎ'}

In [24]:
max_len = -1
for hanzi_char in hanzi_chars:
    ids = tokenizer.encode(hanzi_char, add_special_tokens=False)
    if len(ids)>max_len:
        max_len=len(ids)

In [26]:
len(hanzi_chars)

88464

In [13]:
loss ,logits, pred_ids, sentence_denoised = krop.forward(batch['sentence'], batch['sentence_noisy'], task='char', pred=True)

In [12]:
batch

{'sentence': ['퓨갸 녀뮤 좋야용! 짤 뇰코 캅님따!!',
  '쳬갑 친냔 1월인갼? 2월춤웨 캅봤는톄, 67쯩엣써 슉쏢룰 잡앗셔오. 껴씰톡 클렇교 팡됴 엶쩡 넓꾜, 씹섧또 깩꿋햐교 부까 진쨔 끝넥줍뉘댜. 굵립꼬 억퀵갸 윌폰 슥쉼맞섦뭬셧 낢싣갸 좋우먼 뵤인닳꼬 하눼옴. 윕엑 100쭝엣 쓰탸뻑습또 졍맑 좋교, 헤욀옇행 깐눈 겄 까뜬 눅뀌뮈 뜨렁써오. 큰떼 쌀착 았쒸운 겡 옐씨튐 뮈떼 샹갸뚤뤼 꺼읠 공쉬릴예옷ㅠㅠ 굻련뎨 웠텨퍅끄됴 엶청 꾸케 잊덞람꾜욘. 삯윤났동 잇코, 엎웨 햇윤때 헷쑵욕쟝또 잇썽섟 청맒 좋야써옷. 굻릴교 웰싶틴옛셔 푿싼 룟뎨얼튼 뇰익귁쿡카 뵤잎뎌라곰요.'],
 'sentence_noisy': ['뷰가 너무 좋아요! 잘 놀고 갑니다!!',
  '제가 지난 1월인가? 2월쯤에 가봤는데, 67층에서 숙소를 잡았어요. 거실도 그렇고 방도 엄청 넓고, 시설도 깨끗하고 뷰가 진짜 끝내줍니다. 그리고 여기가 일본 스시마섬에서 날씨가 좋으면 보인다고 하네요. 위에 100층에 스타벅스도 정말 좋고, 해외여행 가는 것 같은 느낌이 들었어요. 근데 살짝 아쉬운 게 엘시티 밑에 상가들이 거의 공실이에요ㅠㅠ 그런데 워터파크도 엄청 크게 있더라고요. 사우나도 있고, 옆에 해운대 해수욕장도 있어서 정말 좋았어요. 그리고 엘시티에서 부산 롯데월드 놀이기구가 보이더라고요.']}

In [14]:
sentence_denoised

['됸뛧!쵧칧!튋녈쪉!!햋!맀녤!볍튥칟!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!',
 '뜃껻!뻘쨄!!쾗츻킥!!!뭻긿렷!눻륄뗐겴!!!!믾턬궐!웄셓쫘!턻츙퉑싶!!우촢럿!껬꿏엜!규룗!뵨뜅!뵙웄!!뜗섗혖!뵗햬맹웑!쨬탻!뙑섓!턓늣큄쾄룓!!칙퉗톺!퓽밨룗!윱섓!쨣옼퍙높뉍셤!벘풗퉑!뗑뜓켇!퍣펮꽓껮!줫팙뜙!!좔쉡!!!!뵧헭!먭퉧륤툓꽣!푯짊!닓턙!!쯩븡덣뵣!껙좇!뼣!킷턤!헭뙙툣!큒섓궤케!!떙슴!낔젋!뗤떭뗙!빣!앍뭃옫!턤껣!덙뜄렝뵣!끣먤!땣즓꾒섙녜!!!떙끭뺤!쌜뼓엾왜뙓!씐흴!즧쉥!띭뭙듷럭펏!!턮뚀감꽣!휈홂!!뼙꾤!섪뵓퓁!셓젒먈팣썻!뜻왙뗤!튀팓!엷퍒쉥뙧!!툘꾤샙!맞먮떓퉑뭤!띷끮!뭗큫낑옄!짫덒툈쨪숵!눪뮘팣뗀칣뚓!']