In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
import datasets
import lightning as L

from lightning.pytorch.callbacks import ModelCheckpoint
from torch.utils.data import DataLoader

from src.model.modeling_bonita import LitBonita

In [5]:
def train_valid_test_split(
    ds,
    split_ratio,
    seed: int,
):
    ds_train_test = ds.train_test_split(
        test_size=split_ratio[2] / sum(split_ratio), seed=seed
    )
    ds_train_valid = ds_train_test["train"].train_test_split(
        test_size=split_ratio[1] / (split_ratio[1] + split_ratio[0]), seed=seed
    )
    ds_train = ds_train_valid["train"]
    ds_valid = ds_train_valid["test"]
    ds_test = ds_train_test["test"]
    return ds_train, ds_valid, ds_test

In [6]:
SEED=42
DATASET_NAME = 'AutoML/bitabuse'
MINI_BATCH_SIZE=2
N_BATCH = 16
BASE_MODEL_NAME='Qwen/Qwen3-0.6B-Base'
EPOCHS=20
LEARNING_RATE = 5e-5
USE_BNTD=True
MAX_LENGTH=32
SPLITS = (1, 20, 79)
INFERENCE_SENTENCE_MAX_LENGTH=256
INFERENCE_SENTENCE_MIN_LENGTH=128
INFERENCE_SENTENCE_N_OVERLAP=1

In [7]:
L.seed_everything(SEED)

Seed set to 42


42

In [8]:
ds = datasets.load_dataset(DATASET_NAME, split="train")
def preprocessing(example):
    example['sentence_noisy'] = example['text'][:MAX_LENGTH]
    example['sentence'] = example['label'][:MAX_LENGTH]
    return example
ds = ds.map(preprocessing)
ds_train, ds_valid, ds_test = train_valid_test_split(
    ds, SPLITS, SEED
)


In [8]:
ds_valid = ds_valid.select(range(len(ds_train)))

In [9]:
dl_train, dl_valid, dl_test = DataLoader(ds_train, batch_size=MINI_BATCH_SIZE), DataLoader(ds_valid, batch_size=MINI_BATCH_SIZE), DataLoader(ds_test, batch_size=1)

In [9]:
lit_bind = LitBonita(
    base_model_name=BASE_MODEL_NAME,
    lr=LEARNING_RATE,
    epochs=EPOCHS,
    use_bntd=USE_BNTD,
    inference_sentence_max_length=INFERENCE_SENTENCE_MAX_LENGTH,
    inference_sentence_min_length=INFERENCE_SENTENCE_MIN_LENGTH,
    inference_sentence_n_overlap=INFERENCE_SENTENCE_N_OVERLAP
)

In [10]:
checkpoint_callback = ModelCheckpoint(
    dirpath='checkpoints/bonita',
    filename=f"{DATASET_NAME.split('/')[1]}/{BASE_MODEL_NAME.split('/')[1]}"+"-{epoch:02d}-{valid_loss:.4f}",
    every_n_epochs=1,
    save_top_k=-1,
)

In [11]:
trainer = L.Trainer(
    callbacks=[checkpoint_callback],
    precision='16-mixed',
    max_epochs=EPOCHS,
    enable_checkpointing=True,
    accumulate_grad_batches=N_BATCH
)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(lit_bind, dl_train, dl_valid)

You are using a CUDA device ('NVIDIA GeForce RTX 5070 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
c:\Users\jinwo\.virtualenvs\KROP-L3im0CPD\Lib\site-packages\lightning\pytorch\callbacks\model_checkpoint.py:658: Checkpoint directory C:\Users\jinwo\Documents\KROP\checkpoints\bonita exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type   | Params | Mode 
------------------------------------------
0 | bonita | Bonita | 596 M  | train
------------------------------------------
596 M     Trainable params
0         Non-trainable params
596 M     Total params
2,384.200 Total estimated model params size (MB)
1         Modules in train mode
427       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\jinwo\.virtualenvs\KROP-L3im0CPD\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
c:\Users\jinwo\.virtualenvs\KROP-L3im0CPD\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [10]:
lit_bonita = LitBonita.load_from_checkpoint(
    'checkpoints/bonita/bitabuse/Qwen3-0.6B-Base-epoch=06-valid_loss=0.0177.ckpt',
    base_model_name=BASE_MODEL_NAME,
    lr=LEARNING_RATE,
    epochs=EPOCHS,
    use_bntd=USE_BNTD,
    inference_sentence_max_length=INFERENCE_SENTENCE_MAX_LENGTH,
    inference_sentence_min_length=INFERENCE_SENTENCE_MIN_LENGTH,
    inference_sentence_n_overlap=INFERENCE_SENTENCE_N_OVERLAP
)

In [11]:
trainer = L.Trainer()

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [12]:
preds = trainer.predict(lit_bonita, dl_test)

You are using a CUDA device ('NVIDIA GeForce RTX 5070 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\jinwo\.virtualenvs\KROP-L3im0CPD\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

In [13]:
preds = sum(preds, [])

In [14]:
trues = ds_test['sentence'][:]

In [1]:
import pandas as pd

In [16]:
df = pd.DataFrame()

In [17]:
df['true'] = trues
df['pred'] = preds

In [19]:
df.to_csv('./results/bonita_bituse.csv', index=False)

In [17]:
import pandas as pd
df = pd.read_csv('./results/bind_bituse.csv')

In [18]:
df['src'] = ds_test['sentence_noisy'][:]

In [19]:
from src.metrics.metric import word_accuracy

In [20]:
df['word_accuracy'] = df.apply(lambda x: word_accuracy(x['pred'], x['true'], x['src']), axis=1)

	input: i now have the .mp4 file with yo
	input_words: ['i', 'now', 'have', 'the', 'mp4', 'file', 'with', 'yo']
	input: asking money ( $500 ) for no sho
	input_words: ['asking', 'money', '500', 'for', 'no', 'sho']
	input: my software has turned on your c
	input_words: ['my', 'software', 'has', 'turned', 'on', 'your', 'c']
	input: when you wer viewing vidos, your
	input_words: ['when', 'you', 'wer', 'viewing', 'vidos', 'your']
	input: i have the - mp4 - with you jerk
	input_words: ['i', 'have', 'the', 'mp4', 'with', 'you', 'jerk']
	input: they are telling that they are u
	input_words: ['they', 'are', 'telling', 'that', 'they', 'are', 'u']
	input: you have the final chance to sav
	input_words: ['you', 'have', 'the', 'final', 'chance', 'to', 'sav']
	input: recieved the following email fro
	input_words: ['recieved', 'the', 'following', 'email', 'fro']
	input: they want you to send them 0.5 b
	input_words: ['they', 'want', 'you', 'to', 'send', 'them', '0', '5', 'b']
	input: my amount was lo

In [21]:
df['word_accuracy'].mean()

np.float64(0.9091538306165322)

In [23]:
from OpenAttack.metric import BLEU, JaccardWord
from OpenAttack.text_process.tokenizer import PunctTokenizer

  """


AttributeError: 'FileFinder' object has no attribute 'find_loader'