In [12]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"


from dotenv import load_dotenv
load_dotenv()

import lightning as L

from tqdm.auto import tqdm
from lightning.pytorch.callbacks import ModelCheckpoint

from src.model.modeling_bind import LitBIND
from src.data.dataset import get_train_dataloader, get_dev_dataloader, get_test_dataloader

SEED=42
DATASET_NAME = 'jwengr/PeOcrSanskritPreproc'
MINI_BATCH_SIZE = 4
N_BATCH = 8
BASE_MODEL_NAME='Qwen/Qwen3-0.6B-Base'
EPOCHS=10
LEARNING_RATE = 1e-4
USE_BNTD=True
TRAIN_MAX_LENGTH=128
VALID_MAX_LENGTH=128
INFERENCE_SENTENCE_MAX_LENGTH=64
INFERENCE_SENTENCE_MIN_LENGTH=32
INFERENCE_SENTENCE_N_OVERLAP=3

L.seed_everything(SEED)

train_dl = get_train_dataloader(DATASET_NAME, batch_size=MINI_BATCH_SIZE, max_length=TRAIN_MAX_LENGTH)
dev_dl = get_dev_dataloader(DATASET_NAME, batch_size=MINI_BATCH_SIZE, max_length=VALID_MAX_LENGTH)
test_dl = get_test_dataloader(DATASET_NAME, batch_size=MINI_BATCH_SIZE)

lit_bind = LitBIND(
    base_model_name=BASE_MODEL_NAME,
    lr=LEARNING_RATE,
    epochs=EPOCHS,
    use_bntd=USE_BNTD,
    inference_sentence_max_length=INFERENCE_SENTENCE_MAX_LENGTH,
    inference_sentence_min_length=INFERENCE_SENTENCE_MIN_LENGTH,
    inference_sentence_n_overlap=INFERENCE_SENTENCE_N_OVERLAP,
    n_tokens_per_char=12
)

checkpoint_callback = ModelCheckpoint(
    dirpath='checkpoints/bind',
    filename=f"{DATASET_NAME.split('/')[1]}-{BASE_MODEL_NAME.split('/')[1]}-addbce-focalloss"+"-{epoch:02d}-{valid_loss:.4f}",
    monitor='valid_loss',
    mode='min',
    save_weights_only=True,
    save_top_k=3,
)

trainer = L.Trainer(
    callbacks=[checkpoint_callback],
    precision='bf16',
    max_epochs=EPOCHS,
    accumulate_grad_batches=N_BATCH
)

trainer.fit(lit_bind, train_dl, dev_dl)

Seed set to 42


use full attn qwen3


/home/jjw1214/.conda/envs/jjw1214_py312/lib/python3.12/site-packages/lightning/fabric/connector.py:571: `precision=bf16` is supported for historical reasons but its usage is discouraged. Please set your precision to bf16-mixed instead!
Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/jjw1214/.conda/envs/jjw1214_py312/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:701: Checkpoint directory /home/jjw1214/KROP/checkpoints/bind exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]
/home/jjw1214/.conda/envs/jjw1214_py312/lib/python3.12/site-packages/lightning/pytorch/utilities/model_summary/model_summary.py:231: Precision bf16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.

  | Name | Type | Params | Mode 
--------------------------------------
0 | bind | BIND 

Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  6.40it/s]

/home/jjw1214/.conda/envs/jjw1214_py312/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=127` in the `DataLoader` to improve performance.


                                                                           

/home/jjw1214/.conda/envs/jjw1214_py312/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=127` in the `DataLoader` to improve performance.


Epoch 0:   0%|          | 3/16009 [00:00<56:07,  4.75it/s, v_num=50]  


Detected KeyboardInterrupt, attempting graceful shutdown ...


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
lit_bind.bind.tokenizer.input_chars_dict

{}

In [None]:
lit_bind.bind.tokenizer.target_chars_dict

{}

In [None]:
sentence, sentence_noisy = ['ते ल म्बनावनती कुतो हेतोः कुतः पृथिव्याः साधिते', ' घटानधिकरणमित्या दितत्पुरुषस्यापि स्वघटकनामद्वयोपस्थाप्यार्थद्वयावच्छिन्नस्वघटकनामान्तरार्थविषयता कबोधजनकतया नोक्तनियम इत्यत आह ', 'अन्यन्तयाह पौरेतरध्नमुल्कापसव्यकरणं दिवाकरहिमांश्वोः', 'स्वगुणागज्या व्यासस्तत्र भलिप्ताः स्फुटः परिधिः'], ['ते ल म्बनावनती कुतो हेतोः कुतः पृथिव्याः साधिते', ' घटानधिकरणमित्या दितत्पुलपस्यापि स्वचटकनामद्वयोपस्थाप्यार्थद्वयावच्छिन्नस्वघटकनामान्तरार्थविषयता कवोधजनकतया नोक्तनियम हत्यत आह घ', 'अन्यन्तयाह पौरेतरध्नमुल्कापसव्यकरणं दिवाकरहिमांश्बोः', 'स्वगुणागज्या व्यासस्तत्र भलिप्ताः स्फुटः परिधिः']

In [None]:
sentence

['ते ल म्बनावनती कुतो हेतोः कुतः पृथिव्याः साधिते',
 ' घटानधिकरणमित्या दितत्पुरुषस्यापि स्वघटकनामद्वयोपस्थाप्यार्थद्वयावच्छिन्नस्वघटकनामान्तरार्थविषयता कबोधजनकतया नोक्तनियम इत्यत आह ',
 'अन्यन्तयाह पौरेतरध्नमुल्कापसव्यकरणं दिवाकरहिमांश्वोः',
 'स्वगुणागज्या व्यासस्तत्र भलिप्ताः स्फुटः परिधिः']

In [None]:
sentence_noisy

['ते ल म्बनावनती कुतो हेतोः कुतः पृथिव्याः साधिते',
 ' घटानधिकरणमित्या दितत्पुलपस्यापि स्वचटकनामद्वयोपस्थाप्यार्थद्वयावच्छिन्नस्वघटकनामान्तरार्थविषयता कवोधजनकतया नोक्तनियम हत्यत आह घ',
 'अन्यन्तयाह पौरेतरध्नमुल्कापसव्यकरणं दिवाकरहिमांश्बोः',
 'स्वगुणागज्या व्यासस्तत्र भलिप्ताः स्फुटः परिधिः']

In [None]:
sentence_len = lit_bind.bind.tokenizer.batch_encode_char(sentence, {})[0].shape[1]
sentence_noisy_len = lit_bind.bind.tokenizer.batch_encode_char(sentence_noisy, {})[0].shape[1]

In [13]:
sentence_len, sentence_noisy_len

(350, 350)

In [None]:
sentence_len
from tqdm.auto import tqdm
for batch in tqdm(train_dl):
    sentence_noisy_len = lit_bind.bind.tokenizer.batch_encode_char(batch['sentence_noisy'], {})[0].shape[1]
    sentence_len = lit_bind.bind.tokenizer.batch_encode_char(batch['sentence'], {})[0].shape[1]
    if sentence_noisy_len!=sentence_noisy_len:
        raise ValueError()

  1%|          | 168/16009 [00:02<04:31, 58.43it/s]


KeyboardInterrupt: 

In [None]:
lit_bind.bind.tokenizer.batch_encode_char(batch['sentence_noisy'])[0].shape

torch.Size([4, 386])

In [None]:
batch

{'sentence_noisy': ['अत्र किं पूर्वं साधुरुत साधुषु चरतीति संदेहः',
  'व्याख्यायते सतां तुष्टयै शब्दशक्तिप्रकाशिका',
  'किं तदिं विषयलक्षणमित्यत्राह अवभासमानतैवेति',
  'पुनरबजदनिष्पत्तितुल्या जदरेखासरेखानिष्पत्तिः कल्पिता'],
 'sentence': ['अत्र किं पूर्वं साधुरुत साधुषु चरतीति संदेहः',
  'व्याख्यायते सतां तुष्टयै शब्दशक्तिप्रकाशिका',
  'किं तदिं विषयलक्षणमित्यत्राह अवभासमानतैवेति',
  'पुनरबजदनिष्पत्तितुल्या जदरेखासरेखानिष्पत्तिः कल्पिता']}