In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import lightning as L

from lightning.pytorch.callbacks import ModelCheckpoint

from src.model.modeling_char_encoder import LitCharEncoder
from src.data.dataset import get_train_dataloader, get_dev_dataloader, get_test_dataloader

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
SEED=42
DATASET_NAME = 'jwengr/C-LLM'
SPACE_TOKEN = '[SEP]'
UNK_TOKEN = '[UNK]'
PAD_TOKEN = '[PAD]'
MINI_BATCH_SIZE=32
N_BATCH = 1
BASE_MODEL_NAME='google-bert/bert-base-multilingual-cased'
EPOCHS=10
LEARNING_RATE = 5e-5
TRAIN_MAX_LENGTH=128
VALID_MAX_LENGTH=128
INFERENCE_SENTENCE_MAX_LENGTH=64
INFERENCE_SENTENCE_MIN_LENGTH=32
INFERENCE_SENTENCE_N_OVERLAP=3

In [5]:
L.seed_everything(SEED)

Seed set to 42


42

In [6]:
train_dl = get_train_dataloader(DATASET_NAME, batch_size=MINI_BATCH_SIZE, max_length=TRAIN_MAX_LENGTH)
dev_dl = get_dev_dataloader(DATASET_NAME, batch_size=MINI_BATCH_SIZE, max_length=VALID_MAX_LENGTH)
test_dl = get_test_dataloader(DATASET_NAME, batch_size=1)

In [7]:
lit_char_encoder = LitCharEncoder(
    base_model_name=BASE_MODEL_NAME,
    space_token=SPACE_TOKEN,
    unk_token=UNK_TOKEN,
    pad_token=PAD_TOKEN,
    lr=LEARNING_RATE,
    epochs=EPOCHS,
    inference_sentence_max_length=INFERENCE_SENTENCE_MAX_LENGTH,
    inference_sentence_min_length=INFERENCE_SENTENCE_MIN_LENGTH,
    inference_sentence_n_overlap=INFERENCE_SENTENCE_N_OVERLAP
)

Some weights of the model checkpoint at google-bert/bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
checkpoint_callback = ModelCheckpoint(
    dirpath='checkpoints/charencoder',
    filename=f"{DATASET_NAME.split('/')[1]}/{BASE_MODEL_NAME.split('/')[1]}"+"-{epoch:02d}-{valid_loss:.4f}",
    every_n_epochs=1,
    save_top_k=-1,
)

In [9]:
trainer = L.Trainer(
    callbacks=[checkpoint_callback],
    precision='bf16',
    max_epochs=EPOCHS,
    enable_checkpointing=True,
    accumulate_grad_batches=N_BATCH
)

/home/jjw1214/.conda/envs/jjw1214_py312/lib/python3.12/site-packages/lightning/fabric/connector.py:571: `precision=bf16` is supported for historical reasons but its usage is discouraged. Please set your precision to bf16-mixed instead!
Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(lit_char_encoder, train_dl, dev_dl)

You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/jjw1214/.conda/envs/jjw1214_py312/lib/python3.12/site-packages/lightning/pytorch/utilities/model_summary/model_summary.py:231: Precision bf16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.

  | Name    | Type        | Params | Mode 
------------------------------------------------
0 | encoder | CharEncoder | 177 M  | train
------------------------------------------------
177 M     Trainable params
0         Non-trainable params
177 M     Total params
711.898   Total estimated model params size (MB)
1         Modul

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/home/jjw1214/.conda/envs/jjw1214_py312/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=127` in the `DataLoader` to improve performance.


                                                                           

/home/jjw1214/.conda/envs/jjw1214_py312/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=127` in the `DataLoader` to improve performance.


Epoch 5:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 9483/10354 [18:36<01:42,  8.49it/s, v_num=53] 

In [10]:
lit_char_encoder = LitCharEncoder.load_from_checkpoint(
    'checkpoints/charencoder/C-LLM/bert-base-multilingual-cased-epoch=00-valid_loss=0.0197.ckpt',
    base_model_name=BASE_MODEL_NAME,
    space_token=SPACE_TOKEN,
    unk_token=UNK_TOKEN,
    pad_token=PAD_TOKEN,
    lr=LEARNING_RATE,
    epochs=EPOCHS,
    inference_sentence_max_length=INFERENCE_SENTENCE_MAX_LENGTH,
    inference_sentence_min_length=INFERENCE_SENTENCE_MIN_LENGTH,
    inference_sentence_n_overlap=INFERENCE_SENTENCE_N_OVERLAP
)

Some weights of the model checkpoint at google-bert/bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
trainer = L.Trainer()

üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [None]:
preds = trainer.predict(lit_char_encoder, test_dl)

You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]
/home/jjw1214/.conda/envs/jjw1214_py312/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=127` in the `DataLoader` to improve performance.


Predicting DataLoader 0:  10%|‚ñà         | 2785/26731 [00:23<03:20, 119.19it/s]‚ÄãËØ¥Âà∞Ëøô,ËΩ¶Âæ∑Èí¢ËßâÂæóÊúâÂøÖË¶Å‰∏∫Â§ßÂÆ∂ËøõË°å‰∏Ä‰∏ãÁÆÄÂçïÁöÑÁßëÊôÆ,Ë∂äÈáéËΩ¶‰∏äÁöÑÊâÄË∞ì‚ÄúÂ∑ÆÈÄüÈîÅ‚ÄùÂØπ‰∫éË∂äÈáéÊÄßËÉΩÁ©∂Á´üÊúâÁùÄÊÄéÊ†∑ÁöÑÊèêÂçá„ÄÇ
ËØ¥Âà∞ËøôÔºåËΩ¶Âæ∑Èí¢ËßâÂæóÊúâÂøÖË¶Å‰∏∫Â§ßÂÆ∂ËøõË°å‰∏Ä‰∏ãÁÆÄÂçïÁöÑÁßëÊôÆ,Ë∂äÈáéËΩ¶‰∏äÁöÑÊâÄË∞ì‚ÄãÂ∑ÆÈÄüÈîÅ‚ÄãÂØπ‰∫éË∂äÈáéÊÄßËÉΩÁ©∂Á´üÊúâÁùÄÊÄéÊ†∑ÁöÑÊèêÂçáÔºå
Predicting DataLoader 0:  11%|‚ñà         | 2807/26731 [00:23<03:20, 119.22it/s]ÊìçÊéßÂô®ÊñπÈù¢,ÊñπÂêëÁõòÊØîËæÉËΩª,ÁÅµÊïèÂ∫¶‰πüÓ∞ØÈ´ò,ËΩ¨Âêë‰πüÈùûÂ∏∏ÁöÑÁ≤æÂáÜ,Â§©Ê∞îÂÜ∑ÁöÑËØùËøòÂèØ‰ª•ÊèêÂâçËøõË°åËøúÁ®ãÂºÄÂêØÁ©∫Ë∞É,ËΩ¶ÂÜÖÁöÑ‰∏≠ÊéßÓ≥ΩÂ±èÓ∞ØÊòØÊñπ‰æø,ÁâπÂà´ÁöÑÊô∫ËÉΩÂåñ,ËΩ¶ÂÜÖÁöÑ360ÂÖ®ÊôØÂΩ±ÂÉèÁîªÈù¢Ó∞ØÊ∏ÖÊô∞,ÂºÄËµ∑Êù•‰ø°ÂøÉÈùûÂ∏∏ÁöÑÂº∫,‰∏äÊâã‰πüÊØîËæÉÂø´„ÄÇ
ÊìçÊéßÂô®ÊñπÈù¢ÔºåÊñπÂêëÁõòÊØîËæÉËΩª,ÁÅµÊïèÂ∫¶‰πüÈ´ò,ËΩ¨Âêë‰πüÈùûÂ∏∏ÁöÑÁ≤æÂáÜÔºåÂ§©Ê∞îÂÜ∑ÁöÑËØùËøòÂèØ‰ª•ÊèêÂâçËøõË°åËøúÁ®ãÂºÄÂêØÁ©∫Ë∞É,ËΩ¶ÂÜÖÁöÑ‰∏≠ÊéßÂ±èÊòØÊñπ‰æø,ÁâπÂà´ÁöÑÊô∫ËÉΩÂåñ,ËΩ¶ÂÜÖÁöÑ360ÂÖ®ÊôØÂΩ±ÂÉèÁîªÈù¢Ê∏ÖÊô∞,ÂºÄËµ∑Êù•‰ø°ÂøÉÈùûÂ∏∏ÁöÑÂº∫

: 

In [11]:
for batch in test_dl:
    break

In [16]:
tokenizer.base_tokenizer.decode?

[31mSignature:[39m
tokenizer.base_tokenizer.decode(
    token_ids: Union[int, list[int], ForwardRef([33m'np.ndarray'[39m), ForwardRef([33m'torch.Tensor'[39m), ForwardRef([33m'tf.Tensor'[39m)],
    skip_special_tokens: bool = [38;5;28;01mFalse[39;00m,
    clean_up_tokenization_spaces: Optional[bool] = [38;5;28;01mNone[39;00m,
    **kwargs,
) -> str
[31mDocstring:[39m
Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
tokens and clean up tokenization spaces.

Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.

Args:
    token_ids (`Union[int, list[int], np.ndarray, torch.Tensor, tf.Tensor]`):
        List of tokenized input ids. Can be obtained using the `__call__` method.
    skip_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not to remove special tokens in the decoding.
    clean_up_tokenization_spaces (`bool`, *optional*):
        Whether or not to 

In [10]:
preds[0]

['Âú®ÊúÄËøëÁöÑÁ¶èÁâπËµÑÊú¨Â∏ÇÂú∫Êó•ÊäïËµÑËÄÖ‰ªãÁªç‰ºö‰∏äÔºåÈ¶ñÂ∏≠‰∫ßÂìÅÂπ≥Âè∞ÂíåËøêËê•Âπ≤']