In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import pandas as pd
import lightning as L

from tqdm.auto import tqdm
from lightning.pytorch.callbacks import ModelCheckpoint

from src.model.modeling_char_encoder import LitCharEncoder
from src.data.dataset import get_train_dataloader, get_dev_dataloader, get_test_dataloader

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
SEED=42
DATASET_NAME = 'jwengr/PeOcrSanskritPreproc'
SPACE_TOKEN = '[SEP]'
UNK_TOKEN = '[UNK]'
PAD_TOKEN = '[PAD]'
MINI_BATCH_SIZE=32
N_BATCH = 1
BASE_MODEL_NAME='google-bert/bert-base-multilingual-cased'
EPOCHS=10
LEARNING_RATE = 5e-5
TRAIN_MAX_LENGTH=128
VALID_MAX_LENGTH=128

In [5]:
L.seed_everything(SEED)

Seed set to 42


42

In [6]:
train_dl = get_train_dataloader(DATASET_NAME, batch_size=MINI_BATCH_SIZE, max_length=TRAIN_MAX_LENGTH)
dev_dl = get_dev_dataloader(DATASET_NAME, batch_size=MINI_BATCH_SIZE, max_length=VALID_MAX_LENGTH)
test_dl = get_test_dataloader(DATASET_NAME, batch_size=1)

In [8]:
lit_char_encoder = LitCharEncoder.load_from_checkpoint(
    'checkpoints/charencoder/PeOcrSanskritPreproc-bert-base-multilingual-cased-epoch=02-valid_loss=0.5334.ckpt',
    base_model_name=BASE_MODEL_NAME,
    space_token=SPACE_TOKEN,
    unk_token=UNK_TOKEN,
    pad_token=PAD_TOKEN,
    lr=LEARNING_RATE,
    epochs=EPOCHS,
)

Some weights of the model checkpoint at google-bert/bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
trainer = L.Trainer()

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [10]:
preds = trainer.predict(lit_char_encoder, test_dl)

You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2]
/home/jjw1214/.conda/envs/jjw1214_py312/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=127` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 2362/2362 [00:20<00:00, 115.91it/s]


In [None]:
# Collect predictions
prediction = []
for pred in tqdm(preds):
    prediction.extend(pred)

100%|██████████| 2362/2362 [00:00<00:00, 1225500.50it/s]


In [15]:
len(prediction)

2362

In [12]:
# Build dataframe
categories, inputs, true = [], [], []
for batch in test_dl:
    true.extend(batch["sentence"])
    inputs.extend(batch["sentence_noisy"])
    if batch.get("category") is None:
        category = "none"
    else:
        category = batch["category"]
    categories.extend(category)

result_df = pd.DataFrame({
    "input": inputs,
    "pred": prediction,
    "true": true,
    "category": categories,
})

ValueError: All arrays must be of the same length

In [13]:
lit_char_encoder = LitCharEncoder.load_from_checkpoint(
    'checkpoints/charencoder/C-LLM/bert-base-chinese-epoch=00-valid_loss=0.0559.ckpt',
    base_model_name=BASE_MODEL_NAME,
    space_token=SPACE_TOKEN,
    unk_token=UNK_TOKEN,
    pad_token=PAD_TOKEN,
    lr=LEARNING_RATE,
    epochs=EPOCHS,
    inference_sentence_max_length=INFERENCE_SENTENCE_MAX_LENGTH,
    inference_sentence_min_length=INFERENCE_SENTENCE_MIN_LENGTH,
    inference_sentence_n_overlap=INFERENCE_SENTENCE_N_OVERLAP,
    target_chars='hanzi'
)

Some weights of the model checkpoint at google-bert/bert-base-chinese were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using hanzi chars as target chars.


In [14]:
trainer = L.Trainer()

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [15]:
preds = trainer.predict(lit_char_encoder, test_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2]
/home/jjw1214/.conda/envs/jjw1214_py312/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=127` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 26731/26731 [03:15<00:00, 136.87it/s]


In [16]:
prediction = []

for pred in tqdm(preds):
    prediction.extend(pred)

100%|██████████| 26731/26731 [00:00<00:00, 1738774.84it/s]


In [17]:
categories = []
inputs = []
true = []
for batch in test_dl:
    true.extend(batch['sentence'])
    inputs.extend(batch['sentence_noisy'])
    categories.extend(batch['category'])

In [22]:
result_df = pd.DataFrame()
result_df['input'] = inputs
result_df['pred'] = prediction
result_df['true'] = true
result_df['category'] = categories

In [23]:
from src.metrics.metric import calculate_metric

In [None]:
for cat in set(result_df['category']):
    cat_df = result_df[result_df['category']==cat].copy()
    result, result_list = calculate_metric(cat_df['input'].tolist(), cat_df['true'].tolist(), cat_df['pred'].tolist())
    print(cat, result)

gam {'S_D_p': 26.667, 'S_D_r': 21.622, 'S_D_f1': 23.881, 'S_C_p': 20.0, 'S_C_r': 16.216, 'S_C_f1': 17.91, 'C_D_p': 30.986, 'C_D_r': 26.829, 'C_D_f1': 28.758, 'C_C_p': 21.127, 'C_C_r': 18.293, 'C_C_f1': 19.608}
new {'S_D_p': 52.507, 'S_D_r': 26.352, 'S_D_f1': 35.092, 'S_C_p': 40.108, 'S_C_r': 20.129, 'S_C_f1': 26.805, 'C_D_p': 55.666, 'C_D_r': 29.08, 'C_D_f1': 38.203, 'C_C_p': 41.691, 'C_C_r': 21.779, 'C_C_f1': 28.612}
mec {'S_D_p': 65.594, 'S_D_r': 36.022, 'S_D_f1': 46.505, 'S_C_p': 57.545, 'S_C_r': 31.602, 'S_C_f1': 40.799, 'C_D_p': 71.593, 'C_D_r': 36.143, 'C_D_f1': 48.036, 'C_C_p': 61.42, 'C_C_r': 31.008, 'C_C_f1': 41.211}
cscd {'S_D_p': 72.548, 'S_D_r': 54.277, 'S_D_f1': 62.096, 'S_C_p': 66.686, 'S_C_r': 49.891, 'S_C_f1': 57.079, 'C_D_p': 76.343, 'C_D_r': 55.657, 'C_D_f1': 64.379, 'C_C_p': 69.669, 'C_C_r': 50.791, 'C_C_f1': 58.751}
cot {'S_D_p': 72.632, 'S_D_r': 46.939, 'S_D_f1': 57.025, 'S_C_p': 58.246, 'S_C_r': 37.642, 'S_C_f1': 45.73, 'C_D_p': 78.738, 'C_D_r': 48.765, 'C_D_f1': 

: 