In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.profilers import PyTorchProfiler

import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()
tqdm.pandas()

from src.feedback_prize_english_language_learning.lib.data.data_utils import (
    load_datasets, 
    preprocessing_datasets,
    select_features_split_datasets,
)
from src.feedback_prize_english_language_learning.lib.data.data_module import NLPDataModule
from src.feedback_prize_english_language_learning.lib.models.BertRegression import BertRegression
from src.feedback_prize_english_language_learning.lib.config import Config, DataModuleConfig, ModuleConfig
from src.feedback_prize_english_language_learning.lib.utils import create_dirs

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
create_dirs([Config.cache_dir, Config.log_dir, Config.ckpt_dir, Config.prof_dir, Config.perf_dir])
torch.set_float32_matmul_precision("medium")

In [4]:
data: dict[str, pd.DataFrame] = load_datasets(Config.data_dir)
train_df: pd.DataFrame = data['train']
predict_df: pd.DataFrame = data['test']

train_df, predict_df = preprocessing_datasets(train_df, predict_df, ModuleConfig.model_name)
train_df, val_df, test_df = select_features_split_datasets(train_df, DataModuleConfig.test_size)



In [5]:
len(train_df), len(val_df), len(test_df), len(predict_df)

(2825, 499, 587, 3)

In [6]:
train_df.head(1)

Unnamed: 0,text_id,full_text,cohesion_scaled,syntax_scaled,vocabulary_scaled,phraseology_scaled,grammar_scaled,conventions_scaled
0,AB75D2DC92B7,Schools were made for people who wants to lear...,0.5,0.375,0.375,0.25,0.25,0.375


In [7]:
lit_datamodule = NLPDataModule(
    DataModuleConfig.label_column,
    train_df,
    val_df,
    test_df,
    pretrained_model_name=ModuleConfig.model_name,
    batch_size=DataModuleConfig.batch_size,
    num_workers=DataModuleConfig.num_workers,
    seed=Config.seed,
)



In [8]:
lit_model = BertRegression(pretrained_model=ModuleConfig.model_name, learning_rate=ModuleConfig.learning_rate)

In [9]:
# lit_datamodule.setup()
# batch = next(iter(lit_datamodule.train_dataloader()))
# inputs, label = batch
# lit_model(**inputs)

In [10]:
callbacks = [
    EarlyStopping(monitor="val-RMSE", mode="min"),
    ModelCheckpoint(
        dirpath=Config.ckpt_dir,
        filename="model",
    ),
]

logger = CSVLogger(
    save_dir=Config.log_dir,
    name="csv-logs",
)

lit_trainer = pl.Trainer(
    accelerator="auto",
    devices="auto",
    strategy="auto",
    precision="16-mixed",
    max_epochs=5,
    deterministic=True,
    logger=logger,
    callbacks=callbacks,
)

Trainer will use only 1 of 8 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=8)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /mnt/storage_2/scratch/pl0145-01/jsmok/feedback_priz ...
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [11]:
lit_trainer.fit(model=lit_model, datamodule=lit_datamodule)

Seed set to 42
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]

  | Name      | Type           | Params | Mode 
-----------------------------------------------------
0 | bert      | DebertaV2Model | 183 M  | eval 
1 | regressor | Sequential     | 1.2 M  | train
-----------------------------------------------------
185 M     Trainable params
0         Non-trainable params
185 M     Total params
740.057   Total estimated model params size (MB)


Epoch 1:  26%|██▌       | 23/89 [01:31<04:23,  0.25it/s, v_num=1, val-RMSE=0.481]