In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from lightning.pytorch.loggers import CSVLogger, TensorBoardLogger
from lightning.pytorch.profilers import PyTorchProfiler

import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()
tqdm.pandas()

from src.feedback_prize_english_language_learning.lib.data.data_utils import (
    load_datasets, 
    preprocessing_datasets,
    select_features_split_datasets,
)
from src.feedback_prize_english_language_learning.lib.data.data_module import NLPDataModule
from src.feedback_prize_english_language_learning.lib.models.BertRegression import BertRegression
from src.feedback_prize_english_language_learning.lib.config import Config, DataModuleConfig, ModuleConfig
from src.feedback_prize_english_language_learning.lib.utils import create_dirs

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
create_dirs([Config.cache_dir, Config.log_dir, Config.ckpt_dir, Config.prof_dir, Config.perf_dir])
torch.set_float32_matmul_precision("medium")

In [4]:
data: dict[str, pd.DataFrame] = load_datasets(Config.data_dir)
train_df: pd.DataFrame = data['train']
predict_df: pd.DataFrame = data['test']

train_df, predict_df = preprocessing_datasets(train_df, predict_df, ModuleConfig.model_name)
train_df, val_df, test_df = select_features_split_datasets(train_df, DataModuleConfig.test_size)



In [5]:
len(train_df), len(val_df), len(test_df), len(predict_df)

(2502, 626, 783, 3)

In [6]:
train_df.head(1)

Unnamed: 0,text_id,full_text,cohesion_scaled,syntax_scaled,vocabulary_scaled,phraseology_scaled,grammar_scaled,conventions_scaled
0,0F8436260047,Some people say that first impressions are eas...,0.75,0.75,0.5,0.75,0.875,0.75


In [7]:
lit_datamodule = NLPDataModule(
    DataModuleConfig.label_column,
    train_df,
    val_df,
    test_df,
    pretrained_model_name=ModuleConfig.model_name,
    batch_size=DataModuleConfig.batch_size,
    num_workers=DataModuleConfig.num_workers,
    seed=Config.seed,
)



In [8]:
lit_model = BertRegression(pretrained_model=ModuleConfig.model_name, learning_rate=ModuleConfig.learning_rate)

In [9]:
# lit_datamodule.setup()
# batch = next(iter(lit_datamodule.train_dataloader()))
# inputs, label = batch
# lit_model(**inputs)

In [12]:
callbacks = [
    EarlyStopping(monitor="val-RMSE", mode="min", verbose=True, patience=10),
    ModelCheckpoint(
        dirpath=Config.ckpt_dir,
        filename="model",
    ),
    LearningRateMonitor(logging_interval='step'),
]

loggers = [
    CSVLogger(
        save_dir=Config.log_dir,
        name="csv-logs",
    ),
    TensorBoardLogger(
        Config.log_dir / "tb_logs", 
        name="my_model"
    ),
]

lit_trainer = pl.Trainer(
    precision="16-mixed",
    max_epochs=25,
    deterministic=True,
    logger=loggers,
    callbacks=callbacks,
    log_every_n_steps=10,
    profiler=PyTorchProfiler(output_filename=Config.prof_dir / "profiler.txt"),
    #gpus=-1,
    accelerator="auto",
    strategy="ddp_notebook",
)

TypeError: Trainer.__init__() got an unexpected keyword argument 'gpus'

In [13]:
lit_trainer.fit(model=lit_model, datamodule=lit_datamodule)
lit_trainer.test(model=lit_model, datamodule=lit_datamodule)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /mnt/storage_2/scratch/pl0145-01/jsmok/feedback_priz ...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The cur

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:439: It is recommended to use `self.log('val-loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:439: It is recommended to use `self.log('val-RMSE', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
STAGE:2024-07-29 10:18:43 56064:56064 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-07-29 10:18:43 56023:56023 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-07-29 10:18:43 55927:55927 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-07

Training: |          | 0/? [00:00<?, ?it/s]

W0729 10:18:58.755000 140350625892160 torch/multiprocessing/spawn.py:145] Terminating process 55814 via signal SIGTERM
W0729 10:18:58.785000 140350625892160 torch/multiprocessing/spawn.py:145] Terminating process 55851 via signal SIGTERM
W0729 10:18:58.786000 140350625892160 torch/multiprocessing/spawn.py:145] Terminating process 55885 via signal SIGTERM
W0729 10:18:58.786000 140350625892160 torch/multiprocessing/spawn.py:145] Terminating process 55927 via signal SIGTERM
W0729 10:18:58.787000 140350625892160 torch/multiprocessing/spawn.py:145] Terminating process 55975 via signal SIGTERM
W0729 10:18:58.791000 140350625892160 torch/multiprocessing/spawn.py:145] Terminating process 56023 via signal SIGTERM
W0729 10:18:58.792000 140350625892160 torch/multiprocessing/spawn.py:145] Terminating process 56064 via signal SIGTERM
[rank: 2] Received SIGTERM: 15
[rank: 3] Received SIGTERM: 15
Process ForkProcess-12:
Process ForkProcess-11:
Process ForkProcess-15:
[rank: 6] Received SIGTERM: 15
[r

ProcessRaisedException: 

-- Process 0 terminated with the following error:
Traceback (most recent call last):
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/torch/multiprocessing/spawn.py", line 75, in _wrap
    fn(i, *args)
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/strategies/launchers/multiprocessing.py", line 173, in _wrapping_function
    results = function(*args, **kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py", line 579, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py", line 986, in _run
    results = self._run_stage()
              ^^^^^^^^^^^^^^^^^
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py", line 1030, in _run_stage
    self.fit_loop.run()
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py", line 205, in run
    self.advance()
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py", line 363, in advance
    self.epoch_loop.run(self._data_fetcher)
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 140, in run
    self.advance(data_fetcher)
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 250, in advance
    batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 190, in run
    self._optimizer_step(batch_idx, closure)
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 268, in _optimizer_step
    call._call_lightning_module_hook(
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py", line 159, in _call_lightning_module_hook
    output = fn(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/core/module.py", line 1308, in optimizer_step
    optimizer.step(closure=optimizer_closure)
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/core/optimizer.py", line 153, in step
    step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/strategies/ddp.py", line 270, in optimizer_step
    optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py", line 238, in optimizer_step
    return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/plugins/precision/amp.py", line 77, in optimizer_step
    closure_result = closure()
                     ^^^^^^^^^
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 144, in __call__
    self._result = self.closure(*args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 138, in closure
    self._backward_fn(step_output.closure_loss)
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 239, in backward_fn
    call._call_strategy_hook(self.trainer, "backward", loss, optimizer)
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py", line 311, in _call_strategy_hook
    output = fn(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py", line 212, in backward
    self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs)
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/plugins/precision/precision.py", line 72, in backward
    model.backward(tensor, *args, **kwargs)
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/lightning/pytorch/core/module.py", line 1103, in backward
    loss.backward(*args, **kwargs)
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/torch/_tensor.py", line 525, in backward
    torch.autograd.backward(
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/torch/autograd/__init__.py", line 267, in backward
    _engine_run_backward(
  File "/mnt/storage_2/scratch/pl0145-01/jsmok/feedback_prize_english_language_learning/.venv/lib/python3.11/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward
    return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.54 GiB. GPU 
