In [7]:
import os

In [17]:
os.chdir("../")

In [23]:
os.getcwd()

'd:\\Dropbox\\Self-Development\\Coding_Projects\\TextSummarizer'

In [19]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    per_device_eval_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: int
    gradient_accumulation_steps: int

In [20]:
from src.constants import *
from src.utils.exception import CustomException
from src.utils.logger import logger
from src.utils.utils import load_yaml, create_directories


class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH):
        self.config = load_yaml(config_filepath)
        self.params = load_yaml(params_filepath)

        create_directories([self.config["artifacts_root"]])
        
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config["model_trainer"]
        params = self.params["TrainingArguments"]
        
        create_directories([config["root_dir"]])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config["root_dir"],
            data_path=config["data_path"],
            model_ckpt=config["model_ckpt"],
            num_train_epochs=params["num_train_epochs"],
            warmup_steps=params["warmup_steps"],
            per_device_train_batch_size=params["per_device_train_batch_size"],
            per_device_eval_batch_size=params["per_device_eval_batch_size"],
            weight_decay=params["weight_decay"],
            logging_steps=params["logging_steps"],
            evaluation_strategy=params["evaluation_strategy"],
            eval_steps=params["eval_steps"],
            save_steps=params["save_steps"],
            gradient_accumulation_steps=params["gradient_accumulation_steps"]
        )

        return model_trainer_config

In [21]:
import os
import sys
from src.utils.exception import CustomException
from src.utils.logger import logger
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_from_disk
import torch

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        try:
            device = "cuda" if torch.cuda.is_available() else "cpu"
            tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
            logger.info("Tokenizer initialized")
            model = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
            logger.info("Model initialized")
            data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
            logger.info("Data Collator initialized")
            
            #loading data 
            dataset_transformed = load_from_disk(self.config.data_path)
            logger.info("Dataset loaded")

            trainer_args = TrainingArguments(
                output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,
                per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_eval_batch_size,
                weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,
                evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,
                gradient_accumulation_steps=self.config.gradient_accumulation_steps
            )

            trainer = Trainer(model=model, args=trainer_args,
                    tokenizer=tokenizer, data_collator=data_collator,
                    train_dataset=dataset_transformed["train"],
                    eval_dataset=dataset_transformed["validation"])
            logger.info("Trainer initialized")
            
            trainer.train()
            logger.info("Training process finished")

            ## Save model
            model.save_pretrained(os.path.join(self.config.root_dir,"trained-model"))
            logger.info("Trained model saved")
            
            ## Save tokenizer
            tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))
            logger.info("Tokenizer saved")
            
        except Exception as e:
            raise CustomException(e, sys)

In [24]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2023-08-07 00:17:27,344: INFO: utils: yaml file: config\config.yaml loaded successfully]
[2023-08-07 00:17:27,347: INFO: utils: yaml file: params.yaml loaded successfully]
[2023-08-07 00:17:27,349: INFO: utils: created directory at: artifacts]
[2023-08-07 00:17:27,349: INFO: utils: created directory at: artifacts/model_trainer]
[2023-08-07 00:17:28,095: INFO: 1905566313: Tokenizer initialized]


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2023-08-07 00:17:37,672: INFO: 1905566313: Model initialized]
[2023-08-07 00:17:37,673: INFO: 1905566313: Data Collator initialized]
[2023-08-07 00:17:37,700: INFO: 1905566313: Dataset loaded]
[2023-08-07 00:17:38,813: INFO: 1905566313: Trainer initialized]




[2023-08-07 00:17:39,453: ERROR: jupyter: Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.]


[34m[1mwandb[0m: Currently logged in as: [33mjjjoooo[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/102 [00:00<?, ?it/s]You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


CustomException: Error occurred in script name [C:\Users\jinou\AppData\Local\Temp\ipykernel_122116\1905566313.py] line number [43] error message [CUDA out of memory. Tried to allocate 376.00 MiB (GPU 0; 8.00 GiB total capacity; 6.91 GiB already allocated; 0 bytes free; 7.12 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF]