In [None]:
import os


In [2]:
%pwd


'c:\\Github\\Text-Summarization\\research'

In [3]:
os.chdir("c:\\Github\\Text-Summarization")
%pwd


'c:\\Github\\Text-Summarization'

In [None]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: int
    gradient_accumulation_steps: int


In [None]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories


In [None]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(

            root_dir=config.root_dir,
            data_path=config.data_path,
            model_ckpt=config.model_ckpt,
            num_train_epochs=params.num_train_epochs,
            warmup_steps=params.warmup_steps,
            per_device_train_batch_size=params.per_device_train_batch_size,
            weight_decay=params.weight_decay,
            logging_steps=params.logging_steps,
            evaluation_strategy=params.evaluation_strategy,
            eval_steps=params.eval_steps,
            save_steps=params.save_steps,
            gradient_accumulation_steps=params.gradient_accumulation_steps
        )
        return model_trainer_config


In [None]:
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch


  from .autonotebook import tqdm as notebook_tqdm


[2024-12-09 18:15:31,514: INFO: config PyTorch version 2.5.1 available.]


In [None]:
from dataclasses import dataclass
from pathlib import Path


class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        """
        Trains the sequence-to-sequence model using the specified configuration.
        This method sets up the device (CPU or GPU), loads the tokenizer and model,
        prepares the data collator, loads the dataset, and configures the training
        arguments before starting the training process.
        Attributes:
            device (str): The device to use for training ('cuda' if GPU is available, otherwise 'cpu').
            tokenizer (AutoTokenizer): The tokenizer loaded from the specified model checkpoint.
            model_pegasus (AutoModelForSeq2SeqLM): The sequence-to-sequence model loaded from the specified model checkpoint.
            seq2seq_data_collator (DataCollatorForSeq2Seq): The data collator for sequence-to-sequence tasks.
            dataset_samsum_pt (Dataset): The dataset loaded from the specified data path.
            trainer_args (TrainingArguments): The training arguments configured with the specified parameters.
        Returns:
            None
        """
        device = "cuda" if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(
            self.config.model_ckpt).to(device)
        seq2seq_data_collator = DataCollatorForSeq2Seq(
            tokenizer, model=model_pegasus)
        dataset_samsum_pt = load_from_disk(self.config.data_path)

        trainer_args = TrainingArguments(
            output_dir="pegasus-samsum",
            num_train_epochs=1,
            warmup_steps=500,
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            weight_decay=0.01,
            logging_steps=10,
            evaluation_strategy='steps',
            eval_steps=500,
            save_steps=1e6,
            gradient_accumulation_steps=16)

        # trainer_args = trainerArguments(
        #     output_dir=self.config.root_dir,
        #     num_train_epochs=self.config.num_train_epochs,
        #     warmup_steps=self.config.warmup_steps,
        #     per_device_train_batch_size=self.config.per_device_train_batch_size,
        #     per_device_eval_batch_size=self.config.per_device_eval_batch_size,
        #     weight_decay=self.config.weight_decay,
        #     logging_steps=self.config.logging_steps,
        #     evaluation_strategy=self.config.evaluation_strategy,
        #     eval_steps=self.config.eval_steps,
        #     save_steps=self.config.save_steps,
        #     gradient_accumulation_steps=self.config.gradient_accumulation_steps)

        trainer = Trainer(model=model_pegasus,
                          args=trainer_args,
                          tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                          train_dataset=dataset_samsum_pt["test"],
                          # use train when training, here testing just code testing purpose
                          eval_dataset=dataset_samsum_pt['validation'])

        trainer.train()

        model_pegasus.save_pretrainer(os.path.join(
            self.config.root_dir, "pegaus-samsum-model"))
        tokenizer.save_pretrainer(os.path.join(
            self.config.root_dir, 'tokenizer'))


In [None]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e


[2024-12-09 18:22:45,795: INFO: common yaml file:config\config.yaml loaded successfully]
[2024-12-09 18:22:45,798: INFO: common yaml file:params.yaml loaded successfully]
[2024-12-09 18:22:45,799: INFO: common created directory at: artifacts]
[2024-12-09 18:22:45,801: INFO: common created directory at: artifacts/model_trainer]


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model=model_pegasus,
  0%|          | 0/51 [00:00<?, ?it/s]

In [None]:

# !pip install --upgrade accelerate
# !pip uninstall -y transformers accelerate
# !pip install transformers accelerate
