In [1]:
!pwd

/home/kushal/TextSummarizer/research


In [2]:
import os
os.chdir('..')

In [3]:
from dataclasses import dataclass
from pathlib import Path

In [4]:
@dataclass
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    per_device_eval_batch_size: int
    weight_decay: float
    logging_steps: int
    eval_strategy: str
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int

In [5]:
from src.textSummarizer.constants import CONFIG_FILEPATH, PARAMS_FILEPATH
from src.textSummarizer.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(self, config_path = CONFIG_FILEPATH, params_path = PARAMS_FILEPATH):
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)
        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self):
        config = self.config.model_trainer
        params = self.params.TrainingArguments
        create_directories([config.root_dir])
        
        mtc = ModelTrainerConfig(root_dir = config.root_dir, data_path=config.data_path, model_ckpt = config.model_ckpt,
                                num_train_epochs=params.num_train_epochs, warmup_steps = params.warmup_steps, 
                                 per_device_train_batch_size = params.per_device_train_batch_size,  
                                 per_device_eval_batch_size = params.per_device_eval_batch_size,
                                 weight_decay = params.weight_decay, logging_steps = params.logging_steps, eval_strategy = params.eval_strategy,
                                 eval_steps = params.eval_steps, save_steps = float(params.save_steps), 
                                 gradient_accumulation_steps = params.gradient_accumulation_steps)
        return mtc

In [7]:
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Trainer, TrainingArguments
import torch
from datasets import load_dataset, load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


[2025-02-16 18:32:14,847: INFO: config: PyTorch version 2.6.0 available.]


In [8]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        config = self.config
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        tokenizer = AutoTokenizer.from_pretrained(config.model_ckpt)
        model = AutoModelForSeq2SeqLM.from_pretrained(config.model_ckpt).to(device)
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        #loading the data
        dataset_samsun_pt = load_from_disk(config.data_path)
        
        trainer_args = TrainingArguments(output_dir = config.root_dir, num_train_epochs= config.num_train_epochs, warmup_steps=config.warmup_steps, 
                                 per_device_train_batch_size = config.per_device_train_batch_size, 
                                 per_device_eval_batch_size = config.per_device_eval_batch_size, weight_decay = config.weight_decay, 
                                 logging_steps = config.logging_steps, eval_strategy = config.eval_strategy, 
                                 eval_steps = config.eval_steps, save_steps = config.save_steps, 
                                 gradient_accumulation_steps = config.gradient_accumulation_steps)

        # Deliberately training on 'test' data as it is smaller size
        trainer = Trainer(model = model, args = trainer_args, processing_class = tokenizer, data_collator=seq2seq_data_collator, 
                          train_dataset=dataset_samsun_pt["test"], eval_dataset=dataset_samsun_pt["validation"])
        
        trainer.train()

        model.save_pretrained(os.path.join(config.root_dir, "pegasus-finetuned-model"))
        tokenizer.save_pretrained(os.path.join(config.root_dir, "tokenizer"))

In [13]:
config = ConfigurationManager()
model_trainer_config = config.get_model_trainer_config()
model_trainer = ModelTrainer(model_trainer_config)
model_trainer.train()

[2025-02-16 18:32:44,345: INFO: common: yaml file: config/config.yml loaded successfully]
[2025-02-16 18:32:44,347: INFO: common: yaml file: params.yml loaded successfully]
[2025-02-16 18:32:44,348: INFO: common: Directory created at: {path}]
[2025-02-16 18:32:44,349: INFO: common: Directory created at: {path}]


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'dataset_samsum_pt' is not defined