## Dependencies

In [1]:
import os
from dataclasses import dataclass
from pathlib import Path

from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%pwd

'c:\\Projects\\python\\text-summarizer\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'c:\\Projects\\python\\text-summarizer'

In [None]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml,create_directories

## Entity

In [5]:
from typing import List
@dataclass(frozen=True)
class ModelTrainerConfig:
  root_dir: Path
  data_path: Path
  model_ckpt: Path
  num_train_epochs: int
  warmup_steps: int
  per_device_train_batch_size: int
  weight_decay: float
  logging_steps: int
  evaluation_strategy: str
  eval_steps: int
  save_steps: float
  gradient_accumulation_steps: int
  #updated
  learning_rate: float
  save_total_limit: int
  load_best_model_at_end: bool
  metric_for_best_model: str
  fp16: bool
  report_to: List[str]
  


## Configuration Manager

In [6]:

class ConfigurationManager:
    def __init__(self,
                 config_file_path=CONFIG_FILE_PATH,
                 params_file_path=PARAMS_FILE_PATH):
        # paths accessed via constant.py -> config.yaml,params.yaml
        self.config=read_yaml(config_file_path)
        self.params=read_yaml(params_file_path)

        create_directories([self.config.artifacts_root]) # due to configbox setup

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config=self.config.model_trainer
        params = self.params.TrainingArguments
        create_directories([config.root_dir])
        
        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_ckpt = config.model_ckpt,
            num_train_epochs = params.num_train_epochs,
            warmup_steps = params.warmup_steps,
            per_device_train_batch_size = params.per_device_train_batch_size,
            weight_decay = params.weight_decay,
            logging_steps = params.logging_steps,
            evaluation_strategy = params.evaluation_strategy,
            eval_steps = params.eval_steps,
            save_steps = params.save_steps,
            gradient_accumulation_steps = params.gradient_accumulation_steps,
            learning_rate=params.learning_rate,
            save_total_limit=params.save_total_limit,
            load_best_model_at_end=params.load_best_model_at_end,
            metric_for_best_model=params.metric_for_best_model,
            fp16=params.fp16,
            report_to=params.report_to
        )

        return model_trainer_config

## Components

In [7]:
import os
from textSummarizer.entity import ModelTrainerConfig

from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_from_disk
import torch

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        datacollator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

        # loading data
        dataset_samsum_pt = load_from_disk(self.config.data_path)

        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir,
            num_train_epochs=self.config.num_train_epochs,
            warmup_steps=self.config.warmup_steps,
            per_device_train_batch_size=self.config.per_device_train_batch_size,
            per_device_eval_batch_size=self.config.per_device_train_batch_size,
            weight_decay=self.config.weight_decay,
            logging_steps=self.config.logging_steps,
            evaluation_strategy=self.config.evaluation_strategy,
            eval_steps=self.config.eval_steps,
            save_steps=self.config.save_steps,  # Updated
            gradient_accumulation_steps=self.config.gradient_accumulation_steps,
            learning_rate=float(self.config.learning_rate),        # New parameter
            save_total_limit=self.config.save_total_limit,  # New parameter
            load_best_model_at_end=self.config.load_best_model_at_end,  # New parameter
            metric_for_best_model=self.config.metric_for_best_model,    # New parameter
            fp16=self.config.fp16,                                   # New parameter
            report_to=self.config.report_to                           # New parameter
        )

        trainer = Trainer(
            model=model,
            args=trainer_args,
            tokenizer=tokenizer,
            data_collator=datacollator,
            train_dataset=dataset_samsum_pt['test'],
            eval_dataset=dataset_samsum_pt['validation']
        )

        trainer.train()

        # Save final model and tokenizer
        model.save_pretrained(os.path.join(self.config.root_dir, "distilbart_model_01"))
        tokenizer.save_pretrained(os.path.join(self.config.root_dir, "distilbart_tokenizer_01"))


In [8]:
# class ModelTrainer:
#     def __init__(self, config: ModelTrainerConfig):
#         self.config = config

#     def train(self):
#         device = "cuda" if torch.cuda.is_available() else "cpu"
#         tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
#         model = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
#         datacollator = DataCollatorForSeq2Seq(tokenizer=tokenizer,model=model)

#         # loading data
#         dataset_samsum_pt = load_from_disk(self.config.data_path)

#         trainer_args = TrainingArguments(
#             output_dir=self.config.root_dir,
#             num_train_epochs=self.config.num_train_epochs,
#             warmup_steps=self.config.warmup_steps,
#             per_device_train_batch_size=self.config.per_device_train_batch_size,
#             per_device_eval_batch_size=self.config.per_device_train_batch_size,
#             weight_decay=self.config.weight_decay, 
#             logging_steps=self.config.logging_steps,
#             evaluation_strategy=self.config.evaluation_strategy, 
#             eval_steps=self.config.eval_steps, 
#             save_steps=1e6,
#             gradient_accumulation_steps=self.config.gradient_accumulation_steps
#         )

#         trainer = Trainer(
#             model=model,
#             args=trainer_args,
#             tokenizer=tokenizer,
#             data_collator=datacollator,
#             train_dataset=dataset_samsum_pt['test'],
#             eval_dataset=dataset_samsum_pt['validation']
#         )

#         trainer.train()

#         model.save_pretrained(os.path.join(self.config.root_dir,"distilbart_model_01"))

#         tokenizer.save_pretrained(os.path.join(self.config.root_dir,"distilbart_tokenizer_01"))

## Test

In [9]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise 

[2025-01-25 17:25:03,440: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-01-25 17:25:03,443: INFO: common: yaml file: params.yaml loaded successfully]
[2025-01-25 17:25:03,444: INFO: common: created directory at: artifacts]
[2025-01-25 17:25:03,445: INFO: common: created directory at: artifacts/model]


  trainer = Trainer(


Step,Training Loss,Validation Loss


KeyboardInterrupt: 