In [3]:
import os

In [4]:
%pwd

'd:\\TextSummarizationProject\\End-to-end-Text-Summarizer-Project\\research'

In [5]:
os.chdir("../")

In [6]:
%pwd

'd:\\TextSummarizationProject\\End-to-end-Text-Summarizer-Project'

In [7]:
import torch

from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir ='pegasus-samsum', num_train_epochs=20,warmup_steps=500,
    per_device_train_batch_size=2, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy="steps", eval_steps=500, save_steps = 1e6,
    gradient_accumulation_steps=16
)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
"""README > 3. Update entity """

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainingEntity:
    root_dir: Path 
    data_path: Path 
    checkpoints: Path 
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int



In [9]:
"""README > 4. Update the configuration manager in src config """
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__( self, config_filepath = FILE_PATH_CONFIG, params_filepath = FILE_PATH_PARAMS):
        
        #common.py dosyasındaki read_yaml() fonksiyonu ile yaml dosyasını okuyoruz.
        self.config = read_yaml(config_filepath) #Aslında Path("config/config.yaml")
        self.params = read_yaml(params_filepath) #Aslında Path("params.yaml")

        create_directories([self.config.artifacts_root]) #artifacts_root: artifacts. Bu method ile "artifacts" isimli folder otomatik olarak oluşturulur.
        #"." kullanarak çağırmayı "ConfigBox"a borçluyuz.

    def get_config_model_trainer(self) -> ModelTrainingEntity:
        
        parameters_of_the_model = self.params.TrainingParameters

        create_directories([self.config.model_training.root_dir])

        """ConfigBox kullanmadan böyle de attributelar çağrılabilir."""
        return ModelTrainingEntity(
            root_dir= self.config.model_training.root_dir, 
            data_path=self.config.model_training.data_path,
            checkpoints=self.config.model_training.model_checkpoints,
            num_train_epochs= parameters_of_the_model.num_train_epochs,
            warmup_steps=parameters_of_the_model.warmup_steps,
            per_device_train_batch_size= parameters_of_the_model.per_device_train_batch_size,
            weight_decay = parameters_of_the_model.weight_decay,
            logging_steps = parameters_of_the_model.logging_steps,
            evaluation_strategy = parameters_of_the_model.evaluation_strategy,
            eval_steps = parameters_of_the_model.eval_steps,
            save_steps = parameters_of_the_model.save_steps,
            gradient_accumulation_steps = parameters_of_the_model.gradient_accumulation_steps
        )

In [10]:
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [12]:
dataset_samsum_pt = load_from_disk("artifacts/data_receiver/samsum_dataset")
dataset_samsum_pt["train"]

Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 14732
})

In [13]:
split_lengths = [len(dataset_samsum_pt[split])for split in dataset_samsum_pt]

print(f"Split lenghts: {split_lengths}")
print(f"Features: {dataset_samsum_pt['train'].column_names}")
print("\Dialogue:")

print(dataset_samsum_pt["test"][1]["dialogue"])

print("\nSummary:")

print(dataset_samsum_pt["test"][1]["dialogue"])

Split lenghts: [14732, 819, 818]
Features: ['id', 'dialogue', 'summary']
\Dialogue:
Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)

Summary:
Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)


In [14]:
"""README > 5. Update components """


class ModelTrainer:
    def __init__(self, config: ModelTrainingEntity):
        self.config = config


    def train(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(self.config.checkpoints)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.checkpoints).to(device)
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
        
        dataset_samsum_pt = load_from_disk(self.config.data_path)

        """ trainer_args = TrainingArguments(
            output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,
            per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,
            weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,
            evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,
            gradient_accumulation_steps=self.config.gradient_accumulation_steps
         ) """ 
        
        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir, num_train_epochs=1, warmup_steps=500,
            per_device_train_batch_size=1, per_device_eval_batch_size=1,
            weight_decay=0.01, logging_steps=10,
            evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
            gradient_accumulation_steps=16
        ) 

        trainer = Trainer(model=model_pegasus, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["train"], 
                  eval_dataset=dataset_samsum_pt["validation"])
        
        trainer.train()

        model_pegasus.save_pretrained(os.path.join(self.config.root_dir,"pegasus-samsum-model"))
        tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))
       

In [16]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_config_model_trainer()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2024-05-23 09:30:44,742: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-05-23 09:30:44,745: INFO: common: yaml file: params.yaml loaded successfully]
[2024-05-23 09:30:44,747: INFO: common: Created directory at artifacts]
[2024-05-23 09:30:44,749: INFO: common: Created directory at artifacts/model_training]


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 3/920 [11:40<61:46:33, 242.52s/it]