In [2]:
import os

In [3]:
%pwd

'c:\\Users\\hp\\end-to-end-text-summarization\\research'

In [4]:
os.chdir("../")

In [5]:
%pwd

'c:\\Users\\hp\\end-to-end-text-summarization'

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_path : Path
    metric_file_path: Path

In [7]:
from textsummarizer.constants import *
from textsummarizer.utils.common import read_yaml, create_directories

In [8]:
class configurationManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH):
        
            self.config = read_yaml(config_filepath)
            self.params = read_yaml(params_filepath)

            create_directories([self.config.artifacts_root])

    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation

        create_directories([config.root_dir])

        model_evaluation_config = ModelEvaluationConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            tokenizer_path = config.tokenizer_path,
            metric_file_name = config.metric_file_name
            )
        
        return model_evaluation_config

In [9]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import pandas as pd
import tqdm as tqdm
from datasets import load_dataset, load_from_disk
import evaluate


  from .autonotebook import tqdm as notebook_tqdm


[2024-10-15 16:32:03,477: INFO:config: PyTorch version 2.4.1 available.]


In [10]:
class ModelEvaluation:
    def __init__(self,config: ModelEvaluationConfig):
        self.config = config

    # Evaluation

def generate_batch_sized_chuncks(list_of_elements, batch_size):
  """Split the dataset into smaller batches that we can process simultaneously
  yield successive batched-sized chunks from list_of_elements."""
  for i in range(0,len(list_of_elements),batch_size):
    yield list_of_elements[i : i + batch_size]


def calculate_metric_on_test_ds(dataset,metric,model,tokenizer,
                                batch_size=16,device="cuda",
                                column_summary="article",
                                column_texts="highlights"):
    article_batches = list(generate_batch_sized_chuncks(dataset[column_texts], batch_size))
    target_batches = list(generate_batch_sized_chuncks(dataset[column_summary], batch_size))

    for articles_batch, target_batch in tqdm(
      zip(article_batches,target_batches), total=len(article_batches)):

      inputs = tokenizer(articles_batch, max_length=1024,truncation=True,
                         padding="max_length",return_tensors="pt")

      summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                                 attention_mask=inputs["attention_mask"].to(device),
                                 length_penalty=0.8, num_beams=8, max_length=128)
      """parameter for length penalty ensures that the model does not generate sequences"""
      # Finally, we decode the generated texts,
      # replace the token, and add the decoded texts with the references to the metric

      decoded_summaries = [tokenizer.decode(s,skip_special_tokens=True,
                                            clean_up_tokenization_spaces=True)
      for s in summaries]

      decoded_summaries = [d.replace("", " ") for d in decoded_summaries]

      metric.add_batch(predictions=decoded_summaries,references=target_batch)

      # Finally compute and return the ROUGE scores.
      score = metric.compute()
      return score
    
def evaluate(self):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
    model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)

    #loading data
    dataset_sansum_pt = load_from_disk(self.config.data_path)   

    rouge_names  = ['rouge1','rouge2','rougeL']

    score = self.calculate_metric_on_test_ds(
        dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, batch_size=2, column_texts= 'dialogue', column_summary= 'summary'
        )

    rouge_dict = dict((rn, score[rn]) for rn in rouge_names)

    df = pd.DataFrame(rouge_dict, index = [f'pegasus'])
    df.to_csv(self.config.metric_file_name,index=False)


In [11]:
try:
    config = configurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    model_evaluation_config = ModelEvaluation(config=model_evaluation_config)
    model_evaluation_config.evaluate()
except Exception as e:
    raise e

[2024-10-15 16:32:05,970: INFO:common: yaml file:config\config.yaml loaded successfully]


[2024-10-15 16:32:05,983: INFO:common: yaml file:params.yaml loaded successfully]
[2024-10-15 16:32:05,984: INFO:common: directory created at: artifacts]


AttributeError: 'NoneType' object has no attribute 'root_dir'