In [1]:
import os 
%pwd

'e:\\TextSummarization\\research'

In [2]:
os.chdir('../')
%pwd

'e:\\TextSummarization'

In [3]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: str

In [4]:
from src.TextSummarizer.constants import *
from src.TextSummarizer.utils.common import read_yaml, create_directories

In [5]:
class ConfigManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH,params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(Path(config_filepath))
        self.params = read_yaml(Path(params_filepath))
        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self)-> DataTransformationConfig:
        config=self.config.data_transformation
        create_directories([config.root_dir])


        return DataTransformationConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path),
            tokenizer_name=config.tokenizer_name
                            )


In [6]:
import os 
import urllib.request as request
from zipfile import ZipFile
from src.TextSummarizer.utils.common import get_size
from src.TextSummarizer.logging import logger
from transformers import AutoTokenizer
from datasets import load_dataset , load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


2025-07-08 16:22:30,783 - datasets - INFO - PyTorch version 2.7.1 available.


In [7]:
class DataTransformation:
    def __init__(self,config:DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)
        print(f"Tokenizer name from config: {repr(config.tokenizer_name)}")

    def convert_examples_to_features(self,example_batch):
        input_encodings=self.tokenizer(example_batch['dialogue'],max_length=1024,truncation=True,padding='max_length',return_tensors='pt')

        with self.tokenizer.as_target_tokenizer():
            target_encodings=self.tokenizer(example_batch['summary'],max_length=128,truncation=True,padding='max_length',return_tensors='pt')

        return {
            'input_ids': input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'labels': target_encodings['input_ids']
        }
    def convert_and_save(self):
        dataset_samsum=load_from_disk(self.config.data_path)
        dataset_samsum=dataset_samsum.map(self.convert_examples_to_features, batched=True)
        dataset_samsum.save_to_disk(os.path.join(self.config.root_dir,'samsum_dataset'))
        logger.info(f"Saved the dataset to {self.config.root_dir}/samsum_dataset")


In [8]:
try:
    config=ConfigManager()
    data_transformation_config=config.get_data_transformation_config()
    data_transformation=DataTransformation(config=data_transformation_config)
    result=data_transformation.convert_and_save()

except Exception as e:
    print(f"An error occurred: {e}")
    raise e   


2025-07-08 16:22:31,372 - TextSummarizerLogger - INFO - YAML file config\config.yaml loaded successfully.
2025-07-08 16:22:31,375 - TextSummarizerLogger - INFO - YAML file params\params.yaml loaded successfully.
2025-07-08 16:22:31,376 - TextSummarizerLogger - INFO - Created Directory at : artifacts
2025-07-08 16:22:31,377 - TextSummarizerLogger - INFO - Created Directory at : artifacts/data_transformation


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Tokenizer name from config: 'google/pegasus-cnn_dailymail'


Map: 100%|██████████| 14732/14732 [00:05<00:00, 2530.34 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 2572.69 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 2239.00 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 97434.59 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 76260.07 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 60364.56 examples/s]

2025-07-08 16:22:42,224 - TextSummarizerLogger - INFO - Saved the dataset to artifacts\data_transformation/samsum_dataset



