In [1]:
import os

In [None]:
curr_dir = os.getcwd()
print(curr_dir)

In [3]:
os.chdir("../")

In [None]:
curr_dir = os.getcwd()
print(curr_dir)

In [5]:
from dataclasses import dataclass 
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path 
    data_path: Path
    tokenizer_name: Path


In [6]:
from src.summarizer.constants.constant import * 
from src.summarizer.utils.common import create_directories,read_yaml
from src.summarizer.entity.config_entity import DataTransformationConfig

class ConfigurationManager:
    def __init__(self,
                config_path=CONFIG_FILE_PATH,
                params_path=PARAMS_FILE_PATH):
        self.config_path = config_path
        self.params_path = params_path

        self.config = read_yaml(self.config_path)
        self.params = read_yaml(self.params_path)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(root_dir=config.root_dir,
                                                              data_path=config.data_path,
                                                              tokenizer_name=config.tokenizer_name)
        return data_transformation_config
        

In [None]:
import os 
from src.summarizer.logging.logger import logger 
from transformers import AutoTokenizer 
from datasets import load_dataset,load_from_disk

class DataTransformation:
    def __init__(self,
                 config: DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_name)
        create_directories([self.config.root_dir])

    def _tokenize_batch(self,batch):
        inputs = self.tokenizer(batch['article'],
                                max_length=1024,
                                truncation=True,
                                padding='max_length')
        
        targets = self.tokenizer(batch['highlights'],
                                max_length=128,
                                truncation=True,
                                padding='max_length')
        
        inputs['labels'] = targets["input_ids"]
        return inputs
    
    def transform(self):
        logger.info(f"Loading the dataset from local disk")
        dataset = load_from_disk(self.config.data_path)

        logger.info("Tokenizing the dataset")
        
        tokenized_dataset = dataset.map(self.convert_examples_to_features,
                                        batched=True,
                                        remove_columns=dataset['train'].column_names)

        output_path = os.path.join(self.config.root_dir,"cnn_dailymail")

        logger.info(f"Saving tokenized dataset to {output_path}")
        tokenized_dataset.save_to_disk(output_path)
        

In [11]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.transform()
except Exception as e:
    raise e

[2026-01-28 19:00:08,170: INFO: common: yaml file:<_io.TextIOWrapper name='config\\config.yaml' mode='r' encoding='utf-8'> loaded successfully]
[2026-01-28 19:00:08,170: INFO: common: yaml file:<_io.TextIOWrapper name='params.yaml' mode='r' encoding='utf-8'> loaded successfully]
[2026-01-28 19:00:08,170: INFO: common: created directory at: artifacts]
[2026-01-28 19:00:08,176: INFO: common: created directory at: artifacts/data_transformation]
[2026-01-28 19:00:08,443: INFO: _client: HTTP Request: HEAD https://huggingface.co/facebook/bart-base/resolve/main/config.json "HTTP/1.1 307 Temporary Redirect"]
[2026-01-28 19:00:08,443: INFO: _client: HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/facebook/bart-base/aadd2ab0ae0c8268c7c9693540e9904811f36177/config.json "HTTP/1.1 200 OK"]
[2026-01-28 19:00:08,697: INFO: _client: HTTP Request: HEAD https://huggingface.co/facebook/bart-base/resolve/main/tokenizer_config.json "HTTP/1.1 404 Not Found"]
[2026-01-28 19:00:08,935: INFO

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

[2026-01-28 19:03:36,366: INFO: 4281784986: Saving tokenized dataset to artifacts/data_transformation\cnn_dailymail]


Saving the dataset (0/4 shards):   0%|          | 0/287113 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/13368 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/11490 [00:00<?, ? examples/s]