In [1]:
import os

In [None]:
curr_dir = os.getcwd()
print(curr_dir)

Go back to the project root

In [3]:
os.chdir("../")

In [None]:
curr_dir = os.getcwd()
print(curr_dir)

In [5]:
# %%writefile src/summarizer/entity/config_entity.py

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DatasetConfig:
    name: str
    source: str
    hf_repo: str
    version: str | None
    text_column: str
    summary_column: str


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    local_data_dir: Path
    dataset: DatasetConfig


In [6]:
# %%writefile src/summarizer/config/configuration.py

from src.summarizer.constants.constant import *
from src.summarizer.utils.common import read_yaml,create_directories

class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        ingestion_cfg = self.config.data_ingestion
        dataset_cfg = self.config.dataset

        create_directories([ingestion_cfg.root_dir])

        dataset_cfg = DatasetConfig(
            name=dataset_cfg.name,
            source=dataset_cfg.source,
            hf_repo=dataset_cfg.hf_repo,
            version=dataset_cfg.version,
            text_column=dataset_cfg.text_column,
            summary_column=dataset_cfg.summary_column
        )

        return DataIngestionConfig(root_dir=Path(ingestion_cfg.root_dir),
                                   local_data_dir=Path(ingestion_cfg.local_data_dir),
                                   dataset=dataset_cfg)
        

In [None]:
# %%writefile src/summarizer/components/data_ingestion.py


from datasets import load_dataset 
from src.summarizer.logging.logger import logger 
from src.summarizer.entity.config_entity import DataIngestionConfig

class DataIngestion: 
    def __init__(self,config: DataIngestionConfig):
        self.config = config
    
    def ingest(self):
        if self.config.dataset.source != "huggingface":
            raise ValueError("Only HuggingFace datasets are supported")

        logger.info(f"Loaing dataset {self.config.dataset.hf_repo}"
                    f"(version={self.config.dataset.version})")
        
        dataset = load_dataset(self.config.dataset.hf_repo,
                               self.config.dataset.version)
        
        dataset.save_to_disk(self.config.local_data_dir)
        logger.info(f"Dataset saved to {self.config.local_data_dir}")

In [9]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.ingest()
except Exception as e:
    raise e

[2026-01-27 20:59:47,675: INFO: common: yaml file:<_io.TextIOWrapper name='config\\config.yaml' mode='r' encoding='utf-8'> loaded successfully]
[2026-01-27 20:59:47,677: INFO: common: yaml file:<_io.TextIOWrapper name='params.yaml' mode='r' encoding='utf-8'> loaded successfully]
[2026-01-27 20:59:47,678: INFO: common: created directory at: artifacts]
[2026-01-27 20:59:47,679: INFO: common: created directory at: artifacts/data_ingestion]
[2026-01-27 20:59:47,679: INFO: 3639872059: Loaing dataset abisee/cnn_dailymail(version=3.0.0)]
[2026-01-27 20:59:48,161: INFO: _client: HTTP Request: HEAD https://huggingface.co/datasets/abisee/cnn_dailymail/resolve/main/README.md "HTTP/1.1 307 Temporary Redirect"]
[2026-01-27 20:59:48,619: INFO: _client: HTTP Request: HEAD https://huggingface.co/api/resolve-cache/datasets/abisee/cnn_dailymail/96df5e686bee6baa90b8bee7c28b81fa3fa6223d/README.md "HTTP/1.1 200 OK"]
[2026-01-27 20:59:48,887: INFO: _client: HTTP Request: GET https://huggingface.co/api/resol



[2026-01-27 20:59:52,366: INFO: _client: HTTP Request: GET https://datasets-server.huggingface.co/info?dataset=abisee/cnn_dailymail "HTTP/1.1 200 OK"]
[2026-01-27 20:59:52,628: INFO: _client: HTTP Request: GET https://huggingface.co/api/datasets/abisee/cnn_dailymail/tree/96df5e686bee6baa90b8bee7c28b81fa3fa6223d/1.0.0?recursive=true&expand=false "HTTP/1.1 200 OK"]
[2026-01-27 20:59:52,877: INFO: _client: HTTP Request: GET https://huggingface.co/api/datasets/abisee/cnn_dailymail/tree/96df5e686bee6baa90b8bee7c28b81fa3fa6223d?recursive=false&expand=false "HTTP/1.1 200 OK"]
[2026-01-27 20:59:53,135: INFO: _client: HTTP Request: HEAD https://huggingface.co/datasets/abisee/cnn_dailymail/resolve/96df5e686bee6baa90b8bee7c28b81fa3fa6223d/dataset_infos.json "HTTP/1.1 404 Not Found"]
[2026-01-27 20:59:53,401: INFO: _client: HTTP Request: GET https://huggingface.co/api/datasets/abisee/cnn_dailymail/tree/96df5e686bee6baa90b8bee7c28b81fa3fa6223d/3.0.0?recursive=true&expand=false "HTTP/1.1 200 OK"]
[2

Generating train split: 100%|██████████| 287113/287113 [00:01<00:00, 218054.87 examples/s]
Generating validation split: 100%|██████████| 13368/13368 [00:00<00:00, 224630.04 examples/s]
Generating test split: 100%|██████████| 11490/11490 [00:00<00:00, 201993.22 examples/s]
Saving the dataset (3/3 shards): 100%|██████████| 287113/287113 [00:00<00:00, 378084.02 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 13368/13368 [00:00<00:00, 371707.38 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 11490/11490 [00:00<00:00, 351093.90 examples/s]

[2026-01-27 21:00:20,436: INFO: 3639872059: Dataset saved to artifacts\data_ingestion\cnn_dailymail]



