In [1]:
import os
%pwd

'e:\\TextSummarization\\research'

In [2]:
os.chdir('../')
%pwd

'e:\\TextSummarization'

In [3]:
from dataclasses import dataclass
from pathlib import Path

# Return type
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_url: str
    local_data_file: Path
    unzip_dir: Path

In [4]:
from src.TextSummarizer.constants import *
from src.TextSummarizer.utils.common import read_yaml, create_directories

In [5]:
class ConfigManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH,params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(Path(config_filepath))
        self.params = read_yaml(Path(params_filepath))
        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        data_ingestion_config = self.config.data_ingestion
        create_directories([data_ingestion_config.root_dir])
        return DataIngestionConfig(
            root_dir=Path(data_ingestion_config.root_dir),
            source_url=data_ingestion_config.source_url,
            local_data_file=Path(data_ingestion_config.local_data_file),
            unzip_dir=Path(data_ingestion_config.unzip_dir)
        )

In [6]:
import os 
import urllib.request as request
from zipfile import ZipFile
from src.TextSummarizer.utils.common import get_size
from src.TextSummarizer.logging import logger

In [7]:
class DataIngestion:
    def __init__(self,config:DataIngestionConfig):
        self.config = config


    def download_file(self):
        try:
            if not os.path.exists(self.config.local_data_file):
                logger.info(f"Downloading file from: {self.config.source_url}")
                url= self.config.source_url
                file_name = self.config.local_data_file
                file_name , headers =request.urlretrieve(url, file_name)

                logger.info(f"File downloaded: {self.config.local_data_file} of size: {get_size(self.config.local_data_file)}")
            else:
                logger.info(f"File already exists: {self.config.local_data_file} of size: {get_size(self.config.local_data_file)}")


        except Exception as e:
            logger.exception(e)
            raise e
    def extract_zip_file(self):
        try:
            unizip_dir = self.config.unzip_dir
            if not os.path.exists(unizip_dir):
                os.makedirs(unizip_dir)
            logger.info(f"Extracting file: {self.config.local_data_file} to {unizip_dir}")
            with ZipFile(self.config.local_data_file, 'r') as zip_ref:
                zip_ref.extractall(unizip_dir)
            logger.info(f"File extracted to: {unizip_dir}")
        except Exception as e:
            logger.exception(e)
            raise e
        
        

In [8]:
try:
    config = ConfigManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()

except Exception as e:
    logger.exception(e)
    raise e

2025-07-08 01:20:41,458 - TextSummarizerLogger - INFO - YAML file config\config.yaml loaded successfully.
2025-07-08 01:20:41,460 - TextSummarizerLogger - INFO - YAML file params\params.yaml loaded successfully.
2025-07-08 01:20:41,462 - TextSummarizerLogger - INFO - Created Directory at : artifacts
2025-07-08 01:20:41,465 - TextSummarizerLogger - INFO - Created Directory at : artifacts/data_ingestion
2025-07-08 01:20:41,466 - TextSummarizerLogger - INFO - Downloading file from: https://github.com/entbappy/Branching-tutorial/raw/refs/heads/master/summarizer-data.zip
2025-07-08 01:21:29,558 - TextSummarizerLogger - INFO - File downloaded: artifacts\data_ingestion\samsumdata.zip of size: 7718 KB
2025-07-08 01:21:29,560 - TextSummarizerLogger - INFO - Extracting file: artifacts\data_ingestion\samsumdata.zip to artifacts\data_ingestion
2025-07-08 01:21:29,735 - TextSummarizerLogger - INFO - File extracted to: artifacts\data_ingestion
