In [None]:
import os
os.chdir("..")

In [None]:
os.getcwd()

# constants

In [None]:
from text_summarization.utils import read_yaml
from dataclasses import dataclass
from dotenv import load_dotenv
import os


load_dotenv()
CONFIG = read_yaml("config/config.yaml")

@dataclass(frozen=True)
class DataIngestionConstants:
    # DIR'S
    ARITFACTS_ROOT_DIR_NAME=CONFIG.ARITFACTS_ROOT_DIR_NAME
    DATA_ROOT_DIR_NAME=CONFIG.DATA.ROOT_DIR_NAME
    INGESTION_ROOT_DIR_NAME=CONFIG.DATA.INGESTION.ROOT_DIR_NAME
    FEATURE_STORE_ROOT_DIR_NAME=CONFIG.DATA.INGESTION.FEATURE_STORE.ROOT_DIR_NAME
    INGESTED_ROOT_DIR_NAME=CONFIG.DATA.INGESTION.INGESTED.ROOT_DIR_NAME
    # FILES
    ZIP_FILE_NAME=CONFIG.DATA.INGESTION.FEATURE_STORE.ZIP_FILE_NAME
    # URI'S
    SOURCE_URI=os.getenv("SOURCE_URI")




In [None]:
print("ARITFACTS_ROOT_DIR_NAME:", DataIngestionConstants.ARITFACTS_ROOT_DIR_NAME)
print("DATA_ROOT_DIR_NAME:", DataIngestionConstants.DATA_ROOT_DIR_NAME)
print("INGESTION_ROOT_DIR_NAME:", DataIngestionConstants.INGESTION_ROOT_DIR_NAME)
print("FEATURE_STORE_ROOT_DIR_NAME:", DataIngestionConstants.FEATURE_STORE_ROOT_DIR_NAME)
print("INGESTED_ROOT_DIR_NAME:", DataIngestionConstants.INGESTED_ROOT_DIR_NAME)
print("ZIP_FILE_NAME:", DataIngestionConstants.ZIP_FILE_NAME)
print("SOURCE_URI:", DataIngestionConstants.SOURCE_URI)

# entity

In [None]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionArtifacts:
    # DIR'S
    ARITFACTS_ROOT_DIR_PATH:Path
    DATA_ROOT_DIR_PATH:Path
    INGESTION_ROOT_DIR_PATH:Path
    FEATURE_STORE_ROOT_DIR_PATH:Path
    INGESTED_ROOT_DIR_PATH:Path
    # FILES
    ZIP_FILE_PATH:str
    # URI'S
    SOURCE_URI:str




# configuration

In [None]:
from dataclasses import dataclass
from datetime import datetime
import os


@dataclass(frozen=True)
class DataIngestionConfig:
    # DIR'S
    __timestamp = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
    ARITFACTS_ROOT_DIR_PATH = os.path.join(DataIngestionConstants.ARITFACTS_ROOT_DIR_NAME, __timestamp)
    DATA_ROOT_DIR_PATH = os.path.join(ARITFACTS_ROOT_DIR_PATH, DataIngestionConstants.DATA_ROOT_DIR_NAME)
    INGESTION_ROOT_DIR_PATH = os.path.join(DATA_ROOT_DIR_PATH, DataIngestionConstants.INGESTION_ROOT_DIR_NAME)
    FEATURE_STORE_ROOT_DIR_PATH = os.path.join(INGESTION_ROOT_DIR_PATH, DataIngestionConstants.FEATURE_STORE_ROOT_DIR_NAME)
    INGESTED_ROOT_DIR_PATH = os.path.join(INGESTION_ROOT_DIR_PATH, DataIngestionConstants.INGESTED_ROOT_DIR_NAME)
    # FILES
    ZIP_FILE_PATH = os.path.join(FEATURE_STORE_ROOT_DIR_PATH, DataIngestionConstants.ZIP_FILE_NAME)
    # URI'S
    SOURCE_URI = DataIngestionConstants.SOURCE_URI




In [None]:
print("ARITFACTS_ROOT_DIR_PATH:", DataIngestionConfig.ARITFACTS_ROOT_DIR_PATH)
print("DATA_ROOT_DIR_PATH:", DataIngestionConfig.DATA_ROOT_DIR_PATH)
print("INGESTION_ROOT_DIR_PATH:", DataIngestionConfig.INGESTION_ROOT_DIR_PATH)
print("FEATURE_STORE_ROOT_DIR_PATH:", DataIngestionConfig.FEATURE_STORE_ROOT_DIR_PATH)
print("INGESTED_ROOT_DIR_PATH:", DataIngestionConfig.INGESTED_ROOT_DIR_PATH)
print("ZIP_FILE_PATH:", DataIngestionConfig.ZIP_FILE_PATH)
print("SOURCE_URI:", DataIngestionConfig.SOURCE_URI)

# components

In [None]:
from text_summarization.exception import CustomException
from text_summarization.logger import logging
from text_summarization.utils import create_dirs
from dataclasses import dataclass
from urllib.request import urlretrieve
from  zipfile import ZipFile
import sys


@dataclass
class DataIngestionComponents:
    __data_ingestion_config:DataIngestionArtifacts

    @staticmethod
    def __download(source_uri:str, zip_file_path:str) -> None:
        """Description: downloads the data zip file and saves locally

        Args:
            source_uri (str): uri for downloading
            zip_file_path (str): path to save file locally
        """
        try:
            logging.info("Downloading........")
            urlretrieve(source_uri, zip_file_path)
            logging.info("Download complete.")            
        except Exception as e:
            logging.error(e)
            CustomException(e, sys)
    
    @staticmethod
    def __extract(zip_file_path:str, raw_data_dir:str) -> None:
        """Description: extracts the zip file into given path

        Args:
            zip_file_path (str): path of zip file needed to be extracted
            raw_data_dir (str): path of directory for extraction
        """
        try:

            with ZipFile(zip_file_path, "r") as zip_ref:
                zip_ref.extractall(raw_data_dir)
                logging.info("zip extraction comleted.")
        except Exception as e:
            logging.error(e)
            raise CustomException(e, sys)
    
    def start_data_ingestion(self)->DataIngestionArtifacts:
        """Runs Data ingestion
        """
        try:
            # create required dir's
            create_dirs(self.__data_ingestion_config.ARITFACTS_ROOT_DIR_PATH)
            create_dirs(self.__data_ingestion_config.DATA_ROOT_DIR_PATH)
            create_dirs(self.__data_ingestion_config.INGESTION_ROOT_DIR_PATH)
            create_dirs(self.__data_ingestion_config.FEATURE_STORE_ROOT_DIR_PATH)
            create_dirs(self.__data_ingestion_config.INGESTED_ROOT_DIR_PATH)

            # get required variables
            uri = self.__data_ingestion_config.SOURCE_URI
            zip_file_path=self.__data_ingestion_config.ZIP_FILE_PATH
            ingested_data_dir = self.__data_ingestion_config.INGESTED_ROOT_DIR_PATH

            # run process
            self.__download(uri, zip_file_path)
            self.__extract(zip_file_path, ingested_data_dir)

            return self.__data_ingestion_config
        except Exception as e:
            logging.exception(e)
            raise CustomException(e, sys)
        



# pipeline

In [None]:
from dataclasses import dataclass


@dataclass
class DataIngestionPipeline:

    def main(self) -> None:
        self.data_ingestion = DataIngestionComponents(DataIngestionConfig)
        self.data_ingestion.start_data_ingestion()





STAGE_NAME = "Data Ingestion"

if __name__=="__main__":
    print(f"\n>>>>>>>>>>>>>>>>>>>>> {STAGE_NAME} initiated <<<<<<<<<<<<<<<<<<<<<")
    obj = DataIngestionPipeline()
    obj.main()
    print(f"\n>>>>>>>>>>>>>>>>>>>>> {STAGE_NAME} completed <<<<<<<<<<<<<<<<<<<<<")


