In [1]:
import logging
import os

In [2]:
%pwd

'C:\\Users\\kisho\\MyWork\\RedWineQuality\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'C:\\Users\\kisho\\MyWork\\RedWineQuality'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [6]:
from src.constants import *
from src.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH, 
                 schema_filepath = SCHEMA_FILE_PATH,
                params_filepath = PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.schema = read_yaml(schema_filepath)
        self.params = read_yaml(params_filepath)
        
        create_directories([self.config.artifacts_root])
        
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        
        config = self.config.data_ingestion
        
        create_directories([config.root_dir])
        
        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            source_URL = config.source_URL,
            local_data_file = config.local_data_file,
            unzip_dir = config.unzip_dir
        )
        
        return data_ingestion_config
    
            

In [11]:
import os
from urllib import request
import zipfile
from src.logging import logger
from src.utils.common import get_size

In [12]:
get_size(Path("logs/running_logs.log"))

'~ 4 KB'

In [13]:
class DataIngestion:
    """
    A class to handle data ingestion tasks, including downloading and extracting files.

    Attributes:
        config (DataIngestionConfig): Configuration object containing paths and URLs for data ingestion.
    """

    def __init__(self, config: DataIngestionConfig) -> None:
        """
        Initializes the DataIngestion class with the provided configuration.

        Args:
            config (DataIngestionConfig): Configuration object for data ingestion.
        """
        self.config = config

    def download_file(self) -> None:
        """
        Downloads the file from the source URL specified in the configuration.

        The downloaded file is saved to the path specified in `self.config.local_data_file`.
        If the file already exists, it logs the file size instead of re-downloading.

        Raises:
            Exception: If the download fails due to network issues or invalid URLs.
        """
        if not os.path.exists(self.config.local_data_file):
            try:
                filename, headers = request.urlretrieve(
                    url=self.config.source_URL,
                    filename=self.config.local_data_file
                )
                logger.info(f"File downloaded successfully: {filename}")
                logger.debug(f"Download headers: {headers}")
            except Exception as e:
                logger.error(f"Failed to download file from {self.config.source_URL}: {e}")
                raise
        else:
            file_size = get_size(Path(self.config.local_data_file))
            logger.info(f"File already exists: {self.config.local_data_file} (Size: {file_size})")

    def extract_zip_file(self) -> None:
        """
        Extracts the contents of the zip file specified in `self.config.local_data_file`.

        The contents are extracted to the directory specified in `self.config.unzip_dir`.
        If the extraction directory does not exist, it is created.

        Raises:
            Exception: If the extraction fails due to invalid zip files or I/O errors.
        """
        unzip_path = self.config.unzip_dir
        try:
            # Create the extraction directory if it doesn't exist
            os.makedirs(unzip_path, exist_ok=True)
            logger.info(f"Extracting zip file to: {unzip_path}")

            # Extract the zip file
            with zipfile.ZipFile(self.config.local_data_file, "r") as zip_ref:
                zip_ref.extractall(path=unzip_path)
            logger.info(f"Successfully extracted zip file: {self.config.local_data_file}")
        except zipfile.BadZipFile:
            logger.error(f"Invalid zip file: {self.config.local_data_file}")
            raise
        except Exception as e:
            logger.error(f"Failed to extract zip file: {e}")
            raise

In [14]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
except Exception as e:
    logger.error(f"Error in {e}")
    raise e

[2025-03-20 22:08:57,696: INFO: common: config\config.yaml file loaded successfully]
[2025-03-20 22:08:57,732: INFO: common: schema.yaml file loaded successfully]
[2025-03-20 22:08:57,738: INFO: common: params.yaml file loaded successfully]
Directory 'artifacts' already exists.
Directory 'artifacts/data_ingestion' already exists.
[2025-03-20 22:08:59,889: INFO: 3957638715: File downloaded successfully: artifacts/data_ingestion/data.zip]
[2025-03-20 22:08:59,920: INFO: 3957638715: Extracting zip file to: artifacts/data_ingestion]
[2025-03-20 22:08:59,946: INFO: 3957638715: Successfully extracted zip file: artifacts/data_ingestion/data.zip]
