In [1]:
import os

In [2]:
%pwd # Present Working Directory

'd:\\Projects\\Red-Wine-Quality-Prediction\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\Projects\\Red-Wine-Quality-Prediction'

#### Entity

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True) # frozen=True means it will not take any other variable here
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

#### Configuration Manager

In [6]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [7]:
# It will give all the file path, source url, etc i.e. all the configuration we need for end to end project will be mentioned here
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH):

        self.config=read_yaml(config_filepath)
        self.params=read_yaml(params_filepath)
        self.schema=read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config=self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config=DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir
        )

        return data_ingestion_config

#### Data Ingestion Components

In [8]:
import os
import urllib.request as request # With the help of request package we will download the data from the url
import zipfile # It will help to unzip the data
from mlProject import logger
from mlProject.utils.common import get_size # It will help to get the size of the data

In [9]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config=config
    

    # Downloading the data
    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename, headers=request.urlretrieve(
                url=self.config.source_URL,
                filename=self.config.local_data_file # It will download the data and save it in local data file
            )
            logger.info(f"{filename} download! with following info: \n{headers}")
        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")
    

    # Extract the zip file
    def extract_zip_file(self):
        """
        zip_file_path: str
        Extracts the zip file into the data directory
        Function returns None
        """
        unzip_path=self.config.unzip_dir # It will create the path which is present in config.yaml file
        os.makedirs(unzip_path, exist_ok=True)
        # Unzip the data.zip file which is already downloaded
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)

#### Pipeline

It is a step to call the method. In pipeline we mention the flow of the function calling i.e. which function to call at the very first and which function to call at the very last.

In [14]:
try:
    config=ConfigurationManager()
    data_ingestion_config=config.get_data_ingestion_config()
    data_ingestion=DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
except Exception as e:
    raise e

[2024-02-21 15:06:14,231: INFO: common: yaml file: config\config.yaml loaded successfully]


[2024-02-21 15:06:14,345: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-21 15:06:14,365: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-02-21 15:06:14,375: INFO: common: created directory at: artifacts]
[2024-02-21 15:06:14,375: INFO: common: created directory at: artifacts/data_ingestion]
[2024-02-21 15:06:15,936: INFO: 3084199856: artifacts/data_ingestion/data.zip download! with following info: 
Connection: close
Content-Length: 23329
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: "c69888a4ae59bc5a893392785a938ccd4937981c06ba8a9d6a21aa52b4ab5b6e"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Options: deny
X-XSS-Protection: 1; mode=block
X-GitHub-Request-Id: 1712:27554:1A1ED6:219AC4:65D5C40F
Accept-Ranges: bytes
Date: Wed, 21 Feb 2024 09:36:15 GMT
Via: 1.1 varnish
X-Served-By: cache-bom4751-BOM
X-Cache: MISS
X-Cache