In [15]:
import os

In [16]:
# os.chdir('../')
%pwd
# os.chdir('../')

'/workspaces/mlproject_wine_quality'

In [17]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    kaggle_dataset: str
    local_data_file: Path
    unzip_dir: Path


In [18]:
from pathlib import Path
from typing import Any, Dict

from wine_quality_predictor.utils.common import read_yaml, make_directory
from wine_quality_predictor.constants import *


class ConfigurationManager:
    def __init__(
        self,
        config_filepath: Path = CONFIG_FILE_PATH,
        params_filepath: Path = PARAMS_FILE_PATH,
        schema_filepath: Path = SCHEMA_FILE_PATH
    ):
        self.config_filepath = config_filepath
        # self.params_filepath = params_filepath
        # self.schema_filepath = schema_filepath

        self.config = read_yaml(Path(self.config_filepath))
        # self.params = read_yaml(self.params_filepath)
        # self.schema = read_yaml(self.schema_filepath)

        make_directory(Path(self.config.artifacts_root))

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        make_directory(config.root_dir)

        return DataIngestionConfig(
            root_dir=config.root_dir,
            kaggle_dataset=config.kaggle_dataset,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir
        )


In [19]:
import os
import subprocess
import zipfile
from wine_quality_predictor.utils.common import get_file_size
import logging


class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_data(self):
        if not os.path.exists(self.config.local_data_file):
            logging.info("Downloading dataset from Kaggle...")
            command = [
                "kaggle", "datasets", "download",
                "-d", self.config.kaggle_dataset,
                "-p", str(self.config.root_dir),
                "--force"
            ]
            subprocess.run(command, check=True)
            logging.info(f"Downloaded dataset to {self.config.local_data_file}")
        else:
            logging.info("Dataset already downloaded.")

    def extract_zip_file(self):
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)

        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)

        logging.info(f"Extracted zip file to: {unzip_path}")
        logging.info(f"Files extracted: {os.listdir(unzip_path)}")


In [20]:
try:
        logging.info(f">>>>>> Stage Data Ingestion started <<<<<<")
        
        data_ingestion_config = ConfigurationManager()
        config = data_ingestion_config.get_data_ingestion_config()
        data_ingestion = DataIngestion(config)
        data_ingestion.download_data()
        data_ingestion.extract_zip_file()

        logging.info(f">>>>>> Stage Data Ingestion completed <<<<<<\n")

except Exception as e:
    logging.exception(f"Error occurred in stage Data Ingestion: {e}")
    raise e

[2025-04-09 18:16:57,506] INFO - 16066906 - >>>>>> Stage Data Ingestion started <<<<<<
[2025-04-09 18:16:57,508] INFO - common - Loaded YAML file from: config/config.yaml
[2025-04-09 18:16:57,510] INFO - common - Created directory: artifacts
[2025-04-09 18:16:57,512] INFO - common - Created directory: artifacts/data_ingestion
[2025-04-09 18:16:57,515] INFO - 1950516215 - Downloading dataset from Kaggle...


Dataset URL: https://www.kaggle.com/datasets/sumit17125/red-wine-quality-dataset
License(s): CC0-1.0


100%|██████████| 24.0k/24.0k [00:00<00:00, 16.3MB/s]
[2025-04-09 18:16:59,700] INFO - 1950516215 - Downloaded dataset to artifacts/data_ingestion/red-wine-quality-dataset.zip
[2025-04-09 18:16:59,703] INFO - 1950516215 - Extracted zip file to: artifacts/data_ingestion/unzipped_data
[2025-04-09 18:16:59,704] INFO - 1950516215 - Files extracted: ['winequality-red.csv']
[2025-04-09 18:16:59,705] INFO - 16066906 - >>>>>> Stage Data Ingestion completed <<<<<<



Downloading red-wine-quality-dataset.zip to artifacts/data_ingestion

