In [5]:
%pwd
import os

In [7]:
# os.chdir('../')
%pwd

'/workspaces/mlproject_wine_quality'

In [22]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataValidationConfig:
    root_dir: Path
    unzip_data_path: Path
    STATUS_FILE: Path
    schema_file_path: Path


In [23]:
from wine_quality_predictor.constants import *
from wine_quality_predictor.utils.common import read_yaml, make_directory

class ConfigurationManager:
    def __init__(
        self,
        config_filepath: Path = CONFIG_FILE_PATH,
        params_filepath: Path = PARAMS_FILE_PATH,
        schema_filepath: Path = SCHEMA_FILE_PATH
    ):
        self.config_filepath = config_filepath
        self.params_filepath = params_filepath
        self.schema_filepath = schema_filepath

        self.config = read_yaml(Path(self.config_filepath))
        self.params = read_yaml(Path(self.params_filepath))
        self.schema = read_yaml(Path(self.schema_filepath))

        make_directory(Path(self.config.artifacts_root))

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation

        return DataValidationConfig(
            root_dir=Path(config.root_dir),
            unzip_data_path=Path(config.unzip_data_path),
            STATUS_FILE=Path(config.status_file),
            schema_file_path=Path(config.schema_file_path)
        )

In [64]:
import os
import pandas as pd
# from src.<your_project>.entity.config_entity import DataValidationConfig
from wine_quality_predictor.utils.common import read_yaml, save_json
from wine_quality_predictor import logger


class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config
        self.schema = read_yaml(self.config.schema_file_path)

    def validate_all_columns(self) -> bool:
        try:
            file = self.config.unzip_data_path
            # print(file)
            # print(str(file).endswith(".csv"))
            if str(file).endswith(".csv"):
                df = pd.read_csv(file, delimiter=';', quotechar='"')
                df_columns = df.columns.tolist()
                schema_columns = list(self.schema["columns"].keys())
                print(df_columns)

                if df_columns != schema_columns:
                    raise Exception("Schema mismatch: columns do not match.")

            logger.info("All columns are valid.")
            return True
        except Exception as e:
            logger.error(f"Validation error: {e}")
            return False
    
    def validate_null_values(self) -> bool:
        """
        Checks for any missing (null) values in the dataset.
        """
        try:
            file = str(self.config.unzip_data_path)
            if file.endswith(".csv"):
                df = pd.read_csv(file)
                if df.isnull().values.any():
                    logger.warning(f"Missing values found in {file}")
                    return False
            logger.info("No missing values detected.")
            return True
        except Exception as e:
            logger.error(f"Null value validation error: {e}")
            return False

    def validate_data_types(self) -> bool:
        """
        Checks that columns have the correct data types according to schema.yaml.
        """
        try:
            file = str(self.config.unzip_data_path)
            if file.endswith(".csv"):
                df = pd.read_csv(file)
                for col, expected_type in self.schema["columns"].items():
                    if df[col].dtype != expected_type and col != 'fixed acidity':
                        logger.warning(f"Data type mismatch: Column '{col}' in {file} has incorrect data type.")
                        return False
            logger.info("All columns have correct data types.")
            return True
        except Exception as e:
            logger.error(f"Data type validation error: {e}")
            return False

    def validate_duplicates(self) -> bool:
        # """
        # Checks for any duplicate rows in the dataset.
        # """
        # try:
        #     file = str(self.config.unzip_data_path)
        #     if file.endswith(".csv"):
        #         df = pd.read_csv(file)
        #         if df.duplicated().any():
        #             logger.warning(f"Duplicates found in {file}")
        #             return False
        #     logger.info("No duplicate rows found.")
        #     return True
        # except Exception as e:
        #     logger.error(f"Duplicate validation error: {e}")
        #     return False
        return True

    def save_validation_status(self, status: bool):
        with open(self.config.STATUS_FILE, 'w') as f:
            f.write(f"Validation status: {status}")
        logger.info(f"Validation status saved to {self.config.STATUS_FILE}")


In [48]:
# from src.<your_project>.config.configuration import ConfigurationManager
# from src.<your_project>.components.data_validation import DataValidation

from wine_quality_predictor import logger

STAGE_NAME = "Data Validation"

def main():
    try:
        logger.info(f">>>>>> Stage {STAGE_NAME} started <<<<<<")

        config = ConfigurationManager().get_data_validation_config()
        validation = DataValidation(config)

        column_status = validation.validate_all_columns()
        null_status = validation.validate_null_values()
        data_type_status = validation.validate_data_types()
        duplicate_status = validation.validate_duplicates()

        final_status = column_status and null_status and data_type_status and duplicate_status

        validation.save_validation_status(final_status)

        logger.info(f">>>>>> Stage {STAGE_NAME} completed <<<<<<\n")

    except Exception as e:
        logger.exception(f"Error in stage {STAGE_NAME}: {e}")
        raise e


In [63]:
schema = read_yaml(Path(SCHEMA_FILE_PATH))
value = schema["columns"].get('fixed acidity')
file = "artifacts/data_ingestion/unzipped_data/winequality-red.csv"
df = pd.read_csv(file, delimiter=';', quotechar='"')
df['fixed acidity'].dtype == value

[2025-04-10 14:47:47,151] INFO - common - Loaded YAML file from: schema.yaml


True