Data Validation is used for validating the columns means checking all the columns are present in our dataset or not.

In [1]:
import os

In [2]:
%pwd

'd:\\Projects\\Red-Wine-Quality-Prediction\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\Projects\\Red-Wine-Quality-Prediction'

#### Entity

In [7]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True) # frozen=True means it will not take any other variable here
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict # For reading schema.yaml file we have define this and it is always written in dictionary format

#### Configuration Manager

In [8]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [9]:
# It will give all the file path, source url, etc i.e. all the configuration we need for end to end project will be mentioned here
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH):

        self.config=read_yaml(config_filepath)
        self.params=read_yaml(params_filepath)
        self.schema=read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_validation_config(self) -> DataValidationConfig:
        config=self.config.data_validation
        schema=self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config=DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            unzip_data_dir=config.unzip_data_dir,
            all_schema=schema
        )

        return data_validation_config

#### Data Validation Components

In [10]:
import os
import pandas as pd
from mlProject import logger

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [11]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config=config
    

    def validate_all_columns(self)-> bool:
        try:
            validation_status=None

            data=pd.read_csv(self.config.unzip_data_dir)
            all_cols=list(data.columns) # listing down all the columns

            all_schema=self.config.all_schema.keys() # loading all the schema.yaml file

            # Checking all the columns are available in the dataset or not
            for col in all_cols:
                if col not in all_schema:
                    validation_status=False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")
                else:
                    validation_status=True
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")
            
            return validation_status
        
        except Exception as e:
            raise e

#### Pipeline

In [12]:
try:
    config=ConfigurationManager()
    data_validation_config=config.get_data_validation_config()
    data_validation=DataValidation(config=data_validation_config)
    data_validation.validate_all_columns()
except Exception as e:
    raise e

[2024-02-21 17:20:57,820: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-02-21 17:20:57,820: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-21 17:20:57,837: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-02-21 17:20:57,837: INFO: common: created directory at: artifacts]
[2024-02-21 17:20:57,853: INFO: common: created directory at: artifacts/data_validation]
