In [1]:
import os

In [2]:
%pwd

In [3]:
os.chdir("../")

In [4]:
%pwd

In [5]:
from pathlib import Path
from dataclasses import dataclass

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    unzip_data_dir: str
    STATUS_FILE: Path
    all_schema: dict

In [6]:
from src.constants import *
from src.utils.common import read_yaml, create_directories

In [28]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH, 
                 schema_filepath = SCHEMA_FILE_PATH,
                params_filepath = PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.schema = read_yaml(schema_filepath)
        self.params = read_yaml(params_filepath)
        
        create_directories([self.config.artifacts_root])
    
    def get_data_validation_config(self) -> DataValidationConfig:
        
        config = self.config.data_validation
        create_directories([config.root_dir])
        
        data_validation_config = DataValidationConfig(
            root_dir= config.root_dir,
            unzip_data_dir= config.unzip_data_dir,
            STATUS_FILE= config.STATUS_FILE,
            all_schema= self.schema.COLUMNS
        )
        return data_validation_config
    
    

In [29]:
temp = ConfigurationManager()
config = temp.get_data_validation_config()
config.all_schema['fixed acidity'] == df.alcohol.dtype

In [21]:
import pandas as pd
df = pd.read_csv("research/winequality-red.csv")
set(df.columns) == set(config.all_schema.keys())

In [36]:
import pandas as pd
from src.logging import logger

class DataValidation:
    
    def __init__(self, config: DataValidationConfig):
        self.config = config
        self.dataframe = pd.read_csv(self.config.unzip_data_dir)
        self.df_cols = set(self.dataframe.columns)
    
    def validate_all_columns(self) -> bool:
        try:
            schema_cols = self.config.all_schema.keys()
            
            validation_status = set(self.df_cols) == set(schema_cols)
                    
            with open(self.config.STATUS_FILE, "w") as status_file:
                status_file.write(f"Column Validation Status: {validation_status}\n")
            
            return validation_status
        except Exception as e:
            logger.exception(f"{e}")
            raise e
    
    def validata_dtype_of_columns(self):
        try:
            validation_status = True
            
            for col in self.df_cols:
                if self.dataframe[col].dtype != self.config.all_schema[col]:
                    validation_status = False
                    logger.error(f"In Schema {col} data type is {self.config.all_schema[col]} but in data the datatype is {self.dataframe[col].dtype}")
                    break
            
            with open(self.config.STATUS_FILE, "a") as status_file:
                    status_file.write(f"Column Data Type Validation Status: {validation_status}\n")
        except Exception as e:
            logger.exception(f"{e}")
            raise e
        

In [37]:
config = ConfigurationManager()
data_validation_config = config.get_data_validation_config()
data_validation = DataValidation(data_validation_config)
data_validation.validate_all_columns()
data_validation.validata_dtype_of_columns()