In [None]:
import os

In [None]:
os.getcwd()

In [None]:
os.chdir("..")

In [None]:
os.getcwd() 

# constants

In [None]:
from dataclasses import dataclass 
from churn_modelling.utils import load_yaml 


CONFIG = load_yaml("config/config.yaml")

@dataclass 
class DataValidationConstants:
    ROOT_DIR = CONFIG.ROOT_DIR
    DATA_ROOT_DIR = CONFIG.DATA.ROOT_DIR
    VALIDATION_ROOT_DIR = CONFIG.DATA.VALIDATION.ROOT_DIR
    VALID_ROOT_DIR = CONFIG.DATA.VALIDATION.VALID.ROOT_DIR
    INVALID_ROOT_DIR = CONFIG.DATA.VALIDATION.INVALID.ROOT_DIR
    TRAIN_DATA = CONFIG.DATA.VALIDATION.TRAIN_DATA
    TEST_DATA = CONFIG.DATA.VALIDATION.TEST_DATA
    REPORT_FILE = CONFIG.DATA.VALIDATION.REPORT_FILE
    SCHEMA_FILE_PATH = 'schema/schema.yaml'

In [None]:
print(f"ROOT_DIR:{DataValidationConstants.ROOT_DIR}")
print(f"DATA_ROOT_DIR:{DataValidationConstants.DATA_ROOT_DIR}")
print(f"VALIDATION_ROOT_DIR:{DataValidationConstants.VALIDATION_ROOT_DIR}")
print(f"VALID_ROOT_DIR:{DataValidationConstants.VALID_ROOT_DIR}")
print(f"INVALID_ROOT_DIR:{DataValidationConstants.INVALID_ROOT_DIR}")
print(f"TRAIN_DATA:{DataValidationConstants.TRAIN_DATA}")
print(f"TEST_DATA:{DataValidationConstants.TEST_DATA}")
print(f"REPORT_FILE:{DataValidationConstants.REPORT_FILE}")
print(f"SCHEMA_FILE_PATH:{DataValidationConstants.SCHEMA_FILE_PATH}")

# entity

In [None]:
from dataclasses import dataclass 
from typing import ClassVar
from pathlib import Path


@dataclass 
class DataValidation:
    ROOT_DIR_PATH:ClassVar[Path]
    DATA_ROOT_DIR_PATH:ClassVar[Path]
    VALIDATION_ROOT_DIR_PATH:ClassVar[Path]
    VALID_ROOT_DIR_PATH:ClassVar[Path]
    VALID_TRAIN_DATA_FILE_PATH:ClassVar[Path]
    VALID_TEST_DATA_FILE_PATH:ClassVar[Path]
    INVALID_ROOT_DIR_PATH:ClassVar[Path]
    INVALID_TRAIN_DATA_FILE_PATH:ClassVar[Path]
    INVALID_TEST_DATA_FILE_PATH:ClassVar[Path]
    REPORT_FILE_FILE_PATH:ClassVar[Path]
    SCHEMA_FILE_PATH:ClassVar[Path]

# configuration

In [None]:
from dataclasses import dataclass 
from pathlib import Path
import os 


@dataclass 
class DataValidationConfig:
    ROOT_DIR_PATH = Path(DataValidationConstants.ROOT_DIR)
    DATA_ROOT_DIR_PATH = Path(os.path.join(ROOT_DIR_PATH, DataValidationConstants.DATA_ROOT_DIR))
    VALIDATION_ROOT_DIR_PATH = Path(os.path.join(DATA_ROOT_DIR_PATH, DataValidationConstants.VALIDATION_ROOT_DIR))
    VALID_ROOT_DIR_PATH = Path(os.path.join(VALIDATION_ROOT_DIR_PATH, DataValidationConstants.VALID_ROOT_DIR))
    VALID_TRAIN_DATA_FILE_PATH = Path(os.path.join(VALID_ROOT_DIR_PATH, DataValidationConstants.TRAIN_DATA))
    VALID_TEST_DATA_FILE_PATH = Path(os.path.join(VALID_ROOT_DIR_PATH, DataValidationConstants.TEST_DATA))
    INVALID_ROOT_DIR_PATH = Path(os.path.join(VALIDATION_ROOT_DIR_PATH, DataValidationConstants.INVALID_ROOT_DIR))
    INVALID_TRAIN_DATA_FILE_PATH = Path(os.path.join(INVALID_ROOT_DIR_PATH, DataValidationConstants.TRAIN_DATA))
    INVALID_TEST_DATA_FILE_PATH = Path(os.path.join(INVALID_ROOT_DIR_PATH, DataValidationConstants.TEST_DATA))
    REPORT_FILE_FILE_PATH = Path(os.path.join(VALIDATION_ROOT_DIR_PATH, DataValidationConstants.REPORT_FILE))
    SCHEMA_FILE_PATH = Path(DataValidationConstants.SCHEMA_FILE_PATH)

In [None]:
print(f"ROOT_DIR_PATH:{DataValidationConfig.ROOT_DIR_PATH}")
print(f"DATA_ROOT_DIR_PATH:{DataValidationConfig.DATA_ROOT_DIR_PATH}")
print(f"VALIDATION_ROOT_DIR_PATH:{DataValidationConfig.VALIDATION_ROOT_DIR_PATH}")
print(f"VALID_ROOT_DIR_PATH:{DataValidationConfig.VALID_ROOT_DIR_PATH}")
print(f"VALID_TRAIN_DATA_FILE_PATH:{DataValidationConfig.VALID_TRAIN_DATA_FILE_PATH}")
print(f"VALID_TEST_DATA_FILE_PATH:{DataValidationConfig.VALID_TEST_DATA_FILE_PATH}")
print(f"INVALID_ROOT_DIR_PATH:{DataValidationConfig.INVALID_ROOT_DIR_PATH}")
print(f"INVALID_TRAIN_DATA_FILE_PATH:{DataValidationConfig.INVALID_TRAIN_DATA_FILE_PATH}")
print(f"INVALID_TEST_DATA_FILE_PATH:{DataValidationConfig.INVALID_TEST_DATA_FILE_PATH}")
print(f"REPORT_FILE_FILE_PATH:{DataValidationConfig.REPORT_FILE_FILE_PATH}")
print(f"SCHEMA_FILE_PATH:{DataValidationConfig.SCHEMA_FILE_PATH}")

In [None]:
print(f"ROOT_DIR_PATH:{type(DataValidationConfig.ROOT_DIR_PATH)}")
print(f"DATA_ROOT_DIR_PATH:{type(DataValidationConfig.DATA_ROOT_DIR_PATH)}")
print(f"VALIDATION_ROOT_DIR_PATH:{type(DataValidationConfig.VALIDATION_ROOT_DIR_PATH)}")
print(f"VALID_ROOT_DIR_PATH:{type(DataValidationConfig.VALID_ROOT_DIR_PATH)}")
print(f"VALID_TRAIN_DATA_FILE_PATH:{type(DataValidationConfig.VALID_TRAIN_DATA_FILE_PATH)}")
print(f"VALID_TEST_DATA_FILE_PATH:{type(DataValidationConfig.VALID_TEST_DATA_FILE_PATH)}")
print(f"INVALID_ROOT_DIR_PATH:{type(DataValidationConfig.INVALID_ROOT_DIR_PATH)}")
print(f"INVALID_TRAIN_DATA_FILE_PATH:{type(DataValidationConfig.INVALID_TRAIN_DATA_FILE_PATH)}")
print(f"INVALID_TEST_DATA_FILE_PATH:{type(DataValidationConfig.INVALID_TEST_DATA_FILE_PATH)}")
print(f"REPORT_FILE_FILE_PATH:{type(DataValidationConfig.REPORT_FILE_FILE_PATH)}")
print(f"SCHEMA_FILE_PATH:{type(DataValidationConfig.SCHEMA_FILE_PATH)}")

# components

In [None]:
from churn_modelling.utils import load_yaml, dump_json, create_dirs
from churn_modelling.entity import DataIngestion 
from churn_modelling.exception import CustomException 
from churn_modelling.logger import logging 
from dataclasses import dataclass 
import pandas as pd 
import sys 



@dataclass 
class DataValidationComponents:
    data_ingestion_config:DataIngestion
    data_validation_config:DataValidation 

    def load_data(self):
        try:
            logging.info('In load_data')

            # read train data from artifacts 
            self.train_data_path = self.data_ingestion_config.TRAIN_DATA_FILE_PATH
            self.train_data = pd.read_csv(self.train_data_path)
            logging.info(f'loaded train data from {{{self.train_data_path}}}')

            # read test data from artifacts 
            self.test_data_path = self.data_ingestion_config.TEST_DATA_FILE_PATH
            self.test_data = pd.read_csv(self.test_data_path)
            logging.info(f'loaded test data from {{{self.test_data_path}}}')

            logging.info('Out load_data')
        except Exception as e:
            logging.exception(e)
            raise CustomException(e, sys)

    def validate_data(self, data:pd.DataFrame) -> dict[str, bool | dict]:
        """creates validation report for given data

        Args:
            data (pd.DataFrame): dataframe object of data which needs to be validated

        Returns:
            dict: keys[status, loaded_schema, generated_schema]
            - status: True if loaded_schema(pre-defined schema) == generated_schema else False 
            - loaded_schema: pre-defined schema which was loaded to compare 
            - generated_schema: schema generated from data which was provided 
        """
        try:
            logging.info('In validate_data')

            # load schema 
            loaded_schema = load_yaml(self.data_validation_config.SCHEMA_FILE_PATH)

            # generate fresh schema of data 
            schema = dict()
            columns_with_dtype = dict()
            numerical_columns = list()

            for col in data.columns:
                columns_with_dtype[col] = str(data[col].dtype)
                if data[col].dtype!="O":
                    numerical_columns.append(col)

            schema["columns"] = columns_with_dtype
            schema["numerical_columns"] = numerical_columns

            status = schema == loaded_schema

            logging.info('Out validate_data')
            return {
                'status':status, 
                'loaded_schema':dict(loaded_schema),
                'generated_schema':schema 
            }
        except Exception as e:
            logging.exception(e)
            raise CustomException(e, sys)
        
    def generate_report(self, train_validation_info:dict, test_validation_info:dict):
        """generates final validation report for train and test data

        Args:
            train_validation_info (dict): output of validate_data when train data is provided to the function
            test_validation_info (dict): output of validate_data when test data is provided to the function
        """
        try:
            logging.info('In generate_report')

            # generate report
            self.validation_report = {
                'train':train_validation_info,
                'test':test_validation_info
            }

            logging.info('Out generate_report')
        except Exception as e:
            logging.exception(e)
            raise CustomException(e, sys)
        
    def save_outputs(self):
        try:
            logging.info('In save_outputs')

            paths = {
                'valid':{
                    'train':self.data_validation_config.VALID_TRAIN_DATA_FILE_PATH, 
                    'test':self.data_validation_config.VALID_TEST_DATA_FILE_PATH
                },
                'invalid':{
                    'train':self.data_validation_config.INVALID_TRAIN_DATA_FILE_PATH, 
                    'test':self.data_validation_config.INVALID_TEST_DATA_FILE_PATH
                }
            }
            data = {
                'train':self.train_data,
                'test':self.test_data
            }
            for key in self.validation_report.keys():
                go = 'invalid'
                if self.validation_report[key]['status']:
                    go = 'valid'

                # save data to its validation path 
                data[key].to_csv(paths[go][key], index=False)
                logging.info(f'saved {key} data at {{{paths[go][key]}}}')

            # save validation report 
            validation_report_path = self.data_validation_config.REPORT_FILE_FILE_PATH
            dump_json(self.validation_report, validation_report_path)
            logging.info(f'saved validation report to {{{validation_report_path}}}')

            logging.info('Out save_outputs')
        except Exception as e:
            logging.exception(e)
            raise CustomException(e, sys)
        
    def main(self):
        # create required directories 
        create_dirs(self.data_validation_config.ROOT_DIR_PATH)
        create_dirs(self.data_validation_config.DATA_ROOT_DIR_PATH)
        create_dirs(self.data_validation_config.VALIDATION_ROOT_DIR_PATH)
        create_dirs(self.data_validation_config.VALID_ROOT_DIR_PATH)
        create_dirs(self.data_validation_config.INVALID_ROOT_DIR_PATH)

        self.load_data()
        train_validation_info = self.validate_data(self.train_data)
        test_validation_info = self.validate_data(self.test_data)
        self.generate_report(train_validation_info, test_validation_info)
        self.save_outputs() 

# pipeline

In [None]:
from churn_modelling.configuration import DataIngestionConfig 
from dataclasses import dataclass 


@dataclass 
class DataValidationPipeline:
    def run(self):
        obj = DataValidationComponents(DataIngestionConfig, DataValidationConfig)
        obj.main()


if __name__ == '__main__':
    data_validation_pipeline = DataValidationPipeline()
    data_validation_pipeline.run()

