In [None]:
import os
os.chdir("..")
os.getcwd()

In [None]:
from package.utils import read_yaml
from dataclasses import dataclass
from pathlib import Path


CONFIG = read_yaml("config/config.yaml")

@dataclass
class DataValidationConstants:
    ARITFACTS_ROOT_DIR_NAME = CONFIG.ARITFACTS_ROOT_DIR_NAME
    DATA_ROOT_DIR_NAME = CONFIG.DATA.ROOT_DIR_NAME
    VALIDATION_ROOT_DIR_NAME = CONFIG.DATA.VALIDATION.ROOT_DIR_NAME

    VALID_ROOT_DIR_NAME = CONFIG.DATA.VALIDATION.VALID.ROOT_DIR_NAME
    VALID_TRAIN_FILE_NAME = CONFIG.DATA.VALIDATION.VALID.TRAIN_FILE_NAME
    VALID_TEST_FILE_NAME = CONFIG.DATA.VALIDATION.VALID.TEST_FILE_NAME

    INVALID_ROOT_DIR_NAME = CONFIG.DATA.VALIDATION.INVALID.ROOT_DIR_NAME
    INVALID_TRAIN_FILE_NAME = CONFIG.DATA.VALIDATION.INVALID.TRAIN_FILE_NAME
    INVALID_TEST_FILE_NAME = CONFIG.DATA.VALIDATION.INVALID.TEST_FILE_NAME

    DRIFT_REPORT_ROOT_DIR_NAME = CONFIG.DATA.VALIDATION.DRIFT_REPORT.ROOT_DIR_NAME
    DRIFT_REPORT_FILE_NAME = CONFIG.DATA.VALIDATION.DRIFT_REPORT.FILE_NAME

    SCHEMA_FILE_PATH = Path("schema/schema.yaml")



In [None]:
print("ARITFACTS_ROOT_DIR_NAME: ", DataValidationConstants.ARITFACTS_ROOT_DIR_NAME)
print("DATA_ROOT_DIR_NAME: ", DataValidationConstants.DATA_ROOT_DIR_NAME)
print("VALIDATION_ROOT_DIR_NAME: ", DataValidationConstants.VALIDATION_ROOT_DIR_NAME)
print("VALID_ROOT_DIR_NAME: ", DataValidationConstants.VALID_ROOT_DIR_NAME)
print("VALID_TRAIN_FILE_NAME: ", DataValidationConstants.VALID_TRAIN_FILE_NAME)
print("VALID_TEST_FILE_NAME: ", DataValidationConstants.VALID_TEST_FILE_NAME)
print("INVALID_ROOT_DIR_NAME: ", DataValidationConstants.INVALID_ROOT_DIR_NAME)
print("INVALID_TRAIN_FILE_NAME: ", DataValidationConstants.INVALID_TRAIN_FILE_NAME)
print("INVALID_TEST_FILE_NAME: ", DataValidationConstants.INVALID_TEST_FILE_NAME)
print("DRIFT_REPORT_ROOT_DIR_NAME: ", DataValidationConstants.DRIFT_REPORT_ROOT_DIR_NAME)
print("DRIFT_REPORT_FILE_NAME: ", DataValidationConstants.DRIFT_REPORT_FILE_NAME)

In [None]:
from dataclasses import dataclass
from pathlib import Path


@dataclass
class DataValidationConfigEntity:
    ARITFACTS_ROOT_DIR_PATH = Path
    DATA_ROOT_DIR_PATH = Path
    VALIDATION_ROOT_DIR_PATH = Path

    VALID_ROOT_DIR_PATH = Path
    VALID_TRAIN_FILE_PATH = str
    VALID_TEST_FILE_PATH = str

    INVALID_ROOT_DIR_PATH = Path
    INVALID_TRAIN_FILE_PATH = str
    INVALID_TEST_FILE_PATH = str

    DRIFT_REPORT_ROOT_DIR_PATH = Path
    DRIFT_REPORT_FILE_PATH = str

    SCHEMA_FILE_PATH = Path



In [None]:
from dataclasses import dataclass
from pathlib import Path
import os

@dataclass
class DataValidationConfig:
    ARITFACTS_ROOT_DIR_PATH = Path(DataValidationConstants.ARITFACTS_ROOT_DIR_NAME)
    DATA_ROOT_DIR_PATH = os.path.join(ARITFACTS_ROOT_DIR_PATH, DataValidationConstants.DATA_ROOT_DIR_NAME)
    VALIDATION_ROOT_DIR_PATH = os.path.join(DATA_ROOT_DIR_PATH, DataValidationConstants.VALIDATION_ROOT_DIR_NAME)

    VALID_ROOT_DIR_PATH = os.path.join(VALIDATION_ROOT_DIR_PATH, DataValidationConstants.VALID_ROOT_DIR_NAME)
    VALID_TRAIN_FILE_PATH = os.path.join(VALID_ROOT_DIR_PATH, DataValidationConstants.VALID_TRAIN_FILE_NAME)
    VALID_TEST_FILE_PATH = os.path.join(VALID_ROOT_DIR_PATH, DataValidationConstants.VALID_TEST_FILE_NAME)

    INVALID_ROOT_DIR_PATH = os.path.join(VALIDATION_ROOT_DIR_PATH, DataValidationConstants.INVALID_ROOT_DIR_NAME)
    INVALID_TRAIN_FILE_PATH = os.path.join(INVALID_ROOT_DIR_PATH, DataValidationConstants.INVALID_TRAIN_FILE_NAME)
    INVALID_TEST_FILE_PATH = os.path.join(INVALID_ROOT_DIR_PATH, DataValidationConstants.INVALID_TEST_FILE_NAME)

    DRIFT_REPORT_ROOT_DIR_PATH = os.path.join(VALIDATION_ROOT_DIR_PATH, DataValidationConstants.DRIFT_REPORT_ROOT_DIR_NAME)
    DRIFT_REPORT_FILE_PATH = os.path.join(DRIFT_REPORT_ROOT_DIR_PATH, DataValidationConstants.DRIFT_REPORT_FILE_NAME)

    SCHEMA_FILE_PATH = DataValidationConstants.SCHEMA_FILE_PATH



In [None]:
print(DataValidationConfig.ARITFACTS_ROOT_DIR_PATH)
print(DataValidationConfig.DATA_ROOT_DIR_PATH)
print(DataValidationConfig.VALIDATION_ROOT_DIR_PATH)
print(DataValidationConfig.VALID_ROOT_DIR_PATH)
print(DataValidationConfig.VALID_TRAIN_FILE_PATH)
print(DataValidationConfig.VALID_TEST_FILE_PATH)
print(DataValidationConfig.INVALID_ROOT_DIR_PATH)
print(DataValidationConfig.INVALID_TRAIN_FILE_PATH)
print(DataValidationConfig.INVALID_TEST_FILE_PATH)
print(DataValidationConfig.DRIFT_REPORT_ROOT_DIR_PATH)
print(DataValidationConfig.DRIFT_REPORT_FILE_PATH)
print(DataValidationConfig.SCHEMA_FILE_PATH)

In [None]:
from dataclasses import dataclass
from package.entity import DataIngestionConfigEntity
from package.exception import CustomException
from package.utils import create_dirs, read_yaml, save_yaml
from pathlib import Path
import pandas as pd
import sys


@dataclass
class DataValidationComponents:
    data_ingestion_config: DataIngestionConfigEntity
    data_validation_config: DataValidationConfigEntity

    @staticmethod
    def get_report(train_data:pd.DataFrame, test_data:pd.DataFrame,schema_org_path:Path)->dict[dict]:
        """validates columns and data types with schema

        Args:
            train_file (Path): path for train file to compare with schema
            test_file (Path): path for test file to compare with schema

        Returns:
            dict[dict]: 
                True: If file follows schema 
                False: If file don't follow schema

                output: {result:{key:value}}
                    key = type of data[train/test]
                   value = True/False

        Note: schema will be taken from Configuration
        """
        try:
            schema_org = read_yaml(schema_org_path)
            data_dict = {"Train Data":train_data, "Test Data":test_data}

            # verification
            schema = dict()
            columns_with_dtype = dict()
            numerical_columns = list()
            output = dict()
            final_output = dict()

            for data_type_name, data in data_dict.items():
                schema = dict()
                columns_with_dtype = dict()
                numerical_columns = list()
                for col in data.columns:
                    columns_with_dtype[col] = str(data[col].dtype)
                    if data[col].dtype!="O":
                        numerical_columns.append(col)

                schema["columns"] = columns_with_dtype
                schema["numerical_columns"] = numerical_columns

                if schema==schema_org:
                    status = True
                else:
                    status = False
                output[data_type_name] = status   
            final_output["result"] = output
            return final_output
        except Exception as e:
            raise CustomException(e, sys)
        
    def validate(self)->None:
        """create required directories, saves validated data and report
        """
        try:
            # create required directories
            create_dirs(self.data_validation_config.ARITFACTS_ROOT_DIR_PATH)
            create_dirs(self.data_validation_config.DATA_ROOT_DIR_PATH)
            create_dirs(self.data_validation_config.VALIDATION_ROOT_DIR_PATH)
            create_dirs(self.data_validation_config.VALID_ROOT_DIR_PATH)
            create_dirs(self.data_validation_config.INVALID_ROOT_DIR_PATH)
            create_dirs(self.data_validation_config.DRIFT_REPORT_ROOT_DIR_PATH)

            # get required variables and data
            schema_path = self.data_validation_config.SCHEMA_FILE_PATH
            report_path = self.data_validation_config.DRIFT_REPORT_FILE_PATH
            ingested_train_data = pd.read_csv(self.data_ingestion_config.TRAIN_FILE_PATH, index_col=False)
            ingested_test_data = pd.read_csv(self.data_ingestion_config.TEST_FILE_PATH, index_col=False)
            output = self.get_report(ingested_train_data, ingested_test_data, schema_path)

            # get valid and invalid file path for train and test data
            valid_path_dict = {"Train Data":self.data_validation_config.VALID_TRAIN_FILE_PATH, 
                               "Test Data":self.data_validation_config.VALID_TEST_FILE_PATH}
            invalid_path_dict = {"Train Data":self.data_validation_config.INVALID_TRAIN_FILE_PATH, 
                               "Test Data":self.data_validation_config.INVALID_TEST_FILE_PATH}

            
            # save validation report
            save_yaml(output, report_path)

            # save validated data
            for data_type_name, status in output["result"].items():
                if status:
                    path = valid_path_dict[data_type_name]
                else:
                    path = invalid_path_dict[data_type_name]

                if data_type_name=="Train Data":
                    ingested_train_data.to_csv(path, index=False, header=True)
                if data_type_name=="Test Data":
                    ingested_test_data.to_csv(path, index=False, header=True)

        except Exception as e:
            raise CustomException(e, sys)
        


In [None]:
from dataclasses import  dataclass
from package.configuration import DataIngestionConfig


@dataclass
class DataValidationPipeline:

    def main(self)->None:
        """runs data ingestion full pipeline
        """
        data_validation = DataValidationComponents(DataIngestionConfig, DataValidationConfig)
        data_validation.validate()





STAGE_NAME = "Data Validation"

if __name__=="__main__":
    print(f"\n>>>>>>>>>>>>>>>>>>>>> {STAGE_NAME} initiated <<<<<<<<<<<<<<<<<<<<<")
    obj = DataValidationPipeline()
    obj.main()
    print(f"\n>>>>>>>>>>>>>>>>>>>>> {STAGE_NAME} completed <<<<<<<<<<<<<<<<<<<<<")
