In [28]:
%pwd

'C:\\Users\\kural\\Desktop\\Projects\\End_To_End_MLops'

In [29]:
import os
from pathlib import Path
os.chdir(Path("C:\\Users\\kural\\Desktop\\Projects\\End_To_End_MLops\\"))

In [30]:
%pwd

'C:\\Users\\kural\\Desktop\\Projects\\End_To_End_MLops'

In [31]:
# from software_defect_prediction.constants import *
from software_defect_prediction.utils.common import *
# from software_defect_prediction.entity.config_entity import DataIngestionConfig
# from software_defect_prediction.config.configuration import ConfigurationManager

In [32]:
from dataclasses import dataclass
from pathlib import Path
from software_defect_prediction import logger

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    source_file_path : str
    input_file_name : str
    STATUS_FILE: Path 

In [33]:
from venv import create
from software_defect_prediction.constants import *
from software_defect_prediction.entity.config_entity import DataIngestionConfig
from software_defect_prediction.utils.common import read_yaml, create_directories
from box import ConfigBox
# from software_defect_prediction.entity.config_entity import (DataIngestionConfig)

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
 
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            source_URL_file_name = config.source_URL_file_name,
            source_URL_unzip_file_name = config.source_URL_unzip_file_name,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        return data_ingestion_config
    
    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        
        create_directories([config.root_dir])
        data_validation_config = DataValidationConfig(
            root_dir = config.root_dir,
            input_file_name = config.input_file_name,
            source_file_path = config.source_file_path,
            STATUS_FILE = config.STATUS_FILE
        )
        return data_validation_config
    
    def get_data_schema(self) -> ConfigBox:
        return(self.schema)
    

In [34]:
from sys import exception
from software_defect_prediction import logger

from pathlib import Path
import shutil
import pandas as pd

class Data_Validation():
    def __init__(self,data_validation_config : DataValidationConfig, data_schema : ConfigBox) -> None:
        self.config = data_validation_config
        self.data_schema = data_schema
        self.VALIDATION_STATUS = False
        
    def prepare_files(self) -> None:
        
        destination_file_path = Path(Path(self.config.root_dir) / Path(self.config.input_file_name))
        if os.path.exists(destination_file_path):
            os.remove(destination_file_path)
        
        shutil.copy(self.config.source_file_path,self.config.root_dir)

    def validate_all_columns(self) -> bool:
        try :
            input_data = pd.read_csv(Path(self.config.root_dir) / Path(self.config.input_file_name))

            input_data_schema = pd.DataFrame({
                "column" : input_data.columns,
                "data_type" : input_data.dtypes
            }).reset_index(drop=True)

            # input_data_schema = input_data_schema[~ input_data_schema["column"].isin(["id","b"])]

            original_data_schema = pd.DataFrame({
                "column" : self.data_schema.COLUMNS.keys(),
                "data_type" : self.data_schema.COLUMNS.values()

            }).reset_index(drop=True)

            # Merge the two DataFrames on both column names and data types
            merged_df = input_data_schema.astype(str).merge(original_data_schema.astype(str), on=['column', 'data_type'], how='outer', indicator=True)

            # Filter rows where the merge indicator is not 'right_only' (i.e., where there is a mismatch with original data schema)
            mismatched_columns = merged_df[merged_df['_merge'] == 'right_only']

            # Print columns causing the mismatch
            if not mismatched_columns.empty:
                logger.error("Columns causing the mismatch: ",str(mismatched_columns['column'].unique()))
                self.VALIDATION_STATUS = False
                with open(self.config.STATUS_FILE, 'w') as f:
                            f.write(f"Validation status: {self.VALIDATION_STATUS}")
            else:
                logger.info("Columns and data types match between the two DataFrames")
                self.VALIDATION_STATUS = True
                with open(self.config.STATUS_FILE, 'w') as f:
                            f.write(f"Validation status: {self.VALIDATION_STATUS}")
                            
            return self.VALIDATION_STATUS
        except exception as e:
            raise e


In [35]:
configuration_obj = ConfigurationManager()
data_validation_conf =  configuration_obj.get_data_validation_config()
data_schema = configuration_obj.get_data_schema()

step_data_val = Data_Validation(data_validation_config=data_validation_conf,data_schema = data_schema)
step_data_val.prepare_files()
step_data_val.validate_all_columns()

[32m2024-05-26 23:57:30.486[0m | [1mINFO    [0m | [36msoftware_defect_prediction.utils.common[0m:[36mread_yaml[0m:[36m31[0m - [1myaml file: config\config.yaml loaded successfully[0m
[32m2024-05-26 23:57:30.487[0m | [1mINFO    [0m | [36msoftware_defect_prediction.utils.common[0m:[36mread_yaml[0m:[36m31[0m - [1myaml file: params.yaml loaded successfully[0m
[32m2024-05-26 23:57:30.493[0m | [1mINFO    [0m | [36msoftware_defect_prediction.utils.common[0m:[36mread_yaml[0m:[36m31[0m - [1myaml file: schema.yaml loaded successfully[0m
[32m2024-05-26 23:57:30.494[0m | [1mINFO    [0m | [36msoftware_defect_prediction.utils.common[0m:[36mcreate_directories[0m:[36m51[0m - [1mcreated directory at: artifacts[0m
[32m2024-05-26 23:57:30.496[0m | [1mINFO    [0m | [36msoftware_defect_prediction.utils.common[0m:[36mcreate_directories[0m:[36m51[0m - [1mcreated directory at: artifacts/data_validation[0m
[32m2024-05-26 23:57:30.681[0m | [1mINFO   

True