In [1]:
%pwd

'c:\\Users\\Dzuels Foundation\\mlproject\\notebook'

In [2]:
%cd ..\

c:\Users\Dzuels Foundation\mlproject


In [6]:
import requests
import os
import zipfile
import sys
import pandas as pd
from pathlib import Path
from dataclasses import dataclass

from src.constant import *
from src.logger import logging
from src.exception import CustomException
from src.utils import read_yaml,create_directory


In [7]:
@dataclass(frozen=True)
class DataValidationConfig:
    """
    Data validation Configuration
    """

    root_dir: Path
    data_file: Path
    status: str
    all_schema: dict
    

In [8]:
class ConfigurationManager:
    def __init__(self, 
                 config_file_path=CONFIG_FILE_PATH,
                 schema_file_path=SCHEMA_FILE_PATH
                 ):
        self.config = read_yaml(config_file_path)
        self.schema = read_yaml(schema_file_path)
                
        create_directory(self.config.artifact_root)
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        all_schema = self.schema.COLUMNS
        
        create_directory(config.root_dir)
        
        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            data_file=config.data_file,
            status=config.status,
            all_schema=all_schema
        )
        
        return data_validation_config
        
        


In [9]:
class DataValidation:
    def __init__(self, config:DataValidationConfig):
        self.config = config

    def validate_data(self):
        try:
            logging.info("Data Validation started")
            data = pd.read_csv(self.config.data_file)
            data = data.drop(columns=["Id"], axis=1, errors='ignore')  
            columns = set(data.columns)
            all_schema = set(self.config.all_schema.keys())

            # Check if all required columns are present
            missing_columns = all_schema - columns
            extra_columns = columns - all_schema

            if missing_columns:
                logging.info(f"Missing columns: {missing_columns}")
                validation_status = False
            elif extra_columns:
                logging.info(f"Unexpected extra columns: {extra_columns}")
                validation_status = False  
            else:
                validation_status = True
            
            with open(self.config.status, "w") as f:
                f.write(f"validation_status:{validation_status}")

            logging.info("Data Validation completed successfully")

        except Exception as e:
            logging.error("Error in data validation")
            logging.error(e)
            raise CustomException(e, sys)

In [10]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_data()
except Exception as e:
    logging.info("Error in data validation")
    logging.info(e)
    logging.info("Data validation failed")
    raise CustomException(e, sys)

[2025-04-07 14:37:40,356] : root : INFO: yaml file: config\config.yaml loaded successfully
[2025-04-07 14:37:40,414] : root : INFO: yaml file: schema.yaml loaded successfully
[2025-04-07 14:37:40,416] : root : INFO: Directory 'artifact' already exists.
[2025-04-07 14:37:40,419] : root : INFO: Directory 'artifact/data_validation' created.
[2025-04-07 14:37:40,419] : root : INFO: Data Validation started
[2025-04-07 14:37:40,658] : root : INFO: Data Validation completed successfully
