In [6]:
import os

In [7]:
%pwd

'c:\\Users\\Admin\\Desktop\\code\\python\\Mlops_End_To_End_Project_Using_Github_Action\\research'

In [8]:
os.chdir("../")

In [9]:
%pwd

'c:\\Users\\Admin\\Desktop\\code\\python\\Mlops_End_To_End_Project_Using_Github_Action'

In [10]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    train_data: str
    test_data: str
    drift_report_path: str
    validation_status_path: str
    columns: dict
    numerical_columns: list[str]
    categorical_columns: list[str]
    

    

 

In [11]:
from us_visa.constants import *
from us_visa.utils.common import read_yaml, create_directories

In [12]:
class ConfigurationManager:
   
    def __init__(self, config_filepath=CONFIG_FILE_PATH,schema_filepath = SCHEMA_FILe_PATH):
        self.config = read_yaml(config_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir, 
            train_data= config.train_data,
            test_data= config.test_data,
            validation_status_path=config.validation_status_path,
            drift_report_path=config.drift_report_path,
            numerical_columns=self.schema.numerical_columns,
            categorical_columns=self.schema.categorical_columns,
            columns=self.schema.columns

            
            )
        return data_validation_config

In [13]:
import pandas as pd
from us_visa import logger
from us_visa.utils.common import read_yaml,write_yaml



In [14]:
from evidently import Report
from evidently.presets import DataDriftPreset






In [17]:



class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config
        
    @staticmethod
    def load_data(config: DataValidationConfig):
        train_df = pd.read_csv(config.train_data)
        test_df = pd.read_csv(config.test_data)
        return train_df, test_df
    
    def validate_number_of_columns(self, train_df: pd.DataFrame, test_df: pd.DataFrame) -> bool:
        return train_df.shape[1] == test_df.shape[1]
    
    def validate_column_names_and_types(self, train_df: pd.DataFrame, test_df: pd.DataFrame) -> bool:
        expected_columns = self.config.columns  # dict from schema.yml

        for col, expected_dtype in expected_columns.items():
            # Check column exists
            if col not in train_df.columns or col not in test_df.columns:
                print(f"Missing column: {col}")
                return False
            
            # Check dtype in train
            train_dtype = str(train_df[col].dtype)
            if train_dtype != expected_dtype:
                logger.warning(f"Train dtype mismatch for {col}: inside Schema {expected_dtype}, but in the raw dataset {train_dtype}")
                return False

            # Check dtype in test
            test_dtype = str(test_df[col].dtype)
            if test_dtype != expected_dtype:
                logger.warning(f"Test dtype mismatch for {col}: inside Schema {expected_dtype}, but in the raw dataset {test_dtype}")
                return False

        return True
    
    def check_data_drift(self, train_df: pd.DataFrame, test_df: pd.DataFrame) -> bool:
   
        report = Report(metrics=[DataDriftPreset()],include_tests=True)
        result = report.run(reference_data=train_df, current_data=test_df)
        
        # Save HTML report for visualization
    
        result.save_html(self.config.drift_report_path)
        success_list = result.tests_results
        is_data_clean = all(test.status.name == 'SUCCESS' for test in success_list)

        # 5. Return the result
        if is_data_clean:
            print("✅ Overall Status: SUCCESS (No Drift)")
            return True
        else:
            print("Overall Status: FAIL (Drift Detected)")
            return False
        
    
       
    def run_validation(self):
        train_df, test_df = self.load_data(self.config)

        col_count_ok = self.validate_number_of_columns(train_df, test_df)
        schema_ok = self.validate_column_names_and_types(train_df, test_df)
        check_drift =  self.check_data_drift(train_df, test_df)

        

        validation_status = col_count_ok and schema_ok and check_drift

    

        # Write YAML file
        write_yaml(
            self.config.validation_status_path,
            {"validation_status": validation_status}
        )

        return {
            "column_count_ok": col_count_ok,
            "schema_ok": schema_ok,
            "validation_status": validation_status,
            "no data_drift": check_drift
        }
     


In [18]:
try:
    config_manager = ConfigurationManager()
    data_validation_config = config_manager.get_data_validation_config()
    data_validator = DataValidation(config=data_validation_config)
    validation_results = data_validator.run_validation()
    print(validation_results)
    logger.info("Data Validation Successful")
except Exception as e:
    raise e


    

[2026-02-12 13:24:18,425: INFO: common: yaml file: config\config.yaml loaded successfully]
[2026-02-12 13:24:18,429: INFO: common: yaml file: config\schema.yaml loaded successfully]
[2026-02-12 13:24:18,431: INFO: common: created directory at: artifacts]
[2026-02-12 13:24:18,433: INFO: common: created directory at: artifacts/data_validation]
✅ Overall Status: SUCCESS (No Drift)
[2026-02-12 13:24:21,784: INFO: common: yaml file saved at: artifacts/data_validation/validation_status.yml]
{'column_count_ok': True, 'schema_ok': True, 'validation_status': True, 'no data_drift': True}
[2026-02-12 13:24:21,784: INFO: 3164758657: Data Validation Successful]
