In [None]:
import os

In [2]:
%pwd

'C:\\Users\\kisho\\Desktop\\PodcastListeningTimePrediction\\PodcastListeningTimePrediction'

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path 
    unzip_data_path: Path
    schema_file_path: Path
    report_file_path: Path


In [4]:
from PodcastListeningTimePrediction.constants import *
from PodcastListeningTimePrediction.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
            self,
            config_file_path=CONFIG_FILE_PATH,
            schema_file_path=SCHEMA_FILE_PATH,
            params_file_path=PARAMS_FILE_PATH):
        self.config = read_yaml(config_file_path)
        self.schema = read_yaml(schema_file_path)
        self.params = read_yaml(params_file_path)
    
    def get_data_validation_config(self) -> DataValidationConfig:
        
        config = self.config.data_validation

        create_directories([config.root_dir], verbose=True)

        data_validation_config = DataValidationConfig(
            root_dir=Path(config.root_dir),
            unzip_data_path=Path(config.unzip_data_path),
            schema_file_path=Path(config.schema_file_path),
            report_file_path=Path(config.report_file_path)
        )
        return data_validation_config


In [5]:
import pandas as pd
from PodcastListeningTimePrediction.utils.common import write_yaml, read_yaml
from PodcastListeningTimePrediction import logger

class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_data(self) -> bool:
        try:
            schema = read_yaml(self.config.schema_file_path)
            schema_cols = schema.Columns

            df = pd.read_csv(self.config.unzip_data_path)
            columns = df.columns.tolist()
            missing_columns = [col for col in schema_cols if col not in columns]
            report = {"data_validation_status": None}

            if missing_columns:
                report["data_validation_status"] = False
                report["missing_columns"] = missing_columns
                write_yaml(self.config.report_file_path, report)
                logger.error(f"{missing_columns} columns are missing in the data")
                return False

            # Data type validation
            wrong_types = []
            for column in schema_cols:
                expected_dtype = schema_cols[column]
                actual_dtype = str(df[column].dtype)
                if expected_dtype not in actual_dtype:
                    wrong_types.append({column: {"expected": expected_dtype, "actual": actual_dtype}})
            if wrong_types:
                report["data_validation_status"] = False
                report["wrong_types"] = wrong_types
                write_yaml(self.config.report_file_path, report)
                logger.error(f"Wrong data types: {wrong_types}")
                return False

            report["data_validation_status"] = True
            write_yaml(self.config.report_file_path, report)
            logger.info("Data validation passed.")
            return True

        except Exception as e:
            logger.error(f"{e}")
            write_yaml(self.config.report_file_path, {"data_validation_status": False, "error": str(e)})
            raise e


[2025-07-22 18:56:36,137] - INFO:  NumExpr defaulting to 4 threads.


In [7]:
try:
    config_manager = ConfigurationManager()
    data_validation_config = config_manager.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_data()
except Exception as e:
    logger.error(f"Data validation failed: {e}")
    raise e


[2025-07-22 18:57:45,129] - INFO:  YAML file config\config.yaml loaded successfully.
[2025-07-22 18:57:45,135] - INFO:  YAML file schema.yaml loaded successfully.
[2025-07-22 18:57:45,145] - INFO:  YAML file params.yaml loaded successfully.
[2025-07-22 18:57:45,149] - INFO:  Created directory: artifacts/data_validation
[2025-07-22 18:57:45,157] - INFO:  YAML file schema.yaml loaded successfully.


[2025-07-22 18:57:45,348] - INFO:  YAML file saved at artifacts\data_validation\report.yaml
[2025-07-22 18:57:45,350] - INFO:  Data validation passed.
