In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
%pwd

'/home/jatin/Projects/customer_churn_prediction/resarch'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/home/jatin/Projects/customer_churn_prediction'

In [None]:
# Write the entity

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    """
    Storing configuration related to data validation
    """
    root_dir: Path
    local_data_file: str
    status_file: str
    status_message_file: str
    all_schema: dict

In [6]:
# Write the configuration manager

from customer_churn_prediction.constants import *
from customer_churn_prediction.utils.common import read_yaml
from customer_churn_prediction.utils.common import create_directory

class ConfigurationManager:
    def __init__(
            self,
            config_path=CONFIG_FILE_PATH,
            schema_path=SCHEMA_FILE_PATH,
            params_path=PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_path)
        self.schema = read_yaml(schema_path)
        self.params = read_yaml(params_path)

        create_directory([self.config.artifacts_root])

    def get_data_validation_config(self)-> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS
        create_directory([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            local_data_file=config.local_data_file,
            status_file=config.status_file,
            status_message_file=config.status_message_file,
            all_schema=schema
        )
        return data_validation_config

In [7]:
data  = pd.read_csv(f"artifacts/data_ingstion/WA_Fn-UseC_-Telco-Customer-Churn.csv")
dataset_columns = data.dtypes.astype(str).to_dict()
dataset_columns


{'customerID': 'object',
 'gender': 'object',
 'SeniorCitizen': 'int64',
 'Partner': 'object',
 'Dependents': 'object',
 'tenure': 'int64',
 'PhoneService': 'object',
 'MultipleLines': 'object',
 'InternetService': 'object',
 'OnlineSecurity': 'object',
 'OnlineBackup': 'object',
 'DeviceProtection': 'object',
 'TechSupport': 'object',
 'StreamingTV': 'object',
 'StreamingMovies': 'object',
 'Contract': 'object',
 'PaperlessBilling': 'object',
 'PaymentMethod': 'object',
 'MonthlyCharges': 'float64',
 'TotalCharges': 'object',
 'Churn': 'object'}

In [8]:
schema = read_yaml(Path("schema.yaml"))
all_schema = schema.COLUMNS.items()
all_schema

[2025-09-27 20:03:47,584]:INFO:common.py:Yaml file: schema.yaml is loaded successfully


dict_items([('gender', 'str'), ('SeniorCitizen', 'int64'), ('Partner', 'str'), ('Dependents', 'str'), ('tenure', 'int64'), ('PhoneService', 'str'), ('MultipleLines', 'str'), ('InternetService', 'str'), ('OnlineSecurity', 'str'), ('OnlineBackup', 'str'), ('DeviceProtection', 'str'), ('TechSupport', 'str'), ('StreamingTV', 'str'), ('StreamingMovies', 'str'), ('Contract', 'str'), ('PaperlessBilling', 'str'), ('PaymentMethod', 'str'), ('MonthlyCharges', 'float64'), ('TotalCharges', 'str')])

In [9]:
for column_name, datatype in all_schema:
    print(f"== {column_name} >>>> {datatype}")
    if dataset_columns.get('Gender'):
        print(f"<<<<<<< {dataset_columns.get('Gender')}")

== gender >>>> str
== SeniorCitizen >>>> int64
== Partner >>>> str
== Dependents >>>> str
== tenure >>>> int64
== PhoneService >>>> str
== MultipleLines >>>> str
== InternetService >>>> str
== OnlineSecurity >>>> str
== OnlineBackup >>>> str
== DeviceProtection >>>> str
== TechSupport >>>> str
== StreamingTV >>>> str
== StreamingMovies >>>> str
== Contract >>>> str
== PaperlessBilling >>>> str
== PaymentMethod >>>> str
== MonthlyCharges >>>> float64
== TotalCharges >>>> str


In [16]:
from customer_churn_prediction import logger

class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_columns(self)-> bool:
        try:
            data = pd.read_csv(self.config.local_data_file)
            dataset_columns = data.dtypes.astype(str).to_dict()
            all_schema = self.config.all_schema.items()
            msg = ""
            valiation_status = True
            for column_name,datatype in all_schema:
                if dataset_columns.get(column_name):
                    valiation_status = True if valiation_status else False
                    if dataset_columns.get(column_name) == datatype:
                        # valiation_status = True if valiation_status else False
                        msg += f"{column_name} validated along with datatype"
                    else:
                        # valiation_status = False
                        msg += f"{column_name} validated without datatype"
                else:
                    valiation_status = False
                    msg += f"{column_name} not validated"
                msg += "\n"
            with open(self.config.status_message_file,'w+') as f:
                f.write(msg)
            with open(self.config.status_file,'w+') as f:
                f.write(f"Validation status: {valiation_status}")
        except Exception:
            logger.exception(f"Exception occured while validating the columns")
            raise

In [17]:
# Create the pipeline

try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(data_validation_config)
    data_validation.validate_all_columns()
except Exception:
    logger.exception(f"Exception occured while executing the data validation pipeline")
    raise

[2025-09-27 20:06:39,578]:INFO:common.py:Yaml file: config/config.yaml is loaded successfully
[2025-09-27 20:06:39,586]:INFO:common.py:Yaml file: schema.yaml is loaded successfully
[2025-09-27 20:06:39,590]:INFO:common.py:Yaml file: params.yaml is loaded successfully
[2025-09-27 20:06:39,591]:INFO:common.py:Directory created at: artifacts
[2025-09-27 20:06:39,592]:INFO:common.py:Directory created at: artifacts/data_validation
