In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
%pwd

'/home/jatin/Projects/customer_churn_prediction/resarch'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/home/jatin/Projects/customer_churn_prediction'

In [5]:
# Write the entity

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    """
    Storing configuration related to data validation
    """
    root_dir: Path
    local_data_file: str
    status_file: str
    status_message_file: str
    all_schema: dict

In [6]:
# Write the configuration manager

from customer_churn_prediction.constants import *
from customer_churn_prediction.utils.common import read_yaml
from customer_churn_prediction.utils.common import create_directory

class ConfigurationManager:
    def __init__(
            self,
            config_path=CONFIG_FILE_PATH,
            schema_path=SCHEMA_FILE_PATH,
            params_path=PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_path)
        self.schema = read_yaml(schema_path)
        self.params = read_yaml(params_path)

        create_directory([self.config.artifacts_root])

    def get_data_validation_config(self)-> DataValidationConfig:
        """
        Return data validation config
        """
        config = self.config.data_validation
        schema = self.schema.COLUMNS
        create_directory([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            local_data_file=config.local_data_file,
            status_file=config.status_file,
            status_message_file=config.status_message_file,
            all_schema=schema
        )
        return data_validation_config

In [9]:
data  = pd.read_csv(f"artifacts/data_ingestion/WA_Fn-UseC_-Telco-Customer-Churn.csv")
dataset_columns = data.dtypes
print(type(dataset_columns))
print(dataset_columns)

<class 'pandas.core.series.Series'>
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object


In [11]:
data  = pd.read_csv(f"artifacts/data_ingestion/WA_Fn-UseC_-Telco-Customer-Churn.csv")
dataset_columns = data.dtypes.astype(str).to_dict()
dataset_columns


{'customerID': 'object',
 'gender': 'object',
 'SeniorCitizen': 'int64',
 'Partner': 'object',
 'Dependents': 'object',
 'tenure': 'int64',
 'PhoneService': 'object',
 'MultipleLines': 'object',
 'InternetService': 'object',
 'OnlineSecurity': 'object',
 'OnlineBackup': 'object',
 'DeviceProtection': 'object',
 'TechSupport': 'object',
 'StreamingTV': 'object',
 'StreamingMovies': 'object',
 'Contract': 'object',
 'PaperlessBilling': 'object',
 'PaymentMethod': 'object',
 'MonthlyCharges': 'float64',
 'TotalCharges': 'object',
 'Churn': 'object'}

In [12]:
schema = read_yaml(Path("schema.yaml"))
all_schema = schema.COLUMNS.items()
all_schema

[2025-12-25 12:02:39,003]:INFO:common.py:Yaml file: schema.yaml is loaded successfully


dict_items([('gender', 'object'), ('SeniorCitizen', 'int64'), ('Partner', 'object'), ('Dependents', 'object'), ('tenure', 'int64'), ('PhoneService', 'object'), ('MultipleLines', 'object'), ('InternetService', 'object'), ('OnlineSecurity', 'object'), ('OnlineBackup', 'object'), ('DeviceProtection', 'object'), ('TechSupport', 'object'), ('StreamingTV', 'object'), ('StreamingMovies', 'object'), ('Contract', 'object'), ('PaperlessBilling', 'object'), ('PaymentMethod', 'object'), ('MonthlyCharges', 'float64'), ('TotalCharges', 'object')])

In [15]:
for column_name, datatype in all_schema:
    print(f"== {column_name} >>>> {datatype}")
    if dataset_columns.get(column_name):
        print(f"<<<<<<< {dataset_columns.get(column_name)}")

== gender >>>> object
<<<<<<< object
== SeniorCitizen >>>> int64
<<<<<<< int64
== Partner >>>> object
<<<<<<< object
== Dependents >>>> object
<<<<<<< object
== tenure >>>> int64
<<<<<<< int64
== PhoneService >>>> object
<<<<<<< object
== MultipleLines >>>> object
<<<<<<< object
== InternetService >>>> object
<<<<<<< object
== OnlineSecurity >>>> object
<<<<<<< object
== OnlineBackup >>>> object
<<<<<<< object
== DeviceProtection >>>> object
<<<<<<< object
== TechSupport >>>> object
<<<<<<< object
== StreamingTV >>>> object
<<<<<<< object
== StreamingMovies >>>> object
<<<<<<< object
== Contract >>>> object
<<<<<<< object
== PaperlessBilling >>>> object
<<<<<<< object
== PaymentMethod >>>> object
<<<<<<< object
== MonthlyCharges >>>> float64
<<<<<<< float64
== TotalCharges >>>> object
<<<<<<< object


In [16]:
from customer_churn_prediction import logger

class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_columns(self)-> bool:
        try:
            data = pd.read_csv(self.config.local_data_file)
            dataset_columns = data.dtypes.astype(str).to_dict()
            all_schema = self.config.all_schema.items()
            msg = ""
            valiation_status = True
            for column_name,datatype in all_schema:
                if dataset_columns.get(column_name):
                    valiation_status = True if valiation_status else False
                    if dataset_columns.get(column_name) == datatype:
                        valiation_status = True if valiation_status else False
                        msg += f"{column_name} validated along with datatype"
                    else:
                        valiation_status = False
                        msg += f"{column_name} validated without datatype"
                else:
                    valiation_status = False
                    msg += f"{column_name} not validated"
                msg += "\n"
            with open(self.config.status_message_file,'w+') as f:
                f.write(msg)
            with open(self.config.status_file,'w+') as f:
                f.write(f"Validation status: {valiation_status}")
        except Exception:
            logger.exception(f"Exception occured while validating the columns")
            raise

In [17]:
# Create the pipeline

try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(data_validation_config)
    data_validation.validate_all_columns()
except Exception:
    logger.exception(f"Exception occured while executing the data validation pipeline")
    raise

[2025-12-25 12:04:29,046]:INFO:common.py:Yaml file: config/config.yaml is loaded successfully
[2025-12-25 12:04:29,050]:INFO:common.py:Yaml file: schema.yaml is loaded successfully
[2025-12-25 12:04:29,056]:INFO:common.py:Yaml file: params.yaml is loaded successfully
[2025-12-25 12:04:29,058]:INFO:common.py:Directory created at: artifacts
[2025-12-25 12:04:29,060]:INFO:common.py:Directory created at: artifacts/data_validation
