In [3]:
import os

In [4]:
%pwd

'/Users/harshbhatt/Desktop/Projects/walmart-sales-forecast-mlops/research'

In [5]:
os.chdir("../")
%pwd

'/Users/harshbhatt/Desktop/Projects/walmart-sales-forecast-mlops'

In [6]:
import pandas as pd

In [26]:
features = pd.read_csv("artifacts/data_ingestion/test.csv")
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115064 entries, 0 to 115063
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Store      115064 non-null  int64 
 1   Dept       115064 non-null  int64 
 2   Date       115064 non-null  object
 3   IsHoliday  115064 non-null  bool  
dtypes: bool(1), int64(2), object(1)
memory usage: 2.7+ MB


In [27]:
cols, col_dtypes = features.dtypes.index.tolist(), features.dtypes.values.tolist()
cols, col_dtypes
for c,t in zip(cols, col_dtypes):
    print(f"{c}: {t}")

Store: int64
Dept: int64
Date: object
IsHoliday: bool


In [29]:
from dataclasses import dataclass, field
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    all_schema: dict
    data_dirs: dict = field(default_factory= lambda: {
        'features': None,
        'stores': None,
        'train': None,
        'test': None
    })

In [30]:
from projectFiles.constants import *
from projectFiles.utils.common import read_yaml, create_directories

In [61]:
class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH, schema_filepath = SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema
        
        data_files_dirs = {}
        for k,v in config.data_dirs.items():
            data_files_dirs[k] = v

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir = config.root_dir,
            STATUS_FILE = config.STATUS_FILE,
            all_schema = schema,
            data_dirs = data_files_dirs
        )

        # for k,v in data_validation_config.data_dirs.items():
        #     data_validation_config.data_dirs.k = config.data_dirs.k

        return data_validation_config

In [None]:
# cm = ConfigurationManager()
# val_config = cm.get_data_validation_config()

[2025-03-01 19:40:57,771: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-03-01 19:40:57,772: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-01 19:40:57,774: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-03-01 19:40:57,775: INFO: common: created directory at: artifacts]
[2025-03-01 19:40:57,776: INFO: common: created directory at: artifacts/data_validation]


In [None]:
# val_config.all_schema["features"]["COLUMNS"].keys()

dict_keys(['Store', 'Date', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'IsHoliday'])

In [31]:
from projectFiles import logger

In [98]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_data(self) -> bool:
        try:
            validation_status = None
            full_status = True
            
            with open(self.config.STATUS_FILE, 'w') as sfile:
                sfile.write("Initializing validation tests")

            for k,v in self.config.data_dirs.items():
                df = pd.read_csv(v)
                df_cols = list(df.columns)
                df_status = True

                schema_cols = self.config.all_schema[k]["COLUMNS"].keys()
                with open(self.config.STATUS_FILE, 'a') as sfile:
                    sfile.write(f"\n\nValidating columns for {k}.csv")
                for col in df_cols:
                    if col not in schema_cols:
                        validation_status = False
                        df_status = False
                        full_status = False
                    else:
                        validation_status = True
                    with open(self.config.STATUS_FILE, 'a') as sfile:
                        sfile.write(f"\nStatus of {v}: {validation_status}")
                with open(self.config.STATUS_FILE, 'a') as sfile:
                    sfile.write(f"\n{k}.csv final validation status: {df_status}")
            
            return full_status
            
        except Exception as e:
            raise e

In [99]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config = data_validation_config)
    data_validation.validate_data()
except Exception as e:
    raise e

[2025-03-01 23:52:57,525: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-03-01 23:52:57,526: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-01 23:52:57,528: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-03-01 23:52:57,529: INFO: common: created directory at: artifacts]
[2025-03-01 23:52:57,529: INFO: common: created directory at: artifacts/data_validation]
