-logging

In [15]:
import logging
import os

from from_root import from_root
from datetime import datetime

LOG_FILE = f"{datetime.now().strftime('%m_%d_%Y_%H_%M_%S')}.log"

log_dir = 'logs'

logs_path = os.path.join(log_dir, LOG_FILE)

os.makedirs(log_dir, exist_ok=True)


logging.basicConfig(
    filename=logs_path,
    format="[ %(asctime)s ] %(name)s - %(levelname)s - %(message)s",
    level=logging.DEBUG,
)

#Exception

In [16]:
import os
import sys

def error_message_detail(error, error_detail:sys):
    _, _, exc_tb = error_detail.exc_info()
    file_name = exc_tb.tb_frame.f_code.co_filename
    error_message = "Error occurred python script name [{0}] line number [{1}] error message [{2}]".format(
        file_name, exc_tb.tb_lineno, str(error)
    )

    return error_message

class Credit_card_Exception(Exception):
    def __init__(self, error_message, error_detail):
        """
        :param error_message: error message in string format
        """
        super().__init__(error_message)
        self.error_message = error_message_detail(
            error_message, error_detail=error_detail
        )

    def __str__(self):
        return self.error_message

# Utility

In [17]:
import os
import sys

import numpy as np
import dill
import yaml
from pandas import DataFrame

from Demo_project.exception import Credit_card_Exception
from Demo_project.logger import logging


def read_yaml_file(file_path: str) -> dict:
    try:
        with open(file_path, "rb") as yaml_file:
            return yaml.safe_load(yaml_file)

    except Exception as e:
        raise Credit_card_Exception(e, sys) from e


def write_yaml_file(file_path: str, content: object, replace: bool = False) -> None:
    try:
        if replace:
            if os.path.exists(file_path):
                os.remove(file_path)
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        with open(file_path, "w") as file:
            yaml.dump(content, file)
    except Exception as e:
        raise Credit_card_Exception(e, sys) from e


def load_object(file_path: str) -> object:
    logging.info("Entered the load_object method of utils")

    try:

        with open(file_path, "rb") as file_obj:
            obj = dill.load(file_obj)

        logging.info("Exited the load_object method of utils")

        return obj

    except Exception as e:
        raise Credit_card_Exception(e, sys) from e

def save_numpy_array_data(file_path: str, array: np.array):
    """
    Save numpy array data to file
    file_path: str location of file to save
    array: np.array data to save
    """
    try:
        dir_path = os.path.dirname(file_path)
        os.makedirs(dir_path, exist_ok=True)
        with open(file_path, 'wb') as file_obj:
            np.save(file_obj, array)
    except Exception as e:
        raise Credit_card_Exception(e, sys) from e


def load_numpy_array_data(file_path: str) -> np.array:
    """
    load numpy array data from file
    file_path: str location of file to load
    return: np.array data loaded
    """
    try:
        with open(file_path, 'rb') as file_obj:
            return np.load(file_obj)
    except Exception as e:
        raise Credit_card_Exception(e, sys) from e


def save_object(file_path: str, obj: object) -> None:
    logging.info("Entered the save_object method of utils")

    try:
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        with open(file_path, "wb") as file_obj:
            dill.dump(obj, file_obj)

        logging.info("Exited the save_object method of utils")

    except Exception as e:
        raise Credit_card_Exception(e, sys) from e


def drop_columns(df: DataFrame, cols: list)-> DataFrame:

    """
    drop the columns form a pandas DataFrame
    df: pandas DataFrame
    cols: list of columns to be dropped
    """
    logging.info("Entered drop_columns methon of utils")

    try:
        df = df.drop(columns=cols, axis=1)

        logging.info("Exited the drop_columns method of utils")
        
        return df
    except Exception as e:
        raise Credit_card_Exception(e, sys) from e

# Data_Ingestion 

1.constant_file

In [None]:
# code part  writing in  constant file
import os
from datetime import date

from dotenv import load_dotenv
load_dotenv()


DATABASE_NAME="demo_project_DB"

COLLECTION_NAME="fraud_data"

MONGODB_URL_KEY="MONGODB_URL"

PIPELINE_NAME:str ="Demo_project"  # NOT A SRC its pipeline name

ARTIFICAT_DIR:str="artificat"

MODEL_FILE_NAME="model.pkl"

TARGET_COLUMN="default payment next month"

PREPROCESSING_OBJECT_FILE_NAME="preprocessing.pkl"


FILE_NAME:str="default of credit card data.xls"

TRAIN_FILE_NAME:str="train.xls"

TEST_FILE_NAME:str="test.xls"


"""
Data Ingestion related constant start with DATA_INGESTION VAR NAME
"""
DATA_INGESTION_COLLECTION_NAME: str = "fraud_data"
DATA_INGESTION_DIR_NAME: str = "data_ingestion"
DATA_INGESTION_FEATURE_STORE_DIR: str = "feature_store"
DATA_INGESTION_INGESTED_DIR: str = "ingested"
DATA_INGESTION_TRAIN_TEST_SPLIT_RATIO: float = 0.2

# Creating MongoDBClient

In [19]:
# code in configuration mongo_db_connection.py

import sys
import os
import pymongo
import certifi
from Demo_project.exception import Credit_card_Exception
from Demo_project.logger import logging
from Demo_project.constants import DATABASE_NAME, MONGODB_URL_KEY

ca = certifi.where()

class MongoDBClient:
    """
    Class Name :   export_data_into_feature_store
    Description :   This method exports the dataframe from mongodb feature store as dataframe 
    
    Output      :   connection to mongodb database
    On Failure  :   raises an exception
    """
    client = None

    def __init__(self, database_name=DATABASE_NAME) -> None:
        try:
            if MongoDBClient.client is None:
                mongo_db_url = os.getenv(MONGODB_URL_KEY)
                if mongo_db_url is None:
                    raise Exception(f"Environment key: {MONGODB_URL_KEY} not set.")
                MongoDBClient.client = pymongo.MongoClient(mongo_db_url,tlsCAFile=ca)
            self.client= MongoDBClient.client
            self.database = self.client[database_name]
            self.database_name = database_name
            logging.info("Connected to MongoDB database successfull")
        except Exception as e:
            raise Credit_card_Exception(e,sys) 

In [20]:
from Demo_project.configuration.mongo_db_connection import MongoDBClient

def main():
    try:
        # Initialize MongoDB client
        mongo_client = MongoDBClient()  # Default uses DATABASE_NAME

        # Print connection details for confirmation
        print(f"Connected to MongoDB database: {mongo_client.databse_name}")
        print(f"Client: {mongo_client.client}")
        
        # Accessing a specific collection (example)
        collection_name = "fraud_data"  # Replace with your collection name
        collection = mongo_client.databse[collection_name]
        print(f"Connected to collection: {collection_name}")

        # Fetching and displaying documents from the collection
        documents = collection.find()
        print("Documents in the collection:")
        for doc in documents:
            print(doc)

    except Exception as e:
        print(f"Error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Error occurred: Error occurred python script name [c:\demo\demo_project\Demo_project\configuration\mongo_db_connection.py] line number[28] error message [Environment key: MONGODB_URL not set.]


# Data_Transformation

In [None]:
import sys
import numpy as np
import pandas as pd
from imblearn.combine import SMOTEENN
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from statsmodels.stats.outliers_influence import variance_inflation_factor
from us_visa.constants import TARGET_COLUMN, SCHEMA_FILE_PATH, CURRENT_YEAR
from us_visa.entity.config_entity import DataTransformationConfig
from us_visa.entity.artifact_entity import DataTransformationArtifact, DataIngestionArtifact, DataValidationArtifact
from us_visa.exception import USvisaException
from us_visa.logger import logging
from us_visa.utils.main_utils import save_object, save_numpy_array_data, read_yaml_file, drop_columns
from us_visa.entity.estimator import TargetValueMapping

class DataTransformation:
    def __init__(self, data_ingestion_artifact: DataIngestionArtifact,
                 data_transformation_config: DataTransformationConfig,
                 data_validation_artifact: DataValidationArtifact):
        """
        :param data_ingestion_artifact: Output reference of data ingestion artifact stage
        :param data_transformation_config: Configuration for data transformation
        """
        try:
            self.data_ingestion_artifact = data_ingestion_artifact
            self.data_transformation_config = data_transformation_config
            self.data_validation_artifact = data_validation_artifact
            self._schema_config = read_yaml_file(file_path=SCHEMA_FILE_PATH)
        except Exception as e:
            raise USvisaException(e, sys)

    @staticmethod
    def read_data(file_path) -> pd.DataFrame:
        try:
            return pd.read_csv(file_path)
        except Exception as e:
            raise USvisaException(e, sys)

    @staticmethod
    def replace_to_zero(df, columns):
        """
        Replace invalid values (-2, -1, 0) in specified columns with 0.
        """
        try:
            for col in columns:
                fil = (df[col] == -2) | (df[col] == -1) | (df[col] == 0)
                df.loc[fil, col] = 0
            return df
        except Exception as e:
            raise USvisaException(e, sys)

    @staticmethod
    def remove_outliers(df, columns):
        """
        Remove outliers using IQR method for specified columns.
        """
        try:
            for col in columns:
                Q1 = df[col].quantile(0.25)
                Q3 = df[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
            return df
        except Exception as e:
            raise USvisaException(e, sys)

    @staticmethod
    def calculate_vif(df, numerical_columns):
        """
        Calculate Variance Inflation Factor (VIF) for numerical columns.
        """
        try:
            X = df[numerical_columns]
            vif_data = pd.DataFrame()
            vif_data["Feature"] = X.columns
            vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
            return vif_data
        except Exception as e:
            raise USvisaException(e, sys)

    def get_data_transformer_object(self):
        """
        Method to create and return a data transformer object for the data.
        """
        logging.info("Entered get_data_transformer_object method of DataTransformation class")

        try:
            # Pipeline for numerical columns
            numeric_transformer = Pipeline(steps=[
                ("imputer", SimpleImputer(strategy="median")),  # Impute missing values for numerical columns
                ("scaler", StandardScaler())  # Scaling numerical features
            ])

            # Pipeline for categorical columns
            oh_transformer = Pipeline(steps=[
                ("imputer", SimpleImputer(strategy="most_frequent")),  # Impute missing values for categorical columns
                ("one_hot_encoder", OneHotEncoder()),  # One-hot encoding for categorical features
                ("scaler", StandardScaler(with_mean=False))  # Scaling categorical features
            ])
            
            # Ordinal encoding for specified columns
            ordinal_encoder = OrdinalEncoder()

            # Get columns from schema configuration
            oh_columns = self._schema_config['oh_columns']
            or_columns = self._schema_config['or_columns']
            transform_columns = self._schema_config['transform_columns']
            num_features = self._schema_config['num_features']

            # Power Transformer for feature transformation
            transform_pipe = Pipeline(steps=[('transformer', PowerTransformer(method='yeo-johnson'))])

            # Combining all transformations using ColumnTransformer
            preprocessor = ColumnTransformer([
                ("OneHotEncoder", oh_transformer, oh_columns),
                ("Ordinal_Encoder", ordinal_encoder, or_columns),
                ("Transformer", transform_pipe, transform_columns),
                ("StandardScaler", numeric_transformer, num_features)
            ])

            logging.info("Created preprocessor object from ColumnTransformer")
            return preprocessor
        except Exception as e:
            raise USvisaException(e, sys)

    def initiate_data_transformation(self) -> DataTransformationArtifact:
        """
        Method to initiate the data transformation component for the pipeline.
        """
        try:
            if self.data_validation_artifact.validation_status:
                logging.info("Starting data transformation")

                # Read and clean data
                train_df = DataTransformation.read_data(file_path=self.data_ingestion_artifact.trained_file_path)
                test_df = DataTransformation.read_data(file_path=self.data_ingestion_artifact.test_file_path)

                # Replace invalid values in specified columns
                target_columns = self._schema_config['replace_invalid_columns']
                train_df = self.replace_to_zero(train_df, target_columns)
                test_df = self.replace_to_zero(test_df, target_columns)

                # Remove duplicates from both train and test data
                train_df = train_df.drop_duplicates()
                test_df = test_df.drop_duplicates()

                # Remove outliers from numerical columns
                numerical_columns = self._schema_config['num_features']
                train_df = self.remove_outliers(train_df, numerical_columns)
                test_df = self.remove_outliers(test_df, numerical_columns)

                # VIF Analysis for Multicollinearity Check
                vif_data = self.calculate_vif(train_df, numerical_columns)
                vif_threshold = 5.0
                selected_features = vif_data[vif_data["VIF"] <= vif_threshold]["Feature"].tolist()

                # Retaining only selected features
                train_df = train_df[selected_features + [TARGET_COLUMN]]
                test_df = test_df[selected_features + [TARGET_COLUMN]]

                # Getting preprocessor object
                preprocessor = self.get_data_transformer_object()

                # Separating features and target
                input_feature_train_df = train_df.drop(columns=[TARGET_COLUMN], axis=1)
                target_feature_train_df = train_df[TARGET_COLUMN]

                input_feature_test_df = test_df.drop(columns=[TARGET_COLUMN], axis=1)
                target_feature_test_df = test_df[TARGET_COLUMN]

                # Apply transformations to the train and test data
                input_feature_train_arr = preprocessor.fit_transform(input_feature_train_df)
                input_feature_test_arr = preprocessor.transform(input_feature_test_df)

                # Apply SMOTEENN to handle class imbalance
                smt = SMOTEENN(sampling_strategy="minority")
                input_feature_train_final, target_feature_train_final = smt.fit_resample(
                    input_feature_train_arr, target_feature_train_df
                )
                input_feature_test_final, target_feature_test_final = smt.fit_resample(
                    input_feature_test_arr, target_feature_test_df
                )

                # Create final train and test arrays
                train_arr = np.c_[input_feature_train_final, np.array(target_feature_train_final)]
                test_arr = np.c_[input_feature_test_final, np.array(target_feature_test_final)]

                # Save preprocessed data and objects
                save_object(self.data_transformation_config.transformed_object_file_path, preprocessor)
                save_numpy_array_data(self.data_transformation_config.transformed_train_file_path, array=train_arr)
                save_numpy_array_data(self.data_transformation_config.transformed_test_file_path, array=test_arr)

                # Return the transformation artifact
                return DataTransformationArtifact(
                    transformed_object_file_path=self.data_transformation_config.transformed_object_file_path,
                    transformed_train_file_path=self.data_transformation_config.transformed_train_file_path,
                    transformed_test_file_path=self.data_transformation_config.transformed_test_file_path
                )
            else:
                raise Exception(self.data_validation_artifact.message)

        except Exception as e:
            raise USvisaException(e, sys)


- DataValidation.py:

In [4]:
import json
import sys
import os
import yaml
import pandas as pd
from pandas import DataFrame
from pandas.api.types import is_dtype_equal
from typing import Dict, Any ,Tuple
from evidently.model_profile import Profile
from evidently.model_profile.sections import DataDriftProfileSection

from Demo_project.exception import Credit_card_Exception
from Demo_project.logger import logging
from Demo_project.utils.main_utils import read_yaml_file, write_yaml_file
from Demo_project.entity.artifact_entity import DataIngestionArtifact, DataValidationArtifact
from Demo_project.entity.config_entity import DataValidationConfig
from Demo_project.constants import SCHEMA_FILE_PATH

In [5]:
class DataValidation:
    def __init__(self, data_ingestion_artifact: DataIngestionArtifact, data_validation_config: DataValidationConfig):
        """
        :param data_ingestion_artifact: Output reference of data ingestion artifact stage
        :param data_validation_config: configuration for data validation
        """
        try:
            self.data_ingestion_artifact = data_ingestion_artifact
            self.data_validation_config = data_validation_config
            self._schema_config =read_yaml_file(file_path=SCHEMA_FILE_PATH)
        except Exception as e:
            raise Credit_card_Exception(e,sys)
        
    @staticmethod
    def read_data(file_path) -> DataFrame:
        try:
            return pd.read_csv(file_path)
        except Exception as e:
            raise Credit_card_Exception(e, sys)

In [6]:
def check_data_types(self, dataframe: DataFrame) -> bool:
        """
        Method Name :   check_data_types
        Description :   This method validates if the data types of the columns match the schema
        Output      :   Returns bool value based on validation results
        On Failure  :   Write an exception log and then raise an exception
        """
        try:
            schema_col_dtype = self._schema_config.get("dtypes",{})
            print(schema_col_dtype)
            schema_dtype=list(schema_col_dtype.values())
            schema_col=list(schema_col_dtype.keys())
            df_dtype=[dtype.name for dtype in dataframe.dtypes]
            df_col=list(dataframe.columns)
            
            mismatched_columns = []

            for column in schema_col:
                if column in df_col:
                    logging.info(f"schema column: {column}  is present in dataframe")
                
                else:
                    mismatched_columns.append(f"{column} is missing in the DataFrame.")
                    logging.info(f"Column not found in dataframe: {column}")
                    continue
                    
            mismatched_dtypes = []        

            for dtype in schema_dtype:
                if dtype  in df_dtype:   
                    logging.info(f"schema dtype : {dtype}  is present in dataframe dtype")
                else:
                    mismatched_dtypes.append(f"Data type mismatch for column: {column} , Found: {dtype}")

            if len(mismatched_columns)>0:
                logging.info(f"schema data column mismatched with dataframe_columns.")

            if len(mismatched_dtypes)>0:
                logging.info(f"schema data type mismatched with dataframe_dtypes.")

            return False if len(mismatched_columns)>0 or len(mismatched_dtypes)>0 else True
                
                
        except Exception as e:
            raise Credit_card_Exception(e, sys)

In [8]:
# Provided schema and dataframe metadata
schema_data = {
    'ID': 'int64', 'LIMIT_BAL': 'int64', 'SEX': 'int64', 'EDUCATION': 'int64', 'MARRIAGE': 'int64', 
    'AGE': 'int64', 'PAY_0': 'int64', 'PAY_2': 'int64', 'PAY_3': 'int64', 'PAY_4': 'int64', 
    'PAY_5': 'int64', 'PAY_6': 'int64', 'BILL_AMT1': 'int64', 'BILL_AMT2': 'int64', 
    'BILL_AMT3': 'int64', 'BILL_AMT4': 'int64', 'BILL_AMT5': 'int64', 'BILL_AMT6': 'int64', 
    'PAY_AMT1': 'int64', 'PAY_AMT2': 'int64', 'PAY_AMT3': 'int64', 'PAY_AMT4': 'int64', 
    'PAY_AMT5': 'int64', 'PAY_AMT6': 'int64', 'default payment next month': 'int64'
}

df_col = [
    'ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 
    'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 
    'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'default payment next month'
]

df_dtype = [
    'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 
    'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 
    'int64', 'int64', 'int64', 'int64', 'int64'
]

# Create a dictionary from df_col and df_dtype
dataframe_schema = dict(zip(df_col, df_dtype))
print(dataframe_schema)
print(dataframe_schema.items())

# Compare schema_data with dataframe_schema
def compare_schemas(schema1, schema2):
    mismatched_columns = []
    extra_columns = []
    missing_columns = []

    # Check for mismatched or missing columns
    for column, dtype in schema1.items():
        if column not in schema2:
            missing_columns.append(column)
        elif schema2[column] != dtype:
            mismatched_columns.append(f"{column}: Expected {dtype}, but got {schema2[column]}")

    # Check for extra columns in schema2
    for column in schema2.keys():
        if column not in schema1:
            extra_columns.append(column)

    return mismatched_columns, missing_columns, extra_columns


# Perform the comparison
mismatched, missing, extra = compare_schemas(schema_data, dataframe_schema)

# Display results
if mismatched:
    print("Mismatched columns:")
    for mismatch in mismatched:
        print(mismatch)

if missing:
    print("\nMissing columns:")
    for miss in missing:
        print(miss)

if extra:
    print("\nExtra columns:")
    for ext in extra:
        print(ext)

if not mismatched and not missing and not extra:
    print("Schemas match perfectly!")


{'ID': 'int64', 'LIMIT_BAL': 'int64', 'SEX': 'int64', 'EDUCATION': 'int64', 'MARRIAGE': 'int64', 'AGE': 'int64', 'PAY_0': 'int64', 'PAY_2': 'int64', 'PAY_3': 'int64', 'PAY_4': 'int64', 'PAY_5': 'int64', 'PAY_6': 'int64', 'BILL_AMT1': 'int64', 'BILL_AMT2': 'int64', 'BILL_AMT3': 'int64', 'BILL_AMT4': 'int64', 'BILL_AMT5': 'int64', 'BILL_AMT6': 'int64', 'PAY_AMT1': 'int64', 'PAY_AMT2': 'int64', 'PAY_AMT3': 'int64', 'PAY_AMT4': 'int64', 'PAY_AMT5': 'int64', 'PAY_AMT6': 'int64', 'default payment next month': 'int64'}


TypeError: get expected at least 1 argument, got 0

In [16]:
# Provided schema and dataframe metadata
schema_data = {
    'ID': 'int64', 'LIMIT_BAL': 'int64', 'SEX': 'int64', 'EDUCATION': 'int64', 'MARRIAGE': 'int64', 
    'AGE': 'int64', 'PAY_0': 'int64', 'PAY_2': 'int64', 'PAY_3': 'int64', 'PAY_4': 'int64', 
    'PAY_5': 'int64', 'PAY_6': 'int64', 'BILL_AMT1': 'int64', 'BILL_AMT2': 'int64', 
    'BILL_AMT3': 'int64', 'BILL_AMT4': 'int64', 'BILL_AMT5': 'int64', 'BILL_AMT6': 'int64', 
    'PAY_AMT1': 'int64', 'PAY_AMT2': 'int64', 'PAY_AMT3': 'int64', 'PAY_AMT4': 'int64', 
    'PAY_AMT5': 'int64', 'PAY_AMT6': 'int64', 'default payment next month': 'int64'
}

df_col = [
    'ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 
    'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 
    'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'default payment next month'
]

df_dtype = [
    'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 
    'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 
    'int64', 'int64', 'int64', 'int64', 'int64'
]

# Create a dictionary from df_col and df_dtype
dataframe_schema = dict(zip(df_col, df_dtype))
#print(dataframe_schema)
print()
#print(dataframe_schema.items())
mismatched_columns = []
extra_columns = []
missing_columns = []

 # Check for mismatched or missing columns
for column, dtype in schema_data.items():
        if column not in dataframe_schema:
            missing_columns.append(column)
        elif dataframe_schema[column] != dtype:
            mismatched_columns.append(f"{column}: Expected {dtype}, but got {dataframe_schema[column]}")
print(mismatched_columns)
print(missing_columns)
print(extra_columns)

# Check for extra columns in schema2
for column in dataframe_schema.keys():
        if column not in schema_data:
            extra_columns.append(column)

print(extra_columns)


[]
[]
[]
[]


In [17]:
schema_col_dtype = {
    'ID': 'int64', 'LIMIT_BAL': 'int64', 'SEX': 'int64', 'EDUCATION': 'int64', 'MARRIAGE': 'int64', 
    'AGE': 'int64', 'PAY_0': 'int64', 'PAY_2': 'int64', 'PAY_3': 'int64', 'PAY_4': 'int64', 
    'PAY_5': 'int64', 'PAY_6': 'int64', 'BILL_AMT1': 'int64', 'BILL_AMT2': 'int64', 
    'BILL_AMT3': 'int64', 'BILL_AMT4': 'int64', 'BILL_AMT5': 'int64', 'BILL_AMT6': 'int64', 
    'PAY_AMT1': 'int64', 'PAY_AMT2': 'int64', 'PAY_AMT3': 'int64', 'PAY_AMT4': 'int64', 
    'PAY_AMT5': 'int64', 'PAY_AMT6': 'int64', 'default payment next month': 'int64'
}
df_col = [
    'ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 
    'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 
    'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'default payment next month'
]

df_dtype = [
    'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 
    'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 
    'int64', 'int64', 'int64', 'int64', 'int64'
]

# Create a dictionary from df_col and df_dtype
dataframe_schema = dict(zip(df_col, df_dtype))
def check_data_types(self, dataframe: DataFrame) -> bool:
    """
    Method Name :   check_data_types
    Description :   This method validates if the data types and columns of the dataframe match the schema
    Output      :   Returns bool value based on validation results
    On Failure  :   Write an exception log and then raise an exception
    """
    try:
        # Retrieve schema configuration for columns and their data types
        #schema_col_dtype = self._schema_config.get("dtypes", {})
        logging.info(f"Schema data types: {schema_col_dtype}")
        
        # Extract schema column names and data types
        schema_columns = list(schema_col_dtype.keys())
        schema_dtypes = list(schema_col_dtype.values())
        
        # Extract DataFrame column names and their data types
        #dataframe_columns = list(dataframe.columns)
        #dataframe_dtypes = [dtype.name for dtype in dataframe.dtypes]

        logging.info(f"DataFrame columns: {df_col}")
        logging.info(f"DataFrame data types: {df_dtype}")

        # Validate column presence
        missing_columns = [col for col in schema_columns if col not in df_col]
        if missing_columns:
            logging.info(f"Missing columns in DataFrame: {missing_columns}")

        # Validate data types
        mismatched_dtypes = [
            f"Column: {col}, Expected: {expected_dtype}, Found: {dataframe[col].dtypes}"
            for col, expected_dtype in schema_col_dtype.items()
            if col in df_col and str(dataframe[col].dtypes) != expected_dtype
        ]
        if mismatched_dtypes:
            logging.info(f"Data type mismatches: {mismatched_dtypes}")

        # Return False if any validation errors are found
        if missing_columns or mismatched_dtypes:
            logging.info("Schema validation failed.")
            return False

        # If no issues, validation is successful
        logging.info("Schema validation passed.")
        return True

    except Exception as e:
        raise Credit_card_Exception(e, sys)


In [None]:
# checking data transformation


import sys
import os
import numpy as np
import pandas as pd
from pandas import DataFrame
from imblearn.combine import SMOTEENN
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
#from statsmodels.stats.outliers_influence import variance_inflation_factor

from Demo_project.constants import TARGET_COLUMN, SCHEMA_FILE_PATH
from Demo_project.entity.config_entity import DataTransformationConfig
from Demo_project.entity.artifact_entity import DataTransformationArtifact, DataIngestionArtifact, DataValidationArtifact
from Demo_project.exception import Credit_card_Exception
from Demo_project.logger import logging
from Demo_project.utils.main_utils import save_object, save_numpy_array_data, read_yaml_file, drop_columns
#from Demo_project.entity.estimator import TargetValueMapping


class DataTransformation:
    def __init__(self, data_ingestion_artifact: DataIngestionArtifact,
                 data_transformation_config: DataTransformationConfig,
                 data_validation_artifact: DataValidationArtifact):
        """
        :param data_ingestion_artifact: Output reference of data ingestion artifact stage
        :param data_transformation_config: Configuration for data transformation
        """
        try:
            self.data_ingestion_artifact = data_ingestion_artifact
            self.data_transformation_config = data_transformation_config
            self.data_validation_artifact = data_validation_artifact
            self._schema_config = read_yaml_file(file_path=SCHEMA_FILE_PATH)
        except Exception as e:
            raise Credit_card_Exception(e, sys)

    @staticmethod
    def read_data(file_path) -> pd.DataFrame:
        try:
            return pd.read_csv(file_path)
        except Exception as e:
            raise Credit_card_Exception(e, sys)


    @staticmethod
    def replace_to_zero(df, columns):
        """
        Replace invalid values (-2, -1, 0) in specified columns with 0.
        """
        try:
            for col in columns:
                fil = (df[col] == -2) | (df[col] == -1) | (df[col] == 0)
                df.loc[fil, col] = 0
            return df
        except Exception as e:
            raise Credit_card_Exception(e, sys)
        
    @staticmethod
    def replace_education_values(df):
        """
        Replace values 4, 5, 6 with 0 in the EDUCATION column.
        """
        try:
            df["EDUCATION"] = df["EDUCATION"].replace({4: 0, 5: 0, 6: 0})
            return df
        except Exception as e:
            raise Credit_card_Exception(e, sys)

    @staticmethod
    def replace_marriage_values(df):
        """
        Replace value 0 with 3 in the MARRIAGE column.
        """
        try:
            if "MARRIAGE" in df.columns:
                df["MARRIAGE"] = df["MARRIAGE"].replace({0:3})
            else:
                raise ValueError("Column 'MARRIAGE' not found in DataFrame.")
            return df
        except Exception as e:
            raise Credit_card_Exception(e, sys)
    
        

    

    
    #@staticmethod
    #def remove_outliers(df, columns):
    #   """
    #    Remove outliers using IQR method for specified columns.
    #   """
    #   try:
    #       for col in columns:
    #            Q1 = df[col].quantile(0.25)
    #           Q3 = df[col].quantile(0.75)
    #            IQR = Q3 - Q1
    #            lower_bound = Q1 - 1.5 * IQR
    #            upper_bound = Q3 + 1.5 * IQR
    #            df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    #       return df
    #    except Exception as e:
    #        raise Credit_card_Exception(e, sys)


    @staticmethod
    def remove_duplicates(df):
        """
        Remove duplicate rows from the DataFrame.
        """
        try:
            #before_count = len(df.shape[0])
            df = df.drop_duplicates(inplace=True)
            #after_count = len(df.shape[0])
            #logging.info(f"Removed {before_count - after_count} duplicate rows.")
            logging.info(f"Removed  duplicate rows.")
            return df
        except Exception as e:
            raise Credit_card_Exception(e, sys)


    def get_data_transformer_object(self) -> Pipeline:
        """
        Method Name :   get_data_transformer_object
        Description :   This method creates and returns a data transformer object for the data
        
        Output      :   data transformer object is created and returned 
        On Failure  :   Write an exception log and then raise an exception
        """
        logging.info(
            "Entered get_data_transformer_object method of DataTransformation class"
        )

        try:
            logging.info("Got numerical cols from schema config")

            numeric_transformer = StandardScaler()

             # follow below Pipeline for numerical columns if data has missing values 

            #numeric_transformer = Pipeline(steps=[
            #    ("imputer", SimpleImputer(strategy="median")),  # Impute missing values for numerical columns
            #    ("scaler", StandardScaler())  # Scaling numerical features
            #])

            oh_transformer = OneHotEncoder()      # One-hot encoding for categorical features
        
            # follow below Pipeline for categorical columns if data has missing values

            #oh_transformer = Pipeline(steps=[
            #    ("imputer", SimpleImputer(strategy="most_frequent")),  # Impute missing values for categorical columns
            #    ("one_hot_encoder", OneHotEncoder()),  # One-hot encoding for categorical features
            #    ("scaler", StandardScaler(with_mean=False))  # Scaling categorical features
            #])
           

            ordinal_encoder = OrdinalEncoder()  # Ordinal encoding for specified columns


            logging.info("Initialized StandardScaler, OneHotEncoder, OrdinalEncoder")



            # Get columns from schema configuration
            num_features = self._schema_config['num_features']
            oh_columns = self._schema_config['oh_columns']
            or_columns = self._schema_config['or_columns']
            transform_columns = self._schema_config['transform_columns']


            logging.info("Initialize PowerTransformer")
            transform_pipe = Pipeline(steps=[
                ('transformer', PowerTransformer(method='yeo-johnson'))
            ])

            # Combining all transformations using ColumnTransformer
            preprocessor = ColumnTransformer(
                [
                    ("OneHotEncoder", oh_transformer, oh_columns),
                    ("Ordinal_Encoder", ordinal_encoder, or_columns),
                    ("Transformer", transform_pipe, transform_columns),
                    ("StandardScaler", numeric_transformer, num_features)
                ]
            )
            logging.info("Created preprocessor object from ColumnTransformer")

            logging.info("Exited get_data_transformer_object method of DataTransformation class")

            return preprocessor
        except Exception as e:
            raise Credit_card_Exception(e, sys) from e


    def initiate_data_transformation(self, ) -> DataTransformationArtifact:
        """
        Method Name :   initiate_data_transformation
        Description :   This method initiates the data transformation component for the pipeline 
        
        Output      :   data transformer steps are performed and preprocessor object is created  
        On Failure  :   Write an exception log and then raise an exception
        """

        try:
            if self.data_validation_artifact.validation_status:
                logging.info("Starting data transformation")
                preprocessor = self.get_data_transformer_object()
                logging.info("Got the preprocessor object")

                train_df = DataTransformation.read_data(file_path=self.data_ingestion_artifact.trained_file_path)
                test_df = DataTransformation.read_data(file_path=self.data_ingestion_artifact.test_file_path)
                
                # Replace invalid values in specified columns
                target_columns = self._schema_config['replace_invalid_values_in_columns']
                train_df = self.replace_to_zero(train_df, target_columns)
                test_df = self.replace_to_zero(test_df, target_columns)
                
                print(train_df.head())
                print(test_df.head())

                
                


                train_df = self.replace_education_values(train_df)
                test_df = self.replace_education_values(test_df)

                train_df = self.replace_marriage_values(train_df)
                test_df = self.replace_marriage_values(test_df)

                train_df = self.remove_duplicates(train_df)
                test_df = self.remove_duplicates(test_df)
                print(type(train_df))

                input_feature_train_df = train_df.drop(columns=[TARGET_COLUMN], axis=1)
                target_feature_train_df = train_df[TARGET_COLUMN]

                logging.info("Got train features and test features of Training dataset")

                drop_cols = self._schema_config['drop_columns']

                logging.info("drop the columns in drop_cols of Training dataset")

                input_feature_train_df = drop_columns(df=input_feature_train_df, cols = drop_cols)
                   

                   #incase target column categorical, replace with target value mapping with numerical values from estimator.py
                #target_feature_train_df = target_feature_train_df.replace(TargetValueMapping()._asdict())
                target_feature_train_df = target_feature_train_df

                input_feature_test_df = test_df.drop(columns=[TARGET_COLUMN], axis=1)

                target_feature_test_df = test_df[TARGET_COLUMN]


                input_feature_test_df = drop_columns(df=input_feature_test_df, cols = drop_cols)

                logging.info("drop the columns in drop_cols of Test dataset")
                #incase target column categorical, replace with target value mapping with numerical values from estimator.py
                #target_feature_test_df = target_feature_test_df.replace(TargetValueMapping()._asdict() )
                target_feature_test_df = target_feature_test_df 

                logging.info("Got train features and test features of Testing dataset")

                logging.info(
                    "Applying preprocessing object on training dataframe and testing dataframe"
                )

                input_feature_train_arr = preprocessor.fit_transform(input_feature_train_df)

                logging.info(
                    "Used the preprocessor object to fit transform the train features"
                )

                input_feature_test_arr = preprocessor.transform(input_feature_test_df)

                logging.info("Used the preprocessor object to transform the test features")

                logging.info("Applying SMOTEENN on Training dataset")

                smt = SMOTEENN(sampling_strategy="minority")

                input_feature_train_final, target_feature_train_final = smt.fit_resample(
                    input_feature_train_arr, target_feature_train_df
                )

                logging.info("Applied SMOTEENN on training dataset")

                logging.info("Applying SMOTEENN on testing dataset")

                input_feature_test_final, target_feature_test_final = smt.fit_resample(
                    input_feature_test_arr, target_feature_test_df
                )

                logging.info("Applied SMOTEENN on testing dataset")

                logging.info("Created train array and test array")

                train_arr = np.c_[
                    input_feature_train_final, np.array(target_feature_train_final)
                ]

                test_arr = np.c_[
                    input_feature_test_final, np.array(target_feature_test_final)
                ]

                save_object(self.data_transformation_config.transformed_object_file_path, preprocessor)
                save_numpy_array_data(self.data_transformation_config.transformed_train_file_path, array=train_arr)
                save_numpy_array_data(self.data_transformation_config.transformed_test_file_path, array=test_arr)

                logging.info("Saved the preprocessor object")

                logging.info(
                    "Exited initiate_data_transformation method of Data_Transformation class"
                )

                data_transformation_artifact = DataTransformationArtifact(
                    transformed_object_file_path=self.data_transformation_config.transformed_object_file_path,
                    transformed_train_file_path=self.data_transformation_config.transformed_train_file_path,
                    transformed_test_file_path=self.data_transformation_config.transformed_test_file_path
                )
                return data_transformation_artifact
            else:
                raise Exception(self.data_validation_artifact.message)

        except Exception as e:
            raise Credit_card_Exception(e, sys) from e


                   
        

    


            