In [1]:
import os
os.chdir('../')
%pwd

'/home/utpal108/dev/Python/PW_Skills_Projects/Home-Loan-Approval-Prediction'

In [2]:
from dataclasses import dataclass
from pathlib import Path

In [3]:
@dataclass(frozen=True)
class DataPreprocessingConfig:
    train_data_path: Path
    test_data_path: Path
    preprocessor_path: Path

In [4]:
import pandas as pd
from homeLoan.constants import *
from homeLoan.utils import read_yaml, save_object, create_directories

In [5]:
# Configuration Manager
class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

    def get_data_preprocessing_config(self) -> DataPreprocessingConfig:
        config = self.config
        create_directories([config.data_preprocessor.root_dir])

        data_preprocessing_config = DataPreprocessingConfig(
            train_data_path = Path(config.data_ingestion.train_data_path),
            test_data_path = Path(config.data_ingestion.test_data_path),
            preprocessor_path = Path(config.data_preprocessor.preprocessor_path)
        )

        return data_preprocessing_config

In [6]:
from sklearn.impute import SimpleImputer # For Handling Missing Values
from sklearn.preprocessing import StandardScaler # For Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # For Ordinal Encoding

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import numpy as np

In [7]:
# Components
class DataPreprocessing:
    def __init__(self, config: DataPreprocessingConfig):
        self.config = config

    def _data_preprocessor(self, numerical_features, categorical_features, scalled_features):
        '''
        Preprocess the raw dataset
        '''

        # Define custom ranking for each ordinal values
        gender_categories = ['Female', 'Male']
        married_categories = ['No', 'Yes']
        dependents_categories = ['0', '1', '2', '3+']
        education_categories = ['Not Graduate', 'Graduate']
        self_employed_categories = ['No', 'Yes']
        property_area_categories = ['Rural', 'Semiurban', 'Urban']

        # Numerical Pipeline
        num_pipeline = Pipeline(
            steps=[
                ('imputer', SimpleImputer(strategy='median'))
            ]
        )

        # Categorical Pipeline
        cat_pipeline = Pipeline(
            steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('ordinalencoder', OrdinalEncoder(categories=[gender_categories, married_categories, dependents_categories, education_categories, self_employed_categories, property_area_categories]))
            ]
        )

        # Scaling Pipeline
        scaling_pipeline = Pipeline(
            steps = [
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]
        )

        preprocessor = ColumnTransformer(
            transformers=[
                ('num_pipeline', num_pipeline, numerical_features),
                ('cat_pipeline', cat_pipeline, categorical_features),
                ('scaling_pipeline', scaling_pipeline, scalled_features)
            ],
            remainder='passthrough'
        )

        return preprocessor

    
    def initiate_data_preprocessing(self) -> None:
        
        try:
            train_df = pd.read_csv(self.config.train_data_path)
            test_df = pd.read_csv(self.config.test_data_path)

            target_maping = {'Y':1, 'N':0}
            train_df['Loan_Status'] = train_df['Loan_Status'].map(target_maping)
            test_df['Loan_Status'] = test_df['Loan_Status'].map(target_maping)
        
            input_features_train_df = train_df.drop(['Loan_ID', 'Loan_Status'], axis=1)
            target_features_train_df = train_df['Loan_Status']

            input_features_test_df = test_df.drop(['Loan_ID', 'Loan_Status'], axis=1)
            target_features_test_df = test_df['Loan_Status']

            # Categorical & Numerical Features
            categorical_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
            scalled_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']
            numerical_features = ['Loan_Amount_Term', 'Credit_History']

            preprocessor = self._data_preprocessor(numerical_features=numerical_features, categorical_features=categorical_features, scalled_features=scalled_features)

            input_features_train_df = preprocessor.fit_transform(input_features_train_df)
            input_features_test_df = preprocessor.fit_transform(input_features_test_df)

            train_arr = np.c_[input_features_train_df, np.array(target_features_train_df)]
            test_arr = np.c_[input_features_test_df, np.array(target_features_test_df)]

            save_object(self.config.preprocessor_path, preprocessor)
            
            return train_arr, test_arr

        except Exception as e:
            raise e
    

In [8]:
# Pipeline
try:
    config = ConfigurationManager()
    data_preprocessing_config = config.get_data_preprocessing_config()
    data_preprocessing = DataPreprocessing(config=data_preprocessing_config)
    train_arr, test_arr = data_preprocessing.initiate_data_preprocessing()

except Exception as e:
    raise e

2023-12-02 21:25:51,116 : homeLoan.logger - INFO - YAML file: config/config.yaml loaded successfully
2023-12-02 21:25:51,118 : homeLoan.logger - INFO - YAML file: params.yaml loaded successfully
2023-12-02 21:25:51,119 : homeLoan.logger - INFO - created directory at: artifacts/preprocessor


[[ 3.60000000e+02  1.00000000e+00  1.00000000e+00 ... -9.06211248e-01
   1.24306463e+00  1.00000000e+00]
 [ 3.60000000e+02  1.00000000e+00  1.00000000e+00 ... -9.06211248e-01
  -8.40012678e-02  1.00000000e+00]
 [ 3.60000000e+02  1.00000000e+00  1.00000000e+00 ...  2.99389655e-02
   2.95160418e-01  1.00000000e+00]
 ...
 [ 4.80000000e+02  1.00000000e+00  1.00000000e+00 ...  2.86134877e-01
  -1.78791689e-01  1.00000000e+00]
 [ 3.60000000e+02  1.00000000e+00  1.00000000e+00 ...  2.84193999e-01
  -3.68372532e-01  1.00000000e+00]
 [ 1.80000000e+02  1.00000000e+00  1.00000000e+00 ... -9.06211248e-01
   4.79093469e+00  1.00000000e+00]]
