In [1]:
import os

In [2]:
%pwd

'c:\\Users\\Venks\\Desktop\\Project\\MLOPS_Bootcamp\\Loan_Risk_Assessment_Platform\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\Venks\\Desktop\\Project\\MLOPS_Bootcamp\\Loan_Risk_Assessment_Platform'

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df_cleaned = pd.read_csv(r'C:\Users\Venks\Desktop\Project\MLOPS_Bootcamp\Loan_Risk_Assessment_Platform\artifacts\data_validation\data_cleaned.csv')


In [7]:
print(f"Dataset shape: {df_cleaned.shape}")
df_cleaned.head()

Dataset shape: (404800, 27)


Unnamed: 0,age,gender,marital_status,education,monthly_salary,employment_type,years_of_employment,company_type,house_type,monthly_rent,...,existing_loans,current_emi_amount,credit_score,bank_balance,emergency_fund,emi_scenario,requested_amount,requested_tenure,emi_eligibility,max_monthly_emi
0,38.0,Female,Married,Professional,82600.0,Private,0.9,Mid-Size,Rented,20000.0,...,Yes,23700.0,660.0,303200.0,70200.0,Personal Loan Emi,850000.0,15,Not_Eligible,500.0
1,38.0,Female,Married,Graduate,21500.0,Private,7.0,Mnc,Family,0.0,...,Yes,4100.0,714.0,92500.0,26900.0,E-Commerce Shopping Emi,128000.0,19,Not_Eligible,700.0
2,38.0,Male,Married,Professional,86100.0,Private,5.8,Startup,Own,0.0,...,No,0.0,650.0,672100.0,324200.0,Education Emi,306000.0,16,Eligible,27775.0
3,58.0,Female,Married,High School,66800.0,Private,2.2,Mid-Size,Own,0.0,...,No,0.0,685.0,440900.0,178100.0,Vehicle Emi,304000.0,83,Eligible,16170.0
4,48.0,Female,Married,Professional,57300.0,Private,3.4,Mid-Size,Family,0.0,...,No,0.0,770.0,97300.0,28200.0,Home Appliances Emi,252000.0,7,Not_Eligible,500.0


In [8]:
df_cleaned.isna().sum()

age                       0
gender                    0
marital_status            0
education                 0
monthly_salary            0
employment_type           0
years_of_employment       0
company_type              0
house_type                0
monthly_rent              0
family_size               0
dependents                0
school_fees               0
college_fees              0
travel_expenses           0
groceries_utilities       0
other_monthly_expenses    0
existing_loans            0
current_emi_amount        0
credit_score              0
bank_balance              0
emergency_fund            0
emi_scenario              0
requested_amount          0
requested_tenure          0
emi_eligibility           0
max_monthly_emi           0
dtype: int64

entity

In [9]:
df_cleaned.duplicated().sum()

np.int64(0)

In [16]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    read_data_path: Path
    eda_report_path: Path
    fe_file_path: str
    transformed_train_data: str
    transformed_test_data: str

In [17]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

configuration manager

In [19]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            read_data_path=config.read_data_path,
            eda_report_path=config.eda_report_path,
            fe_file_path=config.fe_file_path,
            transformed_train_data=config.transformed_train_data,
            transformed_test_data=config.transformed_test_data,

        )

        return data_transformation_config

eda and data transformation component

In [20]:
import os
from mlProject import logger
from sklearn.model_selection import train_test_split
import pandas as pd

In [21]:
import os
import numpy as np
import pandas as pd
from mlProject import logger
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

class EDA_and_DataTransformation:
    def __init__(self, config):
        self.config = config

    def load_data(self):
        data = pd.read_csv(self.config.read_data_path)
        logger.info(f"Loaded data from {self.config.read_data_path}")
        return data

    # --- EDA Functions ---

    def univariate_analysis(self, df):
        # Numerical
        num_cols = df.select_dtypes(include=np.number).columns
        for col in num_cols:
            plt.figure()
            sns.histplot(df[col], kde=True)
            plt.title(f'Distribution of {col}')
            plt.savefig(os.path.join(self.config.eda_report_path, f'{col}_hist.png'))
            plt.close()
        # Categorical
        cat_cols = df.select_dtypes(include='object').columns
        for col in cat_cols:
            plt.figure()
            df[col].value_counts().plot(kind='bar')
            plt.title(f'Value counts of {col}')
            plt.savefig(os.path.join(self.config.eda_report_path, f'{col}_bar.png'))
            plt.close()
        logger.info("Univariate EDA charts saved.")

    def bivariate_analysis(self, df, target='emi_eligibility'):
        for col in df.select_dtypes(include=np.number).columns:
            plt.figure()
            sns.boxplot(x=df[target], y=df[col])
            plt.title(f'{col} by {target}')
            plt.savefig(os.path.join(self.config.eda_report_path, f'{col}_by_{target}_box.png'))
            plt.close()
        logger.info("Bivariate EDA charts saved.")

    def correlation_analysis(self, df):
        corr = df.select_dtypes(include=np.number).corr()
        plt.figure(figsize=(12,10))
        sns.heatmap(corr, annot=False, cmap='coolwarm')
        plt.title('Correlation Heatmap')
        plt.savefig(os.path.join(self.config.eda_report_path, 'correlation_heatmap.png'))
        plt.close()
        logger.info("Correlation heatmap saved.")

    def outlier_summary(self, df, cols):
        summary = {}
        for col in cols:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lb = Q1 - 1.5 * IQR
            ub = Q3 + 1.5 * IQR
            outliers = df[(df[col]<lb) | (df[col]>ub)][col].count()
            summary[col] = outliers
        logger.info(f"Outlier summary: {summary}")
        return summary

    # --- Feature Engineering / Data Transformation ---
    def feature_engineering(self, df):
        df['debt_to_income'] = df['current_emi_amount'] / (df['monthly_salary'] + 1)
        expense_cols = [
            'monthly_rent','school_fees','college_fees','travel_expenses',
            'groceries_utilities','other_monthly_expenses'
        ]
        df['expense_to_income'] = df[expense_cols].sum(axis=1) / (df['monthly_salary'] + 1)
        df['affordability_score'] = (df['monthly_salary'] - df[expense_cols].sum(axis=1) - df['current_emi_amount']) / (df['monthly_salary'] + 1)
        
        def credit_risk(score):
            if score >= 750: return 'Excellent'
            elif score >= 700: return 'Good'
            elif score >= 650: return 'Fair'
            else: return 'Poor'
        df['credit_risk_category'] = df['credit_score'].apply(credit_risk)
        df['employment_stability'] = df['years_of_employment'] / (df['age'] + 1)
        df['income_credit_interaction'] = df['monthly_salary'] * df['credit_score']
        df['loan_to_income'] = df['requested_amount'] / ((df['monthly_salary'] + 1) * 12)
        logger.info("Feature engineering calculated.")

        # Outlier capping and scaling
        iqr_cols = ['monthly_salary','years_of_employment','monthly_rent','college_fees',
                    'travel_expenses','groceries_utilities','other_monthly_expenses',
                    'current_emi_amount','credit_score','bank_balance','emergency_fund',
                    'requested_amount','requested_tenure','max_monthly_emi']
        for col in iqr_cols:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            df[col] = np.where(df[col] < lower, lower, np.where(df[col] > upper, upper, df[col]))
        logger.info("IQR capping complete.")

        scale_cols = iqr_cols
        scaler = StandardScaler()
        df[scale_cols] = scaler.fit_transform(df[scale_cols])
        logger.info("Standard scaling complete.")

        # Encoding
        for col in ['gender', 'marital_status', 'existing_loans']:
            df[col+'_enc'] = LabelEncoder().fit_transform(df[col])
        df = pd.get_dummies(df, columns=['education','employment_type','company_type','house_type','emi_scenario','credit_risk_category'], drop_first=True)
        logger.info("Encoding complete.")

        # Save feature engineered data
        df.to_csv(self.config.fe_file_path, index=False)
        logger.info(f"Feature engineered data saved to {self.config.fe_file_path}")
        return df

    # --- Train/Test Splitting ---
    def train_test_spliting(self, df):
        train, test = train_test_split(df, test_size=0.25, random_state=42)
        train.to_csv(self.config.transformed_train_data, index=False)
        test.to_csv(self.config.transformed_test_data, index=False)
        logger.info(f"Train shape: {train.shape}, Test shape: {test.shape}")
        return train, test

    # --- Main runner (for pipeline) ---
    def run_eda_and_transformation(self):
        df = self.load_data()
        os.makedirs(self.config.eda_report_path, exist_ok=True)
        self.univariate_analysis(df)
        self.bivariate_analysis(df)
        self.correlation_analysis(df)
        outlier_summary = self.outlier_summary(df, ['monthly_salary', 'years_of_employment', 'monthly_rent'])
        df_fe = self.feature_engineering(df)
        train, test = self.train_test_spliting(df_fe)
        logger.info("EDA and data transformation pipeline complete.")



pipeline

In [25]:
try:
    config = ConfigurationManager()
    data_trans_config = config.get_data_transformation_config()
    transformer = EDA_and_DataTransformation(config=data_trans_config)
    transformer.run_eda_and_transformation()
    print("EDA, charts, feature engineered data, train/test files saved under appropriate artifact paths.")
except Exception as e:
    print("Error in EDA/data transformation:", str(e))
    raise e


[2025-11-12 20:44:32,553: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-11-12 20:44:32,555: INFO: common: yaml file: params.yaml loaded successfully]
[2025-11-12 20:44:32,558: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-11-12 20:44:32,560: INFO: common: created directory at: artifacts]


[2025-11-12 20:44:32,561: INFO: common: created directory at: artifacts/data_transformation]
[2025-11-12 20:44:33,266: INFO: 3905771186: Loaded data from artifacts/data_validation/data_cleaned.csv]
[2025-11-12 20:45:18,800: INFO: 3905771186: Univariate EDA charts saved.]
[2025-11-12 20:45:27,477: INFO: 3905771186: Bivariate EDA charts saved.]
[2025-11-12 20:45:27,914: INFO: 3905771186: Correlation heatmap saved.]
[2025-11-12 20:45:27,956: INFO: 3905771186: Outlier summary: {'monthly_salary': np.int64(12304), 'years_of_employment': np.int64(22891), 'monthly_rent': np.int64(12720)}]
[2025-11-12 20:45:28,119: INFO: 3905771186: Feature engineering calculated.]
[2025-11-12 20:45:28,280: INFO: 3905771186: IQR capping complete.]
[2025-11-12 20:45:28,405: INFO: 3905771186: Standard scaling complete.]
[2025-11-12 20:45:28,810: INFO: 3905771186: Encoding complete.]
[2025-11-12 20:45:36,767: INFO: 3905771186: Feature engineered data saved to artifacts/data_transformation/data_feature_engineered.c