In [12]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, RobustScaler, OrdinalEncoder
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import f_classif, SelectKBest, mutual_info_classif 
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix 
from imblearn.over_sampling import SMOTE 
import joblib
import seaborn as sns 
import warnings 
warnings.filterwarnings("ignore")
from xgboost import XGBClassifier 
from lightgbm import LGBMClassifier

In [13]:
train_df = pd.read_csv("https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/home_loan_train.csv")
test_df = pd.read_csv("https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/home_loan_test.csv")

train_copy = train_df.copy()

In [14]:
def data_cleaning(data):
    # for mising values
    missing_cols = [x for x in data.columns if data[x].isna().any() == True]
    if len(missing_cols) > 0:
        print("Features with missing values")
        print(missing_cols)
        print(data.isna().sum())
        num_missing_cols = [x for x in missing_cols if data[x].dtype in ["int64", "float64"]]
        num_missing_cols.remove("Loan_Amount_Term")
        num_missing_cols.remove('Credit_History')
        obj_missing_cols = [x for x in missing_cols if x not in num_missing_cols]

        # handle missing values in numerical features
        for i in num_missing_cols:
            median = data[i].median()
            data[i] = data[i].fillna(median)

        # handle missing values in categorical features
        for i in obj_missing_cols:
            frequent = data[i].value_counts().index[0]
            data[i] = data[i].fillna(frequent)
        print(obj_missing_cols)
        print(num_missing_cols)
    else:
        print("No missing values")

    # handling skewed data
    num_cols = [x for x in data.columns if data[x].dtype in ["int64", "float64"]]
    num_cols.remove("Loan_Amount_Term")
    num_cols.remove("Credit_History")
    obj_cols = [x for x in data.columns if x not in num_cols]

    print("missing values count after cleaning")
    print(data.isna().sum())

    # applying log transform
    for i in num_cols:
        data[f"{i}_log"] = np.log1p(data[i])

    print("Skewness before cleaning")
    dic = {i: data[i].skew() for i in num_cols}
    print(dic)

    print("Skewness after cleaning")
    dic = {i: data[f"{i}_log"].skew() for i in num_cols}
    print(dic)

    return data

In [15]:
cleaned_data = data_cleaning(train_copy)

Features with missing values
['Gender', 'Married', 'Dependents', 'Self_Employed', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']
Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64
['Gender', 'Married', 'Dependents', 'Self_Employed', 'Loan_Amount_Term', 'Credit_History']
['LoanAmount']
missing values count after cleaning
Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64
Skewness before cleaning
{'ApplicantIncome': np.float64(6.539513113994625), 'Coapplican

In [16]:
cleaned_data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,ApplicantIncome_log,CoapplicantIncome_log,LoanAmount_log
0,LP001002,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,Y,8.674197,0.000000,4.859812
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,8.430327,7.319202,4.859812
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,8.006701,0.000000,4.204693
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,7.857094,7.765993,4.795791
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,8.699681,0.000000,4.955827
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y,7.972811,0.000000,4.276666
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y,8.320448,0.000000,3.713572
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y,8.996280,5.484797,5.537334
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y,8.933796,0.000000,5.236442
