In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

#read BankLoanApproval.csv
path = "BankLoanApproval.csv"
df = pd.read_csv(path)

#Data Understanding
print(df.shape) #check for dimension of the dataset
print(df.dtypes) #check the data types for each columns

#df.shape return the dimension of the dataset
#(255327,18) implies that there are 255327 obeservations, and 18 differents variables

#Data Cleaning & Preprocessing
print(df.isnull().values.any()) #check if there's any missing value
#it return False for checking missing values, which implies there are no missing values in the dataset

print(df.duplicated().any())
#it return False for checking duplicated values, which implies there are no duplicated values

#create a dataframe that contain ID
df_withID=df

#create dataframe without ID
df = df.drop(columns=['LoanID'])

#seperate y and x
x=df.drop('Default', axis=1)
y=df['Default']

#handle categorical data (One-hot encoding)
education= pd.get_dummies(x['Education'],prefix='Education', drop_first=True).astype(int)
employmentType=pd.get_dummies(x['EmploymentType'],prefix='EmploymentType', drop_first=True).astype(int)
maritalStatus=pd.get_dummies(x['MaritalStatus'],prefix='MaritalStatus',drop_first=True).astype(int)
hasMortgage=pd.get_dummies(x['HasMortgage'],prefix='HasMortgage',drop_first=True).astype(int)
hasDependents=pd.get_dummies(x['HasDependents'],prefix='HasDependents',drop_first=True).astype(int)
loanPurpose=pd.get_dummies(x['LoanPurpose'],prefix='LoanPurpose',drop_first=True).astype(int)
hasCoSigner=pd.get_dummies(x['HasCoSigner'],prefix='HasCoSigner',drop_first=True).astype(int)

#delete all object types variable from x
x=x.drop(['Education','EmploymentType','MaritalStatus','HasMortgage','HasDependents','LoanPurpose','HasCoSigner'],axis=1)

#concatenate encoded variable into x
x=pd.concat([x,education,employmentType,maritalStatus,hasMortgage,hasDependents,loanPurpose,hasCoSigner], axis=1)
print(x.dtypes)

print(loanPurpose)

print(x)

#Splitting Data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
#now the data already splitted into 80% training, 20% testing
x_train,x_valid,y_train,y_valid = train_test_split(x_train, y_train, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2
#now the data is splitted into 6:2:2 = train:test:validation

(255327, 18)
LoanID             object
Age                 int64
Income              int64
LoanAmount          int64
CreditScore         int64
MonthsEmployed      int64
NumCreditLines      int64
InterestRate      float64
LoanTerm            int64
DTIRatio          float64
Education          object
EmploymentType     object
MaritalStatus      object
HasMortgage        object
HasDependents      object
LoanPurpose        object
HasCoSigner        object
Default             int64
dtype: object
False
False
Age                               int64
Income                            int64
LoanAmount                        int64
CreditScore                       int64
MonthsEmployed                    int64
NumCreditLines                    int64
InterestRate                    float64
LoanTerm                          int64
DTIRatio                        float64
Education_High School             int32
Education_Master's                int32
Education_PhD                     int32
EmploymentTyp

In [3]:

from scipy.stats import ttest_ind, f_oneway

# Load the data
data = pd.read_csv("BankLoanApproval.csv")

# Example: t-test for 'Income' between different levels of 'Education'
education_levels = data['Education'].unique()

for level in education_levels:
    subset = data[data['Education'] == level]
    print(f"T-test for {level} vs. others:")
    for other_level in education_levels[education_levels != level]:
        other_subset = data[data['Education'] == other_level]
        t_stat, p_value = ttest_ind(subset['Income'], other_subset['Income'])
        print(f"   - {level} vs. {other_level}: t-statistic = {t_stat}, p-value = {p_value}")

# Example: ANOVA for 'Income' across different levels of 'MaritalStatus'
f_stat, p_value_anova = f_oneway(data[data['MaritalStatus'] == 'Married']['Income'],
                                 data[data['MaritalStatus'] == 'Single']['Income'],
                                 data[data['MaritalStatus'] == 'Divorced']['Income'])

print(f"\nANOVA for 'Income' across MaritalStatus: F-statistic = {f_stat}, p-value = {p_value_anova}")

T-test for High School vs. others:
   - High School vs. PhD: t-statistic = 0.02622070822928057, p-value = 0.9790813398408851
   - High School vs. Master's: t-statistic = -2.486062851550059, p-value = 0.012917786276438607
   - High School vs. Bachelor's: t-statistic = -1.3298494706515056, p-value = 0.18357023760393051
T-test for PhD vs. others:
   - PhD vs. High School: t-statistic = -0.02622070822928057, p-value = 0.9790813398408851
   - PhD vs. Master's: t-statistic = -2.502638936264561, p-value = 0.012328368594103759
   - PhD vs. Bachelor's: t-statistic = -1.3509889993410236, p-value = 0.1767013466220685
T-test for Master's vs. others:
   - Master's vs. High School: t-statistic = 2.486062851550059, p-value = 0.012917786276438607
   - Master's vs. PhD: t-statistic = 2.502638936264561, p-value = 0.012328368594103759
   - Master's vs. Bachelor's: t-statistic = 1.1628451787467193, p-value = 0.24489449283094994
T-test for Bachelor's vs. others:
   - Bachelor's vs. High School: t-statistic