<a href="https://colab.research.google.com/github/karri-ten/bank_personal_loan_model/blob/main/bank_personal_loan_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing of Libaries

In [147]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.formula.api import ols
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold

## Importing of dataset


In [148]:
df = pd.read_csv('/content/Bank_Personal_Loan_Modelling.csv')
df.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


## Preprocessing

In [149]:
df.columns = [col.lower().replace(" ","_") for col in df.columns]
df.head()

Unnamed: 0,id,age,experience,income,zip_code,family,ccavg,education,mortgage,personal_loan,securities_account,cd_account,online,creditcard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [150]:
df.drop(columns=['id','zip_code'],axis=1,inplace=True)
df.head()

Unnamed: 0,age,experience,income,family,ccavg,education,mortgage,personal_loan,securities_account,cd_account,online,creditcard
0,25,1,49,4,1.6,1,0,0,1,0,0,0
1,45,19,34,3,1.5,1,0,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,0,1


In [151]:
# filter the data set to return cutomers who took personal loan
df_filtered = df[df['personal_loan'] == 1]

df_filtered.head()

Unnamed: 0,age,experience,income,family,ccavg,education,mortgage,personal_loan,securities_account,cd_account,online,creditcard
9,34,9,180,1,8.9,3,0,1,0,0,0,0
16,38,14,130,4,4.7,3,134,1,0,0,0,0
18,46,21,193,2,8.1,3,0,1,0,0,0,0
29,38,13,119,1,3.3,2,0,1,0,1,1,1
38,42,18,141,3,5.0,3,0,1,1,1,1,0


In [152]:
df_filtered.isna().sum()


Unnamed: 0,0
age,0
experience,0
income,0
family,0
ccavg,0
education,0
mortgage,0
personal_loan,0
securities_account,0
cd_account,0


In [153]:
df_filtered.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,480.0,45.066667,11.590964,26.0,35.0,45.0,55.0,65.0
experience,480.0,19.84375,11.582443,0.0,9.0,20.0,30.0,41.0
income,480.0,144.745833,31.584429,60.0,122.0,142.5,172.0,203.0
family,480.0,2.6125,1.115393,1.0,2.0,3.0,4.0,4.0
ccavg,480.0,3.905354,2.097681,0.0,2.6,3.8,5.3475,10.0
education,480.0,2.233333,0.753373,1.0,2.0,2.0,3.0,3.0
mortgage,480.0,100.845833,160.847862,0.0,0.0,0.0,192.5,617.0
personal_loan,480.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
securities_account,480.0,0.125,0.331064,0.0,0.0,0.0,0.0,1.0
cd_account,480.0,0.291667,0.455004,0.0,0.0,0.0,1.0,1.0


## Fitting the model using statsmodels.OLS

In [154]:
y_target ='personal_loan'
x_predictors = [col for col in df_filtered.columns if col != y_target]

In [155]:
formula_str = 'personal_loan ~ ' + '+'.join(x_predictors)
print(formula_str)

personal_loan ~ age+experience+income+family+ccavg+education+mortgage+securities_account+cd_account+online+creditcard


In [157]:
# Fit the model using the model DataFrame
model = ols(formula=formula_str, data=df)
fitted = model.fit()

# Output the fitted summary
print(fitted.summary())

                            OLS Regression Results                            
Dep. Variable:          personal_loan   R-squared:                       0.386
Model:                            OLS   Adj. R-squared:                  0.385
Method:                 Least Squares   F-statistic:                     285.3
Date:                Fri, 03 Jan 2025   Prob (F-statistic):               0.00
Time:                        00:56:30   Log-Likelihood:                 236.48
No. Observations:                5000   AIC:                            -449.0
Df Residuals:                    4988   BIC:                            -370.8
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept             -0.2324      0

## Variable selection by correlation and significance

In [159]:
# Calculate correlations between predictor variables and the response variable
corr = df.corr()['personal_loan'].sort_values(ascending=False)
corr

Unnamed: 0,personal_loan
personal_loan,1.0
income,0.502462
ccavg,0.366889
cd_account,0.316355
mortgage,0.142095
education,0.136722
family,0.061367
securities_account,0.021954
online,0.006278
creditcard,0.002802


In [160]:
# Calculate the p_ value using pearson

# Build a dictionary of correlation coefficients and p-values
dict_cp = {}

column_titles = [col for col in corr.index if col!= 'personal_loan']

for col in column_titles:
    p_val = round(pearsonr(df[col], df['personal_loan'])[1],6)
    dict_cp[col] = {'Correlation_Coefficient':corr[col],
                    'P_Value':p_val}

df_cp = pd.DataFrame(dict_cp).T
df_cp_sorted = df_cp.sort_values('P_Value')
df_cp_sorted[df_cp_sorted['P_Value']<0.1]

Unnamed: 0,Correlation_Coefficient,P_Value
income,0.502462,0.0
ccavg,0.366889,0.0
cd_account,0.316355,0.0
mortgage,0.142095,0.0
education,0.136722,0.0
family,0.061367,1.4e-05


## Fitting the model using statsmodels.OLS

In [None]:

x_predictors = [col for col in df_filtered.columns if col != y_target]

In [162]:
# The dependent variable remains the same:
y_target ='personal_loan'
# Model building – Independent Variable (IV) DataFrame
X_data = list(df_cp[df_cp['P_Value'] < 0.05].index)

In [163]:
formula_str = 'personal_loan ~ ' + '+'.join(X_data)
print(formula_str)

personal_loan ~ income+ccavg+cd_account+mortgage+education+family


In [164]:
model = ols(formula=formula_str, data=df)
fitted = model.fit()

# Output the fitted summary
print(fitted.summary())

                            OLS Regression Results                            
Dep. Variable:          personal_loan   R-squared:                       0.377
Model:                            OLS   Adj. R-squared:                  0.376
Method:                 Least Squares   F-statistic:                     503.2
Date:                Fri, 03 Jan 2025   Prob (F-statistic):               0.00
Time:                        01:12:59   Log-Likelihood:                 198.54
No. Observations:                5000   AIC:                            -383.1
Df Residuals:                    4993   BIC:                            -337.5
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.3995      0.013    -30.760      0.0