<a href="https://colab.research.google.com/github/karansingla11223344/LoanTap-Business-Case/blob/main/LoanTap_business_case.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Business Case: LoanTap Logistic Regression

LoanTap provides personal loans to customers and faces the critical challenge of identifying borrowers who are likely to default versus those who will fully repay their loans. Incorrect classification can lead to financial losses or missed business opportunities.

In this business case, we aim to build a classification model using Logistic Regression to predict whether a loan applicant will default or repay the loan based on historical customer and loan-related features. The objective is to help the business make data-driven lending decisions and minimize credit risk.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

sns.set_style('whitegrid')

In [None]:
pd.set_option('display.max_columns',None)

In [None]:
data = pd.read_csv("/content/logistic_regression.csv")
data.head()

FileNotFoundError: [Errno 2] No such file or directory: '/content/logistic_regression.csv'

In [None]:
data.columns

In [None]:
data.isna().sum(axis=0)

In [None]:
data.columns

In [None]:
data=data[data['dti']<1000]

In [None]:
data.info()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.describe()

# data cleaning

In [None]:
data['term']=data['term'].str.split( ).apply(lambda x:x[0])

In [None]:
data['emp_length'] = data['emp_length'].str.split( ).apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x)

In [None]:
def func(x):
  if x =="<":
    return (0.5)
  elif x == np.nan:
    return np.nan
  elif x =='10+':
    return (13)
  else :
    return x




In [None]:
data['emp_length']=data['emp_length'].apply(func)

In [None]:
imp_purposes=['debt_consolidation','credit_card','home_improvement']
data['purpose']=data['purpose'].apply(lambda x : x if x in imp_purposes else 'other')

In [None]:
grade_mapping={'A':0,'B':1,'C':2,'D':3,'E':4,'F':5,'G':6}
data['grade_e']=data['grade'].map(grade_mapping)

In [None]:
data['verification_status']=data['verification_status'].apply(lambda x : x if x in('Not Verified') else 'Verified')

In [None]:
subgrades=data['sub_grade'].unique()
subgrades_sort=sorted(subgrades)


In [None]:
subgrade_mapping={subgrade:idx for idx,subgrade in enumerate(subgrades_sort)}
data['sub_grade_en']=data['sub_grade'].map(subgrade_mapping)

# new features

In [None]:
data['earliest_cr_line'] = pd.to_datetime(data['earliest_cr_line'], format='%b-%Y',errors='coerce').dt.year
data['issue_d'] = pd.to_datetime(data['issue_d'], format='%b-%Y',errors='coerce').dt.year
#total credit line years=how many years they use the credit service until now
data['total_credit_line_years'] = data['issue_d'] - data['earliest_cr_line']

In [None]:
data['real_disposable_income']=(data['annual_inc']-data['dti']*data['annual_inc']/100)
data['real_disposable_income_month']=data['real_disposable_income']/12
#install_salary_left=after paying the old debt per month how much percentage of their income they give to new debt
data['install_salary_left']=(data['installment']/data['real_disposable_income_month'])*100

In [None]:
data['loan_amnt_int_rate']=data['int_rate']*data['loan_amnt']

In [None]:
data['total_debt_payment_ratio']=(data['installment']+data['revol_bal'])/data['real_disposable_income']

# fill the missing values

In [None]:
data.loc[data['home_ownership']=='RENT','mort_acc']=data.loc[data['home_ownership']=='RENT','mort_acc'].fillna(0.0)
data.loc[data['home_ownership']=='MORTGAGE','mort_acc']=data.loc[data['home_ownership']=='MORTGAGE','mort_acc'].fillna(1.0)

In [None]:
data.loc[~data['home_ownership'].isin(['MORTGAGE',"RENT"]),'mort_acc']=data.loc[~data['home_ownership'].isin(['MORTGAGE',"RENT"]),'mort_acc'].fillna(1.0)

In [None]:
data['pub_rec_bankruptcies']=data['pub_rec_bankruptcies'].fillna(0.0)

In [None]:
data['revol_util']=data['revol_util'].fillna(data['revol_util'].mean())

In [None]:
data['emp_length']=pd.to_numeric(data['emp_length'],errors='coerce')
data['emp_length'].mean()

In [None]:
data['emp_length']=data['emp_length'].fillna(data['emp_length'].median())

In [None]:
data['title']=data['title'].fillna(data['purpose'])

In [None]:
data['term']=data['term'].astype('int')

In [None]:
data.columns

# outliers treatment

In [None]:
per_75_ins=np.percentile(data['install_salary_left'],75)
per_25_ins=np.percentile(data['install_salary_left'],25)
iqr_ins=per_75_ins - per_25_ins
lower_limit_ins=per_25_ins-(1.5*iqr_ins)
upper_limit_ins=per_75_ins+(1.5*iqr_ins)


In [None]:
data['install_salary_left'].loc[data['install_salary_left']<lower_limit_ins]=lower_limit_ins

In [None]:
data['install_salary_left'].loc[data['install_salary_left']>upper_limit_ins]=upper_limit_ins

In [None]:
per_75_dti=np.percentile(data['dti'],75)
per_25_dti=np.percentile(data['dti'],25)
iqr_dti=per_75_dti - per_25_dti
lower_limit_dti=per_25_dti-(1.5*iqr_dti)
upper_limit_dti=per_75_dti+(1.5*iqr_dti)

In [None]:
data['dti'].loc[data['dti']<lower_limit_dti]=lower_limit_dti
data['dti'].loc[data['dti']>upper_limit_dti]=upper_limit_dti

In [None]:
#mort_acc

In [None]:
data['mort_acc']

In [None]:
per_75_mort=np.percentile(data['mort_acc'],75)
per_25_mort=np.percentile(data['mort_acc'],25)
iqr_mort=per_75_mort - per_25_mort
lower_limit_mort=per_25_mort-(1.5*iqr_mort)
upper_limit_mort=per_75_mort+(1.5*iqr_mort)

In [None]:
data['mort_acc'].loc[data['mort_acc']<lower_limit_mort]=lower_limit_mort
data['mort_acc'].loc[data['mort_acc']>upper_limit_mort]=upper_limit_mort

In [None]:
data['mort_acc'].skew()

In [None]:
data['mort_acc_log']=np.log1p(data['mort_acc'])

In [None]:
data['mort_acc_log'].skew()

In [None]:
data.drop(columns='mort_acc',inplace=True)

In [None]:
data['sub_grade_en'].skew()

In [None]:
sns.histplot(x='sub_grade_en',data=data)

In [None]:
sns.boxplot(y='sub_grade_en',data=data)

In [None]:
per_75_sub=np.percentile(data['sub_grade_en'],75)
per_25_sub=np.percentile(data['sub_grade_en'],25)
iqr_sub=per_75_sub - per_25_sub
lower_limit_sub=per_25_sub-(1.5*iqr_sub)
upper_limit_sub=per_75_mort+(1.5*iqr_sub)

In [None]:
data['sub_grade_en'].loc[data['sub_grade_en']<lower_limit_sub]=lower_limit_sub
data['sub_grade_en'].loc[data['sub_grade_en']>upper_limit_sub]=upper_limit_sub

# Univariate analysis

What percentage of customers have fully paid their Loan Amount?

In [None]:
plt.pie(data['loan_status'].value_counts(),labels=data['loan_status'].value_counts().index,
        autopct='%1.1f%%')
plt.show()

Observation

80.4% of people pay their loans fully and 19.6% people loans were charged off

The majority of people have home ownership as

In [None]:
sns.countplot(x='home_ownership',data=data)
plt.show()

In [None]:
plt.pie(data['home_ownership'].value_counts(),labels=data['home_ownership'].value_counts().index,
        autopct='%1.1f%%')
plt.show()

observation

approximately 50.1% people is living in mortgaged homes

In [None]:
df=data.copy()

In [None]:
df.head()

what are the purpose for which people seek loans?

In [None]:
sns.countplot(data=df,x='purpose')
plt.show()

Observation

most people take the loan to pay off existing debts

In [None]:
sns.countplot(data=df,x='application_type')
plt.show()

# Bivariate analysis

does loan grade have an impact on loan status

In [None]:
sns.countplot(x='grade',hue='loan_status',data=data)
plt.title("impact of loan grade on loan status")
plt.show()

Observation (A grade is considered as best)

we can see that with the decrease of grade the ratio between fully paid and charged off also decreases.

Grade A and Grade B people have high chance to pay the loan

Grade F and Grade G people have less chance to pay the loan



Does loan term have impact on loan status

In [None]:
data['term']=data['term'].astype('int')

In [None]:

sns.countplot(x='term',data=df,hue='loan_status')
plt.title('how term(in months) influence on loan_status')
plt.xlabel('term(in months)')
plt.show()

observation

Loan term for 36 months has better fully paid and charged off ratio then 60 month loan term

Does interest rate has impact on loan status

In [None]:
df['int_rate_bins']=pd.cut(df['int_rate'],bins=(1,10,15,100),labels=('low','medium','high'))

In [None]:
sns.countplot(x='int_rate_bins',data=df,hue='loan_status')
plt.title('impact of interest rate on loan status')
plt.xlabel('interest_rate')
plt.show()

Observation

with the increase of interest rate the charged off of loan is also increased

More than 15% of interest rate is risky for company   

do debt to income (dbi) has any impact on loan status

In [None]:
df['dti_bins']=pd.cut(df['dti'],bins=(0,20,30,40,80),labels=('low','moderate','high','veryhigh'))

In [None]:
sns.countplot(x='dti_bins',data=df,hue='loan_status')
plt.title('impact of debt to income(dti) on loan status')
plt.xlabel('debt to income (percentage )')
plt.show()

Observation

High debt to income(more than 30%) chances of charged off of loan increases

In [None]:
sns.histplot(data=df,x='dti',kde=True)
plt.show()

In [None]:
sns.boxplot(data=df,y='install_salary_left',hue='loan_status')
plt.show()

In [None]:
sns.boxplot(data=df,y='emp_length',hue='loan_status')
plt.show()

In [None]:
sns.boxplot(data=df,y='int_rate',hue='loan_status')
plt.show()

In [None]:
from scipy.stats import spearmanr

In [None]:
spearmanr(df['open_acc'],df['total_acc'])

In [None]:
data1=data.select_dtypes(include=np.number)
data1

In [None]:
plt.figure(figsize=(20,12))
sns.heatmap(data=data1[['loan_amnt', 'term', 'emp_length', 'dti', 'open_acc',
       'pub_rec', 'revol_util', 'total_acc','installment',
       'sub_grade_en', 'total_credit_line_years', 'install_salary_left',
       'mort_acc_log']].corr(),annot=True)
plt.show()

installment and loan amount features are highly postively correlated to each other

open account and total account are also some correlation between each other

In [None]:
sns.scatterplot(x='installment',y='loan_amnt',data=df)

# drop

In [None]:
data.drop(columns='pub_rec_bankruptcies',inplace=True)

In [None]:
data.drop(columns=['title','application_type','real_disposable_income','real_disposable_income_month'],inplace=True)

In [None]:
data.drop(columns=['address','earliest_cr_line','issue_d'],inplace=True)

In [None]:
data.drop(columns=['grade','home_ownership','emp_title','grade_e','sub_grade'],inplace=True)

# split of train data

In [None]:
mapping_loan={'Fully Paid':0,'Charged Off':1}
data['loan_status']=data['loan_status'].map(mapping_loan)

In [None]:
X=data.drop('loan_status',axis=1)
y=data['loan_status']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train_1,x_test,y_train_1,y_test=train_test_split(X,y,test_size=0.2,random_state=1)
x_train,x_val,y_train,y_val=train_test_split(x_train_1,y_train_1,test_size=0.25,random_state=1)



In [None]:
x_all=pd.concat([x_train,x_val,x_test])

In [None]:
cat_cols=['verification_status','purpose','initial_list_status']

In [None]:
x_all_encoded=pd.get_dummies(x_all,columns=cat_cols,drop_first=True)

In [None]:
x_all_encoded

In [None]:
x_train_final=x_all_encoded.loc[x_train.index]
x_val_final=x_all_encoded.loc[x_val.index]
x_test_final=x_all_encoded.loc[x_test.index]

# Standarisation

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x_train_final_scaled1=pd.DataFrame(scaler.fit_transform(x_train_final),columns=x_train_final.columns)

In [None]:
x_val_final_scaled1=pd.DataFrame(scaler.transform(x_val_final),columns=x_val_final.columns)
x_test_final_scaled1=pd.DataFrame(scaler.transform(x_test_final),columns=x_test_final.columns)

In [None]:
import statsmodels.api as sm

In [None]:
X_const=sm.add_constant(x_train_final_scaled1.reset_index(drop=True))
model=sm.Logit(y_train.reset_index(drop=True),X_const)
result=model.fit()
print(result.summary())

In [None]:
#drop features which have high p>|z|

In [None]:
x_train_final_scaled=x_train_final_scaled1.drop(columns=['installment','initial_list_status_w','total_debt_payment_ratio'])

In [None]:
x_val_final_scaled=x_val_final_scaled1.drop(columns=['installment','initial_list_status_w','total_debt_payment_ratio',])

In [None]:
x_test_final_scaled=x_test_final_scaled1.drop(columns=['installment','initial_list_status_w','total_debt_payment_ratio'])

# Multicollinearity

In [None]:
import statsmodels.api as sm

In [None]:
# VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
vif_thr = 5
r2_thr = 0.85
i = 0
feats_removed = []
cols2=x_train_final_scaled.columns
while True:
  vif = pd.DataFrame()
  X_t = pd.DataFrame(x_train_final_scaled, columns=x_train_final_scaled.columns)[cols2]
  vif['Features'] = cols2
  vif['VIF'] = [variance_inflation_factor(X_t.values, j) for j in range(X_t.shape[1])]
  vif['VIF'] = round(vif['VIF'], 2)
  vif = vif.sort_values(by = "VIF", ascending = False)
  print(vif)



  if (vif.iloc[0]['VIF'] < vif_thr) :
    print('Reached threshold')
    print('Highest vif:',vif.iloc[0])

    print('Features removed:', i)
    print('List of features removed:', feats_removed)
    break

  feats_removed.append(vif.iloc[0]['Features'])
  i += 1
  cols2 = vif["Features"][1:].values


In [None]:
x_train_final_scaled1.drop(columns=feats_removed,inplace=True)

In [None]:
x_val_final_scaled1.drop(columns=feats_removed,inplace=True)
x_test_final_scaled1.drop(columns=feats_removed,inplace=True)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from imblearn.over_sampling import RandomOverSampler

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE

# Create an instance of SMOTE
smt = SMOTE()


# Perform SMOTE on the training data
print('Before SMOTE')
print(y_train.value_counts())

X_sm, y_sm = smt.fit_resample(x_train_final_scaled, y_train)
print('After Oversampling')
print(y_sm.value_counts())

model = LogisticRegression(C= 100,class_weight='balanced', penalty= 'l2', solver = 'liblinear')

In [None]:
x_train_final_scaled=X_sm
y_train=y_sm

# logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model.fit(x_train_final_scaled,y_train)

In [None]:
weights=pd.DataFrame(data=model.coef_.T,index=x_train_final_scaled.columns,columns=['coefficient']).sort_values(by=['coefficient'],ascending=False)
weights

In [None]:
model.predict(x_train_final_scaled)

# Accuracy Metrics

In [None]:
y_predict_train=model.predict(x_train_final_scaled)
y_predict_train

In [None]:
y_predict_val=model.predict(x_val_final_scaled)
y_predict_val

In [None]:
np.sum(y_train==y_predict_train)

In [None]:
model.predict_proba(x_train_final_scaled)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_train=accuracy_score(y_train,y_predict_train)
accuracy_train

In [None]:
accuracy_val=accuracy_score(y_val,y_predict_val)
accuracy_val

# classification report

# precision/recall/f1_score

# Default 0.5 threshold used

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

y_pred = model.predict(x_val_final_scaled)

conf_matrix = confusion_matrix(y_val, y_pred)
conf_matrix

In [None]:
from matplotlib import pyplot as plt

In [None]:
# ax used here to control the size of confusion matrix
fig, ax = plt.subplots(figsize=(5,5))
ConfusionMatrixDisplay(conf_matrix).plot(ax = ax)

In [None]:
np.diag(conf_matrix).sum() / conf_matrix.sum()

In [None]:
fig, ax = plt.subplots(figsize=(5,5))
ConfusionMatrixDisplay(conf_matrix).plot(ax = ax)

In [None]:
from sklearn.metrics import precision_score

precision_score(y_val, y_pred)

In [None]:
from sklearn.metrics import recall_score

recall_score(y_val, y_pred)

In [None]:
from sklearn.metrics import f1_score

In [None]:
print(f'f1Score:{f1_score(y_val,y_pred)}')

In [None]:
y_probs=model.predict_proba(x_val_final_scaled)[:,1]


# different threshold used

In [None]:
threshold=0.62
y_pred_custom = (y_probs>=threshold).astype(int)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay



conf_matrix1 = confusion_matrix(y_val, y_pred_custom)
conf_matrix1

In [None]:
# ax used here to control the size of confusion matrix
fig, ax = plt.subplots(figsize=(5,5))
ConfusionMatrixDisplay(conf_matrix1).plot(ax = ax)

In [None]:
np.diag(conf_matrix1).sum() / conf_matrix1.sum()

In [None]:
from sklearn.metrics import precision_score

precision_score(y_val, y_pred_custom)

In [None]:
precision_score=precision_score(y_val, y_pred_custom)

In [None]:
from sklearn.metrics import recall_score

recall_score(y_val, y_pred_custom)

In [None]:
recall_score=recall_score(y_val, y_pred_custom)

In [None]:
from sklearn.metrics import f1_score

In [None]:
print(f'f1Score:{f1_score(y_val,y_pred_custom)}')

In [None]:
f1_score=f1_score(y_val,y_pred_custom)

# AU ROC curve

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
probability = model.predict_proba(x_val_final_scaled)

In [None]:
probability

In [None]:
probabilites = probability[:,1]

In [None]:
fpr, tpr, thr = roc_curve(y_val,probabilites)

In [None]:
print(f'fpr:{fpr}')
print(f'tpr:{tpr}')

In [None]:
plt.plot(fpr,tpr)

#random model
plt.plot(fpr,fpr,'--',color='red' )
plt.title('ROC curve')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.show()

In [None]:
roc_auc_score(y_val,probabilites)

In [None]:
area_roc=roc_auc_score(y_val,probabilites)

# observation

Blue line is for model's performance
red dashed line is the performance of dumb or random classifier

For best model we always want tpr =1 and fpr=0 but in this model we could't get it

roc_auc_score =0.7 this is descent for a model.

area of blue curve is more than the area of red curve which tells our model is better than dumb model






# PR curve

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc

In [None]:
precision, recall, thr = precision_recall_curve(y_val, probabilites)

In [None]:
plt.plot(recall, precision)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('PR curve')
plt.show()

In [None]:
auc(recall, precision)

In [None]:
area_auc_pre_recall=auc(recall, precision)

Observation

precision drops sharply at start means with small of threshold there is more chance of false positive or more positive class prediction

even after sharp drop by decresing the threshold the precision going downward without sustain or constant value and recall_score is increasing


to find the good metrics of model we should have high f1 score we used threshold of 0.62 where we get decent f1 score 0.4



for a decent model pr_auc value should be 0.7
so it may not be a good model but data is so messy

# model evaluation

In [None]:
print(f'f1_score:{f1_score}')
print(f'precision_score:{precision_score}')
print(f'recall_score:{recall_score}')
print(f'area of au roc curve:{area_roc}')
print(f'area of precision recall curve:{area_auc_pre_recall}')

The model performance is not optimal because of non linearity present in the dataset

# Business insights and recommendations

1.we see even after adding new features like installment per disposable income and loan amount*interest rate for compounding impact the performance of model is improved but not much

2.Sub_grade and term are get highest weight coefficients and are postively correlated

interest_rate and loan_amount are get highest negative weight coefficients and are negatively correlated



furthermore,the model performance can be improved by adding features like total savings,family wealth.we see even with worst situations like dti is very high and installment per disposable income is high the ratio betwwen fully paid and charged off is 2:1 also.so there is so much noise only by removing the data we can get decent model