In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler

In [5]:
!pip install -r req.txt

Collecting pandas (from -r req.txt (line 1))
  Using cached pandas-2.2.2-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting matplotlib (from -r req.txt (line 3))
  Using cached matplotlib-3.9.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting pytz>=2020.1 (from pandas->-r req.txt (line 1))
  Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas->-r req.txt (line 1))
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting contourpy>=1.0.1 (from matplotlib->-r req.txt (line 3))
  Downloading contourpy-1.2.1-cp312-cp312-win_amd64.whl.metadata (5.8 kB)
Collecting cycler>=0.10 (from matplotlib->-r req.txt (line 3))
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib->-r req.txt (line 3))
  Downloading fonttools-4.53.0-cp312-cp312-win_amd64.whl.metadata (165 kB)
     ---------------------------------------- 0.0/165.5 kB ? eta -:--:--
     ---------------------

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('loan.csv')
df.head()

Unnamed: 0,Loan_ID,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Credit_History,Loan_Status
0,LP001002,0,Graduate,0.0,5849,,1.0,1
1,LP001003,1,Graduate,0.0,4583,128.0,1.0,0
2,LP001005,0,Graduate,,3000,66.0,1.0,1
3,LP001006,0,0t Graduate,0.0,2583,120.0,1.0,1
4,LP001008,0,Graduate,0.0,6000,141.0,1.0,1


Remove loan id column (irrelevant)

In [4]:
df.drop(["Loan_ID"], axis="columns", inplace=True)
df.dropna(inplace=True)

In [5]:
df

Unnamed: 0,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Credit_History,Loan_Status
1,1,Graduate,0.0,4583,128.0,1.0,0
3,0,0t Graduate,0.0,2583,120.0,1.0,1
4,0,Graduate,0.0,6000,141.0,1.0,1
6,0,0t Graduate,0.0,2333,95.0,1.0,1
7,3+,Graduate,0.0,3036,158.0,0.0,0
...,...,...,...,...,...,...,...
608,0,Graduate,0.0,3232,108.0,1.0,1
609,0,Graduate,0.0,2900,71.0,1.0,1
610,3+,Graduate,0.0,4106,40.0,1.0,1
611,1,Graduate,0.0,8072,253.0,1.0,1


In [6]:
df['Education'] = df['Education'].replace({'Graduate': 1, "Not Graduate": 0})
df["Credit_History"] = pd.to_numeric(df['Credit_History'], errors='coerce').astype(int)
df["LoanAmount"] = pd.to_numeric(df['LoanAmount'], errors='coerce').astype(int)

base on dataset source, loan amount are written in thousands so we will use the real number

In [7]:
df["LoanAmount"] = df.LoanAmount*1000

look at features correlation and remove unperformed features

In [8]:
df

Unnamed: 0,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Credit_History,Loan_Status
1,1,1,0.0,4583,128000,1,0
3,0,0t Graduate,0.0,2583,120000,1,1
4,0,1,0.0,6000,141000,1,1
6,0,0t Graduate,0.0,2333,95000,1,1
7,3+,1,0.0,3036,158000,0,0
...,...,...,...,...,...,...,...
608,0,1,0.0,3232,108000,1,1
609,0,1,0.0,2900,71000,1,1
610,3+,1,0.0,4106,40000,1,1
611,1,1,0.0,8072,253000,1,1


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 434 entries, 1 to 612
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Dependents       434 non-null    object 
 1   Education        434 non-null    object 
 2   Self_Employed    434 non-null    float64
 3   ApplicantIncome  434 non-null    int64  
 4   LoanAmount       434 non-null    int64  
 5   Credit_History   434 non-null    int64  
 6   Loan_Status      434 non-null    int64  
dtypes: float64(1), int64(4), object(2)
memory usage: 27.1+ KB


In [10]:
df['Dependents'] = df['Dependents'].str.replace('3+', '3')
df['Education'] = df['Education'].str.replace('0t Graduate', '0')
df['Education'].fillna(1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Education'].fillna(1, inplace=True)


In [11]:
df

Unnamed: 0,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Credit_History,Loan_Status
1,1,1,0.0,4583,128000,1,0
3,0,0,0.0,2583,120000,1,1
4,0,1,0.0,6000,141000,1,1
6,0,0,0.0,2333,95000,1,1
7,3,1,0.0,3036,158000,0,0
...,...,...,...,...,...,...,...
608,0,1,0.0,3232,108000,1,1
609,0,1,0.0,2900,71000,1,1
610,3,1,0.0,4106,40000,1,1
611,1,1,0.0,8072,253000,1,1


In [12]:
df.drop(columns=["Self_Employed"], inplace=True)

In [13]:
df

Unnamed: 0,Dependents,Education,ApplicantIncome,LoanAmount,Credit_History,Loan_Status
1,1,1,4583,128000,1,0
3,0,0,2583,120000,1,1
4,0,1,6000,141000,1,1
6,0,0,2333,95000,1,1
7,3,1,3036,158000,0,0
...,...,...,...,...,...,...
608,0,1,3232,108000,1,1
609,0,1,2900,71000,1,1
610,3,1,4106,40000,1,1
611,1,1,8072,253000,1,1


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 434 entries, 1 to 612
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Dependents       434 non-null    object
 1   Education        434 non-null    object
 2   ApplicantIncome  434 non-null    int64 
 3   LoanAmount       434 non-null    int64 
 4   Credit_History   434 non-null    int64 
 5   Loan_Status      434 non-null    int64 
dtypes: int64(4), object(2)
memory usage: 23.7+ KB


In [15]:
df["Dependents"] = pd.to_numeric(df['Dependents'], errors='coerce').astype(int)
df["Education"] = pd.to_numeric(df['Education'], errors='coerce').astype(int)
df

Unnamed: 0,Dependents,Education,ApplicantIncome,LoanAmount,Credit_History,Loan_Status
1,1,1,4583,128000,1,0
3,0,0,2583,120000,1,1
4,0,1,6000,141000,1,1
6,0,0,2333,95000,1,1
7,3,1,3036,158000,0,0
...,...,...,...,...,...,...
608,0,1,3232,108000,1,1
609,0,1,2900,71000,1,1
610,3,1,4106,40000,1,1
611,1,1,8072,253000,1,1


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 434 entries, 1 to 612
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Dependents       434 non-null    int64
 1   Education        434 non-null    int64
 2   ApplicantIncome  434 non-null    int64
 3   LoanAmount       434 non-null    int64
 4   Credit_History   434 non-null    int64
 5   Loan_Status      434 non-null    int64
dtypes: int64(6)
memory usage: 23.7 KB


In [18]:
from sklearn.preprocessing import StandardScaler

In [19]:
scaler = StandardScaler()
df[["ApplicantIncome", "LoanAmount"]] = scaler.fit_transform(df[["ApplicantIncome", "LoanAmount"]])

each column impact on loan status visualization

In [20]:
df[df.Loan_Status == 1].shape

(298, 6)

In [21]:
df[df.Loan_Status == 0].shape

(136, 6)

<b> 4. Data Preparation

In [22]:
#Extracting Independent and dependent Variable  
X = df.drop(["Loan_Status"], axis=1)
y = df['Loan_Status']

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=64)

<b> 5. Create LOGISTIC Regression model

#Fitting Logistic Regression to the training set  
from sklearn.linear_model import LogisticRegression  
classifier= LogisticRegression(random_state=0)  
classifier.fit(X_train, y_train)  


In [26]:
from sklearn.linear_model import LogisticRegression

In [27]:
classifier = LogisticRegression(C=1, penalty='l1', solver='liblinear')
classifier.fit(X_train, y_train)
classifier.score(X_test, y_test)

0.8473282442748091

In [28]:
#Predicting the test set result  
y_pred= classifier.predict(X_test)  

In [29]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm=confusion_matrix(y_test,y_pred)
acc = accuracy_score(y_test,y_pred)

In [30]:
cm

array([[21, 18],
       [ 2, 90]])

In [31]:
acc

0.8473282442748091

In [32]:
import pickle

In [33]:
pickle.dump(classifier, open('logisticloannew.pkl','wb'))

In [34]:
pickle.dump(scaler, open('scalerloan.pkl','wb'))

In [131]:
X_train[:10]

Unnamed: 0,Dependents,Education,ApplicantIncome,LoanAmount,Credit_History
67,1,1,0.95108,2.118637,1
266,2,1,-0.068052,0.10115,1
17,0,1,-0.270124,-0.820418,0
272,0,1,-0.384316,-0.197737,1
281,0,1,-0.199786,-0.372088,1
431,0,1,0.234213,-0.459263,0
298,0,1,-0.018799,0.113603,1
375,0,1,-0.049666,-0.272459,1
177,3,1,0.068237,4.397649,0
368,1,1,0.204695,0.41249,1


In [132]:
y_train[:10]

67     1
266    1
17     0
272    1
281    1
431    0
298    0
375    1
177    0
368    1
Name: Loan_Status, dtype: int64

In [42]:
data = scaler.transform([[3500, 450000]])
p1 = data[0,0]
p2 = data[0,1]
classifier.predict([[1,1,p1,p2,1]])  



array([1])