In [172]:
import pandas as pd
import numpy as np

In [173]:
df=pd.read_csv('loan_status.csv')

In [174]:
df.sample(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
183,LP001637,Male,Yes,1,Graduate,No,33846,0.0,260.0,360.0,1.0,Semiurban,N
227,LP001758,Male,Yes,2,Graduate,No,6250,1695.0,210.0,360.0,1.0,Semiurban,Y
311,LP002004,Male,No,0,Not Graduate,No,2927,2405.0,111.0,360.0,1.0,Semiurban,Y
40,LP001119,Male,No,0,Graduate,No,3600,0.0,80.0,360.0,1.0,Urban,N
215,LP001720,Male,Yes,3+,Not Graduate,No,3850,983.0,100.0,360.0,1.0,Semiurban,Y


In [175]:
df.shape

(614, 13)

In [176]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [177]:
df.drop(columns=['Loan_ID','CoapplicantIncome','Married','Self_Employed'],axis=1,inplace=True)

In [178]:
df.describe()

Unnamed: 0,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,592.0,600.0,564.0
mean,5403.459283,146.412162,342.0,0.842199
std,6109.041673,85.587325,65.12041,0.364878
min,150.0,9.0,12.0,0.0
25%,2877.5,100.0,360.0,1.0
50%,3812.5,128.0,360.0,1.0
75%,5795.0,168.0,360.0,1.0
max,81000.0,700.0,480.0,1.0


In [179]:
df['Loan_Status'].value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

In [180]:
df.isnull().sum()

Gender              13
Dependents          15
Education            0
ApplicantIncome      0
LoanAmount          22
Loan_Amount_Term    14
Credit_History      50
Property_Area        0
Loan_Status          0
dtype: int64

In [181]:
df.nunique()

Gender                2
Dependents            4
Education             2
ApplicantIncome     505
LoanAmount          203
Loan_Amount_Term     10
Credit_History        2
Property_Area         3
Loan_Status           2
dtype: int64

In [182]:
df['LoanAmount'].isnull().sum()

22

In [183]:
df.duplicated().sum()

0

In [184]:
df.shape

(614, 9)

In [185]:
df['Dependents']=df['Dependents'].replace('3+',3)
df['Loan_Status']=df['Loan_Status'].replace('Y',0)
df['Loan_Status']=df['Loan_Status'].replace('N',1)

In [186]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [187]:
x_train,x_test,y_train,y_test=train_test_split((df.drop(columns=['Loan_Status'],axis=1)),df['Loan_Status'],test_size=0.25,random_state=2)

print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(460, 8) (154, 8) (460,) (154,)


In [188]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [189]:
df.head(1)

Unnamed: 0,Gender,Dependents,Education,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,0,Graduate,5849,,360.0,1.0,Urban,0


In [190]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 7))

# scale columns with values outside the range [0, 7]
df['LoanAmount'] = scaler.fit_transform(df['LoanAmount'].values.reshape(-1, 1))


In [191]:
# pipe.fit(x_train,y_train)
trf1 = ColumnTransformer([
    ('imputer', SimpleImputer(strategy='most_frequent'), [0, 1, 2, 3, 4, 5, 6, 7])
], remainder='passthrough')

trf2 = ColumnTransformer([
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'), [0, 2, 7])
], remainder='passthrough')

trf3 = LogisticRegression()

pipe = Pipeline([
    ('trf1', trf1),
    ('trf2', trf2),
    ('trf3', trf3)
])

model=pipe.fit(x_train, y_train)


In [192]:
x_pred=model.predict(x_test)

In [193]:
from sklearn.metrics import accuracy_score 

In [194]:
acc=accuracy_score(x_pred,y_test)

In [195]:
acc

0.7597402597402597

In [204]:
import pickle
pickle.dump(pipe,open('loan_status.pkl','wb'))

In [206]:
pipe=pickle.load(open('loan_status.pkl','rb'))

In [207]:
x_train.head(1)

Unnamed: 0,Gender,Dependents,Education,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
205,Female,0,Not Graduate,4408,120.0,360.0,1.0,Semiurban


In [210]:
test_input=np.array(['Female',0,'Not Graduate',4408,120,360,1,'Semiurban'],dtype=object).reshape(1,8)

In [211]:
pipe.predict(test_input)

array([0], dtype=int64)

In [212]:
y_train.head()

205    0
389    0
194    0
322    0
7      1
Name: Loan_Status, dtype: int64