#Import Library

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

#Import Dataset

In [None]:
data = pd.read_csv('credit-approval_csv.csv')
data

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved,Approved_Status
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+,Approved
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+,Approved
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,280.0,824,+,Approved
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+,Approved
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+,Approved
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260.0,0,-,Not Approved
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,200.0,394,-,Not Approved
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,200.0,1,-,Not Approved
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280.0,750,-,Not Approved


#Preprocessing Dataset

In [None]:
#Check if there is any null in the data
data.isnull().sum()

Gender             12
Age                12
Debt                0
Married             6
BankCustomer        6
EducationLevel      9
Ethnicity           9
YearsEmployed       0
PriorDefault        0
Employed            0
CreditScore         0
DriversLicense      0
Citizen             0
ZipCode            13
Income              0
Approved            0
Approved_Status     0
dtype: int64

In [None]:
# Impute the missing values with mean imputation for the numeric columns
# mean imputation is replacing the missing values with the mean of the data
data.fillna(data.mean(), inplace=True)

# Count the number of NaNs in the dataset and print the counts to verify
data.isnull().sum()

Gender             12
Age                 0
Debt                0
Married             6
BankCustomer        6
EducationLevel      9
Ethnicity           9
YearsEmployed       0
PriorDefault        0
Employed            0
CreditScore         0
DriversLicense      0
Citizen             0
ZipCode             0
Income              0
Approved            0
Approved_Status     0
dtype: int64

In [None]:
# Impute the missing values with most frequent imputation for the caterogical columns
# Iterate over each column of data
for col in data:
    # Check if the column is of object type
    if (data[col].dtypes == 'object'):
        # Impute with the most frequent value / mode
        data = data.fillna(data[col].value_counts().index[0])

# Count the number of NaNs in the dataset and print the counts to verify
data.isnull().sum()

Gender             0
Age                0
Debt               0
Married            0
BankCustomer       0
EducationLevel     0
Ethnicity          0
YearsEmployed      0
PriorDefault       0
Employed           0
CreditScore        0
DriversLicense     0
Citizen            0
ZipCode            0
Income             0
Approved           0
Approved_Status    0
dtype: int64

In [None]:
#Using labelEncoder to change the approved status to an interger 0/1
le = LabelEncoder()
data['Approved_Status'] = le.fit_transform(data['Approved_Status'])
data

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved,Approved_Status
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+,0
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+,0
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,280.0,824,+,0
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+,0
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260.0,0,-,1
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,200.0,394,-,1
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,200.0,1,-,1
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280.0,750,-,1


In [None]:
#In this part, I will choose the categorical features to transform them using label encoder
#Making a new data_categorical to change the value
data_categorical = data[['Gender',
 'Age',
 'Married',
 'BankCustomer',
 'EducationLevel',
 'Ethnicity',
 'PriorDefault',
 'Employed',
 'DriversLicense',
 'Citizen',
 'ZipCode']]

#transform the catorigical data
data_categorical = data_categorical.apply(le.fit_transform)

In [None]:
#droping the old data
data = data.drop(data_categorical.columns, axis=1)
#adding the already changed data
data = pd.concat([data, data_categorical], axis=1)
data

Unnamed: 0,Debt,YearsEmployed,CreditScore,Income,Approved,Approved_Status,Gender,Age,Married,BankCustomer,EducationLevel,Ethnicity,PriorDefault,Employed,DriversLicense,Citizen,ZipCode
0,0.000,1.25,1,0,+,0,1,156,2,1,13,8,1,1,0,0,69
1,4.460,3.04,6,560,+,0,0,329,2,1,11,4,1,1,0,0,11
2,0.500,1.50,0,824,+,0,0,89,2,1,11,4,1,0,0,0,97
3,1.540,3.75,5,3,+,0,1,125,2,1,13,8,1,1,1,0,31
4,5.625,1.71,0,0,+,0,1,43,2,1,13,8,1,0,0,2,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,10.085,1.25,0,0,-,1,1,52,3,3,5,4,0,0,0,0,91
686,0.750,2.00,2,394,-,1,0,71,2,1,2,8,0,1,1,0,68
687,13.500,2.00,1,1,-,1,0,97,3,3,6,3,0,1,1,0,68
688,0.205,0.04,0,750,-,1,1,20,2,1,0,8,0,0,0,0,97


In [None]:
#Finding the corelation value with the dependent variable
Correlation = data.corr()
Correlation['Approved_Status'].abs().sort_values(ascending = False)

Approved_Status    1.000000
PriorDefault       0.720407
Employed           0.458301
CreditScore        0.406410
YearsEmployed      0.322475
Debt               0.206294
Married            0.191431
BankCustomer       0.187520
Income             0.175657
Age                0.158278
EducationLevel     0.130026
ZipCode            0.101224
Citizen            0.100867
DriversLicense     0.031625
Gender             0.028934
Ethnicity          0.000877
Name: Approved_Status, dtype: float64

In [None]:
#By seeing above corelation value, I'll be using feature with above 0.1 
data = data.drop(['Ethnicity', 'Gender','DriversLicense'], axis=1)

In [None]:
#By seeing above corelation value, I'll be using feature with above 0.1 
x_data = data[['Debt','YearsEmployed','CreditScore','Income','Age','Married','BankCustomer','EducationLevel','PriorDefault','Employed','Citizen','ZipCode']] #independet variable
y_data = data[['Approved_Status']] #dependent variable

In [None]:
#Normalize the dataset using StandardScaler, I am using StandardScaler as we don't know the min-max value
sc = StandardScaler()
x_data = sc.fit_transform(x_data)

In [None]:
#Spliting Dataset with 20% test size

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2)

In [None]:
#Making the model using sklearn.linear_model 
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
#Insert the prediction to y_pred
y_pred = model.predict(x_test)

# Import confusion_matrix
from sklearn.metrics import confusion_matrix

#Evaluate the model on training accuary, testing accuracy
print("Training Accuracy:",model.score(x_train,y_train)*100)
print("Testing Accuracy:",model.score(x_test,y_test)*100)

#Evaluate the model on confusion matrix
print("Confusion Matrix\n", confusion_matrix(y_test, y_pred))

Training Accuracy: 88.40579710144928
Testing Accuracy: 85.5072463768116
Confusion Matrix
 [[57  7]
 [13 61]]
