In [47]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier

In [48]:
df = pd.read_csv('loan_data.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y


In [49]:
df.shape

(381, 13)

In [50]:
df.isnull().sum()

Loan_ID               0
Gender                5
Married               0
Dependents            8
Education             0
Self_Employed        21
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     11
Credit_History       30
Property_Area         0
Loan_Status           0
dtype: int64

In [51]:
df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [52]:
le = LabelEncoder()
for column in df.columns:
    if column == 'Loan_Status':
        pass
    elif df[column].dtype == 'object':
        df[column] = le.fit_transform(df[column])

In [53]:
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,N
1,1,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,Y
2,2,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,Y
3,3,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,Y
4,4,1,1,0,1,0,2333,1516.0,95.0,360.0,1.0,2,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,376,1,1,3,0,0,5703,0.0,128.0,360.0,1.0,2,Y
377,377,1,1,0,0,0,3232,1950.0,108.0,360.0,1.0,0,Y
378,378,0,0,0,0,0,2900,0.0,71.0,360.0,1.0,0,Y
379,379,1,1,3,0,0,4106,0.0,40.0,180.0,1.0,0,Y


In [54]:
from sklearn.impute import SimpleImputer

columns_with_missing = df.columns[df.isnull().any()]
imputer = SimpleImputer(strategy='mean')
df[columns_with_missing] = imputer.fit_transform(df[columns_with_missing])
df.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,N
1,1,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,Y
2,2,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,Y
3,3,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,Y
4,4,1,1,0,1,0,2333,1516.0,95.0,360.0,1.0,2,Y


In [55]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [56]:
X = df.drop(columns=['Loan_Status']) 
y = df['Loan_Status'] 
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=0)

In [57]:
def perform_kfold_cv(model, X, Y, k):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, Y, cv=kf)
    mean_score = np.mean(scores)
    print(f"Scores for k={k}: {scores}")
    print(f"Mean score for k={k}: {mean_score}")
    return mean_score

# Function to find the best k and fit the model
def find_best_k_and_fit(model, X, Y, k_values):
    mean_scores = {}
    for k in k_values:
        mean_scores[k] = perform_kfold_cv(model, X, Y, k)
    
    best_k = max(mean_scores, key=mean_scores.get)
    print(f"Best k is {best_k} with a mean score of {mean_scores[best_k]}")
    
    # Fit the model using all the training data
    model.fit(X, Y)
    print("Model fitted with the entire dataset.")
    return model
k_values = [5,7,10]

In [58]:
nb_model = GaussianNB()
print("Gaussian Naive Bayes Model:")
best_nb_model = find_best_k_and_fit(nb_model, x_train, y_train, k_values)

Gaussian Naive Bayes Model:
Scores for k=5: [0.875      0.8        0.66666667 0.86666667 0.93333333]
Mean score for k=5: 0.8283333333333334
Scores for k=7: [0.81818182 0.72727273 0.81818182 0.72727273 1.         0.81818182
 0.9       ]
Mean score for k=7: 0.82987012987013
Scores for k=10: [0.875      0.875      0.75       0.75       0.75       0.875
 1.         0.71428571 1.         0.85714286]
Mean score for k=10: 0.8446428571428571
Best k is 10 with a mean score of 0.8446428571428571
Model fitted with the entire dataset.


In [59]:
y_pred = nb_model.predict(x_test) 
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print(f"Accuracy:{accuracy}")
print(f"Precision:{precision}")
print(f"Recall:{recall}")
print(f"F1-Score:{f1}")
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

Accuracy:0.780327868852459
Precision:0.74467478128566
Recall:0.6981873266919996
F1-Score:0.7126727689430689


array([[ 45,  46],
       [ 21, 193]], dtype=int64)

In [60]:
dt_model = DecisionTreeClassifier(random_state=42)
print("Decision Tree Classifier:")
best_dt_model = find_best_k_and_fit(dt_model, x_train, y_train, k_values)

Decision Tree Classifier:
Scores for k=5: [0.8125     0.93333333 0.73333333 0.8        0.6       ]
Mean score for k=5: 0.7758333333333334
Scores for k=7: [0.72727273 0.81818182 0.81818182 0.63636364 1.         0.72727273
 0.7       ]
Mean score for k=7: 0.7753246753246754
Scores for k=10: [0.875      0.875      0.875      0.75       0.875      0.875
 1.         0.57142857 0.85714286 0.85714286]
Mean score for k=10: 0.8410714285714285
Best k is 10 with a mean score of 0.8410714285714285
Model fitted with the entire dataset.


In [61]:
y_pred = dt_model.predict(x_test) 
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print(f"Accuracy:{accuracy}")
print(f"Precision:{precision}")
print(f"Recall:{recall}")
print(f"F1-Score:{f1}")
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

Accuracy:0.7934426229508197
Precision:0.7640103968555851
Recall:0.7138492348772723
F1-Score:0.7298266334837811


array([[ 47,  44],
       [ 19, 195]], dtype=int64)

In [62]:
rf_model = RandomForestClassifier(random_state=42)
print("Random Forest Classifier:")
best_rf_model = find_best_k_and_fit(rf_model, x_train, y_train, k_values)

Random Forest Classifier:
Scores for k=5: [0.9375     0.93333333 0.8        0.86666667 0.86666667]
Mean score for k=5: 0.8808333333333334
Scores for k=7: [1.         0.81818182 0.90909091 0.90909091 0.90909091 0.81818182
 0.8       ]
Mean score for k=7: 0.8805194805194806
Scores for k=10: [1.         0.875      0.875      0.875      1.         0.75
 1.         0.71428571 0.85714286 0.85714286]
Mean score for k=10: 0.8803571428571428
Best k is 5 with a mean score of 0.8808333333333334
Model fitted with the entire dataset.


In [63]:
y_pred = rf_model.predict(x_test) 
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print(f"Accuracy:{accuracy}")
print(f"Precision:{precision}")
print(f"Recall:{recall}")
print(f"F1-Score:{f1}")
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

Accuracy:0.8229508196721311
Precision:0.8518472703282203
Recall:0.7190869877785766
F1-Score:0.7471444362564481


array([[ 42,  49],
       [  5, 209]], dtype=int64)