In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("loan_approval_data.csv")
df.head()
df.info()
#df.isnull().sum()
#df.describe()

Handle Missing Values


In [None]:
categorical_cols = df.select_dtypes(include = ["object"]).columns
numerical_cols = df.select_dtypes(include = ["number"]).columns
#categorical_vals
numerical_cols

In [None]:
from sklearn.impute import SimpleImputer

num_imp = SimpleImputer(strategy = "mean")
df[numerical_cols] = num_imp.fit_transform(df[numerical_cols])

In [None]:
df.head()

In [None]:
cat_imp = SimpleImputer(strategy = "most_frequent")
df[categorical_cols] = cat_imp.fit_transform(df[categorical_cols])

In [None]:
df.isnull().sum()

EDA - exploratory data analysis

In [None]:
#how balanced our classes are?

class_count = df["Loan_Approved"].value_counts()
plt.pie(class_count, labels = ["No", "Yes"], autopct = "%1.1f%%")
plt.title("Is Loan Approved or not?")

In [None]:
#analyze categories

gender_count = df["Gender"].value_counts()
ax = sns.barplot(gender_count)
ax.bar_label(ax.containers[0])

In [None]:
#analyze income

sns.histplot(
    data = df,
    x = "Applicant_Income",
    bins = 20
    
)

In [None]:
sns.histplot(
    data = df,
    x = "Coapplicant_Income",
    bins = 20
)

In [None]:
#outliers - box plots

sns.boxplot(
    data =df,
    x = "Loan_Approved",
    y = "Applicant_Income"
    
)

In [None]:
fig, axes = plt.subplots(2,2)
sns.boxplot(ax = axes[0,0], data =df, x = "Loan_Approved",y = "Applicant_Income")
sns.boxplot(ax = axes[0,1], data =df, x = "Loan_Approved",y = "Credit_Score")
sns.boxplot(ax = axes[1,0], data =df, x = "Loan_Approved",y = "DTI_Ratio")
sns.boxplot(ax = axes[1,1], data =df, x = "Loan_Approved",y = "Savings")
plt.tight_layout()

In [None]:
#credit_score with loan_approved

sns.histplot(
    data = df,
    x = "Credit_Score",
    hue = "Loan_Approved",
    bins = 20,
    multiple = "dodge"
)

In [None]:
#remove applicant_id

df  = df.drop("Applicant_ID", axis = 1)
df.head()

Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

le = LabelEncoder()
df["Education_Level"] = le.fit_transform(df["Education_Level"])
df["Loan_Approved"] = le.fit_transform(df["Loan_Approved"])
df.head()




In [None]:
cols = ["Employment_Status", "Marital_Status", "Loan_Purpose", "Property_Area", "Gender", "Employer_Category"]

ohe = OneHotEncoder(drop = "first", sparse_output = False, handle_unknown = "ignore")

encoded = ohe.fit_transform(df[cols])

encoded_df = pd.DataFrame(encoded, columns = ohe.get_feature_names_out(cols), index = df.index)

df = pd.concat([df.drop(columns = cols), encoded_df], axis = 1)

In [None]:
df.head()

Correlation Heatmap

In [None]:
num_cols = df.select_dtypes(include = "number")
corr_matrix = num_cols.corr()

plt.figure(figsize=(18,8))
sns.heatmap(
    corr_matrix,
    annot = True,
    fmt = ".2f",
    cmap = "coolwarm"
)


In [None]:
num_cols.corr()["Loan_Approved"].sort_values(ascending = False)

Train-Test-Split + Feature Scaling

In [None]:
x = df.drop("Loan_Approved", axis = 1)
y =df["Loan_Approved"]
y.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [None]:
#feature scaling

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train)
y_train_scaled = scaler.transform(x_test)
x_test_scaled

Train & Evaluate Models

In [None]:
#logistic regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score

log_model = LogisticRegression()
log_model.fit(x_train_scaled, y_train)

y_pred = log_model.predict(x_test_scaled)
print("Logistic Regression Model")
print("Precision:", precision_score(y_test, y_pred))
print("recall:", recall_score(y_test, y_pred))
print("f1:", f1_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("CM:", confusion_matrix(y_test, y_pred))
      


In [None]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score

knn_model = KNeighborsClassifier(n_neighbors = 5)
knn_model.fit(x_train_scaled, y_train)

y_pred = knn_model.predict(x_test_scaled)
print("KNN Model")
print("Precision:", precision_score(y_test, y_pred))
print("recall:", recall_score(y_test, y_pred))
print("f1:", f1_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("CM:", confusion_matrix(y_test, y_pred))
      


In [None]:
#naive bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score

nb_model = GaussianNB()
nb_model.fit(x_train_scaled, y_train)


y_pred = nb_model.predict(x_test_scaled)
print("naive bayes Model")
print("Precision:", precision_score(y_test, y_pred))
print("recall:", recall_score(y_test, y_pred))
print("f1:", f1_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("CM:", confusion_matrix(y_test, y_pred))
      
