In [115]:
import pandas as pd
import numpy as np

In [116]:
df=pd.read_csv("loan_approved.csv")

In [117]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status (Approved)
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [118]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Loan_ID                 614 non-null    object 
 1   Gender                  601 non-null    object 
 2   Married                 611 non-null    object 
 3   Dependents              599 non-null    object 
 4   Education               614 non-null    object 
 5   Self_Employed           582 non-null    object 
 6   ApplicantIncome         614 non-null    int64  
 7   CoapplicantIncome       614 non-null    float64
 8   LoanAmount              592 non-null    float64
 9   Loan_Amount_Term        600 non-null    float64
 10  Credit_History          564 non-null    float64
 11  Property_Area           614 non-null    object 
 12  Loan_Status (Approved)  614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [119]:
df=df.drop(columns=['Loan_ID'],axis=1)

In [120]:
x = df.drop("Loan_Status (Approved)", axis=1)
y = df["Loan_Status (Approved)"]

In [121]:
num_cols = x.select_dtypes(include=["int64","float64"]).columns
cat_cols = x.select_dtypes(include=["object"]).columns

In [122]:
from sklearn.model_selection import train_test_split
x = df.drop("Loan_Status (Approved)", axis=1)
y = df["Loan_Status (Approved)"]

X_train, X_test, y_train, y_test = train_test_split(
    x, y,
    test_size=0.2,
    random_state=82,
    stratify=y
)

In [123]:
# ===================== MUTUAL INFORMATION (TRAIN ONLY) =====================
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.impute import SimpleImputer
# Copy data for MI only
X_train_mi = X_train.copy()
X_test_mi  = X_test.copy()
# ---- 1. Handle missing values TEMPORARILY (for MI only) ----
num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")
# Numeric columns
X_train_mi[num_cols] = num_imputer.fit_transform(X_train_mi[num_cols])
X_test_mi[num_cols]  = num_imputer.transform(X_test_mi[num_cols])
# Categorical columns
X_train_mi[cat_cols] = cat_imputer.fit_transform(X_train_mi[cat_cols])
X_test_mi[cat_cols]  = cat_imputer.transform(X_test_mi[cat_cols])
# ---- 2. Encode categorical columns (for MI only) ----
for col in cat_cols:
    le = LabelEncoder()
    X_train_mi[col] = le.fit_transform(X_train_mi[col])
    X_test_mi[col]  = X_test_mi[col].map(
        lambda x: le.transform([x])[0] if x in le.classes_ else -1
    )
# ---- 3. Compute MI on TRAIN ONLY ----
mi_scores = mutual_info_classif(
    X_train_mi,
    y_train,
    random_state=42
)
mi_df = pd.DataFrame({
    "feature": X_train.columns,
    "mi_score": mi_scores
}).sort_values(by="mi_score", ascending=False)

print(mi_df)

# ---- 4. Select important features ----
selected_features = mi_df[mi_df["mi_score"] > 0.01]["feature"]

X_train = X_train[selected_features]
X_test  = X_test[selected_features]

              feature  mi_score
9      Credit_History  0.132557
3           Education  0.042104
4       Self_Employed  0.024759
1             Married  0.019369
0              Gender  0.004106
8    Loan_Amount_Term  0.003066
10      Property_Area  0.001433
2          Dependents  0.000000
5     ApplicantIncome  0.000000
6   CoapplicantIncome  0.000000
7          LoanAmount  0.000000


In [124]:
num_cols = X_train.select_dtypes(include=["int64","float64"]).columns
cat_cols = X_train.select_dtypes(include=["object"]).columns

In [125]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, num_cols),
        ("cat", categorical_pipeline, cat_cols)
    ]
)

In [126]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(
    n_neighbors=5,
    weights="uniform",
    metric="euclidean"
)

In [127]:
model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("knn", knn)
])

In [128]:
model.fit(X_train, y_train)

In [129]:
y_pred = model.predict(X_test)

In [130]:
from sklearn.metrics import accuracy_score

In [131]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.7967479674796748


In [132]:
y_train_pred = model.predict(X_train)
print("Accuracy:", accuracy_score(y_train, y_train_pred))

Accuracy: 0.8126272912423625


In [133]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "knn__n_neighbors": range(3, 31, 2),
    "knn__weights": ["uniform", "distance"],
    "knn__metric": ["euclidean", "manhattan"]
}

grid = GridSearchCV(
    model,
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)


Best params: {'knn__metric': 'euclidean', 'knn__n_neighbors': 5, 'knn__weights': 'uniform'}
Best CV accuracy: 0.812636569779427
