In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier


# Load dataset
df = pd.read_csv("loan_approval_dataset.csv")
# Clean target labels
df["loan_status"] = df["loan_status"].str.strip()   # removes spaces

# Split into features & target
# Drop loan_id before training
X = df.drop(["loan_status", "loan_id"], axis=1)   # drop ID + target
y = df["loan_status"]

# Encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y.str.strip())
y = le.fit_transform(df["loan_status"].str.strip())


# Train-test split (keep DataFrame, not array!)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Identify categorical & numeric columns (now works, since X_train is DataFrame)
cat_cols = X_train.select_dtypes(include=["object"]).columns
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns

# Preprocessor
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

# Define models & hyperparameters
models = {
    "LogisticRegression": (
        LogisticRegression(max_iter=1000),
        {"classifier__C": [0.1, 1, 10]}
    ),
    "RandomForest": (
        RandomForestClassifier(random_state=42),
        {"classifier__n_estimators": [100, 200],
         "classifier__max_depth": [5, 10, None]}
    ),
    "GradientBoosting": (
        GradientBoostingClassifier(random_state=42),
        {"classifier__n_estimators": [100, 200],
         "classifier__learning_rate": [0.05, 0.1, 0.2]}
    ),
    "XGBoost": (
        xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
        {"classifier__n_estimators": [100, 200],
         "classifier__max_depth": [3, 5, 7]}
    ),
    "SVM": (
        SVC(probability=True),
        {"classifier__C": [0.1, 1, 10],
         "classifier__kernel": ["linear", "rbf"]}
    ),
    "KNN": (
        KNeighborsClassifier(),
        {"classifier__n_neighbors": [3, 5, 7, 9],
         "classifier__weights": ["uniform", "distance"]}
    )
}

# Train & evaluate models
best_model = None
best_score = 0
results = {}

for name, (model, params) in models.items():
    pipe = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", model)])
    grid = GridSearchCV(pipe, params, cv=3, scoring="accuracy", n_jobs=1)
    grid.fit(X_train, y_train)
    
    y_pred = grid.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    
    print(f"\n{name} Accuracy: {acc:.4f}")
    print("Best Params:", grid.best_params_)
    print(classification_report(y_test, y_pred))
    
    if acc > best_score:
        best_score = acc
        best_model = grid.best_estimator_

print("\n✅ Best Model:", best_model)
print("✅ Best Accuracy:", best_score)


Unexpected exception formatting exception. Falling back to standard exception


joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "c:\Users\hp\anaconda3\envs\loan_pred\lib\site-packages\joblib\externals\loky\process_executor.py", line 463, in _process_worker
    r = call_item()
  File "c:\Users\hp\anaconda3\envs\loan_pred\lib\site-packages\joblib\externals\loky\process_executor.py", line 291, in __call__
    return self.fn(*self.args, **self.kwargs)
  File "c:\Users\hp\anaconda3\envs\loan_pred\lib\site-packages\joblib\parallel.py", line 589, in __call__
    return [func(*args, **kwargs)
  File "c:\Users\hp\anaconda3\envs\loan_pred\lib\site-packages\joblib\parallel.py", line 589, in <listcomp>
    return [func(*args, **kwargs)
  File "c:\Users\hp\anaconda3\envs\loan_pred\lib\site-packages\sklearn\utils\parallel.py", line 147, in __call__
    return self.function(*args, **kwargs)
  File "c:\Users\hp\anaconda3\envs\loan_pred\lib\site-packages\sklearn\model_selection\_validation.py", line 847, in _fit_and_score
   

In [None]:
import joblib

joblib.dump(best_model, "model/loan_model.pkl")

['../model/loan_model.pkl']