In [4]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from tqdm.notebook import tqdm

import pandas as pd

In [5]:

# Loading preprocessed dataset
loanDF = pd.read_csv('../datasets/Loan_Default_cleaned_downsampled.csv')

# Split the dataset
X = loanDF.drop('status', axis=1)
y = loanDF.status
X_train, X_test, y_train, y_test = train_test_split(
                                            X, y, 
                                            test_size=0.2, 
                                            random_state=42
                                            )

In [7]:

# Define the models
models = {
    "Logistic Regression": LogisticRegression(solver='lbfgs', max_iter=1000),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(),
    "XGBoost": XGBClassifier(),
    "Decision Tree": DecisionTreeClassifier()
}

# Create pipelines for each model
pipelines = {name: Pipeline([('scaler', StandardScaler()), ('model', model)]) for name, model in models.items()}


In [8]:

# Evaluate each model using cross-validation
cv_scores = {}
for name, pipeline in tqdm(pipelines.items(), desc='Evaluating models'):
    score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy').mean()
    cv_scores[name] = score

# Print the performance
for model in cv_scores:
    print(f"{model}: {cv_scores[model]}")

# Select the best model based on CV score
best_model_name = max(cv_scores, key=cv_scores.get)
best_model = pipelines[best_model_name]


Evaluating models:   0%|          | 0/7 [00:00<?, ?it/s]

Logistic Regression: 0.7742320314109994
KNN: 0.8508096715265439
Random Forest: 1.0
Naive Bayes: 0.7183935914357912
SVM: 0.9558437134340082
XGBoost: 0.9999823788546255
Decision Tree: 0.9999823788546255


## If standardscaler is not used in the pipeline. while training LinearModel, max_iter reached error is observed. Need to fix this