In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler ## Feature Scaling
from sklearn.preprocessing import OneHotEncoder ## categorical to numerical
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
df=pd.read_csv('data/LoanApprovalPrediction.csv')

In [9]:
df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [4]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [5]:
cat_columns=X.columns[X.dtypes=='object']
num_columns=X.columns[X.dtypes!='object']

In [8]:
cat_columns

Index(['Loan_ID', 'Gender', 'Married', 'Education', 'Self_Employed',
       'Property_Area'],
      dtype='object')

In [10]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=69)

In [11]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((478, 12), (478,), (120, 12), (120,))

In [12]:
## feature Engineering Automation
## Numerical Pipelines
num_pipeline=Pipeline(
    steps=[('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler()) ],verbose=3

)

#categorical Pipeline
cat_pipeline=Pipeline(
                steps=[('imputer',SimpleImputer(strategy='most_frequent')),
                ('onehotencoder',OneHotEncoder(handle_unknown='ignore')),
                       ('scaler',StandardScaler(with_mean=False))
                ],verbose=3

            )


In [13]:

preprocessor= ColumnTransformer([
    ('numpipeline',num_pipeline,num_columns),
    ('catpipeline',cat_pipeline,cat_columns)]

)
preprocessor

In [14]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.0s
[Pipeline] ..... (step 2 of 3) Processing onehotencoder, total=   0.0s
[Pipeline] ............ (step 3 of 3) Processing scaler, total=   0.1s


In [20]:
from imblearn.over_sampling import SMOTE
smote=SMOTE(random_state=69)
X_train_resampled,y_train_resampled=smote.fit_resample(X_train,y_train)

In [21]:
X_train_resampled.shape,y_train_resampled.shape

((656, 495), (656,))

In [22]:
models={
    "Logistic Regression":LogisticRegression(),
    "Decision Tree Classifier":DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Support Vector Classifier": SVC(),
    "Ada Boost Classifier":AdaBoostClassifier(),
    "Gradient Boost Classifier":GradientBoostingClassifier()
}

In [23]:
from sklearn.model_selection import GridSearchCV, cross_val_score

def evaluate_model(X_train, X_test, y_train, y_test, models, param_grids, cv=5):
    train_scores = {}
    test_scores = {}

    for name, clf in models.items():
        # Define parameter grid for GridSearchCV
        param_grid = param_grids.get(name, {})  # Get parameter grid for the current model

        # Perform GridSearchCV with cross-validation
        grid_search = GridSearchCV(clf, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)

        # Best model from GridSearchCV
        best_clf = grid_search.best_estimator_

        # Training accuracy using cross-validation
        train_cv_scores = cross_val_score(best_clf, X_train, y_train, cv=cv, scoring='accuracy')
        train_accuracy = train_cv_scores.mean()

        # Testing accuracy
        test_accuracy = accuracy_score(y_test, best_clf.predict(X_test))

        # Store accuracies
        train_scores[name] = train_accuracy
        test_scores[name] = test_accuracy

        # Print results
        print(confusion_matrix(y_test,best_clf.predict(X_test)))
        print(classification_report(y_test,best_clf.predict(X_test)))
        print(f"{name} Cross-Validation Training Accuracy: {train_accuracy:.4f}")
        print(f"{name} Testing Accuracy: {test_accuracy:.4f}")
        print(f"{name} Best Parameters: {grid_search.best_params_}")
        print("-" * 40)




In [24]:
# Define parameter grids for each model
param_grids = {
    "Logistic Regression": {
        "C": [0.1, 1.0, 10.0],
        "penalty": ["l1", "l2"]
    },
    "Decision Tree Classifier": {
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5, 10]
    },
    "Random Forest Classifier": {
        "n_estimators": [100, 200, 300],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5, 10]
    },
    "Support Vector Classifier": {
        "C": [0.1, 1.0, 10.0],
        "kernel": ["linear", "rbf"]
    },
    "Ada Boost Classifier": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 1.0]
    },
    "Gradient Boost Classifier": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 1.0],
        "max_depth": [3, 5, 7]
    }

}


In [25]:
evaluate_model(X_train_resampled,X_test,y_train_resampled,y_test,models, param_grids, cv=5)

[[20 17]
 [ 0 83]]
              precision    recall  f1-score   support

           N       1.00      0.54      0.70        37
           Y       0.83      1.00      0.91        83

    accuracy                           0.86       120
   macro avg       0.92      0.77      0.80       120
weighted avg       0.88      0.86      0.84       120

Logistic Regression Cross-Validation Training Accuracy: 0.9589
Logistic Regression Testing Accuracy: 0.8583
Logistic Regression Best Parameters: {'C': 10.0, 'penalty': 'l2'}
----------------------------------------
[[22 15]
 [ 9 74]]
              precision    recall  f1-score   support

           N       0.71      0.59      0.65        37
           Y       0.83      0.89      0.86        83

    accuracy                           0.80       120
   macro avg       0.77      0.74      0.75       120
weighted avg       0.79      0.80      0.79       120

Decision Tree Classifier Cross-Validation Training Accuracy: 0.8187
Decision Tree Classifier 

In [26]:
#printing list of models used.
list(models)

['Logistic Regression',
 'Decision Tree Classifier',
 'Random Forest Classifier',
 'Support Vector Classifier',
 'Ada Boost Classifier',
 'Gradient Boost Classifier']