<a href="https://www.kaggle.com/code/shresthajeevan/loan-approval-prediction-classification?scriptVersionId=211279989" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import pylab

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler 
from sklearn.exceptions import ConvergenceWarning
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve

import lightgbm as lgb 
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [None]:
df_train = pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv',index_col='id' )
df_test = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv',index_col='id' )

In [None]:
y = df_train['loan_status']
X = df_train.drop(columns='loan_status') 

In [None]:
X

In [None]:
df_train.info()

In [None]:
df_train.shape

In [None]:
df_train.columns

In [None]:
df_train.describe()

In [None]:
df_train.head()

In [None]:
numerical_columns = df_train.select_dtypes(include=['number']).columns.drop(y.name).tolist()
categorical_columns = df_train.select_dtypes(include=['object']).columns.tolist()

In [None]:
df_train.isna().sum()

### Exploratory Data Analysis


In [None]:
 
# Box plot on the left
sns.countplot(x=y ,palette='Set2' , color=y  )
 
plt.show()

### Numerical Variables Distributions and Correlations Pairplots

In [None]:
# Compute the correlation matrix
correlation_matrix = df_train[numerical_columns].corr()

In [None]:
# Plot the correlation matrix
plt.figure(figsize=(16, 12))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', cbar=True)

# Set plot labels
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Create pairplot with KDE only on the diagonal
sns.pairplot(df_train[numerical_columns], diag_kind='kde')

# Show the plot
plt.show()

In [None]:
 # Set up the matplotlib figure with subplots
plt.figure(figsize=(20, 30))

# Loop through each numerical column and create a histogram
for i, col in enumerate(numerical_columns):
    plt.subplot(10, 4, i + 1)  # Adjust the number of rows and columns as needed
    sns.histplot(df_train[col], kde=True, bins=15 )  # 'kde=True' adds a kernel density estimate
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

# Adjust layout to prevent overlapping
plt.tight_layout()
plt.show()

In [None]:
df_train[numerical_columns].describe()

In [None]:
# Set up the matplotlib figure with subplots
plt.figure(figsize=(20, 30))

# Loop through each numerical column and create a box plot
for i, col in enumerate(numerical_columns):
    plt.subplot(10, 4, i + 1)  # Adjust the number of rows and columns as needed
    sns.boxplot(data=df_train, x=col)  # Box plot for each numerical column
    plt.title(f'Box Plot of {col}')
    plt.xlabel(col)

# Adjust layout to prevent overlapping
plt.tight_layout()
plt.show()


### Categorical Variables Distribution

In [None]:
# Plot distribution of SalesPrice vs each categorical column
plt.figure(figsize=(16, 12))

for i, col in enumerate(categorical_columns):
    plt.subplot(2,2, i + 1)  # Adjust the grid size based on the number of categorical columns
    sns.countplot(x=df_train[col], palette="Set2", hue=y)
    plt.xticks(rotation=45)  # Rotate x-axis labels if needed
    plt.title(f'Distribution   {col}')

plt.tight_layout()
plt.show()

In [None]:
class HandleOutliers(BaseEstimator, TransformerMixin):
    def __init__(self, p_min=0.01, p_max=0.99):
        self.p_min = p_min
        self.p_max = p_max

    def fit(self, X, y=None):
        # Fit method is required by the estimator interface but we don't need to do anything here
        return self

    def transform(self, X):
        X = pd.DataFrame(X)  # Ensure it's a DataFrame for easy processing
        for column in X.columns:
            # Calculate the min and max percentiles
            p_min_value = X[column].quantile(self.p_min)
            p_max_value = X[column].quantile(self.p_max)

            # Replace values below min percentile and above max percentile
            X[column] = X[column].clip(lower=p_min_value, upper=p_max_value)

            # Replace missing values with the median
            median = X[column].median()
            X[column] = X[column].fillna(median)

        return X.values  # Return as numpy array for compatibility with sklearn

    def get_feature_names_out(self, input_features=None):
        # Return the same feature names as input
        return input_features

In [None]:
class AgeToRangeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, age_column):
        self.age_column = age_column  # Allow passing the column name dynamically

    def fit(self, X, y=None):
        return self  # No fitting needed

    def transform(self, X):
        # Ensure the provided column exists in the data
        if self.age_column not in X.columns:
            raise ValueError(f"Column '{self.age_column}' not found in the input data.")
        
        # Create the 'age_range' column based on the age column
        age_range = pd.cut(X[self.age_column], bins=[0, 20, 25, 30, 35, 40, 50, float('inf')],
                           labels=['Below 20', '20-25', '25-30', '30-35', '35-40', '40-50', '50+'])
        
        X['age_range'] = age_range  # Add the new 'age_range' column
        X = X.drop(columns=[self.age_column])  # Drop the original age column
        
        return X

    def get_feature_names_out(self, input_features=None):
        # Return the feature names after the transformation, which will be 'age_range'
        return ['age_range']

In [None]:

# Create a pipeline for age transformation followed by one-hot encoding
age_pipeline = Pipeline(steps=[
    ('age_transformation',  AgeToRangeTransformer(age_column='person_age')),  # Apply age transformation
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))  # OneHotEncode the age_range column
])

# Define the numerical pipeline: Handle outliers followed by standard scaling
numerical_pipeline = Pipeline(steps=[
    ('outlier_handling', HandleOutliers(p_min=0.0025, p_max=0.9975)),  # Apply outlier handling
    ('scaling', StandardScaler())  # Apply standard scaling
])

# ColumnTransformer with age_pipeline to handle age range + other transformations
preprocessor = ColumnTransformer(
    transformers=[
        ("age_transformation", age_pipeline, ['person_age']),  # Apply to 'person_age'
        ("num_transformation", numerical_pipeline, [col for col in numerical_columns if col != 'person_age']),  # Exclude 'person_age'
        ("cat_transformation", OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_columns)  # OneHotEncode other categorical columns
    ],
    remainder='passthrough'  # Pass through any columns not explicitly transformed
)

 
# Create a pipeline with PCA or any other estimator after preprocessing
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)  # Apply preprocessing for both categorical and numerical columns
])

In [None]:
pipeline.fit(X)

In [None]:
transformed_X = pipeline.transform(X)

In [None]:
 # Get the feature names after transformation
transformed_feature_names = preprocessor.get_feature_names_out()
transformed_feature_names

In [None]:
df_transformed_X =  pd.DataFrame(transformed_X, columns=transformed_feature_names)

In [None]:
df_transformed_X

In [None]:
 
# Perform the split
X_train, X_test, y_train, y_test = train_test_split(df_transformed_X, y, test_size=0.2, random_state=42)


In [None]:
def print_correlation_heatmap(confusion_matrix):
    plt.figure(figsize=(8,6))
    sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

### Apply Machine Learing Algorithms

#### XGBoost

In [None]:
# Define the XGBoost model
xgb_model = XGBClassifier()

# Define hyperparameters for tuning
param_grid = {
    'n_estimators': [ 100, 150],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Perform hyperparameter tuning using GridSearchCV
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search_xgb.fit(X_train, y_train)

# Print best parameters and best score
print("Best Parameters:", grid_search_xgb.best_params_)
print("Training Accuracy:", grid_search_xgb.best_score_) 


# Get the best model from grid search
best_xg_model = grid_search_xgb.best_estimator_

# Make predictions on the test set
y_pred_xgb = best_xg_model.predict(X_test)

# Calculate error and accuracy of the model 
test_accuracy_xgb= accuracy_score(y_test, y_pred_xgb)
 
print(f"Test Accuracy XG : {test_accuracy_xgb}")

print(classification_report(y_test, y_pred_xgb))

xgboost_confusion_matrix = confusion_matrix(y_test, y_pred_xgb)
print_correlation_heatmap(xgboost_confusion_matrix)

#### Ligth GBM Model

In [None]:
# Suppress LightGBM warnings 
warnings.filterwarnings('ignore', category=UserWarning, module='lightgbm')


# Define the LGBM model
lgbm_model = LGBMClassifier(verbose = -1)

# Define hyperparameters for tuning
param_grid = {
    'n_estimators': [100, 150],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'num_leaves': [31, 50],   # Specific to LightGBM 
}

# Perform hyperparameter tuning using GridSearchCV
grid_search_lgbm = GridSearchCV(estimator=lgbm_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search_lgbm.fit(X_train, y_train)

# Print best parameters and best score
print("Best Parameters:", grid_search_lgbm.best_params_)
print("Training Accuracy:", grid_search_lgbm.best_score_)

# Get the best model from grid search
best_lgbm_model = grid_search_lgbm.best_estimator_

# Make predictions on the test set
y_pred_lgbm = best_lgbm_model.predict(X_test)

# Calculate error and accuracy of the model 
test_accuracy_lgbm = accuracy_score(y_test, y_pred_lgbm)
print(f"Test Accuracy LightGBM: {test_accuracy_lgbm}")

# Print classification report
print(classification_report(y_test, y_pred_lgbm))

# Confusion Matrix and Correlation Heatmap
lgbm_confusion_matrix = confusion_matrix(y_test, y_pred_lgbm)

# Print confusion matrix heatmap
print_correlation_heatmap(lgbm_confusion_matrix)

In [None]:
transformed_test = pipeline.transform(df_test)

In [None]:
df_transformed_test  =  pd.DataFrame(transformed_test, columns=transformed_feature_names)

In [None]:
Y_pred_xgb = best_xg_model.predict(df_transformed_test)
Y_pred_lgbm = best_lgbm_model.predict(df_transformed_test)

In [None]:
output_xgb = pd.DataFrame(Y_pred_xgb, index=df_test.index, columns=['loan_status']).reset_index()
output_lgbm = pd.DataFrame(Y_pred_lgbm, index=df_test.index, columns=['loan_status']).reset_index()

In [None]:
output_xgb.to_csv('/kaggle/working/submission_xgb.csv', index=False)
output_lgbm.to_csv('/kaggle/working/submission_lgbm.csv', index=False)


In [None]:
output_lgbm.to_csv('/kaggle/working/submission.csv', index=False)