<a href="https://colab.research.google.com/github/joeyeuron/Case-Study-CE880/blob/main/Final_Draft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
from pandas import read_csv
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, roc_auc_score

# Data Loading and Exploration

In [None]:
from google.colab import files
uploaded = files.upload()

### Load dataset using pandas

In [None]:
excel_file = pd.ExcelFile('loan_data.xlsx')
df = excel_file.parse(excel_file.sheet_names[0])

###Perform Exploratory Data Analysis (EDA)

In [None]:
print(df.head())

In [None]:
#Basic information about the dataset
print(df.info())

In [None]:
#Statistical summary of the numerical columns
print(df.describe())

In [None]:
#Distribution of the target variable 'Status'
print(df['Status'].value_counts())

##Data Preprocessing

In [None]:
#Missing Values
print(df.isnull().sum())

###Handled missing values

In [None]:
# Replacing missing values with mode for respective columns
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
df['Married'].fillna(df['Married'].mode()[0], inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)
df['Term'].fillna(df['Term'].mean(), inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)


###Encoded categorical variables

In [None]:
# Convert 'Dependents' column to strings
df['Dependents'] = df['Dependents'].astype(str)

# Using OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

# Convert 'Dependents' column to strings
df['Dependents'] = df['Dependents'].astype(str)

# Using OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

#Categorical columns for one-hot encoding (excluding 'Status')
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Area']

onehot_encoder = OneHotEncoder()
encoded_features = onehot_encoder.fit_transform(df[categorical_columns])
feature_names = onehot_encoder.get_feature_names_out(input_features=categorical_columns)
df_encoded = pd.concat([df.drop(categorical_columns, axis=1), pd.DataFrame(encoded_features.toarray(), columns=feature_names)], axis=1)



### Data Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Histogram for Applicant_Income
plt.figure(figsize=(8, 6))
plt.hist(df_encoded['Applicant_Income'], bins=20, color='blue', edgecolor='black')
plt.xlabel('Applicant Income')
plt.ylabel('Count')
plt.title('Histogram of Applicant Income')
plt.show()

In [None]:
# 2. Bar Chart for Credit_History
plt.figure(figsize=(6, 4))
sns.countplot(x='Credit_History', data=df_encoded, palette='pastel')
plt.xlabel('Credit History')
plt.ylabel('Count')
plt.title('Count of Credit History')
plt.show()

In [None]:
# 3. Correlation Heatmap
plt.figure(figsize=(10, 8))
corr_matrix = df_encoded.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# 4. Scatter Plot for Applicant_Income vs. Loan_Amount
plt.figure(figsize=(8, 6))
plt.scatter(df_encoded['Applicant_Income'], df_encoded['Loan_Amount'], c=df_encoded['Status'].map({'Y': 'blue', 'N': 'red'}), alpha=0.6)
plt.xlabel('Applicant Income')
plt.ylabel('Loan Amount')
plt.title('Scatter Plot: Applicant Income vs. Loan Amount')
plt.legend(['Approved', 'Not Approved'])
plt.show()

In [None]:
# 5. Box Plot for Loan_Amount by Status
plt.figure(figsize=(6, 4))
sns.boxplot(x='Status', y='Loan_Amount', data=df_encoded, palette='pastel')
plt.xlabel('Status')
plt.ylabel('Loan Amount')
plt.title('Box Plot: Loan Amount by Status')
plt.show()

# Data Splitting

In [None]:
# Split features (X) and the target variable (y)
X = df_encoded.drop('Status', axis=1)
y = df_encoded['Status']

# Split data into training (80%), validation (10%), and test sets (10%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)



# Model Selection and Training

### Scaling feature Data for Model

In [None]:

scaler = StandardScaler()

# Fitting the scaler on the training data and transforming both training and validation data
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Scaling the test data using the same scaler used for training data
X_test_scaled = scaler.transform(X_test)


###Initialized classification models

In [None]:
svm_model = SVC(probability=True)
rf_model = RandomForestClassifier()
knn_model = KNeighborsClassifier()
gb_model = GradientBoostingClassifier()
nb_model = GaussianNB()

###Defined parameter grids for grid search to tune hyperparameters

In [None]:
# Defining parameter grids for grid search
svm_param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10],
    'gamma': [0.1, 1, 10]
}

rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

knn_param_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

gb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7]
}

nb_param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7]
}

# List of models and their respective parameter grids
models = [svm_model, rf_model, knn_model, gb_model, nb_model]
param_grids = [svm_param_grid, rf_param_grid, knn_param_grid, gb_param_grid, nb_param_grid]
model_names = ['SVM', 'Random Forest', 'KNN', 'Gradient Boosting', 'Naive Bayes']


###Selection of best model

In [None]:
best_models = []  # To store the best models after grid search

# Loop over each model and its corresponding parameter grid
for model, param_grid, model_name in zip(models, param_grids, model_names):
    # Initialize GridSearchCV with the current model, parameter grid, and 5-fold cross-validation
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1') # Selection base don F1 score

    # Fit the GridSearchCV object to the training data
    grid_search.fit(X_train_scaled, y_train)

    # Retrieivng best model and its hyperparameters from the grid search
    best_model = grid_search.best_estimator_
    best_models.append(best_model)
    best_hyperparameters = grid_search.best_params_

    # Print the best hyperparameters for the current model
    print(f"Best Hyperparameters for {model_name}:")
    print(best_hyperparameters)


#Model Evaluation on Validation Set

###Evaluated each best model on the validation set

In [None]:
# Lists to store evaluation metrics
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Iteration over each best model
for best_model, model_name in zip(best_models, model_names):
    # Predict on the validation set
    y_pred_val = best_model.predict(X_val_scaled)

    # Calculating and storing evaluation metrics
    accuracy_scores.append(np.round(best_model.score(X_val_scaled, y_val), 3))
    precision_scores.append(np.round(precision_score(y_val, y_pred_val, pos_label='Y'), 3))
    recall_scores.append(np.round(recall_score(y_val, y_pred_val, pos_label='Y'), 3))
    f1_scores.append(np.round(f1_score(y_val, y_pred_val, pos_label='Y'), 3))

    # Print classification report
    print(f"Classification Report - {model_name} Model:")
    print(classification_report(y_val, y_pred_val, target_names=['N', 'Y']))

    # Confusion matrix heatmap
    cm = confusion_matrix(y_val, y_pred_val)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name} Model')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# List of model names
model_names = ['SVM', 'Random Forest', 'KNN', 'Gradient Boosting', 'Naive Bayes']

# List of corresponding metrics
accuracy_scores = [...]  # Replace with your actual accuracy scores
precision_scores = [...]  # Replace with your actual precision scores
recall_scores = [...]  # Replace with your actual recall scores
f1_scores = [...]  # Replace with your actual F1 scores

# Set positions for the bars
x = np.arange(len(model_names))
width = 0.2

# Create bar plots for each metric
plt.bar(x - width, accuracy_scores, width, label='Accuracy')
plt.bar(x, precision_scores, width, label='Precision')
plt.bar(x + width, recall_scores, width, label='Recall')
plt.bar(x + 2 * width, f1_scores, width, label='F1-Score')

# Set labels and title
plt.xlabel('Models')
plt.ylabel('Scores')
plt.title('Model Evaluation Metrics')
plt.xticks(x, model_names)
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()


###Cross-validation

In [None]:
# Initialize empty lists to store mean scores for each model
mean_accuracy_scores = []
mean_precision_scores = []
mean_recall_scores = []
mean_f1_scores = []

# Perform cross-validation for each model
for model in models:
    accuracy_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
    mean_accuracy_scores.append(np.mean(accuracy_scores))

    try:
        precision_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='precision')
        mean_precision_scores.append(np.mean(precision_scores))
    except:
        mean_precision_scores.append(0)  # Replace with 0 or another suitable value

    try:
        recall_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='recall')
        mean_recall_scores.append(np.mean(recall_scores))
    except:
        mean_recall_scores.append(0)  # Replace with 0 or another suitable value

    try:
        f1_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='f1')
        mean_f1_scores.append(np.mean(f1_scores))
    except:
        mean_f1_scores.append(0)  # Replace with 0 or another suitable value


# Display the mean cross-validation scores for each model
for model_name, accuracy, precision, recall, f1 in zip(model_names, mean_accuracy_scores, mean_precision_scores, mean_recall_scores, mean_f1_scores):
    print(f"Model: {model_name}")
    print(f"Mean Accuracy: {accuracy}")
    print(f"Mean Precision: {precision}")
    print(f"Mean Recall: {recall}")
    print(f"Mean F1-Score: {f1}")
    print("=============================")


#Model Comparison using ROC AUC

In [None]:
# Initializing empty lists to store mean ROC AUC scores for each model
mean_roc_auc_scores = []

# Performing cross-validation for each model
for model in models:
    roc_auc_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='roc_auc')
    mean_roc_auc_scores.append(np.mean(roc_auc_scores))

# Displaying the mean ROC AUC scores for each model
for model_name, roc_auc in zip(model_names, mean_roc_auc_scores):
    print(f"Model: {model_name}")
    print(f"Mean ROC AUC Score: {roc_auc}")
    print("=============================")


In [None]:
# Mean ROC AUC scores from cross-validation
mean_roc_auc_scores = [0.7846481661130219, 0.7955742716876857, 0.67795847174452, 0.7553687616975825, 0.7564197313108153]
model_names = ['SVM', 'Random Forest', 'KNN', 'Gradient Boosting', 'Naive Bayes']

# Bar plot
plt.figure(figsize=(10, 6))
plt.bar(model_names, mean_roc_auc_scores, color= ['skyblue', 'orange', 'green', 'red', 'purple'])
plt.xlabel('Models')
plt.ylabel('Mean ROC AUC Score')
plt.title('Mean ROC AUC Scores for Different Models')
plt.ylim([0, 1])  # Set the y-axis range from 0 to 1
plt.xticks(rotation=45)  # Rotated x-axis labels for better visibility
plt.show()


#Best Model Evaluation on Test Set

In [None]:
# Index of the Random Forest model in the list of model names
rf_model_index = model_names.index('Random Forest')

# Retrieve the best Random Forest model from the best_models list
best_rf_model = best_models[rf_model_index]

# Testing the best-performing Random Forest model on the test data
y_pred_test_rf = best_rf_model.predict(X_test_scaled)

# Classification report for the test set
print("Classification Report - Best Random Forest Model (Test Set):")
print(classification_report(y_test, y_pred_test_rf, target_names=['N', 'Y']))

# Confusion matrix heatmap for the test set
cm_test_rf = confusion_matrix(y_test, y_pred_test_rf)
sns.heatmap(cm_test_rf, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Best Random Forest Model (Test Set)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()



###Analyzing Feature Importance

In [None]:
feature_importances = best_rf_model.feature_importances_
sorted_indices = np.argsort(feature_importances)[::-1]

# Feature importance scores and corresponding feature names
for idx in sorted_indices:
    print(f"{X.columns[idx]}: {feature_importances[idx]}")


###Results

1.   Precision: The precision for class 'N' (No) is 1.00, meaning that when the model predicts a loan as 'N', it is correct 100% of the time. However, the precision for class 'Y' (Yes) is 0.72, indicating that when the model predicts a loan as 'Y', it is correct 72% of the time.
2.   Recall: The recall for class 'N' is 0.38, indicating that the model correctly identifies only 38% of the actual 'N' loans. However, the recall for class 'Y' is 1.00, meaning that the model correctly identifies all the actual 'Y' loans.
3. F1-Score: The F1-score is a balanced metric that considers both precision and recall. The F1-score for class 'N' is 0.55, and for class 'Y' is 0.84.
4. Support: The support is the number of samples in each class in the test set. There are 24 samples for class 'N' and 38 samples for class 'Y'.
5. Accuracy: The overall accuracy of the model on the test set is 0.76, meaning that the model correctly predicts the loan status for 76% of the samples in the test set.
6. Macro Avg: The macro-average takes the average of precision, recall, and F1-score for both classes, treating each class equally. The macro-average precision is 0.86, recall is 0.69, and F1-score is 0.69.
7. Weighted Avg: The weighted average considers the support (number of samples) for each class and calculates a weighted average of precision, recall, and F1-score. The weighted average precision is 0.83, recall is 0.76, and F1-score is 0.72.

The Random Forest model has good precision and recall for predicting the 'Y' class (loan approval), suggesting that it's able to accurately identify applicants who are likely to get approved for loans.
The precision for class 'Y' is higher than class 'N', indicating that when the model predicts an applicant will get approved, it's more likely to be accurate.
The recall for class 'Y' is high as well, meaning that the model is able to capture a significant proportion of actual loan approvals.
The F1-score balances both precision and recall, providing a holistic view of the model's effectiveness.