In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

In [None]:
df = pd.read_csv("/content/Loan_default.csv") # load data set

In [None]:
df  = df.drop(["LoanID"],axis=1) # there is no use of LoanId


In [None]:
df.head()

#Feature Engineering

###Ordinal Encoding



In [None]:
# List of categorical columns to analyze
categorical_columns = [
    'Education', 'EmploymentType', 'MaritalStatus',
    'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner'
]

# Iterate through each column and calculate the counts, percentages, and averages
for col in categorical_columns:
    print(f"\nAnalysis for {col}:\n")

    # Group by the column and 'Default', then count occurrences
    counts = df.groupby([col, 'Default']).size().unstack(fill_value=0)

    # Calculate total counts for each category
    total_counts = counts.sum(axis=1)

    # Calculate percentages of defaults and non-defaults
    percentages = (counts.div(total_counts, axis=0) * 100).round(2)

    # Add an average column for 'Default' weighted by the counts
    averages = df.groupby(col)['Default'].mean().round(2)

    # Combine counts, percentages, and averages into one table
    summary = counts.copy()
    summary['Total'] = total_counts
    summary['Default %'] = percentages[1] if 1 in percentages.columns else 0
    summary['Non-Default %'] = percentages[0] if 0 in percentages.columns else 0
    # summary['Default Rate (Average)'] = averages

    # Print the summary
    print(summary)


Why Ordinal Encoding instead of One hot encoding


---

Based on statistical analysis, the categories in certain variables exhibit a clear order in their relationship with the target variable (Default). For example, in the "Education" variable, the default rate decreases as the education level increases (High School: 12.88%, Bachelor's: 12.10%, Master's: 10.87%, PhD: 10.59%). This trend is statistically significant, demonstrating an inherent order among the categories. By applying Ordinal Encoding, we can capture this order and utilize the progression between categories effectively, which enhances the model's ability to predict loan defaults.

In [None]:


# List of categorical columns to analyze
categorical_columns = [
    'Education', 'EmploymentType', 'MaritalStatus',
    'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner'
]

# Initialize an OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

# Dynamically generate ordinal mappings
ordinal_mappings = {}

for col in categorical_columns:
    # Group by the column and calculate the mean default rate
    default_rates = df.groupby(col)['Default'].mean().sort_values()

    # Generate the ordinal mapping dynamically based on default rates
    mapping = {category: rank for rank, category in enumerate(default_rates.index, start=1)}
    ordinal_mappings[col] = mapping

    # Apply the mapping to the column in the DataFrame
    df[col] = df[col].map(mapping)

    # print(f"Ordinal mapping for {col}: {mapping}")

# Display the first few rows of the updated DataFrame
df.head()


###Train Test Split

In [None]:

from sklearn.model_selection import train_test_split

# Assuming 'Default' is your target variable
X = df.drop('Default', axis=1)
y = df['Default']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

###Feature Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Now X_train_scaled and X_test_scaled contain the normalized data
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)


In [None]:
def find_outliers_iqr(data):
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    return outliers

# Example usage for a specific column (replace 'LoanAmount' with the column you want to analyze)
numerical_columns = ['LoanAmount', 'LoanTerm', 'CreditScore']
#Check if all numerical_columns are in df
for col in numerical_columns:
    if col not in df.columns:
        print(f"Warning: Column '{col}' not found in DataFrame. Skipping...")
        continue  # Skip to the next column
    outliers = find_outliers_iqr(df[col])
    print(f"Outliers in {col}: {outliers}")
    print(f"Number of outliers in {col}: {len(outliers)}")


In [None]:

# Apply logistic regression

from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
logreg = LogisticRegression(random_state=42)  # You can adjust hyperparameters as needed

# Train the model using the scaled training data
logreg.fit(X_train_scaled, y_train)

# Make predictions on the scaled test data
y_pred = logreg.predict(X_test_scaled)

# Evaluate the model (example: accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Logistic Regression: {accuracy}")

# You can further evaluate using other metrics like precision, recall, F1-score, etc.
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


In [None]:
# Feature Importance


# Get feature importances (coefficients for logistic regression)
feature_importances = logreg.coef_[0]

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print feature importances in a user-friendly format
for index, row in feature_importance_df.iterrows():
    print(f"{row['Feature']} feature importance: {row['Importance']:.4f}")


In [None]:
# Apply Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)  # You can adjust hyperparameters

# Train the model
dt_classifier.fit(X_train_scaled, y_train)

# Make predictions
y_pred_dt = dt_classifier.predict(X_test_scaled)

# Evaluate the model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Accuracy of Decision Tree Classifier: {accuracy_dt}")
print(classification_report(y_test, y_pred_dt))


In [None]:
# apply naive baise

from sklearn.naive_bayes import GaussianNB

# Initialize the Gaussian Naive Bayes model
gnb = GaussianNB()

# Train the model
gnb.fit(X_train_scaled, y_train)

# Make predictions
y_pred_gnb = gnb.predict(X_test_scaled)

# Evaluate the model
accuracy_gnb = accuracy_score(y_test, y_pred_gnb)
print(f"Accuracy of Gaussian Naive Bayes: {accuracy_gnb}")
print(classification_report(y_test, y_pred_gnb))


In [None]:
# random forest

from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)  # You can adjust hyperparameters

# Train the model
rf_classifier.fit(X_train_scaled, y_train)

# Make predictions
y_pred_rf = rf_classifier.predict(X_test_scaled)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy of Random Forest Classifier: {accuracy_rf}")
print(classification_report(y_test, y_pred_rf))


In [None]:
# Feature Importance for Random Forest

import pandas as pd

# Assuming rf_classifier is your trained RandomForestClassifier model
# and X_train is your training features DataFrame

feature_importances = rf_classifier.feature_importances_
feature_names = X_train.columns

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Sort by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print feature importances in a user-friendly format
for index, row in feature_importance_df.iterrows():
    print(f"{row['Feature']} feature importance: {row['Importance']:.4f}")
