In [None]:
import re
import math
import warnings
warnings.filterwarnings('ignore')

# Data handling
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objects as go
import kaleido
import ipywidgets as widgets
from plotly.subplots import make_subplots
from IPython.display import display, Image

# Statistical analysis
from scipy.stats import boxcox, skew
from statsmodels.stats.outliers_influence import variance_inflation_factor

# ML libraries and utilities
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Hyperparameter tuning
import xgboost as xgb
import optuna
import logging
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
def read_excel_data(file_path, sheet_names):
    data_frames = []
    
    for sheet_name in sheet_names:
      df = pd.read_excel(file_path, sheet_name=sheet_name, engine='openpyxl')
      data_frames.append(df)
    
    return data_frames

In [None]:
# Read Excel sheets into DataFrames
sheet_names = ['loan_information', 'Employment', 'Personal_information', 'Other_information']
dfs = read_excel_data("Credit_Risk_Dataset.xlsx", sheet_names)

loan_information = dfs[0]
employment = dfs[1]
personal_information = dfs[2]
other_information = dfs[3]

# Standardize the 'User_id' column names across all DataFrames
employment.rename(columns={'User id': 'User_id'}, inplace=True)
personal_information.rename(columns={'User id': 'User_id'}, inplace=True)

# Merge 'loan_information' and 'employment' DataFrames based on 'User_id'
merged_df = pd.merge(loan_information, employment, on='User_id')

# Merge the previously merged DataFrame with 'personal_information' based on 'User_id'
merged_df = pd.merge(merged_df, personal_information, on='User_id')

# Merge the previously merged DataFrame with 'other_information' based on 'User_id'
merged_df = pd.merge(merged_df, other_information, on='User_id')
df = merged_df
df.to_csv("merged_data.csv", index=False)

## Data Overview

In [None]:
df = pd.read_csv("merged_data.csv")
df.head()

In [None]:
df = df.drop(columns=['Pincode', 'Role', 'Industry', 'User_id'])

df.rename(columns={
    'Employmet type': 'Employment type',
    'Total Payement ': 'Total Payment',
    'Total Income(PA)': 'Total Income',
    'Tenure(years)': 'Loan Tenure'
}, inplace=True)

df['Employment type'] = df['Employment type'].str.replace('Self - Employeed', 'Self Employed')

df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.describe()

## Missing Data Analysis

In [None]:
missing_data = df.isnull().sum()
missing_data = missing_data[missing_data > 0]
missing_data

In [None]:
missing_data_proportion = df.isnull().mean() * 100
missing_data_proportion = missing_data_proportion[missing_data_proportion > 0]
missing_data_proportion = missing_data_proportion.sort_values(ascending=False)
missing_data_proportion_formatted = missing_data_proportion.apply(lambda x: f"{x:.2f}%")
missing_data_proportion_formatted

In [None]:
num_imputer = SimpleImputer(strategy='median')  # For numerical columns
cat_imputer = SimpleImputer(strategy='most_frequent')  # For categorical columns

# Select numerical and categorical columns
numerical_cols = ['Amount']
categorical_cols = ['Employment type', 'Tier of Employment', 'Married', 'Social Profile', 'Is_verified']

df[numerical_cols] = num_imputer.fit_transform(df[numerical_cols])
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

In [None]:
missing_data = df.isnull().sum()
missing_data = missing_data[missing_data > 0]
missing_data

In [None]:
print(df['Work Experience'].unique())
missing_work_exp = df[df['Work Experience'].isnull()]
print(missing_work_exp)

In [None]:
df['Work Experience'].replace(['0', 'nan 0', np.nan], '<1', inplace=True)
df = df.dropna(subset=['Work Experience'])
missing_data = df.isnull().sum()
missing_data = missing_data[missing_data > 0]
missing_data

In [None]:
df.describe()

## Feature Distribution and Univariate Analysis

### Categorical and numerical feature analysis

In [None]:
categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_features = df.select_dtypes(include=['number']).columns.tolist()

print("Categorical Features:")
print(categorical_features)

print("\nDistinct values for each categorical feature:")
for feature in categorical_features:
    distinct_values = df[feature].nunique()
    unique_values = df[feature].unique()
    print(f"{feature}: {distinct_values} unique values")
    print(f"Values: {unique_values}")

print("\nNumerical Features:")
print(numerical_features)

The Dependents feature, which represents the number of dependents a person has, can be treated either as numerical discrete or categorical. 

If there are many distinct values (e.g., 0 to 10+ dependents), it can be treated as numerical discrete.
If there are few distinct values (e.g., 0 to 4+ dependents), it can be treated as categorical.

The Defaulter variable is a categorical feature since it has two distinct values, 0 and 1, representing whether a user is a defaulter or not. It should be treated as a binary categorical variable, not as a numerical one.

In [None]:
unique_dependents = df['Dependents'].unique()
print("Unique values in Dependents:", unique_dependents)
df['Dependents'] = df['Dependents'].astype('category')

unique_defaulter = df['Defaulter'].unique()
print("Unique values in Defaulter:", unique_defaulter)
df['Defaulter'] = df['Defaulter'].astype('category')

unique_num_loans = df['Number of loans'].unique()
print("Unique values in Number of loans:", unique_num_loans)
df['Number of loans'] = df['Number of loans'].astype('category')

unique_loan_tenure = df['Loan Tenure'].unique()
print("Unique values in Loan Tenure:", unique_loan_tenure)
df['Loan Tenure'] = df['Loan Tenure'].astype('category')

categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
print(categorical_features)

In [None]:
numerical_features = ['Amount', 'Interest Rate', 'Loan Tenure', 'Total Income', 'Delinq_2yrs', 'Total Payment', 'Received Principal', 'Interest Received']

num_features = len(numerical_features)
cols = 3
rows = math.ceil(num_features / cols)

fig, axs = plt.subplots(rows, cols, figsize=(18, 6 * rows))
axs = axs.flatten()

for i, feature in enumerate(numerical_features):
    sns.histplot(df[feature], bins=20, kde=True, ax=axs[i])
    axs[i].set_title(f'{feature.capitalize()} Distribution')
    axs[i].set_xlabel(feature.capitalize())
    axs[i].set_ylabel('Frequency')

for j in range(i + 1, len(axs)):
    fig.delaxes(axs[j])

plt.tight_layout()
plt.show()

## Shape Analysis (Skewness)

In [None]:
numerical_features = ['Amount', 'Interest Rate', 'Loan Tenure', 'Total Income', 'Delinq_2yrs', 'Total Payment', 'Received Principal', 'Interest Received', 'Number of loans']

skewness = df[numerical_features].skew()
print("Skewness of features:\n", skewness)

Highly Skewed Features: Amount,Total Income, Delinq_2yrs, Interest Received and Number of loans.

Moderate Skewness: Loan Tenure, Total Payment, Received Principal

In [None]:
# Apply log transformations
df['Amount'] = np.log1p(df['Amount'])  # First log1p transformation for Amount
df['Total Income'] = np.log1p(df['Total Income'])
df['Interest Received'] = np.log1p(df['Interest Received'])
df['Number of loans'] = np.log1p(df['Number of loans'])
df['Delinq_2yrs'] = np.log1p(df['Delinq_2yrs'])

# Apply square root transformations for moderately skewed features
df['Loan Tenure'] = np.sqrt(df['Loan Tenure'])
df['Total Payment'] = np.sqrt(df['Total Payment'])
df['Received Principal'] = np.sqrt(df['Received Principal'])

# Apply cube root transformations where needed
df['Number of loans'] = np.cbrt(df['Number of loans'])

# Apply reflection transformation (log with reflection) for negative skewness
df['Interest Received'] = np.log(np.max(df['Interest Received']) - df['Interest Received'] + 1)
df['Amount'] = np.log(np.max(df['Amount']) - df['Amount'] + 1)

# Apply Box-Cox transformation for highly skewed features
df['Number of loans'], _ = boxcox(df['Number of loans'] + 1)  # Adding 1 to ensure values are > 0
df['Delinq_2yrs'], _ = boxcox(df['Delinq_2yrs'] + 1)
df['Loan Tenure'], _ = boxcox(df['Loan Tenure'] + 1)

In [None]:
skewness = df[numerical_features].skew()
print("Skewness of features:\n", skewness)

In [None]:
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Check for binary numerical features (which should be treated as categorical)
binary_categorical_columns = [col for col in numerical_columns if df[col].nunique() == 2]

# Explicitly add 'Work Experience Encoded' and 'Dependents' to the categorical list
categorical_columns += binary_categorical_columns + ['Dependents']

# Remove 'Work Experience Encoded' and 'Dependents' from numerical columns
numerical_columns = [col for col in numerical_columns if col not in binary_categorical_columns + ['Work Experience Encoded', 'Dependents']]

# Get unique values for categorical columns
unique_values_categorical = {col: df[col].unique() for col in categorical_columns}

# Print the categorical columns and their unique values
print("Categorical Features and Unique Values:")
for col, unique_values in unique_values_categorical.items():
    print(f"{col}: {unique_values}")

# Print the remaining numerical columns
print("\nNumerical Features:")
print(numerical_columns)

In [None]:
null_values = df.isnull().sum()
print(null_values[null_values > 0])

In [None]:
df['Amount Missing'] = df['Amount'].isnull().astype(int)
df['Amount'].fillna(df['Amount'].median(), inplace=True)

In [None]:
df = df.dropna(subset=['Work Experience'])
target_encoder = ce.TargetEncoder(cols=['Work Experience'])
df['Work Experience Encoded'] = target_encoder.fit_transform(df['Work Experience'], df['Defaulter'])

In [None]:
replace_with = 'Unknown'
columns_to_replace = ['Social Profile', 'Is_verified', 'Married', 'Employment type']
df[columns_to_replace] = df[columns_to_replace].fillna(replace_with)
df['Tier of Employment'].fillna('X', inplace=True)

In [None]:
null_values = df.isnull().sum()
print(null_values[null_values > 0])

In [None]:
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Check for binary numerical features (which should be treated as categorical)
binary_categorical_columns = [col for col in numerical_columns if df[col].nunique() == 2]

# Explicitly add 'Work Experience Encoded' and 'Dependents' to the categorical list
categorical_columns += binary_categorical_columns + ['Work Experience Encoded', 'Dependents']

# Remove 'Work Experience Encoded' and 'Dependents' from numerical columns
numerical_columns = [col for col in numerical_columns if col not in binary_categorical_columns + ['Work Experience Encoded', 'Dependents']]

# Get unique values for categorical columns
unique_values_categorical = {col: df[col].unique() for col in categorical_columns}

# Print the categorical columns and their unique values
print("Categorical Features and Unique Values:")
for col, unique_values in unique_values_categorical.items():
    print(f"{col}: {unique_values}")

# Print the remaining numerical columns
print("\nNumerical Features:")
print(numerical_columns)

In [None]:
df.head()

In [None]:
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Check for binary numerical features (which should be treated as categorical)
binary_categorical_columns = [col for col in numerical_columns if df[col].nunique() == 2]

# Explicitly add 'Work Experience Encoded' and 'Dependents' to the categorical list
categorical_columns += binary_categorical_columns + ['Work Experience Encoded', 'Dependents']

# Remove 'Work Experience Encoded' and 'Dependents' from numerical columns
numerical_columns = [col for col in numerical_columns if col not in binary_categorical_columns + ['Work Experience Encoded', 'Dependents']]

# Calculate skewness for the remaining numerical columns
skewness_metrics = df[numerical_columns].skew()

# Print skewness metrics
print("Skewness Metrics for Numerical Columns:")
print(skewness_metrics)

for feature in numerical_columns:
    print(f"\n{feature} (skewness: {skewness_metrics[feature]:.2f})")

    # Create subplots for histogram and KDE plot
    fig, axes = plt.subplots(1, 2, figsize=(18, 6))

    # Plot histogram with kernel density estimate (KDE)
    sns.histplot(df[feature], kde=True, color='blue', ax=axes[0])
    axes[0].set_title(f'{feature} - Histogram')

    # Plot KDE (Kernel Density Estimate)
    sns.kdeplot(df[feature], color='blue', ax=axes[1])
    axes[1].set_title(f'{feature} - KDE')

    # Adjust layout to avoid overlapping
    plt.tight_layout()

    # Display the plots
    plt.show()

In [None]:
# Apply log and sqrt transformations where applicable based on skewness
df['Amount'] = np.log1p(df['Amount'])  # log(Amount + 1) to handle 0 values
df['Total Income'] = np.log1p(df['Total Income'])
df['Delinq_2yrs'] = np.log1p(df['Delinq_2yrs'])
df['Interest Received'] = np.log1p(df['Interest Received'])
df['Number of loans'] = np.log1p(df['Number of loans'])

df['Total Payment'] = np.sqrt(df['Total Payment'])
df['Received Principal'] = np.sqrt(df['Received Principal'])

# Recalculate skewness metrics for transformed columns
transformed_columns = ['Amount', 'Total Income', 'Delinq_2yrs', 'Interest Received', 'Number of loans', 'Total Payment', 'Received Principal']

transformed_skewness_metrics = df[transformed_columns].skew()

# Output the skewness metrics
print("Skewness Metrics for Transformed Columns:")
print(transformed_skewness_metrics)

for feature in transformed_columns:
    print(f"\n{feature} (skewness: {skewness_metrics[feature]:.2f})")

    # Create subplots for histogram and KDE plot
    fig, axes = plt.subplots(1, 2, figsize=(18, 6))

    # Plot histogram with kernel density estimate (KDE)
    sns.histplot(df[feature], kde=True, color='blue', ax=axes[0])
    axes[0].set_title(f'{feature} - Histogram')

    # Plot KDE (Kernel Density Estimate)
    sns.kdeplot(df[feature], color='blue', ax=axes[1])
    axes[1].set_title(f'{feature} - KDE')

    # Adjust layout to avoid overlapping
    plt.tight_layout()

    # Display the plots
    plt.show()

In [None]:
def detect_outliers_iqr(df, column):
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    
    # Calculate IQR (Interquartile Range)
    IQR = Q3 - Q1
    
    # Define the outlier boundaries
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Detect outliers
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    
    print(f"Outliers in '{column}':")
    print(outliers[[column]])

    # Optionally, visualize the outliers using a boxplot
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df[column])
    plt.title(f"Outliers in {column}")
    plt.show()

    return outliers

# Function to cap values at the 95th percentile
def cap_values(df, column):
    cap_value = df[column].quantile(0.95)
    df[f'{column}_capped'] = np.where(df[column] > cap_value, cap_value, df[column])
    return df

# Function to check and print skewness
def check_skewness(df, columns):
    skewness_metrics = df[columns].skew()
    print("Skewness Metrics:")
    print(skewness_metrics)
    return skewness_metrics

# Function to apply transformations and visualize results
def apply_log_and_boxcox(df, column):
    # Log transformation
    df[f'{column}_log'] = np.log1p(df[column])  # log(x + 1) to handle 0 values
    log_skewness = df[f'{column}_log'].skew()
    print(f"Skewness after log transformation for {column}: {log_skewness}")
    
    # Visualize log-transformed data
    plt.figure(figsize=(10, 5))
    sns.histplot(df[f'{column}_log'], kde=True, color='blue')
    plt.title(f'{column}_log - Histogram and KDE after Log Transformation')
    plt.show()
    
    # Box-Cox transformation
    df[f'{column}_boxcox'], fitted_lambda = stats.boxcox(df[column] + 1e-6)  # Small constant to avoid log(0)
    boxcox_skewness = pd.Series(df[f'{column}_boxcox']).skew()
    print(f"Skewness after Box-Cox transformation for {column}: {boxcox_skewness}")
    
    # Visualize Box-Cox transformed data
    plt.figure(figsize=(10, 5))
    sns.histplot(df[f'{column}_boxcox'], kde=True, color='blue')
    plt.title(f'{column}_boxcox - Histogram and KDE after Box-Cox Transformation')
    plt.show()

    return df

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Step 1: Detect outliers for "Number of loans" and "Delinq_2yrs"
outliers_number_of_loans = detect_outliers_iqr(df, 'Number of loans')
outliers_delinq_2yrs = detect_outliers_iqr(df, 'Delinq_2yrs')

# Step 2: Cap the values at the 95th percentile
df = cap_values(df, 'Number of loans')
df = cap_values(df, 'Delinq_2yrs')

# Step 3: Check skewness of capped columns
capped_columns = ['Number of loans_capped', 'Delinq_2yrs_capped']
check_skewness(df, capped_columns)

# Visualize capped features
for feature in capped_columns:
    plt.figure(figsize=(10, 5))
    sns.histplot(df[feature], kde=True, color='blue')
    plt.title(f'{feature} - Histogram and KDE after Capping')
    plt.show()

# Step 4: Apply transformations (Log and Box-Cox) for 'Delinq_2yrs_capped'
df = apply_log_and_boxcox(df, 'Delinq_2yrs_capped')

In [None]:
numerical_df = df.select_dtypes(include=['float64', 'int64'])

# Calculate the Pearson and Spearman correlation matrices
pearson_corr_matrix = numerical_df.corr(method='pearson')
spearman_corr_matrix = numerical_df.corr(method='spearman')

# Print the Pearson and Spearman correlation matrices
print("Pearson Correlation Matrix:")
print(pearson_corr_matrix)

print("\nSpearman Correlation Matrix:")
print(spearman_corr_matrix)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))
sns.heatmap(pearson_corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=axes[0])
axes[0].set_title('Pearson Correlation Matrix')
sns.heatmap(spearman_corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=axes[1])
axes[1].set_title('Spearman Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Assuming 'Defaulter' is your target variable
plt.figure(figsize=(8, 5))
sns.countplot(x=df['Defaulter'])
plt.title('Class Distribution of Defaulter')
plt.show()

# Print the class distribution
class_distribution = df['Defaulter'].value_counts()
print(class_distribution)

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Copy the original dataframe to avoid altering the original one
df_encoded = df.copy()

# Label Encoding for ordinal categorical features
ordinal_features = ['Tier of Employment', 'Work Experience Encoded', 'Dependents']

# Apply LabelEncoder to ordinal features
label_encoders = {}
for feature in ordinal_features:
    le = LabelEncoder()
    df_encoded[feature] = le.fit_transform(df_encoded[feature].astype(str))  # Convert to string to handle NaNs
    label_encoders[feature] = le

# One-Hot Encoding for nominal categorical features
nominal_features = ['Loan Category', 'Employment type', 'Gender', 'Married', 'Home', 'Social Profile', 'Is_verified']

# Apply OneHotEncoder to nominal features
df_encoded = pd.get_dummies(df_encoded, columns=nominal_features, drop_first=True)
le_work_exp = LabelEncoder()
df_encoded['Work Experience'] = le_work_exp.fit_transform(df_encoded['Work Experience'].astype(str))
# Print the encoded dataframe
print("Encoded Dataframe Head:")
print(df_encoded.head())

In [None]:
from imblearn.combine import SMOTETomek

# Separate features and target variable
X = df_encoded.drop('Defaulter', axis=1)
y = df_encoded['Defaulter']

# Apply SMOTETomek for resampling
smote_tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X, y)

# Check new class distribution
print("Class distribution after SMOTE-Tomek:")
print(pd.Series(y_resampled).value_counts())

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# 1. Regularized Decision Tree Classifier
dt_model = DecisionTreeClassifier(
    random_state=42, 
    max_depth=10,                 # Limiting the maximum depth
    min_samples_split=10,         # Minimum samples required to split an internal node
    min_samples_leaf=5            # Minimum samples required to be at a leaf node
)
dt_model.fit(X_train, y_train)

# Predictions on both training and test data
y_train_pred_dt = dt_model.predict(X_train)
y_test_pred_dt = dt_model.predict(X_test)

# Evaluation on training data
print("Decision Tree Classifier (Train) Report:")
print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred_dt):.4f}")

# Evaluation on test data
print("Decision Tree Classifier (Test) Report:")
print(classification_report(y_test, y_test_pred_dt))
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred_dt):.4f}")

# 2. Regularized Random Forest Classifier
rf_model = RandomForestClassifier(
    random_state=42, 
    n_estimators=100,             # Number of trees
    max_depth=10,                 # Limiting the maximum depth of the trees
    min_samples_split=10,         # Minimum samples required to split an internal node
    min_samples_leaf=5            # Minimum samples required to be at a leaf node
)
rf_model.fit(X_train, y_train)

# Predictions on both training and test data
y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)

# Evaluation on training data
print("\nRandom Forest Classifier (Train) Report:")
print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred_rf):.4f}")

# Evaluation on test data
print("Random Forest Classifier (Test) Report:")
print(classification_report(y_test, y_test_pred_rf))
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred_rf):.4f}")

# 3. Regularized XGBoost Classifier
xgb_model = XGBClassifier(
    random_state=42, 
    use_label_encoder=False, 
    eval_metric='mlogloss', 
    max_depth=10,                 # Limiting the maximum depth of the trees
    learning_rate=0.1,            # Step size shrinkage used to prevent overfitting
    colsample_bytree=0.8          # Subsample ratio of columns when constructing each tree
)
xgb_model.fit(X_train, y_train)

# Predictions on both training and test data
y_train_pred_xgb = xgb_model.predict(X_train)
y_test_pred_xgb = xgb_model.predict(X_test)

# Evaluation on training data
print("\nXGBoost Classifier (Train) Report:")
print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred_xgb):.4f}")

# Evaluation on test data
print("XGBoost Classifier (Test) Report:")
print(classification_report(y_test, y_test_pred_xgb))
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred_xgb):.4f}")

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

dt_model = DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_split=10)
rf_model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10, min_samples_split=10)
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss', max_depth=10, learning_rate=0.1)

# Cross-validation for Decision Tree
print("Cross-validation for Decision Tree Classifier:")
dt_cv_scores = cross_val_score(dt_model, X_train, y_train, cv=5)
print(f"Cross-validation accuracy for Decision Tree: {dt_cv_scores}")
print(f"Mean CV accuracy for Decision Tree: {dt_cv_scores.mean():.4f}\n")

# Cross-validation for Random Forest
print("Cross-validation for Random Forest Classifier:")
rf_cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5)
print(f"Cross-validation accuracy for Random Forest: {rf_cv_scores}")
print(f"Mean CV accuracy for Random Forest: {rf_cv_scores.mean():.4f}\n")

# Cross-validation for XGBoost
print("Cross-validation for XGBoost Classifier:")
xgb_cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=5)
print(f"Cross-validation accuracy for XGBoost: {xgb_cv_scores}")
print(f"Mean CV accuracy for XGBoost: {xgb_cv_scores.mean():.4f}\n")

# Fit the models after cross-validation to evaluate test performance
dt_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

# Predictions and evaluation on the test set for Decision Tree
y_test_pred_dt = dt_model.predict(X_test)
print("\nDecision Tree Test Accuracy: {:.4f}".format(accuracy_score(y_test, y_test_pred_dt)))

# Predictions and evaluation on the test set for Random Forest
y_test_pred_rf = rf_model.predict(X_test)
print("Random Forest Test Accuracy: {:.4f}".format(accuracy_score(y_test, y_test_pred_rf)))

# Predictions and evaluation on the test set for XGBoost
y_test_pred_xgb = xgb_model.predict(X_test)
print("XGBoost Test Accuracy: {:.4f}".format(accuracy_score(y_test, y_test_pred_xgb)))

In [None]:
def plot_feature_importance(model, feature_names, model_name):
    importance = model.feature_importances_
    
    # Sort feature importance in descending order
    indices = np.argsort(importance)[::-1]
    
    # Create a plot
    plt.figure(figsize=(10, 6))
    plt.title(f"Feature Importance - {model_name}")
    plt.bar(range(len(indices)), importance[indices], align="center")
    plt.xticks(range(len(indices)), [feature_names[i] for i in indices], rotation=90)
    plt.tight_layout()
    plt.show()

# Random Forest Feature Importance
print("Random Forest Feature Importance:")
plot_feature_importance(rf_model, X_train.columns, "Random Forest")

# XGBoost Feature Importance
print("XGBoost Feature Importance:")
plot_feature_importance(xgb_model, X_train.columns, "XGBoost")