### Import the data

In [1]:
#Import libraries
import pandas as pd 

#Read data
data = pd.read_excel("../Data/dataproject2024.xlsx")
data.shape

(7430, 15)

In [2]:
data.head()

Unnamed: 0,ID,Job tenure,Age,Car price,Funding amount,Down payment,Loan duration,Monthly payment,Credit event,Married,Homeowner,Default (y),Pred_default (y_hat),PD,Group
0,1,34,55,4875,3087,0,36,0.047895,0,1,1,0,0,0.02448,1
1,2,5,29,13000,13000,0,60,0.091667,0,0,0,1,0,0.331661,0
2,3,14,38,17190,14190,0,60,0.088235,0,0,0,0,0,0.187505,0
3,4,16,37,22773,23568,0,48,0.110084,0,1,1,0,0,0.035441,1
4,5,1,61,7700,8526,0,48,0.123404,0,1,0,1,0,0.340883,0


### Split the dataset into features/target variables

In [3]:
data_features = data.drop(columns=['ID', 'Default (y)', 'Pred_default (y_hat)', 'Group', 'PD'])

data_features.head()

Unnamed: 0,Job tenure,Age,Car price,Funding amount,Down payment,Loan duration,Monthly payment,Credit event,Married,Homeowner
0,34,55,4875,3087,0,36,0.047895,0,1,1
1,5,29,13000,13000,0,60,0.091667,0,0,0
2,14,38,17190,14190,0,60,0.088235,0,0,0
3,16,37,22773,23568,0,48,0.110084,0,1,1
4,1,61,7700,8526,0,48,0.123404,0,1,0


In [4]:
data_target = data['PD']

data_target

0       0.024480
1       0.331661
2       0.187505
3       0.035441
4       0.340883
          ...   
7425    0.217708
7426    0.063937
7427    0.630863
7428    0.067551
7429    0.150553
Name: PD, Length: 7430, dtype: float64

### Train the surrogate model on PD

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model
model.fit(data_features, data_target)

# Make predictions on the test set
y_pred = model.predict(data_features)

# Evaluate the model
mse = mean_squared_error(data_target, y_pred)
r2 = r2_score(data_target, y_pred)

# Output the evaluation metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared: {r2}")

Mean Squared Error (MSE): 0.025167974305639117
R-squared: 0.43464725609093446


In [6]:
# Create a DataFrame for the coefficients and order by absolute value
coefficients = pd.DataFrame({'Feature': data_features.columns, 'Coefficient': model.coef_})
coefficients['Abs_Coefficient'] = coefficients['Coefficient'].abs()  # Create column for absolute values
coefficients = coefficients.sort_values(by='Abs_Coefficient', ascending=False)  # Sort by absolute values

# Print the sorted coefficients
print(coefficients[['Feature', 'Coefficient']])

           Feature  Coefficient
6  Monthly payment     0.301764
7     Credit event     0.300786
4     Down payment     0.077610
8          Married    -0.072643
9        Homeowner    -0.065734
0       Job tenure    -0.003022
1              Age    -0.001167
5    Loan duration     0.001120
3   Funding amount     0.000025
2        Car price    -0.000021


Most Important Features:

- Monthly payment and Credit event are the most significant factors, both positively impacting the probability of default.

- Down payment also contributes positively but to a lesser extent.

- Marital status and homeownership decrease the probability of default.


### Train now a new surrogate model on the output PD of our own CatBoost model

In [7]:
# Import the results from our own blackbox model
data_blackbox = pd.read_csv('../Data/catboost_output')

# Only keep the predicted default probabilities
output_blackbox = data_blackbox['pred_default']
output_blackbox

0       0.854820
1       0.690417
2       0.803889
3       0.842806
4       0.655357
          ...   
7425    0.639712
7426    0.668494
7427    0.624802
7428    0.738861
7429    0.724935
Name: pred_default, Length: 7430, dtype: float64

In [8]:
# Initialize the Linear Regression model
model_own = LinearRegression()

# Train the model
model_own.fit(data_features, output_blackbox)

# Make predictions on the test set
y_pred_own = model_own.predict(data_features)

# Evaluate the model
mse_own = mean_squared_error(output_blackbox, y_pred_own)
r2_own = r2_score(output_blackbox, y_pred_own)

# Output the evaluation metrics
print(f"Mean Squared Error (MSE): {mse_own}")
print(f"R-squared: {r2_own}")

Mean Squared Error (MSE): 0.0031279016417306584
R-squared: 0.6969325267192541


In [9]:
# Create a DataFrame for the coefficients and order by absolute value
coefficients_own = pd.DataFrame({'Feature': data_features.columns, 'Coefficient': model_own.coef_})
coefficients_own['Abs_Coefficient'] = coefficients_own['Coefficient'].abs()  # Create column for absolute values
coefficients_own = coefficients_own.sort_values(by='Abs_Coefficient', ascending=False)  # Sort by absolute values

# Print the sorted coefficients
print(coefficients_own[['Feature', 'Coefficient']])

           Feature  Coefficient
6  Monthly payment    -0.304869
7     Credit event    -0.117320
8          Married     0.059190
9        Homeowner     0.056110
4     Down payment    -0.029828
0       Job tenure     0.002220
1              Age     0.000999
5    Loan duration    -0.000822
3   Funding amount    -0.000007
2        Car price     0.000004
