In [4]:
# Reset environment
%reset -f

# Load useful libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [5]:
# Load past loans data
df0 = pd.read_csv("PastLoans.csv")
df0

Unnamed: 0,sex,employment,married,income,digital1,digital2,digital3,default
0,M,employed,1,33610,0.705613,0.068107,0.167038,0
1,M,employed,1,20650,0.000000,0.091152,0.000000,0
2,M,employed,1,8504,0.637625,0.000000,0.066207,0
3,M,employed,1,18460,0.276120,0.548380,0.625448,0
4,F,employed,0,8811,0.001974,0.500731,0.000000,0
...,...,...,...,...,...,...,...,...
99995,F,retired,1,24785,0.000000,0.206613,0.000000,0
99996,M,employed,1,23353,0.000000,0.499002,0.367185,0
99997,M,employed,1,6770,0.000000,0.125227,0.000000,0
99998,M,student,0,61856,0.807840,0.827529,0.772643,0


In [6]:
# Load loan applications
df1 = pd.read_csv("NewApplications_Lender1.csv")
df1

Unnamed: 0,id,sex,employment,married,income,digital1
0,0,M,unemployed,1,16514,0.758500
1,1,F,employed,1,8842,0.593872
2,2,F,employed,0,19424,0.626852
3,3,M,unemployed,1,7418,0.000000
4,4,M,retired,0,843,0.219186
...,...,...,...,...,...,...
99995,99995,M,student,0,0,0.034615
99996,99996,M,employed,1,18066,0.000000
99997,99997,M,employed,0,7273,0.441220
99998,99998,M,employed,1,8238,0.169401


In [7]:
# Summary statistics
df0.describe()

Unnamed: 0,married,income,digital1,digital2,digital3,default
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,0.60983,18534.4494,0.416057,0.415864,0.415967,0.0951
std,0.487791,47862.045438,0.325249,0.324763,0.325209,0.293354
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,5369.75,0.061816,0.062371,0.06104,0.0
50%,1.0,9397.5,0.445255,0.444705,0.445357,0.0
75%,1.0,17492.0,0.69441,0.695056,0.694997,0.0
max,1.0,1000000.0,0.999989,0.999995,0.999998,1.0


In [8]:
# One-hot encode categorical variables
df0 = pd.get_dummies(df0, columns=['sex','employment'], dtype='int')
df0.describe()

Unnamed: 0,married,income,digital1,digital2,digital3,default,sex_F,sex_M,employment_employed,employment_retired,employment_student,employment_unemployed
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,0.60983,18534.4494,0.416057,0.415864,0.415967,0.0951,0.42669,0.57331,0.58035,0.11554,0.16259,0.14152
std,0.487791,47862.045438,0.325249,0.324763,0.325209,0.293354,0.494599,0.494599,0.493504,0.319674,0.368993,0.348559
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,5369.75,0.061816,0.062371,0.06104,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,9397.5,0.445255,0.444705,0.445357,0.0,0.0,1.0,1.0,0.0,0.0,0.0
75%,1.0,17492.0,0.69441,0.695056,0.694997,0.0,1.0,1.0,1.0,0.0,0.0,0.0
max,1.0,1000000.0,0.999989,0.999995,0.999998,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
df0['logincome'] = np.log(df0['income']+1)
df0['logincome'].describe()

count    100000.000000
mean          7.859965
std           3.653455
min           0.000000
25%           8.588723
50%           9.148305
75%           9.769556
max          13.815512
Name: logincome, dtype: float64

In [10]:
# Example of a scoring model: linear regression model (OLS)

## Set the dependent variable
y = df0['default']

## Set predictors: log(income), dummy income==0, employment dummies
df0['income_log'] = np.log(df0['income']+1) # add 1 to handle with log(0)
df0['income_zero'] = np.where(df0['income']==0, 1, 0)
X = df0[['income_log', 'income_zero', 'employment_student', 'employment_retired', 'employment_unemployed']]
## always include a constant in OLS
X = sm.add_constant(X)

## Estimate the model
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                default   R-squared:                       0.045
Model:                            OLS   Adj. R-squared:                  0.045
Method:                 Least Squares   F-statistic:                     942.9
Date:                Thu, 11 Jul 2024   Prob (F-statistic):               0.00
Time:                        12:46:38   Log-Likelihood:                -16953.
No. Observations:              100000   AIC:                         3.392e+04
Df Residuals:                   99994   BIC:                         3.397e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                     0.60

In [11]:
# Use the estimated model to predict the default probability of new loan applications

## Loan new loan applications
df1 = pd.read_csv("NewApplications_Lender1.csv")

## Define the predictors in the scoring model
df1 = pd.get_dummies(df1, columns=['sex','employment'], dtype='int') #one-hot encoding
df1['income_log'] = np.log(df1['income']+1) #log income
df1['income_zero'] = np.where(df1['income']==0, 1, 0) #zero income dummy
X = df1[['income_log', 'income_zero', 'employment_student', 'employment_retired', 'employment_unemployed']]
X = sm.add_constant(X) #constant

## Predict the default probability
df1['default_proba'] = model.predict(X)

In [12]:
# Summary statistics of predicted default probability
df1['default_proba'].describe()

count    100000.000000
mean          0.095359
std           0.062134
min          -0.215311
25%           0.060222
50%           0.095558
75%           0.125127
max           0.516491
Name: default_proba, dtype: float64

In [13]:
# Set the interest rate
df1['rate'] = 0.10 #Uniform interest rate of 10%, which is obviously not optimal! The interest rate should depend on the default probability

In [14]:
# Save the data with the application id and the interest
df1[['id','rate']].to_csv('team10.csv', index=False) #replace 10 with your team number