# Credit risk for German banks

#### EL = PD * LGD * ED
- EL: Expected Loss
- PD: Probability of Default
- LGD: Loss given default
- ED: Exposure at Default

In [45]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_recall_fscore_support
import xgboost as xgb
import multiprocessing

In [2]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

## 1. Data loading and encoding

In [4]:
df = pd.read_csv('german.data', header=None, delim_whitespace=True)
print(df.shape)

(1000, 21)


In [5]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,4,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,2,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,3,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,4,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,4,A124,53,A143,A153,2,A173,2,A191,A201,2


In [8]:
df.columns=['account-balance', 'duration', 'credit-history', 'purpose', 'credit-amount', 'saving', 'employed-time', 'installment-rate', 'status', 'guarantors', 'residence-time', 'value-assets', 'age', 'installment-plans', 'housing', 'current-credits', 'job', 'dependents', 'telephone', 'foreign-worker', 'creditability']

Encode the categorical features

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   account-balance    1000 non-null   object
 1   duration           1000 non-null   int64 
 2   credit-history     1000 non-null   object
 3   purpose            1000 non-null   object
 4   credit-amount      1000 non-null   int64 
 5   saving             1000 non-null   object
 6   employed-time      1000 non-null   object
 7   installment-rate   1000 non-null   int64 
 8   status             1000 non-null   object
 9   guarantors         1000 non-null   object
 10  residence-time     1000 non-null   int64 
 11  value-assets       1000 non-null   object
 12  age                1000 non-null   int64 
 13  installment-plans  1000 non-null   object
 14  housing            1000 non-null   object
 15  current-credits    1000 non-null   int64 
 16  job                1000 non-null   object
 

In [11]:
categorical_columns=['account-balance', 'credit-history', 'purpose', 'saving', 'employed-time', 'status', 'guarantors', 'value-assets', 'installment-plans', 'housing', 'job', 'telephone', 'foreign-worker']

In [17]:
le = LabelEncoder()
for column in categorical_columns:
    df[column] = le.fit_transform(df[column])

In [18]:
df.head()

Unnamed: 0,account-balance,duration,credit-history,purpose,credit-amount,saving,employed-time,installment-rate,status,guarantors,residence-time,value-assets,age,installment-plans,housing,current-credits,job,dependents,telephone,foreign-worker,creditability
0,0,6,4,4,1169,4,4,4,2,0,4,0,67,2,1,2,2,1,1,0,1
1,1,48,2,4,5951,0,2,2,1,0,2,0,22,2,1,1,2,1,0,0,2
2,3,12,4,7,2096,0,3,2,2,0,3,0,49,2,1,1,1,2,0,0,1
3,0,42,2,3,7882,0,3,2,2,2,4,1,45,2,2,1,2,2,0,0,1
4,0,24,3,0,4870,0,2,3,2,0,4,3,53,2,2,2,2,2,0,0,2


## 2. Exploratory Data Analysis

In [20]:
num_records = len(df)
num_records

1000

In [22]:
df.describe()

Unnamed: 0,account-balance,duration,credit-history,purpose,credit-amount,saving,employed-time,installment-rate,status,guarantors,residence-time,value-assets,age,installment-plans,housing,current-credits,job,dependents,telephone,foreign-worker,creditability
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1.577,20.903,2.545,3.277,3271.258,1.105,2.384,2.973,1.682,0.145,2.845,1.358,35.546,1.675,0.929,1.407,1.904,1.155,0.404,0.037,1.3
std,1.257638,12.058814,1.08312,2.739302,2822.736876,1.580023,1.208306,1.118715,0.70808,0.477706,1.103718,1.050209,11.375469,0.705601,0.531264,0.577654,0.653614,0.362086,0.490943,0.188856,0.458487
min,0.0,4.0,0.0,0.0,250.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,19.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
25%,0.0,12.0,2.0,1.0,1365.5,0.0,2.0,2.0,1.0,0.0,2.0,0.0,27.0,2.0,1.0,1.0,2.0,1.0,0.0,0.0,1.0
50%,1.0,18.0,2.0,3.0,2319.5,0.0,2.0,3.0,2.0,0.0,3.0,1.0,33.0,2.0,1.0,1.0,2.0,1.0,0.0,0.0,1.0
75%,3.0,24.0,4.0,4.0,3972.25,2.0,4.0,4.0,2.0,0.0,4.0,2.0,42.0,2.0,1.0,2.0,2.0,1.0,1.0,0.0,2.0
max,3.0,72.0,4.0,9.0,18424.0,4.0,4.0,4.0,3.0,2.0,4.0,3.0,75.0,2.0,2.0,4.0,3.0,2.0,1.0,1.0,2.0


Check for data class imbalance

In [24]:
df['creditability'].value_counts(normalize=True)

1    0.7
2    0.3
Name: creditability, dtype: float64

For class data imbalance, I will use two different approach:
- Upsampling for minor class + Logistics regression
- Tree-based statiscal learning algorithm

Check feature values imbalance in columns with categorical values.

In [25]:
numerical_cols = ['duration', 'credit-amount', 'age']

In [26]:
(pd.DataFrame(
    df.loc[:, ~df.columns.isin(numerical_cols)]
    .melt(var_name='column', value_name='value')
    .groupby(by=['column'])['value'].apply(pd.Series.value_counts, normalize=True))
.sort_values(by=['column', 'value']))

Unnamed: 0_level_0,Unnamed: 1_level_0,value
column,Unnamed: 1_level_1,Unnamed: 2_level_1
account-balance,2,0.063
account-balance,1,0.269
account-balance,0,0.274
account-balance,3,0.394
credit-history,0,0.04
credit-history,1,0.049
credit-history,3,0.088
credit-history,4,0.293
credit-history,2,0.53
creditability,2,0.3


There are a huge data imbalance in the "Guarantors" and "Foreign Worker" features => so I will drop these features from the data.

## 2. Data preprocess

Omit features that aren't related to creditability intuitively

In [30]:
used_features = ['account-balance', 'credit-history', 'saving', 'employed-time', 'status', 'current-credits', 'guarantors', 'installment-plans', 'purpose']

In [47]:
x = df[used_features]
y = df['creditability'] - 1

In [48]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## 3. Modelling

### Logistic Regression

In [49]:
lreg = LogisticRegression()
lreg.fit(x_train, y_train)
y_pred = lreg.predict(x_test)

In [50]:
print(precision_recall_fscore_support(y_test, y_pred, average='micro'))

(0.735, 0.735, 0.735, None)


### Gradient Boosted Trees

In [51]:
xgb_model = xgb.XGBClassifier(n_jobs=multiprocessing.cpu_count() // 2)
clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
                                   'n_estimators': [50, 100, 200]}, verbose=1,
                       n_jobs=2)
clf.fit(x_train, y_train)
y_pred_2 = clf.predict(x_test)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  45 out of  45 | elapsed:    9.4s finished


In [52]:
print(precision_recall_fscore_support(y_test, y_pred_2, average='micro'))

(0.715, 0.715, 0.715, None)
