# 4. STRESS TESTING

In this part we perform the Stress Testing, variable *test* includes the Static Base Case, *test2* is a Static Adverse Case and *testD* is the dataset with Dynamic Adverse Case. 

In [1]:
import pandas as pd
import numpy  as np
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
from utils import get_train_test, load_statistical_clusters, run_model, stress_test

## From now on we use the dataset with imputed missing values

In [4]:
df = pd.read_csv('imputed_MICE_forrest.csv').iloc[:, 1:]

## Merge the imputed dataset with info columns (first 8 columns from the orig dataset)
* note: the number of observations in both df is the same and at the exact same order, so we can concat 1:1 withou using any merging key

In [5]:
data = pd.read_pickle('Erasmus_data_stresstesting_2024.pickle').iloc[:, 2:20]

countries_of_interest = ['FI', 'NL', 'AT', 'BE', 'SE', 'DE', 'NO', 'DK', 'IS', 'IT', 'ES', 'PT', 'LV', 'RO', 'HR', 'LT', 'BG', 'SK', 'CZ', 'SI', 'HU', 'PL']

subset_df = data[data['country_code'].isin(countries_of_interest)]

In [6]:
info = subset_df.iloc[:, :7]
info = info.reset_index().iloc[:, 1:]

In [7]:
df = pd.concat([info, df], axis = 1)

## Now we obtain the macro and micro varibales and merge it with our latest df
* note: this time macros are merged based on key:*country code* and WoE_country and WoE_industry on keys:*country_code* and *industry_code*

In [8]:
df2 = pd.read_csv('full_data_woe_rid_-2.csv').iloc[:, 2:]
macro = df2.iloc[:, 61:]

In [9]:
macro = pd.concat([df2[['country_code', 'status_year']], macro], axis = 1)

In [10]:
woe = df2[['country_code', 'industry_code', 'WoE_country', 'WoE_industry']]

In [11]:
macro_uni = macro.drop_duplicates()

In [12]:
woe_uni = woe.drop_duplicates()

In [13]:
df_new = pd.merge(df, woe_uni, on = ['country_code', 'industry_code'])
df_new = pd.merge(df_new, macro_uni, on = ['country_code', 'status_year'])

## Variable *df_new* is final dataframe with info columns, firm characteristics, WoE columns and macro variables

In [14]:
df_new

Unnamed: 0,country_code,industry_code,size_class,status_year,status_date_latest,status_latest,default_indicator,intangible_fixed_assets_0,intangible_fixed_assets_1,tangible_fixed_assets_0,...,EURxTRY,hh_debt,corp_debt,govt_debt,3m_yield,10y_yield,oil,gas,gold,copper
0,DE,G,SME,2020,16991231,non_default,0,2905.0,9624.000000,65007.0,...,9.07982,57.055937,119.453344,67.986011,-0.425150,-0.511024,32.84,3.4902,1898.36,6352.5562
1,DE,G,SME,2020,16991231,non_default,0,1.0,1.000000,266426.0,...,9.07982,57.055937,119.453344,67.986011,-0.425150,-0.511024,32.84,3.4902,1898.36,6352.5562
2,DE,G,SME,2020,16991231,non_default,0,310.0,0.000000,596041.0,...,9.07982,57.055937,119.453344,67.986011,-0.425150,-0.511024,32.84,3.4902,1898.36,6352.5562
3,DE,G,SME,2020,16991231,non_default,0,3269.0,3714.000000,167798.0,...,9.07982,57.055937,119.453344,67.986011,-0.425150,-0.511024,32.84,3.4902,1898.36,6352.5562
4,DE,G,SME,2020,16991231,non_default,0,68.0,262.000000,1053093.0,...,9.07982,57.055937,119.453344,67.986011,-0.425150,-0.511024,32.84,3.4902,1898.36,6352.5562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075921,LV,L,SME,2004,16991231,non_default,0,66871.0,90739.000000,1936418.0,...,1.82091,19.996895,88.332385,13.744445,4.231875,3.875000,234.51,692.6744,438.45,2323.6943
1075922,LV,J,SME,2004,16991231,non_default,0,19037.0,47858.105616,203540.0,...,1.82091,19.996895,88.332385,13.744445,4.231875,3.875000,234.51,692.6744,438.45,2323.6943
1075923,LV,R,SME,2004,16991231,non_default,0,5228.0,47858.105616,2498256.0,...,1.82091,19.996895,88.332385,13.744445,4.231875,3.875000,234.51,692.6744,438.45,2323.6943
1075924,LV,A,SME,2001,16991231,non_default,0,0.0,47858.105616,2271159.0,...,1.29230,7.683075,83.443015,17.459588,6.861667,5.408333,114.44,990.0938,278.95,1678.1713


## In this step we assable the clusters based on statistical anlysis done in FCS_code2.5_Clustering.ipynb

In [15]:
clusters = load_statistical_clusters(df_new)

# Load the Stress Test Scenarios (used as test sets for our PD models)

* *test* : Static Base Case
* *test2* : Static Adverse Case
* *testD* : Dynamic Adverse Case

In [16]:
test = pd.read_csv('full_stress_data_base.csv').iloc[:, 1:]
test2 = pd.read_csv('full_stress_data-2.csv').iloc[:, 1:]
testD = pd.read_csv('DynamicLogitStress.csv').iloc[:, 2:].drop(['size_class'], axis = 1)

# STRESS TEST FULL DATA

In [17]:
y = df_new['default_indicator']
X = df_new.iloc[:, 7:]

In [18]:
X

Unnamed: 0,intangible_fixed_assets_0,intangible_fixed_assets_1,tangible_fixed_assets_0,tangible_fixed_assets_1,other_fixed_assets_0,other_fixed_assets_1,stock_0,stock_1,debtors_0,debtors_1,...,EURxTRY,hh_debt,corp_debt,govt_debt,3m_yield,10y_yield,oil,gas,gold,copper
0,2905.0,9624.000000,65007.0,8.164000e+04,5000.0,5.000000e+03,891934.0,9.047550e+05,0.0,0.000000e+00,...,9.07982,57.055937,119.453344,67.986011,-0.425150,-0.511024,32.84,3.4902,1898.36,6352.5562
1,1.0,1.000000,266426.0,2.806030e+05,152155.0,0.000000e+00,266255.0,1.777810e+05,0.0,0.000000e+00,...,9.07982,57.055937,119.453344,67.986011,-0.425150,-0.511024,32.84,3.4902,1898.36,6352.5562
2,310.0,0.000000,596041.0,6.018440e+05,0.0,0.000000e+00,490006.0,4.701660e+05,1532.0,2.605000e+03,...,9.07982,57.055937,119.453344,67.986011,-0.425150,-0.511024,32.84,3.4902,1898.36,6352.5562
3,3269.0,3714.000000,167798.0,8.515900e+04,250.0,2.500000e+02,2387122.0,2.094560e+06,0.0,0.000000e+00,...,9.07982,57.055937,119.453344,67.986011,-0.425150,-0.511024,32.84,3.4902,1898.36,6352.5562
4,68.0,262.000000,1053093.0,9.262990e+05,153.0,1.530000e+02,807575.0,1.633121e+06,0.0,0.000000e+00,...,9.07982,57.055937,119.453344,67.986011,-0.425150,-0.511024,32.84,3.4902,1898.36,6352.5562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075921,66871.0,90739.000000,1936418.0,1.934787e+06,1423.0,6.000500e+04,1631941.0,1.565978e+06,1809790.0,1.208876e+06,...,1.82091,19.996895,88.332385,13.744445,4.231875,3.875000,234.51,692.6744,438.45,2323.6943
1075922,19037.0,47858.105616,203540.0,3.214865e+05,295240.0,1.007275e+05,19873.0,2.859826e+05,891122.0,1.032384e+06,...,1.82091,19.996895,88.332385,13.744445,4.231875,3.875000,234.51,692.6744,438.45,2323.6943
1075923,5228.0,47858.105616,2498256.0,1.759926e+06,278401.0,1.007275e+05,107248.0,2.859826e+05,737776.0,7.252454e+05,...,1.82091,19.996895,88.332385,13.744445,4.231875,3.875000,234.51,692.6744,438.45,2323.6943
1075924,0.0,47858.105616,2271159.0,1.759926e+06,103154.0,1.007275e+05,327246.0,2.859826e+05,631371.0,7.252454e+05,...,1.29230,7.683075,83.443015,17.459588,6.861667,5.408333,114.44,990.0938,278.95,1678.1713


In [23]:
data = get_train_test(X, y, use_SMOTE=True)

In [30]:
m = run_model(data, type = 'RF', n_estimators=50, max_depth=3, verbose = False)
print('FULL DATA')
stress_test(m, test2, X, y)


FULL DATA
Total default rate over 3 years: 1.322
Default Rate in the train data: 0.006265300773473269
Stress Test increased PD by multiple of 2.109279102905004
Number of defaults:
Year 2023 : 768
Year 2024 : 20
Year 2025 : 0


# CLUSTERS

## Base Static Case

In [57]:
#Note, 2 strange cases of Gaussian CLUSTERING are Belgium in cluster 2 and Slovania in cluster 3

country1 = test[test['country_code'].isin(['IT', 'PT', 'ES'])]
country2 = test[test['country_code'].isin(['PL', 'CZ', 'RO','HU','BG','HR','LT','LV','BE','SK','IS'])]
country3 = test[test['country_code'].isin(['NL', 'SE', 'NO','DK','AT','DE','FI','SI'])]

micro1 = country1.iloc[:, 9:60]
micro2 = country2.iloc[:, 9:60]
micro3 = country3.iloc[:, 9:60]

T_Cluster1 = country1[country1['industry_code'].isin(['G', 'C', 'M','J','GX','R','S','K'])]
T_Cluster2 = country1[country1['industry_code'].isin(['I', 'Q', 'A','E','D','P','B'])]
T_Cluster3 = country1[country1['industry_code'].isin(['F', 'H', 'N','L'])]

T_Cluster4 = country2[country2['industry_code'].isin(['F', 'H', 'N', 'L'])]
T_Cluster5 = country2[country2['industry_code'].isin(['M', 'J', 'E','D','S','K'])]
T_Cluster6 = country2[country2['industry_code'].isin(['A', 'Q', 'I','B','P'])]
T_Cluster7 = country2[country2['industry_code'].isin(['G', 'C', 'GX','R'])]

T_Cluster8 = country3[country3['industry_code'].isin(['G', 'C', 'M','J','GX','R','S','K'])]
T_Cluster9 = country3[country3['industry_code'].isin(['I', 'Q', 'A','E','D','P','B'])]
T_Cluster10 = country3[country3['industry_code'].isin(['F', 'H', 'N','L'])]

T_clusters = [T_Cluster1, T_Cluster2, T_Cluster3, T_Cluster4, T_Cluster5, T_Cluster6, T_Cluster7, T_Cluster8, T_Cluster9, T_Cluster10]

In [58]:
for cluster in range(len(clusters)):
    y = clusters[cluster]['default_indicator']
    X = clusters[cluster].iloc[:, 7:].drop(['WoE_country', 'WoE_industry'], axis=1)
    data = get_train_test(X, y, use_SMOTE=True)
    m = run_model(data, type = 'RF', n_estimators=50, max_depth=3, verbose = False)
    print(f'Cluster {cluster + 1}')
    stress_test(m, T_clusters[cluster], X, y)

Cluster 1
Total default rate over 3 years: 2.026
Default Rate in the train data: 0.0072451239492512
Stress Test increased PD by multiple of 2.7966995866507616
Number of defaults:
Year 2023 : 332
Year 2024 : 0
Year 2025 : 0
Cluster 2
Total default rate over 3 years: 0.306
Default Rate in the train data: 0.004523410825739947
Stress Test increased PD by multiple of 0.6764062683874005
Number of defaults:
Year 2023 : 6
Year 2024 : 0
Year 2025 : 0
Cluster 3
Total default rate over 3 years: 0.365
Default Rate in the train data: 0.010856248298393684
Stress Test increased PD by multiple of 0.3362464708907691
Number of defaults:
Year 2023 : 18
Year 2024 : 0
Year 2025 : 0
Cluster 4
Total default rate over 3 years: 0.0
Default Rate in the train data: 0.006169077314147255
Stress Test increased PD by multiple of 0.0
Number of defaults:
Year 2023 : 0
Year 2024 : 0
Year 2025 : 0
Cluster 5
Total default rate over 3 years: 0.0
Default Rate in the train data: 0.0038654214855587854
Stress Test increased P

## Static Adverse Case

In [82]:
#Note, 2 strange cases of Gaussian CLUSTERING are Belgium in cluster 2 and Slovania in cluster 3

country1 = test2[test2['country_code'].isin(['IT', 'PT', 'ES'])]
country2 = test2[test2['country_code'].isin(['PL', 'CZ', 'RO','HU','BG','HR','LT','LV','BE','SK','IS'])]
country3 = test2[test2['country_code'].isin(['NL', 'SE', 'NO','DK','AT','DE','FI','SI'])]


T_Cluster1 = country1[country1['industry_code'].isin(['G', 'C', 'M','J','GX','R','S','K'])]
T_Cluster2 = country1[country1['industry_code'].isin(['I', 'Q', 'A','E','D','P','B'])]
T_Cluster3 = country1[country1['industry_code'].isin(['F', 'H', 'N','L'])]

T_Cluster4 = country2[country2['industry_code'].isin(['F', 'H', 'N', 'L'])]
T_Cluster5 = country2[country2['industry_code'].isin(['M', 'J', 'E','D','S','K'])]
T_Cluster6 = country2[country2['industry_code'].isin(['A', 'Q', 'I','B','P'])]
T_Cluster7 = country2[country2['industry_code'].isin(['G', 'C', 'GX','R'])]

T_Cluster8 = country3[country3['industry_code'].isin(['G', 'C', 'M','J','GX','R','S','K'])]
T_Cluster9 = country3[country3['industry_code'].isin(['I', 'Q', 'A','E','D','P','B'])]
T_Cluster10 = country3[country3['industry_code'].isin(['F', 'H', 'N','L'])]

T_clusters = [T_Cluster1, T_Cluster2, T_Cluster3, T_Cluster4, T_Cluster5, T_Cluster6, T_Cluster7, T_Cluster8, T_Cluster9, T_Cluster10]

In [61]:
for cluster in range(len(clusters)):
    y = clusters[cluster]['default_indicator']
    X = clusters[cluster].iloc[:, 7:].drop(['WoE_country', 'WoE_industry'], axis=1)
    data = get_train_test(X, y, use_SMOTE=True)
    m = run_model(data, type = 'RF', n_estimators=50, max_depth=3)
    print(f'Cluster {cluster + 1}')
    stress_test(m, T_clusters[cluster].drop(['WoE_country', 'WoE_industry'], axis=1), X, y)

Cluster 1
Total default rate over 3 years: 2.923
Default Rate in the train data: 0.0072451239492512
Stress Test increased PD by multiple of 4.034997295197936
Number of defaults:
Year 2023 : 479
Year 2024 : 0
Year 2025 : 0
Cluster 2
Total default rate over 3 years: 1.632
Default Rate in the train data: 0.004523410825739947
Stress Test increased PD by multiple of 3.607500098066136
Number of defaults:
Year 2023 : 32
Year 2024 : 0
Year 2025 : 0
Cluster 3
Total default rate over 3 years: 1.237
Default Rate in the train data: 0.010856248298393684
Stress Test increased PD by multiple of 1.1395019291298285
Number of defaults:
Year 2023 : 61
Year 2024 : 0
Year 2025 : 0
Cluster 4
Total default rate over 3 years: 0.0
Default Rate in the train data: 0.006169077314147255
Stress Test increased PD by multiple of 0.0
Number of defaults:
Year 2023 : 0
Year 2024 : 0
Year 2025 : 0
Cluster 5
Total default rate over 3 years: 0.0
Default Rate in the train data: 0.0038654214855587854
Stress Test increased PD

## Dynamic Adverse Case

In [83]:
#Note, 2 strange cases of Gaussian CLUSTERING are Belgium in cluster 2 and Slovania in cluster 3

country1 = testD[testD['country_code'].isin(['IT', 'PT', 'ES'])]
country2 = testD[testD['country_code'].isin(['PL', 'CZ', 'RO','HU','BG','HR','LT','LV','BE','SK','IS'])]
country3 = testD[testD['country_code'].isin(['NL', 'SE', 'NO','DK','AT','DE','FI','SI'])]

micro1 = country1.iloc[:, 9:60]
micro2 = country2.iloc[:, 9:60]
micro3 = country3.iloc[:, 9:60]

T_Cluster1 = country1[country1['industry_code'].isin(['G', 'C', 'M','J','GX','R','S','K'])]
T_Cluster2 = country1[country1['industry_code'].isin(['I', 'Q', 'A','E','D','P','B'])]
T_Cluster3 = country1[country1['industry_code'].isin(['F', 'H', 'N','L'])]

T_Cluster4 = country2[country2['industry_code'].isin(['F', 'H', 'N', 'L'])]
T_Cluster5 = country2[country2['industry_code'].isin(['M', 'J', 'E','D','S','K'])]
T_Cluster6 = country2[country2['industry_code'].isin(['A', 'Q', 'I','B','P'])]
T_Cluster7 = country2[country2['industry_code'].isin(['G', 'C', 'GX','R'])]

T_Cluster8 = country3[country3['industry_code'].isin(['G', 'C', 'M','J','GX','R','S','K'])]
T_Cluster9 = country3[country3['industry_code'].isin(['I', 'Q', 'A','E','D','P','B'])]
T_Cluster10 = country3[country3['industry_code'].isin(['F', 'H', 'N','L'])]

In [109]:
for cluster in range(len(clusters)):
    y = clusters[cluster]['default_indicator']
    X = clusters[cluster].iloc[:, 7:].drop(['WoE_country', 'WoE_industry'], axis=1)
    data = get_train_test(X, y, use_SMOTE=True)
    m = run_model(data, type = 'RF', n_estimators=35, max_depth=3)
    print(f'Cluster {cluster + 1}')
    stress_test(m, T_clusters[cluster].drop(['WoE_country', 'WoE_industry'], axis=1), X, y)

Cluster 1
Total default rate over 3 years: 3.778
Default Rate in the train data: 0.0072451239492512
Stress Test increased PD by multiple of 5.214328446195245
Number of defaults:
Year 2023 : 619
Year 2024 : 0
Year 2025 : 0
Cluster 2
Total default rate over 3 years: 1.836
Default Rate in the train data: 0.004523410825739947
Stress Test increased PD by multiple of 4.058437610324403
Number of defaults:
Year 2023 : 36
Year 2024 : 0
Year 2025 : 0
Cluster 3
Total default rate over 3 years: 2.028
Default Rate in the train data: 0.010856248298393684
Stress Test increased PD by multiple of 1.8680359493931618
Number of defaults:
Year 2023 : 100
Year 2024 : 0
Year 2025 : 0
Cluster 4
Total default rate over 3 years: 0.0
Default Rate in the train data: 0.006169077314147255
Stress Test increased PD by multiple of 0.0
Number of defaults:
Year 2023 : 0
Year 2024 : 0
Year 2025 : 0
Cluster 5
Total default rate over 3 years: 0.0
Default Rate in the train data: 0.0038654214855587854
Stress Test increased P

In [None]:
well, did it work...? :D

In [None]:
very nice dirtbag hahahhah i can see how happy and peaceful you look studying art without any vectorautoregressions or garchs