# 6. Prediction Of Characteristics based on Macro

In this notebook we show the results of the predictions of characteristics based on macros, such that we can dynamically stress the characteristics in stress testing. In the first part of this notebook, we investigate the predictive powers while in the second we create the dataset for dynamic stress test

# First Part

# Data

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
import sklearn.pipeline
from sklearn.cluster import KMeans
import numpy as np
import os

In [2]:
df = pd.read_csv('full_data_woe_rid_-2.csv').iloc[:, 2:]
#df = pd.read_csv('full_data_woe_rid_.csv').iloc[:, 2:]
micro = df.iloc[:, 8:61]
macro = df.iloc[:,61:]

In [3]:
present= ['intangible_fixed_assets_0', 
       'tangible_fixed_assets_0',
       'other_fixed_assets_0',  'stock_0', 
       'debtors_0',  'other_current_assets_0',
       'cash_cash_equivalent_0',
       'capital_0', 
       'other_shareholders_funds_0', 
       'long_term_debt_0', 
       'other_noncurrent_liabilities_0', 
        'loans_0',  'creditors_0', 
       'other_current_liabilities_0', 
       'number_of_employees_0', 'sales_0',
       'operating_pl_ebit_0', 'financial_revenue_0',
       'financial_expenses_0', 
       'taxation_0',  'material_costs_0', 
       'costs_of_employees_0', 
       'depreciation_amortization_0', 
        'cash_flow_0']


lag= ['intangible_fixed_assets_1', 
       'tangible_fixed_assets_1',
       'other_fixed_assets_1',  'stock_1', 
       'debtors_1',  'other_current_assets_1',
       'cash_cash_equivalent_1',
       'capital_1', 
       'other_shareholders_funds_1', 
       'long_term_debt_1', 
       'other_noncurrent_liabilities_1', 
        'loans_1',  'creditors_1', 
       'other_current_liabilities_1', 
       'number_of_employees_1', 'sales_1',
       'operating_pl_ebit_1', 'financial_revenue_1',
       'financial_expenses_1', 
       'taxation_1',  'material_costs_1', 
       'costs_of_employees_1', 
       'depreciation_amortization_1', 
        'cash_flow_1']

# Function for all variables

In [142]:
for i in range(len(present)):
    Y = micro[present[i]]
    Y = list(Y)
    X1= pd.concat([micro[lag[i]],macro], axis =1)

    X_train, X_test, Y_train, Y_test = train_test_split(X1, Y, test_size=0.2,  random_state=42)
    
    scaler = StandardScaler()

    X_train_std = scaler.fit_transform(X_train)
    X_test_std = scaler.transform(X_test)
    
    X_train= pd.DataFrame(X_train_std, columns= X_train.columns)
    X_test= pd.DataFrame(X_test_std, columns= X_test.columns)
    
    model = RandomForestRegressor(n_estimators=50, max_depth =3, random_state=42)
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    
    rmse1 = mean_squared_error(Y_test, y_pred, squared=False)

    
    X2= micro[[lag[i]]]

    X_train, X_test, Y_train, Y_test = train_test_split(X2, Y, test_size=0.2,  random_state=42)
    
    X_train_std = scaler.fit_transform(X_train)
    X_test_std = scaler.transform(X_test)
    
    X_train= pd.DataFrame(X_train_std, columns= X_train.columns)
    X_test= pd.DataFrame(X_test_std, columns= X_test.columns)
    
    y_pred = X_test
    
    rmse2 = mean_squared_error(Y_test, y_pred, squared=False)
    
    print("Results for:", present[i])
    print("RMSE RF (in millions):", round(rmse1/1000000,2))
    print("RMSE BENCHMARK(in millions):", round(rmse2/1000000,2))
    print(40*'=')

Results for: intangible_fixed_assets_0
RMSE RF (in millions): 2.89
RMSE BENCHMARK(in millions): 2.79
Results for: tangible_fixed_assets_0
RMSE RF (in millions): 92.48
RMSE BENCHMARK(in millions): 100.69
Results for: other_fixed_assets_0
RMSE RF (in millions): 24.61
RMSE BENCHMARK(in millions): 51.49
Results for: stock_0
RMSE RF (in millions): 4.7
RMSE BENCHMARK(in millions): 5.84
Results for: debtors_0
RMSE RF (in millions): 5.4
RMSE BENCHMARK(in millions): 10.65
Results for: other_current_assets_0
RMSE RF (in millions): 778.27
RMSE BENCHMARK(in millions): 85.45
Results for: cash_cash_equivalent_0
RMSE RF (in millions): 5.2
RMSE BENCHMARK(in millions): 7.25
Results for: capital_0
RMSE RF (in millions): 7.07
RMSE BENCHMARK(in millions): 12.59
Results for: other_shareholders_funds_0
RMSE RF (in millions): 15.87
RMSE BENCHMARK(in millions): 44.01
Results for: long_term_debt_0
RMSE RF (in millions): 66.84
RMSE BENCHMARK(in millions): 91.9
Results for: other_noncurrent_liabilities_0
RMSE RF

In [145]:
from sklearn import linear_model

In [148]:
for i in range(len(present)):
    Y = micro[present[i]]
    Y = list(Y)
    X1= pd.concat([micro[lag[i]],macro], axis =1)

    X_train, X_test, Y_train, Y_test = train_test_split(X1, Y, test_size=0.2,  random_state=42)
    
    scaler = StandardScaler()

    X_train_std = scaler.fit_transform(X_train)
    X_test_std = scaler.transform(X_test)
    
    X_train= pd.DataFrame(X_train_std, columns= X_train.columns)
    X_test= pd.DataFrame(X_test_std, columns= X_test.columns)
    
    model = xgb.XGBRegressor(objective='reg:squarederror',n_estimators= 50, max_depth=3, random_state=42)
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    
    rmse3 = mean_squared_error(Y_test, y_pred, squared=False)

    
    model2= linear_model.Lasso(alpha= 0.1)
    model2.fit(X_train, Y_train)
    
    y_pred = model2.predict(X_test)
    
    rmse4 = mean_squared_error(Y_test, y_pred, squared=False)
    
    print("Results for:", present[i])
    print("RMSE XGB (in millions):", round(rmse3/1000000,2))
    print("RMSE Logit with Lasso (in millions):", round(rmse4/1000000,2))
    print(40*'=')

Results for: intangible_fixed_assets_0
RMSE XGB (in millions): 2.38
RMSE Logit with Lasso (in millions): 3.11
Results for: tangible_fixed_assets_0
RMSE XGB (in millions): 100.14
RMSE Logit with Lasso (in millions): 61.21
Results for: other_fixed_assets_0
RMSE XGB (in millions): 43.38
RMSE Logit with Lasso (in millions): 25.54
Results for: stock_0
RMSE XGB (in millions): 6.45
RMSE Logit with Lasso (in millions): 5.95
Results for: debtors_0
RMSE XGB (in millions): 10.34
RMSE Logit with Lasso (in millions): 3.32
Results for: other_current_assets_0
RMSE XGB (in millions): 152.57
RMSE Logit with Lasso (in millions): 316.0
Results for: cash_cash_equivalent_0
RMSE XGB (in millions): 6.79
RMSE Logit with Lasso (in millions): 4.69
Results for: capital_0
RMSE XGB (in millions): 10.72
RMSE Logit with Lasso (in millions): 8.28
Results for: other_shareholders_funds_0
RMSE XGB (in millions): 42.02
RMSE Logit with Lasso (in millions): 17.54
Results for: long_term_debt_0
RMSE XGB (in millions): 81.67


# On clusters (all variables)

In [188]:
for i in range(len(present)):
    
    all_y_pred = []
    all_Y_test = []
    
    for cluster in range(len(clusters)):
        Y = clusters[cluster][present[i]]
        Xa = clusters[cluster][lag[i]]
        Xb = clusters[cluster].iloc[:, 61:]
        X = pd.concat([Xa, Xb], axis=1)
        
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

        scaler = StandardScaler()

        X_train_std = scaler.fit_transform(X_train)
        X_test_std = scaler.transform(X_test)

        X_train = pd.DataFrame(X_train_std, columns=X_train.columns)
        X_test = pd.DataFrame(X_test_std, columns=X_test.columns)

        model = RandomForestRegressor(n_estimators=50, max_depth=3, random_state=42)
        model.fit(X_train, Y_train)
        y_pred = model.predict(X_test)
        
        all_y_pred.extend(y_pred)
        all_Y_test.extend(Y_test)
    
    print("Results for: ", present[i])
    print("RMSE RF CLUSTERS (in millions):",round(mean_squared_error(all_Y_test, all_y_pred, squared=False)/1000000,2) )
    print(40*'=')

Results for:  intangible_fixed_assets_0
RMSE RF CLUSTERS (in millions): 7.5
Results for:  tangible_fixed_assets_0
RMSE RF CLUSTERS (in millions): 8.93
Results for:  other_fixed_assets_0
RMSE RF CLUSTERS (in millions): 23.86
Results for:  stock_0
RMSE RF CLUSTERS (in millions): 8.93
Results for:  debtors_0
RMSE RF CLUSTERS (in millions): 20.91
Results for:  other_current_assets_0
RMSE RF CLUSTERS (in millions): 166.9
Results for:  cash_cash_equivalent_0
RMSE RF CLUSTERS (in millions): 3.35
Results for:  capital_0
RMSE RF CLUSTERS (in millions): 8.34
Results for:  other_shareholders_funds_0
RMSE RF CLUSTERS (in millions): 22.48
Results for:  long_term_debt_0
RMSE RF CLUSTERS (in millions): 23.13
Results for:  other_noncurrent_liabilities_0
RMSE RF CLUSTERS (in millions): 12.13
Results for:  loans_0
RMSE RF CLUSTERS (in millions): 3.13
Results for:  creditors_0
RMSE RF CLUSTERS (in millions): 2.42
Results for:  other_current_liabilities_0
RMSE RF CLUSTERS (in millions): 15.57
Results for:

In [208]:
for i in range(len(present)):
    
    all_y_pred = []
    all_Y_test = []
    
    for cluster in range(len(clusters)):
        Y = clusters[cluster][present[i]]
        Xa = clusters[cluster][lag[i]]
        Xb = clusters[cluster].iloc[:, 61:]
        X = pd.concat([Xa, Xb], axis=1)
        
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

        scaler = StandardScaler()

        X_train_std = scaler.fit_transform(X_train)
        X_test_std = scaler.transform(X_test)

        X_train = pd.DataFrame(X_train_std, columns=X_train.columns)
        X_test = pd.DataFrame(X_test_std, columns=X_test.columns)

        model = linear_model.Lasso(alpha= 0.1, max_iter= 10000)
        model.fit(X_train, Y_train)
        y_pred = model.predict(X_test)
        
        all_y_pred.extend(y_pred)
        all_Y_test.extend(Y_test)
    
    print("Results for: ", present[i])
    print("RMSE RF CLUSTERS (in millions):",round(mean_squared_error(all_Y_test, all_y_pred, squared=False)/1000000,2) )
    print(40*'=')

Results for:  intangible_fixed_assets_0
RMSE RF CLUSTERS (in millions): 7.21
Results for:  tangible_fixed_assets_0
RMSE RF CLUSTERS (in millions): 23.01
Results for:  other_fixed_assets_0
RMSE RF CLUSTERS (in millions): 25.16
Results for:  stock_0
RMSE RF CLUSTERS (in millions): 6.96
Results for:  debtors_0
RMSE RF CLUSTERS (in millions): 4.91
Results for:  other_current_assets_0
RMSE RF CLUSTERS (in millions): 232.69
Results for:  cash_cash_equivalent_0
RMSE RF CLUSTERS (in millions): 3.88
Results for:  capital_0
RMSE RF CLUSTERS (in millions): 7.63
Results for:  other_shareholders_funds_0
RMSE RF CLUSTERS (in millions): 19.67
Results for:  long_term_debt_0
RMSE RF CLUSTERS (in millions): 28.07
Results for:  other_noncurrent_liabilities_0
RMSE RF CLUSTERS (in millions): 16.76
Results for:  loans_0
RMSE RF CLUSTERS (in millions): 3.43
Results for:  creditors_0
RMSE RF CLUSTERS (in millions): 2.73
Results for:  other_current_liabilities_0
RMSE RF CLUSTERS (in millions): 14.33
Results fo

# Second Part

In [None]:
dat= pd.read_csv("full_stress_data-2.csv")

In [None]:
#2022 data
dftemp= df[df['status_year'] == 2022].reset_index()
data2022 = dftemp[dftemp['default_indicator']==0].reset_index()
data2022= data2022.iloc[:,2:]

In [None]:
#2023,2024 and 2025 stress test data
data2023 = dat[dat['status_year']==2023].reset_index()
data2023 = data2023.iloc[:,2:]
data2024 = dat[dat['status_year']==2024].reset_index()
data2024 = data2024.iloc[:,2:]
data2025 = dat[dat['status_year']==2025].reset_index()
data2025 = data2025.iloc[:,2:]

In [None]:
#Micro Macro splits
micro2022 = data2022.iloc[:, 6:59]
micro2023 = data2023.iloc[:, 6:59]
micro2024 = data2024.iloc[:, 6:59]
micro2025 = data2025.iloc[:, 6:59]

macro2022 = data2022.iloc[:, 59:]
macro2023 = data2023.iloc[:, 59:]
macro2024 = data2024.iloc[:, 59:]
macro2025 = data2025.iloc[:, 59:]

# Create the dynamic micros

In [None]:
import os
import pandas as pd
import statsmodels.api as sm
from sklearn import linear_model
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from itertools import compress
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
X_test_2023 = pd.concat([micro2022[present[i]],macro2023], axis =1)
X_test_2023.shape

(59628, 21)

In [None]:
micro2023[present[i]]

0        215943.345063
1        216045.625537
2        215944.328224
3        215943.349287
4        215965.654697
             ...      
59623    177206.217419
59624    177205.879490
59625    177206.385327
59626    177583.989033
59627    177205.879490
Name: intangible_fixed_assets_0, Length: 59628, dtype: float64

In [None]:
macro2024

Unnamed: 0,gdp_growth,inflation_growth,unemployment,EURxUSD,EURxJPY,EURxCNY,EURxINR,EURxGBP,EURxNOK,EURxCHF,EURxTRY,hh_debt,corp_debt,govt_debt,3m_yield,10y_yield,oil,gas,gold,copper
0,-4.3,6.6,6.3,1.05,137.56,7.05,92.98,0.85,10.07,1.01,21.8,72.779991,136.050240,82.507802,0.377,3.77,189.325378,1348.172487,1648.067592,7248.052673
1,-4.3,6.6,6.3,1.05,137.56,7.05,92.98,0.85,10.07,1.01,21.8,72.779991,136.050240,82.507802,0.377,3.77,189.325378,1348.172487,1648.067592,7248.052673
2,-4.3,6.6,6.3,1.05,137.56,7.05,92.98,0.85,10.07,1.01,21.8,72.779991,136.050240,82.507802,0.377,3.77,189.325378,1348.172487,1648.067592,7248.052673
3,-4.3,6.6,6.3,1.05,137.56,7.05,92.98,0.85,10.07,1.01,21.8,72.779991,136.050240,82.507802,0.377,3.77,189.325378,1348.172487,1648.067592,7248.052673
4,-4.3,6.6,6.3,1.05,137.56,7.05,92.98,0.85,10.07,1.01,21.8,72.779991,136.050240,82.507802,0.377,3.77,189.325378,1348.172487,1648.067592,7248.052673
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59623,-3.1,5.5,12.1,1.05,137.56,7.05,92.98,0.85,10.07,1.01,21.8,34.396619,120.645893,34.457310,0.591,5.91,189.325378,1348.172487,1648.067592,7248.052673
59624,-3.1,5.5,12.1,1.05,137.56,7.05,92.98,0.85,10.07,1.01,21.8,34.396619,120.645893,34.457310,0.591,5.91,189.325378,1348.172487,1648.067592,7248.052673
59625,-3.1,5.5,12.1,1.05,137.56,7.05,92.98,0.85,10.07,1.01,21.8,34.396619,120.645893,34.457310,0.591,5.91,189.325378,1348.172487,1648.067592,7248.052673
59626,-3.1,5.5,12.1,1.05,137.56,7.05,92.98,0.85,10.07,1.01,21.8,34.396619,120.645893,34.457310,0.591,5.91,189.325378,1348.172487,1648.067592,7248.052673


In [None]:
for i in range(len(present)):
    
    Y = micro[present[i]]
    X= pd.concat([macro, micro[lag[i]]], axis = 1)
    
    X_train, _, Y_train, _ = train_test_split(X, Y, test_size=0.2,  random_state=42)
    scaler = StandardScaler()

    X_train_std = scaler.fit_transform(X_train)

    X_train= pd.DataFrame(X_train_std, columns= X_train.columns)
    
    #Train model on full data up to 2022
    #Y_train = micro[present[i]]
    #Y_train = list(Y)
    #X_train= pd.concat([micro[lag[i]],macro], axis =1)
    #X_train, _, Y_train, ___ = train_test_split(X1, Y, test_size=0.2,  random_state=42)
    scaler = StandardScaler()
    #X_train_std = scaler.fit_transform(X_train)
    #X_train= pd.DataFrame(X_train_std, columns= X_train.columns)

    model = linear_model.Lasso(alpha= 0.1)
    model.fit(X_train, Y_train)
    
    #2023 prediction
    X_test_2023 = pd.concat([micro2022[present[i]],macro2023], axis =1)
    X_test_std_2023 = scaler.fit_transform(X_test_2023)
    X_test_2023= pd.DataFrame(X_test_std_2023, columns= X_train.columns)
    y_pred_2023 = model.predict(X_test_2023)
    micro2023[present[i]] = y_pred_2023
    
    #2024 prediction
    X_test_2024 = pd.concat([micro2023[present[i]],macro2024], axis =1)
    X_test_std_2024 = scaler.fit_transform(X_test_2024)
    X_test_2024= pd.DataFrame(X_test_std_2024, columns= X_train.columns)
    y_pred_2024 = model.predict(X_test_2024)
    micro2024[present[i]] = y_pred_2024
    
    #2025 prediction
    X_test_2025 = pd.concat([micro2024[present[i]],macro2025], axis =1)
    X_test_std_2025 = scaler.fit_transform(X_test_2025)
    X_test_2025= pd.DataFrame(X_test_std_2025, columns= X_train.columns)
    y_pred_2025 = model.predict(X_test_2025)
    micro2025[present[i]] = y_pred_2025
    print(i)
    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22


In [None]:
micro2022['sales_0'].tail().round(0)

59623    5338271.0
59624    5824362.0
59625    5209463.0
59626    7735311.0
59627    5146425.0
Name: sales_0, dtype: float64

In [None]:
micro2023['sales_0'].tail().round(0)

59623    4578480.0
59624    4995386.0
59625    4468005.0
59626    6634352.0
59627    4413939.0
Name: sales_0, dtype: float64

In [None]:
micro2024['sales_0'].tail().round(0)

59623    3926829.0
59624    4284398.0
59625    3832078.0
59626    5690091.0
59627    3785708.0
Name: sales_0, dtype: float64

In [None]:
micro2023['operating_pl_ebit_0'].tail().round(0)

59623    597640.0
59624    429359.0
59625     94288.0
59626      7465.0
59627     26154.0
Name: operating_pl_ebit_0, dtype: float64

In [None]:
micro2025['operating_pl_ebit_0'].tail().round(0)

59623    372957.0
59624    267942.0
59625     58841.0
59626      4659.0
59627     16322.0
Name: operating_pl_ebit_0, dtype: float64

In [None]:
temp2023 = pd.concat([micro2023,macro2023],axis=1)
temp2024 = pd.concat([micro2024,macro2024],axis=1)
temp2025 = pd.concat([micro2025,macro2025],axis=1)
stress2023= pd.concat([data2023.iloc[:,:6],temp2023],axis=1)
stress2024= pd.concat([data2024.iloc[:,:6],temp2024],axis=1)
stress2025= pd.concat([data2025.iloc[:,:6],temp2025],axis=1)

In [None]:
dynamiclogit= pd.concat([stress2023,stress2024,stress2025],axis=0)
dynamiclogit.to_csv("DynamicLogitStress_vTS.csv")