#### imports

In [3]:
# import from main and experiments library
import os
from experiments_lib import *
os.chdir("../")
from library import *

# filter the warnings for clarity
import warnings
warnings.filterwarnings("ignore")

#### business failure prediction task

We use the ECL benchmark dataset to predict next-year business failure from the multi-modal data contained in corporate 10K records. To this end, we use the following variables:

- ```qualified```: "Yes" if the 10K record qualifies for inclusion in the LoPucki BRD, "No" if the 10K record does not qualify for inclusion in the LoPucki BRD and "out-of-period" if the 10K records was filed before 1993 or after 2021.
- ```can_label```: "True" if we have all the necessary information to assign a label to the 10K record (```filing date``` and ```total asset value```), "False" otherwise
- ```label```: "True" if the company filed for bankruptcy in the year following the filing date of the 10K, "False" otherwise.

#### prepare data

In [5]:
# read data and add financial features
dataset = pd.read_csv('ECL.csv', index_col=0)
dataset = compustat_local('./data/CompuStat/data.csv', dataset, update=False)
dataset, predictors = compute_features(dataset)

Dropped 115373 rows from CompuStat based on screening variables
0 records in the dataset do not have an accompanying CompuStat record.


In [6]:
# split in train test set
subset = dataset.loc[(dataset['can_label'] == True) & (dataset['qualified'] == 'Yes')].reset_index(drop=True)
train = subset.loc[subset['bankruptcy_prediction_split'] == 'train']
test = subset.loc[subset['bankruptcy_prediction_split'] == 'test']

In [7]:
# store predictors and labels
X = train[predictors]
y = train['label']

test_X = test[predictors]
test_y = test['label']

In [8]:
# Resample training data and store in a dictionary
training_data = dict()

# run over data distributions
training_data['real'] = (X, y)
for i in [1, 0.5, 0.25]:
    
    # resample and store
    ros = RandomOverSampler(random_state=0, sampling_strategy=i)
    X_resampled, y_resampled = ros.fit_resample(X, y)
    training_data[i] = (X_resampled, y_resampled)

#### logistic regression

In [8]:
# create the pipeline
distribution = 1

LR = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')), 
                 ('scaler', StandardScaler()), 
                 ('clf', LogisticRegression(penalty='l2', C = 0.01))])

# train model
LR.fit(X=training_data[distribution][0], y=training_data[distribution][1])

# evaluate the model
preds = LR.predict_proba(test_X)[:, 1]
evaluate(labels=test_y, predictions=preds)

-- RESULTS --
AUC: 0.9148
AP: 0.115
recall@100: 0.1475
CAP: 0.8297


#### MLP

In [9]:
# create the pipeline
distribution = 0.5

# note that the output of the MLP depends on the initial weights - the results will slightly vary each run
MLP = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')), 
                         ('scaler', StandardScaler()), 
                         ('clf', MLPClassifier(learning_rate='invscaling', alpha=1, learning_rate_init=0.001,
                                              hidden_layer_sizes=(100,100)))])
# train model
MLP.fit(X=training_data[distribution][0], y=training_data[distribution][1])

# evaluate the model
preds = MLP.predict_proba(test_X)[:, 1]
evaluate(labels=test_y, predictions=preds)

-- RESULTS --
AUC: 0.9282
AP: 0.1801
recall@100: 0.2213
CAP: 0.8564


#### XGBoost

In [10]:
# create the pipeline
distribution = 1

XGB = Pipeline([ ('scaler', StandardScaler()), 
                 ('clf', xgb.XGBClassifier(objective='binary:logistic', subsample=0.5, eta=0.1, 
                 max_depth = 1, n_estimators = 1000))])

# train model
XGB.fit(X=training_data[distribution][0], y=training_data[distribution][1])

# evaluate the model
preds = XGB.predict_proba(test_X)[:, 1]
evaluate(labels=test_y, predictions=preds)

-- RESULTS --
AUC: 0.9364
AP: 0.1562
recall@100: 0.1885
CAP: 0.8727
