In [1]:
# Import from main and experiments library
import os
from experiments_lib import *
os.chdir("../")
from library import *

# filter the warnings for clarity
import warnings
warnings.filterwarnings("ignore")

#### business failure prediction task

We use the ECL benchmark dataset to predict next-year business failure from the multi-modal data contained in corporate 10K records. To this end, we use the following variables:

- ```qualified```: "Yes" if the 10K record qualifies for inclusion in the LoPucki BRD, "No" if the 10K record does not qualify for inclusion in the LoPucki BRD and "out-of-period" if the 10K records was filed before 1993 or after 2021.
- ```can_label```: "True" if we have all the necessary information to assign a label to the 10K record (```filing date``` and ```total asset value```), "False" otherwise
- ```label```: "True" if the company filed for bankruptcy in the year following the filing date of the 10K, "False" otherwise.

#### prepare data

In [2]:
# specify path
path_ECL = '../bankruptcy research data/ECL.csv' # change path to correct location
path_CS = '../bankruptcy research data/Compustat/data.csv' # change path to correct location

# read data and add financial features
dataset = pd.read_csv(path_ECL, index_col=0)
dataset = compustat_local(path_CS, dataset, update=False)
dataset, predictors = compute_features(dataset)

Dropped 115373 rows from CompuStat based on screening variables
0 records in the dataset do not have an accompanying CompuStat record.


In [3]:
# split in train val test set
subset = dataset.loc[(dataset['can_label'] == True) & (dataset['qualified'] == 'Yes')].reset_index(drop=True)
subset['fyear'] = pd.to_datetime(subset['filing_date']).dt.year


train = subset.loc[subset['bankruptcy_prediction_split'] == 'train']
test = subset.loc[subset['bankruptcy_prediction_split'] == 'test']
val = train.loc[train['fyear'] > 2011]
train_small = train.drop(val.index)

#### XGBoost baseline

In [4]:
# Store predictors and labels
small_X = train_small[predictors]
small_y = train_small['label']

val_X = val[predictors]
val_y = val['label']

train_X = train[predictors]
train_y = train['label']

test_X = test[predictors]
test_y = test['label']

# resample training data
ros = RandomOverSampler(random_state=0, sampling_strategy=1)
small_X, small_y = ros.fit_resample(small_X, small_y)
train_X, train_y = ros.fit_resample(train_X, train_y)

In [7]:
# create the pipelines
XGB_small = Pipeline([ ('scaler', StandardScaler()), 
                      ('clf', xgb.XGBClassifier(objective='binary:logistic', subsample=0.5, eta=0.1, 
                                                max_depth = 1, n_estimators = 1000))])

XGB_train = Pipeline([ ('scaler', StandardScaler()), 
                      ('clf', xgb.XGBClassifier(objective='binary:logistic', subsample=0.5, eta=0.1, 
                                                max_depth = 1, n_estimators = 1000))])

# train model
XGB_small.fit(X=small_X, y=small_y)
XGB_train.fit(X=train_X, y=train_y)

# evaluate the model
val_preds = XGB_small.predict_proba(val_X)[:, 1]
test_preds = XGB_train.predict_proba(test_X)[:, 1]
val["preds_XGB"] = val_preds
test["preds_XGB"] = test_preds

#### TF-IDF baseline

In [8]:
# split predictors and labels
small_X = clean_corpus + train_small['filename']
small_y = train_small['label']

val_X = clean_corpus + val['filename']
val_y = val['label']

train_X = clean_corpus + train['filename']
train_y = train['label']

test_X = clean_corpus + test['filename']
test_y = test['label']

In [9]:
# create the pipeline
TF_IDF_small = Pipeline([
    ('vect', TfidfVectorizer(input='filename', lowercase=True, 
                                 strip_accents='ascii', stop_words='english', min_df=2, ngram_range = (1,2))),
    ('clf', LogisticRegression(penalty = 'l1', C = 1, class_weight = 'balanced', 
                                   solver='liblinear'))])

TF_IDF_train = Pipeline([
    ('vect', TfidfVectorizer(input='filename', lowercase=True, 
                                 strip_accents='ascii', stop_words='english', min_df=2, ngram_range = (1,2))),
    ('clf', LogisticRegression(penalty = 'l1', C = 1, class_weight = 'balanced', 
                                   solver='liblinear'))])

# train model
TF_IDF_small.fit(X=small_X, y= small_y)
TF_IDF_train.fit(X=train_X, y= train_y)

# evaluate the model
val_preds = TF_IDF_small.predict_proba(val_X)[:, 1]
test_preds = TF_IDF_train.predict_proba(test_X)[:, 1]
val["preds_TFIDF"] = val_preds
test["preds_TFIDF"] = test_preds

#### stacking classifier

In [12]:
# split predictors and labels
train_X = val[["preds_XGB", "preds_TFIDF"]]
train_y = val['label']

test_X = test[["preds_XGB", "preds_TFIDF"]]
test_y = test['label']

In [13]:
# create the pipelines
stacking_clf = Pipeline([('scaler', StandardScaler()),
                         ('clf', LogisticRegression(penalty='l2', C =1 , class_weight='balanced'))])

# train
stacking_clf.fit(X=train_X, y = train_y)

# evaluate the model
preds = stacking_clf.predict_proba(test_X)[:, 1]
evaluate(labels=test_y, predictions=preds)

-- RESULTS --
AUC: 0.9479
AP: 0.2639
recall@100: 0.2869
CAP: 0.8958
