In [1]:
# Import from main and experiments library
import os
from experiments_lib import *
from roberta_lib import *
os.chdir("../")
from library import *

# filter the warnings for clarity
import warnings
warnings.filterwarnings("ignore")

#### business failure prediction task

We use the ECL benchmark dataset to predict next-year business failure from the multi-modal data contained in corporate 10K records. To this end, we use the following variables:

- ```qualified```: "Yes" if the 10K record qualifies for inclusion in the LoPucki BRD, "No" if the 10K record does not qualify for inclusion in the LoPucki BRD and "out-of-period" if the 10K records was filed before 1993 or after 2021.
- ```can_label```: "True" if we have all the necessary information to assign a label to the 10K record (```filing date``` and ```total asset value```), "False" otherwise
- ```label```: "True" if the company filed for bankruptcy in the year following the filing date of the 10K, "False" otherwise.

#### prepare data and pre-process text documents

In [2]:
# specify path
path_ECL = '../bankruptcy research data/ECL.csv' # change path to correct location

# read data 
dataset = pd.read_csv(path_ECL, index_col=0)
subset = dataset.loc[(dataset['can_label'] == True) & (dataset['qualified'] == 'Yes')].reset_index(drop=True)
subset.sample(5)

Unnamed: 0,cik,company,period_of_report,gvkey,datadate,filename,can_label,qualified,label,bankruptcy_prediction_split,bankruptcy_date_1,bankruptcy_date_2,bankruptcy_date_3,filing_date
27792,1308161.0,NEWS CORP,2010-06-30,12886.0,30/06/2010,/2010/1308161_10K_2010_0001193125-10-181329.json,True,Yes,False,train,,,,2010-08-06
914,4828.0,AMERICAN CRYSTAL SUGAR CO /MN/,2000-08-31,1429.0,31/08/2000,/2000/4828_10K405_2000_0000912057-00-051146.json,True,Yes,False,train,,,,2000-11-22
40245,863436.0,BENCHMARK ELECTRONICS INC,2012-12-31,23084.0,31/12/2012,/2012/863436_10K_2012_0001144204-13-011827.json,True,Yes,False,train,,,,2013-02-28
77007,1304280.0,NOVELIS INC.,2008-03-31,162701.0,31/03/2008,/2008/1304280_10K_2008_0000950144-08-004924.json,True,Yes,False,train,,,,2008-06-19
55953,874766.0,ITT HARTFORD GROUP INC /DE,1996-12-31,61739.0,31/12/1996,/1996/874766_10K_1996_0000948572-97-000016.json,True,Yes,False,train,,,,1997-03-28


In [3]:
# path to corpera
original_corpus = '../bankruptcy research data/original_corpus' # change path to correct location
clean_corpus = '../bankruptcy research data/clean_corpus'
raw_corpus = '../bankruptcy research data/raw_corpus'

# indicate of we still need to perform cleaning operations 
clean = True

# Create directories
try:
    os.mkdir(clean_corpus + '/')
    os.mkdir(raw_corpus + '/')
    for i in range(1993,2024):
        os.mkdir(clean_corpus + '/' + str(i))
        os.mkdir(raw_corpus + '/' + str(i))
except:
    print('Corpera already exist')
    clean = False

Corpera already exist


In [4]:
# clean documents if indicated
if clean:

    # loop over documents
    for idx, row in prediction_subset.iterrows():

        # read file
        filename = row['filename']
        file_path = original_corpus + filename
        with open(file_path, "r", encoding="utf-8") as f:
            file_data = json.load(f)

        # extract relevant part and clean
        document = file_data.get('item_7', '')
        tokens = tokenize_lemmatize(document)
        clean_tokens = remove_stop_punct_num(tokens)
        clean_document = ' '.join(clean_tokens)

        # create file paths
        file_name_without_extension = os.path.splitext(filename)[0]
        preprocessed_filepath = clean_corpus + file_name_without_extension + '.txt'
        raw_filepath = raw_corpus + file_name_without_extension + '.txt'

        # store
        with open(preprocessed_filepath, "w", encoding="utf-8") as preprocessed_file:
            preprocessed_file.write(clean_document)

        with open(raw_filepath, "w", encoding="utf-8") as raw_file:
            raw_file.write(document)

print("Done")

Done


In [5]:
# adjust file extension
subset['filename'] = subset['filename'].str.replace('.json', '.txt')

In [6]:
# split
train = subset.loc[subset['bankruptcy_prediction_split'] == 'train']
test = subset.loc[subset['bankruptcy_prediction_split'] == 'test']

## TF-IDF classifier

In [7]:
# split predictors and labels
train_X = clean_corpus + train['filename']
test_X = clean_corpus + test['filename']

train_y = train['label']
test_y = test['label']

In [8]:
# create the pipeline
TF_IDF = Pipeline([
        ('vect', TfidfVectorizer(input='filename', lowercase=True, 
                                 strip_accents='ascii', stop_words='english', min_df=2, ngram_range = (1,2))),
        ('clf', LogisticRegression(penalty = 'l1', C = 1, class_weight = 'balanced', 
                                   solver='liblinear'))])

# train model
TF_IDF.fit(X=train_X, y= train_y)

# evaluate the model
preds = TF_IDF.predict_proba(test_X)[:, 1]
evaluate(labels=test_y, predictions=preds)

-- RESULTS --
AUC: 0.8855
AP: 0.2387
recall@100: 0.2869
CAP: 0.7711


## RoBERTa classifier

#### Initialize model and set parameters

In [None]:
%%capture
# Load pretrained RoBERTa tokenizer and model 
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
model = RobertaForSequenceClassification.from_pretrained("roberta-large", num_labels=2).to(device)

In [None]:
# set params - note that gradient accumulation is used to simulate larger batches
batch_size = 16
learning_rate = 2e-3
num_epochs = 2
accumulation_steps = 20

# set optimiser and  weighted loss
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
class_counts = train['label'].value_counts().to_dict()
total_samples = len(train)
class_weights = [total_samples / (2 * class_counts[False]), total_samples / (2 * class_counts[True])]
loss_fn = nn.CrossEntropyLoss(weight=torch.Tensor(class_weights).to(device))

In [12]:
# create dataset and dataloaders
train_dataset = CustomDataset(train, tokenizer, raw_corpus)
test_dataset = CustomDataset(test, tokenizer, raw_corpus)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [14]:
# freeze layers in first epoch
model.train()
for param in model.roberta.parameters():
    param.requires_grad = False

In [None]:
# training
for epoch in range(num_epochs):
    
    # unfreeze
    if epoch == 1:
        for param in model.roberta.parameters():
            param.requires_grad = True
            optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)


    # loop over batches
    for idx, batch in enumerate(train_loader):

        
        # get inputs
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits 

        # backward pass
        loss = loss_fn(logits, labels)
        loss = loss / accumulation_steps
        loss.backward()

        # weight update
        if ((idx + 1) % accumulation_steps == 0) or (idx + 1 == len(train_loader)):
            optimizer.step()
            optimizer.zero_grad()

In [16]:
# eval
model.eval()
test_labels = []
test_preds = []

# loop over batches
for idx, batch in enumerate(test_loader):
    
    # get inputs
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].cpu().numpy()

    # predict
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)

    # store
    test_labels.extend(labels)
    test_preds.extend(probabilities.cpu().numpy())

In [20]:
# Evaluate the model
preds = [label[1] for label in test_preds]
evaluate(labels=test_labels, predictions=preds)

Results on a small subsample of the data:

-- RESULTS --
AUC: 0.64
AP: 0.6978
recall@100: 1.0
CAP: 0.28
