#### imports

In [1]:
# import
import os
from model_utils import *
os.chdir("../")
from utils import *

# filter the warnings for clarity
import warnings
warnings.filterwarnings("ignore")

In [2]:
%%capture
# specific imports 
from torch.utils.data import WeightedRandomSampler
import random
import json
import nltk
nltk.download('punkt_tab')
import os

#### sentence-attention model

In this notebook, we show the basic functionalities of the sentence-attention model on the ECL benchmark dataset. Note that we have stored the sentence embeddings and masks on disk for these experiments (see the ```embedding_demo.ipynb``` notebook for more information). Below, we show how to train and evaluate the model and how to display the sentences with the highest attention weights for a particular instance (we do this for a randomly sampled test instance from the top 50 instances with the highest probability of business failure assigned by the model from the test set).

#### prepare data


In [3]:
# specify path
path_ECL = '../bankruptcy research data/ECL.csv' # change path to correct location
path_CS = '../bankruptcy research data/Compustat/data.csv' # change path to correct location

# read data and add financial features
dataset = pd.read_csv(path_ECL, index_col=0)
dataset = compustat_local(path_CS, dataset, update=False)
dataset, predictors = compute_features(dataset)

In [4]:
# split in train test set
subset = dataset.loc[(dataset['can_label'] == True) & (dataset['qualified'] == 'Yes')].reset_index(drop=True)
train = subset.loc[subset['bankruptcy_prediction_split'] == 'train']
test = subset.loc[subset['bankruptcy_prediction_split'] == 'test']

In [5]:
# Mean impute and normalize
train_mean = train.loc[:, predictors].mean()
train_std = train.loc[:, predictors].std()

# Impute missing values with the mean
train.loc[:, predictors] = train.loc[:, predictors].fillna(train_mean)
test.loc[:, predictors] = test.loc[:, predictors].fillna(train_mean)

# Normalize the data
train.loc[:, predictors] = (train.loc[:, predictors] - train_mean) / train_std
test.loc[:, predictors] = (test.loc[:, predictors] - train_mean) / train_std

#### set seed

In [6]:
# set the seeds
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed) 

seed = 1
set_seed(seed)

In [7]:
# set parameters
lr = 1e-3
weight_decay = 1e-3
negatives_batch = 4 
batch_size = 320
hidden_dim = 32
training_time = 4 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#### init datasets, model, optimiser and loss

In [8]:
# create custom datasets
train_dataset = SentenceDataset(dataframe=train)
test_dataset = SentenceDataset(dataframe=test)

In [9]:
# compute current class distribution
class_counts = train['label'].value_counts().to_dict()
num_negatives = class_counts[False]
num_positives = class_counts[True]

# compute target class distribution
target_frac_negatives = negatives_batch/(negatives_batch+1)
target_frac_positives = 1 - target_frac_negatives

# compute class weights for sampler
class_weights = {False: (target_frac_negatives/num_negatives), True: (target_frac_positives/num_positives) }

# compute the weight for each sample (as required for the WeightedRandomSampler)
sample_weights = train['label'].map(class_weights).to_numpy()
sample_weights = torch.Tensor(sample_weights).to(device)

# init sampler
set_seed(seed)
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(train), replacement=True)

In [10]:
# create dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [11]:
# init network, optimizer and loss
network = SentenceAttentionNetwork(embedding_dim=384, feature_dim=28, hidden_dim=hidden_dim).to(device)
optimizer = torch.optim.AdamW(network.parameters(), lr=lr, weight_decay=weight_decay)
loss_fn = nn.CrossEntropyLoss(reduction="mean")

#### train

In [None]:
# set model to train mode
network.train()

In [None]:
# loop over epochs
for epoch in range(training_time):

    # loop over batches
    for idx, batch in enumerate(train_loader):

        ################# forward pass #################

        # get input
        embeddings = batch['sentence_embeddings'].to(device)
        masks = batch['sentence_masks'].to(device)
        features = batch['structured_features'].to(device)
        labels = batch['labels'].to(device)

        # get logits 
        logits, _ = network(embeddings, masks, features)

        ################# backward pass #################

        # compute loss
        loss = loss_fn(input=logits, target=labels)

        # compute grads, update weights, reset grads
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

#### eval

In [13]:
# set model to eval mode
network.eval()

SentenceAttentionNetwork(
  (linear_map): Linear(in_features=28, out_features=32, bias=True)
  (key_layer): Linear(in_features=384, out_features=32, bias=True)
  (value_layer): Linear(in_features=384, out_features=32, bias=True)
  (classification): Linear(in_features=64, out_features=1, bias=True)
)

In [None]:
# inititalise list
predictions = []
attn_weights = []
indices = []

# loop over batches
for idx, batch in enumerate(test_loader):

    ################# get predictions #################

    # get input
    embeddings = batch['sentence_embeddings'].to(device)
    masks = batch['sentence_masks'].to(device)
    features = batch['structured_features'].to(device)
    labels = batch['labels'].to(device)
    indx = batch['idx'].to(device)
    
    # get logits 
    with torch.no_grad():
        logits, attn = network(embeddings, masks, features)
    
    # get predicted probabilities
    preds = torch.sigmoid(logits)
    preds = preds[:, 1]

    ################# store predictions #################

    # store
    predictions.extend(preds.cpu().tolist())
    indices.extend(indx.cpu().tolist())
    attn_weights.append(attn.cpu().numpy())

# stack attention weights
attn_weights = np.vstack(attn_weights)

In [None]:
%%capture
# set predictions
test['predictions'] = None
test.loc[indices, 'predictions'] = predictions

In [15]:
# evaluate
AUC, AP, recall, CAP = evaluate(test['label'], test['predictions'])

#### sentence analysis

In [None]:
# get top 50 instances with higest probability of business failure
top_pred = test.sort_values('predictions', ascending=False).head(50)

In [16]:
# select a random instance
row_id = np.random.randint(49)
row = top_pred.iloc[row_id]
idx = top_pred.index[row_id]

# read text and tokenize
with open(row['filename'], 'r') as f:
    text = f.read()
sentences = nltk.sent_tokenize(text)

# store attn_weights
attn_weight = attn_weights[indices.index(idx)]

# isolate attention on trainable vector
attn_trainable = attn_weight[0]
attn_sentences = attn_weight[1:]

In [17]:
# get top 10 sentences (highest attention weight)
top_indices = np.argsort(attn_sentences)[-10:]
top_indices_descending = top_indices[::-1]

# print
printed_sentences = set()
for i, idx in enumerate(top_indices_descending):
    sentence = sentences[idx]
    if sentence not in printed_sentences:
        print('-'*10)
        print(f'Sentence {i+1}:')
        print(sentence)
        printed_sentences.add(sentence)

----------
Sentence 1:
Item 7.
----------
Sentence 2:
North Dakota drilling activity declined approximately 65% during the year with the majority of activity returning to the core areas located in McKenzie, Mountrail, Williams and Dunn counties.
----------
Sentence 3:
Operationally, 2015 proved to be a challenging year due to the downturn in commodity prices.
----------
Sentence 4:
Historically, commodity prices have been volatile and we expect the volatility to continue in the future.
----------
Sentence 5:
While rail transportation has historically been more expensive than pipeline transportation, Williston Basin prices have justified shipment by rail to markets such as St. James, Louisiana, which offers prices benchmarked to Brent/LLS.
----------
Sentence 6:
North Dakota's average rig count has dropped from 171 on January 4, 2015 to 59 on December 31, 2015.
----------
Sentence 7:
Additionally, our oil price differential to the NYMEX WTI benchmark price dropped from $13.67 per barrel