#### imports

In [1]:
# import from main and experiments library
import os
from sentence_lib import *
os.chdir("../")
from library import *

# filter the warnings for clarity
import warnings
warnings.filterwarnings("ignore")

In [2]:
# specific imports
from sentence_transformers import SentenceTransformer

import nltk
nltk.download('punkt_tab')
from nltk import sent_tokenize

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


#### prepare data

In [3]:
# specify path
path_ECL = '../bankruptcy research data/ECL.csv' # change path to correct location

# read data and add financial features
dataset = pd.read_csv(path_ECL, index_col=0)

#### functions to embedd documents

In [4]:
# function to encode sentences

def doc_encode(text_path, sentence_encoder):
    """
    Args:
        text_path (str): path to text document to encode (.txt)
        sentence_encoder (sentence_transformer model): model from sentence_transformer library to encode sentences
    Returns:
        Array with dense vector representations of the sentences in the document with shape (n_sentences, embedding_dim)
    """

    # read text file
    with open(text_path, 'r', encoding="utf8") as file:
        text = file.read().strip()

    # handle empty documents
    if (text is None) or (text == ''):
        text = "not included"

    # tokenize document into sentences and encode
    sentences = sent_tokenize(text)
    embeddings = sentence_encoder.encode(sentences, batch_size=32)
    
    return embeddings

In [5]:
# function to padd embeddings and masks

def pad_embeddings(embeddings, max_sentences):
    """
    Args:
        embeddings (np Array): Array with dense vector representations of the sentences in the document 
            with shape (n_sentences, embedding_dim).
        max_sentences (int): Maximum number of sentences that are encoded. If the document contains more sentences than 
            `max_sentences`, the embeddings are truncated to the first `max_sentences/2` and last `max_sentences/2` sentences. 
            If fewer sentences are present, the embeddings are padded with zero rows to reach `max_sentences` rows.
    
    Returns:
        tuple:
            - padded_embeddings (np Array): Array of shape (max_sentences, embedding_dim) containing the first `max_sentences/2`
              and last `max_sentences/2` sentence embeddings if the document has more than `max_sentences` sentences. Otherwise, 
              the array contains all sentence embeddings from the document, padded with zero rows if necessary.
            - padding_mask (np Array): Array of shape (max_sentences,) where 0 indicates a row that contains an original sentence 
              embedding and 1 indicates a row that was added as padding.
    """
    # get dimensions
    n_sentences, embedding_dim = embeddings.shape

    # long documents
    if n_sentences > max_sentences:

        # padded embeddings
        half = max_sentences // 2
        first_part = embeddings[:half]
        last_part = embeddings[-half:]
        padded_embeddings = np.vstack((first_part, last_part))

        # padding mask
        padding_mask = np.zeros(max_sentences)
    
    # short documents
    else:

        # padded embeddings
        padded_embeddings = np.zeros((max_sentences, embedding_dim))
        padded_embeddings[:n_sentences] = embeddings

        # padding mask
        padding_mask = np.zeros(max_sentences)
        padding_mask[n_sentences:] = 1
    
    return padded_embeddings, padding_mask

#### show functionality

In this notebook, we show how to encode a single document. In the experiments, we encoded each document and stored the embeddings and masks on disk in the '../bankruptcy research data/embeddings/' and '../bankruptcy research data/masks/' folders. When storing in a different location, adjust the SentenceDataset class in the sentence_lib accordingly.

In [6]:
%%capture
# init encoder
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sentence_encoder = SentenceTransformer('all-MiniLM-L6-v2', device=device)

In [7]:
# try code
text_path = '../bankruptcy research data/raw_corpus' + dataset.iloc[100]['filename'].replace('.json', '.txt')
embeddings = doc_encode(text_path, sentence_encoder)
padded, mask = pad_embeddings(embeddings, max_sentences=500)

In [8]:
print(padded.shape)

(500, 384)


In [9]:
print(mask.shape)

(500,)


In [10]:
print(padded)

[[-0.0787237   0.0460993   0.0035307  ... -0.01979921 -0.10768701
   0.01217997]
 [-0.0165556   0.03531491 -0.04758731 ... -0.06410385 -0.02580261
  -0.00225563]
 [-0.04243159  0.01894433  0.03861635 ...  0.07538494  0.04574082
   0.01861062]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


In [11]:
print(mask)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.