Extracts covenant concerns from earnings call transcripts

In [None]:
import pandas as pd
import boto3
import numpy as np
from nltk.stem import PorterStemmer
import re
import spacy
from spacy.language import Language
from tqdm import tqdm
import warnings
import numpy as np
import io

ps = PorterStemmer()
bucket = '[bucket_name]'  
warnings.filterwarnings("ignore")
tqdm.pandas()

In [None]:
# incorporate expect keywords

expect_keywords_all = [
    'may becom','hope','outlook','go to','tailwind','work toward','trend','is like to','may depend','may not','forse','would','seek to','ought','potenti','could depend','unknown','remain confid','shortterm','depend','endeavor','intend','abl to remain','feel','may result','project','expect to','possibl','like will result','goal','may affect','go forward','belief','consid','estim will','contempl','suggest','pursu','call for','appear','well posit to','think','with a view to','appear to','up to','short term','prioriti','hypothes','can have','indic','may impact','schedul','envis','believ','could','look forward','pro forma','drive','uncertain','explor','could be','look forward to','see','prospect','upsid','may','should','is like','risk','improv','longterm','like','uncertainti','tent','forese','predict','would be','headwind','view','move toward','aim','estim','on target','pend','probabl','could potenti','might','may be','are like','pipelin','do not expect','may continu','seek','will','shall','not expect','will like result','futur','unanticip','guidanc','look ahead','likelihood','like to','full year guidanc','anticip','confid','opportun','propos','on pace','plan','schedul to','preliminari','will like','will like be','do not anticip','expect','presum','express confid','can be','opportunity','plans','believes','could potentially','is likely to','drive','predicting','may affect','may continue','uncertain','expect','headwind','would be','shall','depend','expressed confidence','projects','aims','looking forward','scheduled to','think','hopefully','on target','presume','seek to','view','looks forward','expects','belief','pending','may not','suggests','moving toward','depends','believe','goals','trend','do not expect','appear to'
]

In [None]:
# basic data cleaning. return tokens

def clean_sentence(sentence):

    # remove formatting
    sentence = re.sub('\n',' ', sentence) # remove line break markers 
    sentence = re.sub('&#[0-9]+;',' ', sentence) # remove character ids
    
    # remove months etc
    sentence = re.sub('covenant skills','', sentence)
    sentence = re.sub('customer covenant','', sentence)
    sentence = re.sub(r"\b(?:'ll|we'll|will|may|should|shouldn't|can|can't|would|wouldn't|can also|may also|will also|should also) \b(?:increase|decrease|step down|step up|see|say|mention|recall|note|add|talk|like to)",'', sentence)
    sentence = re.sub('May','',sentence)
    
    # remove capitalization, punctuations (dont remove numbers, dollar signs, full stops, commas)
    sentence = re.sub("[^A-Za-z0-9$.,\s]",' ',sentence) 
    sentence = re.sub(' +',' ',sentence)
    sentence = sentence.strip()
    sentence = sentence.lower() 
    
    # additional cleaning
    sentence = re.sub(r"\b(?=[mdclxvii])m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})([ii]x|[ii]v|v?[ii]{0,3})\b\.?", '', sentence)
    sentence = re.sub(r'(mda|md a)','', sentence) # short form
    sentence = re.sub(r'form\s\w{0,1}','',sentence) # form number
    sentence = re.sub('table of contents','',sentence) # table of contents
    sentence = re.sub(r'(item|i tem)\s{0,1}[0-9]*[a-z]{0,1}','', sentence) # header
    sentence = re.sub('(year|years) ended','', sentence)
    sentence = re.sub('page\s{0,1}[0-9]*','',sentence)
    sentence = re.sub('rsquo','', sentence)
    sentence = re.sub('amp','', sentence)
    sentence = re.sub('rdquo','',sentence)
    sentence = re.sub('ldquo','',sentence)
    
    # remove hanging characters
#     sentence = re.sub(r'\b[b-hj-z]\b',' ', sentence) # remove hanging characters
    sentence = re.sub(r'(?<!\w)\.(?!\w)',' ',sentence) # remove hanging .
    sentence = re.sub(' +',' ',sentence)

    return sentence

In [None]:
# module to get covfuture indicators
# input: text as string
# output: dictionary of relevant indicators

nlp = spacy.load("en_core_web_sm")

# ADD A NEW RULE TO THE PIPELINE
subsentence_id = [",",".","!","?",";",
                  "or","after","because","but",
                  "so", "when", "where", "while", 
                  "although", "however", "though", "whereas"
                  "so that", "despite"]

@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text in subsentence_id:
            doc[token.i + 1].is_sent_start = True
    return doc

nlp.add_pipe("set_custom_boundaries", before="parser")

def parse_text(text):
    """
    Parses the input text and returns a dictionary of summed indicators (out1).
    This includes:
    - Tense flags (past/present/future)
    - Covenant flags
    - Expectation flags
    - Word counts
    - Summed stats for future-covenant sentences
    """
    # Clean the text
    sent_clean = clean_sentence(text)

    # Parse with spaCy
    doc_parse = nlp(sent_clean)
    
    # Prepare a container to collect sentence-level results
    out = []

    for i_sent, sent in enumerate(doc_parse.sents):
        # 1) Count words in the current sentence (ignore punct/whitespace)
        word_count = sum(1 for token in sent if not token.is_punct and not token.is_space)

        # 2) Identify tense (past/present/future)
        past_flag = 0
        present_flag = 0
        future_flag = 0
        
        # Past Tense
        if (sent.root.tag_ in ["VBD", "VBN"]) or any(
            (w.dep_ in ["aux", "auxpass"]) and (w.tag_ in ["VBD", "VBN"]) 
            for w in sent.root.children
        ):
            past_flag = 1

        # Present Tense
        present_tag = ["VB", "VBG", "VBP", "VBZ"]
        nonpresent_tag = ["VBD", "VBN", "MD"]
        if (sent.root.tag_ in present_tag) and not any(
            (w.dep_ in ["aux", "auxpass"]) and (w.tag_ in nonpresent_tag)
            for w in sent.root.children
        ):
            present_flag = 1

        # Future Tense
        if (sent.root.tag_ in present_tag) and any(
            (w.dep_ in ["aux", "auxpass"]) and (w.tag_ == "MD") 
            for w in sent.root.children
        ):
            future_flag = 1

        # 3) Find covenant keywords
        cov_regex = r"\b(?:covenant|convenant)"
        covenant = re.findall(cov_regex, sent.text)
        covenant_count = len(covenant)
        covenant_flag = int(covenant_count > 0)

        # 4) Find expectation keywords
        expect_regex = r"\b(" + "|".join(expect_keywords_all) + r")"
        expect = re.findall(expect_regex, sent.text)
        expect_count = len(expect)
        expect_flag = int(expect_count > 0)

        # 5) Determine future/past/active covenant mentions
        query_cov_fut = 0
        query_cov_act = 0
        query_cov_past = 0
        query_cov_fut_tense = 0

        if (covenant_flag == 1) and (past_flag == 1):
            if expect_flag == 0:
                query_cov_past = 1
        elif (covenant_flag == 1) and (present_flag == 1):
            if expect_flag == 1:
                query_cov_fut = 1
            else:
                query_cov_act = 1
        elif (covenant_flag == 1) and (future_flag == 1):
            query_cov_fut = 1
            query_cov_fut_tense = 1
        else:
            if (covenant_flag == 1) and (expect_flag == 1):
                query_cov_fut = 1

        # 6) Collect results for this sentence in a dictionary
        out.append({
            "query_covenant": covenant_flag,
            "query_cov_past": query_cov_past,
            "query_cov_act": query_cov_act,
            "query_cov_fut": query_cov_fut,
            "query_cov_fut_tense": query_cov_fut_tense,
            "expect_keywords": expect,  # list of matched expectation words
            "covmentions_fut": [sent.text] if query_cov_fut else [],
            "covmentions_past": [sent.text] if query_cov_past else [],
            "covmentions_act": [sent.text] if query_cov_act else [],
            # NEW: Word counts
            "word_count": word_count,
            "expect_count": expect_count,  # number of matched expectation words
        })

    # 7) Convert to DataFrame for easy summation
    out_df = pd.DataFrame(out)

    # 8) Summarize (same structure as your original code)
    #    This sums up all numeric columns (including any new ones).
    out1 = out_df.sum().to_dict()

    # 9) Optionally, compute advanced sums for future-covenant sentences
    #    (query_cov_fut == 1)
    if "query_cov_fut" in out_df.columns:
        fut_df = out_df[out_df["query_cov_fut"] == 1]
        out1["fut_sentences_word_count"] = fut_df["word_count"].sum() if not fut_df.empty else 0
        out1["fut_sentences_expect_count"] = fut_df["expect_count"].sum() if not fut_df.empty else 0

        # # 10) Add total words in the entire doc
        # out1["total_words_in_document"] = out_df["word_count"].sum()
    else:
        out1["fut_sentences_word_count"] = 0
        out1["fut_sentences_expect_count"] = 0
        # out1["total_words_in_document"] = 0

    # Return the same final dictionary structure
    return out1



In [None]:
# main program starts here

# read file
s3_client = boto3.client('s3')
file = f"factset_calls_covmentions/covmentions_all.gzip"
obj = s3_client.get_object(Bucket=bucket,Key=file)
df = pd.read_parquet(io.BytesIO(obj['Body'].read()))
df.rename({'covmentions_raw':'covmentions_fullsent'},axis=1,inplace=True)

# search for keywords
print('finding forward-looking sentences...')
query = df['covmentions_fullsent'].progress_apply(parse_text)
query = pd.DataFrame(list(query))
df1 = df.join(query)


In [None]:
# save intermediate output

output_prefix = f's3://{bucket}/output/'
# save dataframe
savepath = output_prefix + 'factset_calls_covenant_mentions_4_3.gzip'
wr.s3.to_parquet(
    df=df1,
    path=savepath,
)

In [None]:
# sample sentences
test = df1[df1.query_cov_fut>0].sample(n=5).reset_index(drop=True)
# test = df1[df1.query_cov_act>0].sample(n=5).reset_index(drop=True)
for i in range(0,len(test)):
    print(test.loc[i,'covmentions_fullsent'])
    print(test.loc[i,'fut_sentences_word_count'])
    print(test.loc[i, 'fut_sentences_expect_count'])
    print('\n')

In [None]:
# save output (note: word count is number of words in sentences with covmentions)
df_out = df1[['date','repid','query_covenant','query_cov_fut','query_cov_act','query_cov_past','query_cov_fut_tense','fut_sentences_word_count','fut_sentences_expect_count','word_count']]  
df_out = df_out.fillna(0)

df_out.rename({'fut_sentences_word_count':'query_cov_fut_wc', 'fut_sentences_expect_count': 'query_expect_wc', 'word_count': 'query_covenant_wc'}, axis=1, inplace=True)
    
output_prefix = f's3://{bucket}/output/'
savepath = output_prefix +'factset_calls_covenant_mentions_4_3.txt'
wr.s3.to_csv(
    df=df_out,
    path=savepath,
    sep='|'
)