Extract query_cov_fut from sec mda filings

In [None]:
import pandas as pd
import multiprocessing
import boto3
import awswrangler as wr
from nltk.stem import PorterStemmer
import re
import spacy
from spacy.language import Language
from tqdm import tqdm
import warnings
from joblib import Parallel, delayed
import multiprocessing
import numpy as np
import io

ps = PorterStemmer()
bucket = 'sagemaker-us-east-2-269018301143'
warnings.filterwarnings("ignore")
tqdm.pandas()

In [None]:
# incorporate expect keywords

expect_keywords_all = [
    'may becom','hope','outlook','go to','tailwind','work toward','trend','is like to','may depend','may not','forse','would','seek to','ought','potenti','could depend','unknown','remain confid','shortterm','depend','endeavor','intend','abl to remain','feel','may result','project','expect to','possibl','like will result','goal','may affect','go forward','belief','consid','estim will','contempl','suggest','pursu','call for','appear','well posit to','think','with a view to','appear to','up to','short term','prioriti','hypothes','can have','indic','may impact','schedul','envis','believ','could','look forward','pro forma','drive','uncertain','explor','could be','look forward to','see','prospect','upsid','may','should','is like','risk','improv','longterm','like','uncertainti','tent','forese','predict','would be','headwind','view','move toward','aim','estim','on target','pend','probabl','could potenti','might','may be','are like','pipelin','do not expect','may continu','seek','will','shall','not expect','will like result','futur','unanticip','guidanc','look ahead','likelihood','like to','full year guidanc','anticip','confid','opportun','propos','on pace','plan','schedul to','preliminari','will like','will like be','do not anticip','expect','presum','express confid','can be','opportunity','plans','believes','could potentially','is likely to','drive','predicting','may affect','may continue','uncertain','expect','headwind','would be','shall','depend','expressed confidence','projects','aims','looking forward','scheduled to','think','hopefully','on target','presume','seek to','view','looks forward','expects','belief','pending','may not','suggests','moving toward','depends','believe','goals','trend','do not expect','appear to'
]

In [None]:
# basic data cleaning. return tokens

def clean_sentence(sentence):

    # remove formatting
    sentence = re.sub('\n',' ', sentence) # remove line break markers 
    sentence = re.sub('&#[0-9]+;',' ', sentence) # remove character ids
    
    # remove months etc
    sentence = re.sub('covenant skills','', sentence)
    sentence = re.sub('customer covenant','', sentence)
    sentence = re.sub(r"\b(?:'ll|we'll|will|may|should|shouldn't|can|can't|would|wouldn't|can also|may also|will also|should also) \b(?:increase|decrease|step down|step up|see|say|mention|recall|note|add|talk|like to)",'', sentence)
    sentence = re.sub('May','',sentence)
    
    # remove capitalization, punctuations (dont remove numbers, dollar signs, full stops, commas)
    sentence = re.sub("[^A-Za-z0-9$.,\s]",' ',sentence) 
    sentence = re.sub(' +',' ',sentence)
    sentence = sentence.strip()
    sentence = sentence.lower() 
    
    # additional cleaning
    sentence = re.sub(r"\b(?=[mdclxvii])m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})([ii]x|[ii]v|v?[ii]{0,3})\b\.?", '', sentence)
    sentence = re.sub(r'(mda|md a)','', sentence) # short form
    sentence = re.sub(r'form\s\w{0,1}','',sentence) # form number
    sentence = re.sub('table of contents','',sentence) # table of contents
    sentence = re.sub(r'(item|i tem)\s{0,1}[0-9]*[a-z]{0,1}','', sentence) # header
    sentence = re.sub('(year|years) ended','', sentence)
    sentence = re.sub('page\s{0,1}[0-9]*','',sentence)
    sentence = re.sub('rsquo','', sentence)
    sentence = re.sub('amp','', sentence)
    sentence = re.sub('rdquo','',sentence)
    sentence = re.sub('ldquo','',sentence)
    
    # remove hanging characters
#     sentence = re.sub(r'\b[b-hj-z]\b',' ', sentence) # remove hanging characters
    sentence = re.sub(r'(?<!\w)\.(?!\w)',' ',sentence) # remove hanging .
    sentence = re.sub(' +',' ',sentence)

    return sentence

In [None]:
# module to get covfuture indicators
# input: text as string
# output: dictionary of relevant indicators

nlp = spacy.load("en_core_web_sm")

# ADD A NEW RULE TO THE PIPELINE
subsentence_id = [",",".","!","?",";",
                  "or","after","because","but",
                  "so", "when", "where", "while", 
                  "although", "however", "though", "whereas"
                  "so that", "despite"]

@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text in subsentence_id:
            doc[token.i + 1].is_sent_start = True
    return doc

nlp.add_pipe("set_custom_boundaries", before="parser")

def parse_text(text):
    """
    :param text: A string of text to parse.
    :return: A dictionary (out1) with aggregated numeric indicators:
             - query_covenant, query_cov_past, query_cov_act, query_cov_fut, etc.
             - word_count, expect_count, total_words_in_document
             - fut_sentences_word_count, fut_sentences_expect_count
             ... and any other sums from the sentence-level results.
    """
    # Prepare the final aggregated output
    out1 = dict()

    # Only parse if text is a string
    if isinstance(text, str) and text.strip():
        # 1) Parse using spaCy
        doc_parse = nlp(text.strip())
        
        out = []  # Will store sentence-level dictionaries

        # 2) Iterate over each sentence
        for i_sent, sent in enumerate(doc_parse.sents):
            # Word count (ignore punctuation and whitespace)
            word_count = sum(1 for token in sent if not token.is_punct and not token.is_space)

            # Flag past tense
            past_flag = 0
            if (sent.root.tag_ in ["VBD", "VBN"]) or any(
               (w.dep_ in ["aux","auxpass"]) and (w.tag_ in ["VBD","VBN"])
               for w in sent.root.children
            ):
                past_flag = 1

            # Flag present tense
            present_flag = 0
            present_tag = ["VB","VBG","VBP","VBZ"]
            nonpresent_tag = ["VBD","VBN","MD"]
            if (sent.root.tag_ in present_tag) and not any(
               (w.dep_ in ["aux","auxpass"]) and (w.tag_ in nonpresent_tag)
               for w in sent.root.children
            ):
                present_flag = 1

            # Flag future tense
            future_flag = 0
            if (sent.root.tag_ in present_tag) and any(
               (w.dep_ in ["aux","auxpass"]) and (w.tag_ == "MD")
               for w in sent.root.children
            ):
                future_flag = 1

            # Flag covenant keywords
            regex = r'\b(?:covenant|convenant)'
            covenant = re.findall(regex, sent.text)
            covenant_flag = int(len(covenant) > 0)

            # Flag expectation keywords
            expect_regex = r'\b(' + '|'.join(expect_keywords_all) + r')'
            expect = re.findall(expect_regex, sent.text)
            expect_count = len(expect)
            expect_flag = int(expect_count > 0)

            # covmention labels
            query_cov_fut = 0
            query_cov_act = 0
            query_cov_past = 0
            query_cov_fut_tense = 0

            if (covenant_flag == 1) and (past_flag == 1):
                if expect_flag == 0:
                    query_cov_past = 1

            elif (covenant_flag == 1) and (present_flag == 1):
                if expect_flag == 1:
                    query_cov_fut = 1
                else:
                    query_cov_act = 1

            elif (covenant_flag == 1) and (future_flag == 1):
                query_cov_fut = 1
                query_cov_fut_tense = 1

            else:
                if (covenant_flag == 1) and (expect_flag == 1):
                    query_cov_fut = 1

            # Sub-lists for each category
            expect_keywords = []
            covmentions_fut = []
            covmentions_past = []
            covmentions_act = []

            if query_cov_fut == 1:
                expect_keywords += expect
                covmentions_fut.append(sent.text)

            if query_cov_act == 1:
                covmentions_act.append(sent.text)

            if query_cov_past == 1:
                covmentions_past.append(sent.text)

            out.append({
                "query_covenant": covenant_flag,
                "query_cov_past": query_cov_past,
                "query_cov_act": query_cov_act,
                "query_cov_fut": query_cov_fut,
                "query_cov_fut_tense": query_cov_fut_tense,
                "expect_keywords": expect_keywords,
                "covmentions_fut": covmentions_fut,
                "covmentions_past": covmentions_past,
                "covmentions_act": covmentions_act,
                
                # NEW columns
                "word_count": word_count,
                "expect_count": expect_count
            })

        # 3) Convert to DataFrame
        out_df = pd.DataFrame(out)

        # 4) If out_df is empty, return zeros
        if out_df.empty:
            out1 = {
                "query_covenant": 0,
                "query_cov_past": 0,
                "query_cov_act": 0,
                "query_cov_fut": 0,
                "query_cov_fut_tense": 0,
                "word_count": 0,
                "expect_count": 0,
                # Additional sums
                "fut_sentences_word_count": 0,
                "fut_sentences_expect_count": 0
            }
        else:
            # 5) Summation of numeric columns
            out1 = out_df.sum().to_dict()

            # 6) Compute "future-sentences" subset
            future_df = out_df[out_df["query_cov_fut"] == 1]
            out1["fut_sentences_word_count"] = future_df["word_count"].sum() if not future_df.empty else 0
            out1["fut_sentences_expect_count"] = future_df["expect_count"].sum() if not future_df.empty else 0

            # # 7) Total words in the document
            # out1["total_words_in_document"] = out_df["word_count"].sum()
    else:
        # If it's not a string or it's empty, return zeros
        out1 = {
            "query_covenant": 0,
            "query_cov_past": 0,
            "query_cov_act": 0,
            "query_cov_fut": 0,
            "query_cov_fut_tense": 0,
            "word_count": 0,
            "expect_count": 0,
            "fut_sentences_word_count": 0,
            "fut_sentences_expect_count": 0
        }

    return out1


In [None]:
# main program starts here

df_all = pd.DataFrame()
yrlist = range(2001,2022)
# yrlist = [2020]
for yr in yrlist:

    print(f'parsing text for year {yr}  \n')
    
    # read file
    s3_client = boto3.client('s3')
    file = f"edgar_mda_new_2_covmentions_june2024/{yr}_covmentions.gzip"
    obj = s3_client.get_object(Bucket=bucket,Key=file)
    df = pd.read_parquet(io.BytesIO(obj['Body'].read()))

    # search for keywords
    print('finding forward-looking sentences...')
    
    query = Parallel(n_jobs=multiprocessing.cpu_count(), batch_size=32) \
            (delayed(parse_text)(text) for text in tqdm(df['cov_full_text'])) 
    query = pd.DataFrame(query)    
    
    # join
    df1 = df.join(query)
    df_all = df_all.append(df1, ignore_index=True)


In [None]:
# save output 
# df_out = df_all[['master_idx','cik','company_name','filing_type','filing_date','report_date','query_covenant','query_cov_fut']]  
df_out = df_all[['master_idx','cik','company_name','filing_type','filing_date','report_date','query_covenant','query_cov_fut','query_cov_act','query_cov_past','query_cov_fut_tense','fut_sentences_word_count','fut_sentences_expect_count','word_count']]  
df_out = df_out.fillna(0)

df_out.rename({'query_covenant': 'query_cov_sec', 'query_cov_fut':'query_covfut_sec','query_cov_act':'query_cov_act_sec','query_cov_past':'query_cov_past_sec',
               'query_cov_fut_tense':'query_cov_fut_tense_sec','fut_sentences_word_count':'query_cov_fut_wc',
               'fut_sentences_expect_count':'query_expect_wc','word_count':'query_covenant_wc'
               }, axis=1, inplace=True)
    
output_prefix = f's3://{bucket}/output/'
savepath = output_prefix +'sec_api_mda_covenant_mentions_june2024_update.txt'
wr.s3.to_csv(
    df=df_out,
    path=savepath,
    sep='|'
)