This module extracts snippets from earnings calls with mentions of covenants

In [None]:
import pandas as pd
import boto3
import awswrangler as wr
from nltk.stem import PorterStemmer
from tqdm import tqdm
import warnings
import numpy as np
import io
from nltk import sent_tokenize, word_tokenize

ps = PorterStemmer()
bucket = '[bucket_name]'  
warnings.filterwarnings("ignore")
tqdm.pandas()

In [None]:
# module to extract covenant mentions
# input: document as a string
# output: sentences with covenant mentions as a string

def get_covmentions(doc_in):
    # doc_in is a string containing the content of the call
    
    covmentions_raw = ''
    
    # tokenize document
    sentences = sent_tokenize(doc_in)
    
    # remove boilerplate
    if len(sentences)>10:
        sentences = sentences[10:]

    if len(sentences) > 0:
        
        regex = r'covenant|convenant'
        mask = pd.Series(sentences).str.contains(regex)
        covmentions_raw = ' '.join(pd.Series(sentences)[mask])
    
    return covmentions_raw

In [None]:
df_all = pd.DataFrame()

yearstart = int(input('start year: '))
yearend = int(input('end year: '))
yearlist = range(yearstart,yearend+1)

for yr in reversed(yearlist):

    # read in earnings calls data
    print(f'loading data from year {yr}')

    # read file
    s3_client = boto3.client('s3')
    file = f"factset_calls/raw_{yr}.gzip"
    obj = s3_client.get_object(Bucket=bucket,Key=file)
    df = pd.read_parquet(io.BytesIO(obj['Body'].read()))

    # combine mda and qa section
    print('combining mda and qa...')
    raw_text = df.progress_apply(lambda x: x['mda']+x['qa'],axis=1)
    df.drop(['mda','qa'],axis=1,inplace=True)
    df['raw_text'] = raw_text

    # extract covmentions
    print('extracting covenent mentions...')
    df['covmentions_raw'] = df.raw_text.progress_apply(get_covmentions)
    df.drop(['raw_text'],axis=1,inplace=True)
    
    # append to main data frame
    df_all = df_all.append(df, ignore_index=True)
    

In [None]:
# save output 

output_prefix = f's3://{bucket}/factset_calls_covmentions/'
savepath = output_prefix + f'covmentions_all_v6.gzip'
wr.s3.to_parquet(
    df=df_all,
    path=savepath,
)