Identify mentions of covenants in MDA section extracted from SEC API

In [None]:
import pandas as pd
from tqdm.notebook import tqdm
from nltk import sent_tokenize, word_tokenize
import warnings
import re
from joblib import Parallel, delayed
import multiprocessing
import boto3
import awswrangler as wr
from nltk.stem import PorterStemmer
import numpy as np
import pandas as pd
import io 
import random

warnings.filterwarnings('ignore')
tqdm.pandas()
ps = PorterStemmer()

#setup

s3_resource = boto3.resource('s3')

# directories
bucket = 'sagemaker-us-east-2-269018301143'

In [None]:
def simple_clean(sentence):
    
    sentence = re.sub('\n',' ', sentence) # remove line break markers 
    sentence = re.sub('&#[0-9]+;',' ', sentence) # remove character ids
    sentence = re.sub(' +', ' ', sentence) # remove extra spaces

    return sentence

In [None]:
# get sentences around covenant mentions / loan agreement amendments etc
def parse_text(sentences, filing_date):
    
    # use dataframe structure
    df = pd.DataFrame(columns = ['row','rawtext'])
    df['rawtext'] = sentences
    df['row'] = df.index
    
    # query covenant keywords
    regex = r'\b(?:covenant)'
    df['query_cov'] = df['rawtext'].apply(lambda x: int(len(re.findall(regex,x))>0))

    # get relevant text (up to 3 sentences following valid covenant mentions)
    relevant_indices = df.index[df['query_cov'] == 1].tolist()
    surrounding_indices = set()
    
    for index in relevant_indices:
        start_index = max(0, index - 5)
        end_index = min(len(df), index + 6)
        surrounding_indices.update(range(start_index, end_index))
    
    surrounding_indices = sorted(surrounding_indices)
    cov_full_text = ' '.join(df.loc[surrounding_indices, 'rawtext'].values)
    
    # save output
    out = {'cov_full_text': cov_full_text}
    
            
    return out

In [None]:
# get raw text from input filepath

def read_text(file, header=8):
    
    # get text
    s3_client = boto3.client('s3')
    obj = s3_client.get_object(Bucket=bucket,Key=file)
    raw_text = obj['Body'].read().decode('utf-8').splitlines()
        
    # get file info
    master_idx = re.search(r'(?<=<master idx>)(.*?)(?=</master idx>)', raw_text[0]).group(0)
    cik = re.search(r'(?<=<cik>)(.*?)(?=</cik>)', raw_text[1]).group(0)
    company_name = re.search(r'(?<=<company name>)(.*?)(?=</company name>)', raw_text[2]).group(0)
    filing_type = re.search(r'(?<=<filing type>)(.*?)(?=</filing type>)', raw_text[3]).group(0)
    filing_date = re.search(r'(?<=<filing date>)(.*?)(?=</filing date>)', raw_text[4]).group(0)
    report_date = re.search(r'(?<=<report date>)(.*?)(?=</report date>)', raw_text[5]).group(0)
    filing_index = re.search(r'(?<=<filing index>)(.*?)(?=</filing index>)', raw_text[6]).group(0)
    filing_url = re.search(r'(?<=<filing url>)(.*?)(?=</filing url>)', raw_text[7]).group(0)
    
    # filter for incorrect text (old text mixed in)
    valid_flag = 1
    if len(raw_text)>=10:
        incorrect_text = re.search(r'(^Our operating results may fluctuate significantly)',raw_text[9])
        if incorrect_text:
            valid_flag = 0
    
    # body of text
    body = raw_text[header:] # remove header lines
    if len(body)<5:
        valid_flag = 0
    
    # join to single string
    body = ' '.join(body) 

    # split into sentences
    body = re.sub(r'(?<=No)\.(?!\w)', '',body) # dont tokenize "No. 1"
    sentences = sent_tokenize(body)
    sentences_clean = [simple_clean(s) for s in sentences] # clean and tokenize

    # parse text
    query = parse_text(sentences_clean, filing_date = filing_date)

    return {
        'master_idx': master_idx,
        'cik': cik,
        'company_name': company_name,
        'filing_type': filing_type,
        'filing_date': filing_date,
        'report_date': report_date,
        'filing_index': filing_index,
        'valid_text': valid_flag,
        'cov_full_text': query['cov_full_text'],
        }

In [None]:
## read file paths

yearstart = int(input('start year: '))
yearend = int(input('end year: '))
yrlist = range(yearstart,yearend+1)
qtrlist = range(1,5)
# qtrlist = range(1,2)

for yr in reversed(yrlist):
    
    df_all = pd.DataFrame()
    
    for qtr in qtrlist:
        
        print(f'parsing text for year {yr} qtr {qtr} \n')

        prefix = f"edgar_mda_new_2/{yr}/QTR{qtr}/"
        s3_client = boto3.client('s3')
        paginator = s3_client.get_paginator('list_objects_v2')
        pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
        filepath = [obj['Key'] for page in pages for obj in page['Contents'] if '.txt' in obj['Key']]
        
        # get tokenized sentences and filing info (nested list: token -> sentence -> document)
        df = Parallel(n_jobs=multiprocessing.cpu_count(), batch_size=32) \
                (delayed(read_text)(file, header=8) for file in tqdm(filepath)) 
        df = pd.DataFrame(df)
        
        df_all = df_all.append(df, ignore_index=True)
        
    output_prefix = f's3://{bucket}/edgar_mda_new_2_covmentions_june2024/'
    savepath = output_prefix + f'{yr}_covmentions.gzip'
    wr.s3.to_parquet(
        df=df_all,
        path=savepath,
    )

## Combine files and extract random sample for hand-labeling
Note: this portion should be commented out when running the code above 

In [None]:

bucket = 'sagemaker-us-east-2-269018301143'

df_all = pd.DataFrame()
yearlist = range(2000,2022)

for yr in reversed(yearlist):
        
    print(f'reading file from year {yr}')

    # read file
    s3_client = boto3.client('s3')
    file = f"edgar_mda_new_2_covmentions_june2024/{yr}_covmentions.gzip"
    obj = s3_client.get_object(Bucket=bucket,Key=file)
    df = pd.read_parquet(io.BytesIO(obj['Body'].read()))
    
    df_all = df_all.append(df, ignore_index=True)


In [None]:
# merge with main master index list

# import index file
loadpath = f's3://{bucket}/misc/edgar_masterhtml1_combined.csv'
indexdf = wr.s3.read_csv(path=loadpath,
                        sep='|',
                       lineterminator='\n'
                      )
indexdf.rename({'Unnamed: 0':'master_idx'},axis=1,inplace=True)
report_date = pd.to_datetime(indexdf.filing_date,format='%Y-%m-%d')
indexdf['yq'] = pd.PeriodIndex(report_date, freq='Q').to_timestamp()
indexdf.drop({'cik','company_name','filing_type','filing_date','report_date'},axis=1,inplace=True)

# get yq 
df_all['parsed'] = 1
df_all['master_idx'] = df_all['master_idx'].astype(int) 

# combine
dfcombine = indexdf.merge(df_all, on='master_idx', how='left')
dfcombine['parsed'] = dfcombine['parsed'].fillna(0)
dfcombine.head()

In [None]:
# merge with covenant violation indicator

file = f"output/edgar_mda_new_violations_2_postsubmit.txt"
s3_client = boto3.client('s3')
obj = s3_client.get_object(Bucket=bucket,Key=file)
cov_viol_ind = obj['Body'].read().decode('utf-8').splitlines()
cov_viol_ind = '\n'.join(cov_viol_ind)
cov_viol_ind = pd.read_csv(io.StringIO(cov_viol_ind), sep='|')
cov_viol_ind.drop(columns=['Unnamed: 0'], inplace=True)

dfcombine = dfcombine.merge(cov_viol_ind[['master_idx','cov_viol_ind']], on=['master_idx'], how='left',validate='1:1')

In [None]:
dfcombine.groupby(['yq'])['parsed'].mean().plot()

In [None]:
# save full sample as gzip

output_prefix = f's3://{bucket}/edgar_mda_new_2_covmentions_june2024/'
savepath = output_prefix + f'covmentions_fullsample.gzip'
wr.s3.to_parquet(
    df=dfcombine,
    path=savepath,
)

In [None]:
# extract sample of 100 CIK indicators

# extract sample of cik indicators (only those with at least 10 observations)
cik_with_violation = dfcombine['cik'][dfcombine['cov_viol_ind']==1].unique()

cik_list = dfcombine['cik'].value_counts()[dfcombine['cik'].value_counts()>=20].index
cik_list = set(cik_list).intersection(set(cik_with_violation))

random.seed(42)
cik_sample = random.sample(list(cik_list), 20)

# filter for sample
df = dfcombine[dfcombine['cik'].isin(cik_sample)]
df = df.sort_values(by=['cik','filing_date']).reset_index(drop=True)

# save as excel
output_prefix = f's3://{bucket}/edgar_mda_new_2_covmentions_june2024/'
savepath = output_prefix + f'covmentions_sample100.xlsx'
wr.s3.to_excel(
    df=df,
    path=savepath,
    index=False
)
