Identify covenant violations in MDA section extracted from SEC API

In [None]:
import pandas as pd
from tqdm.notebook import tqdm
from nltk import sent_tokenize, word_tokenize
import warnings
import re
from joblib import Parallel, delayed
import multiprocessing
import boto3
import awswrangler as wr
from nltk.stem import PorterStemmer
from datetime import datetime
from dateutil.relativedelta import relativedelta
import numpy as np
import io

warnings.filterwarnings('ignore')
tqdm.pandas()
ps = PorterStemmer()

#setup

s3_resource = boto3.resource('s3')

# directories
bucket = '[your-bucket-name]'  # Replace with your bucket name

In [None]:
# incorporate expect keywords

def stem_words(wordlist):
    stem_words = []
    for x in wordlist:
        xtoken = x.strip().split()
        xstem = [ps.stem(t) for t in xtoken]
        if len(xstem)>1:
            out = ' '.join(xstem)
            stem_words.append(out)
        elif len(xstem)==1:
            out = xstem[0]
            stem_words.append(xstem[0])
        else:
            stem_words.append('')

    return stem_words

In [None]:
# action terms
expect_keywords = [
    'may becom','hope','outlook','go to','tailwind','work toward','trend','is like to','may depend','may not','forse','would','seek to','ought','potenti','could depend','unknown','remain confid','shortterm','depend','endeavor','intend','abl to remain','feel','may result','project','expect to','possibl','like will result','goal','may affect','go forward','belief','consid','estim will','contempl','suggest','pursu','call for','appear','well posit to','think','with a view to','appear to','up to','short term','prioriti','hypothes','can have','indic','may impact','schedul','envis','believ','could','look forward','pro forma','drive','uncertain','explor','could be','look forward to','see','prospect','upsid','may','should','is like','risk','improv','longterm','like','uncertainti','tent','forese','predict','would be','headwind','view','move toward','aim','estim','on target','pend','probabl','could potenti','might','may be','are like','pipelin','do not expect','may continu','seek','will','shall','not expect','will like result','futur','unanticip','guidanc','look ahead','likelihood','like to','full year guidanc','anticip','confid','opportun','propos','on pace','plan','schedul to','preliminari','will like','will like be','do not anticip','expect','presum','express confid','can be','opportunity','plans','believes','could potentially','is likely to','drive','predicting','may affect','may continue','uncertain','expect','headwind','would be','shall','depend','expressed confidence','projects','aims','looking forward','scheduled to','think','hopefully','on target','presume','seek to','view','looks forward','expects','belief','pending','may not','suggests','moving toward','depends','believe','goals','trend','do not expect','appear to'
]

violate_keywords = \
[
    'waiv','viol','in default','modif','not in compliance','forbear',
    'out of compliance','did not comply','unable to comply' 'failed to comply',
    'did not meet', 'unable to meet', 'failed to meet', 
    'did not satisfy', 'unable to satisfy', 'failed to satisfy'
]

# misc 
exclude_keywords = \
[
    'adjustable rate', 'gross margin', 'borrowing condition', 'floating rate',
    'adjusted eurodollar rate', 'adjusted libor rate', 'exchange rate', 'adjustable margin',
    'adjusted libor'
]

negate_keywords = \
[
    'no', 'not', 'don', 'won', 'none', 'wouldn', 'without', 'didn'
]

In [None]:
# basic data cleaning. return tokens

def clean_sentence(sentence):

    # remove formatting
    sentence = re.sub('\n',' ', sentence) # remove line break markers 
    sentence = re.sub('&#[0-9]+;',' ', sentence) # remove character ids
    
    # remove months etc
    sentence = re.sub('covenant skills','', sentence)
    sentence = re.sub('customer covenant','', sentence)
    
    # remove capitalization, punctuations (dont remove numbers, dollar signs, full stops, commas)
    sentence = re.sub("[^A-Za-z0-9$.,\s]",' ',sentence) 
    sentence = re.sub(' +',' ',sentence)
    sentence = sentence.strip()
    sentence = sentence.lower() 
    
    # additional cleaning
    sentence = re.sub(r"\b(?=[mdclxvii])m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})([ii]x|[ii]v|v?[ii]{0,3})\b\.?", '', sentence)
    sentence = re.sub(r'(mda|md a)','', sentence) # short form
    sentence = re.sub(r'form\s\w{0,1}','',sentence) # form number
    sentence = re.sub('table of contents','',sentence) # table of contents
    sentence = re.sub(r'(item|i tem)\s{0,1}[0-9]*[a-z]{0,1}','', sentence) # header
    sentence = re.sub('(year|years) ended','', sentence)
    sentence = re.sub('page\s{0,1}[0-9]*','',sentence)
    sentence = re.sub('rsquo','', sentence)
    sentence = re.sub('amp','', sentence)
    sentence = re.sub('rdquo','',sentence)
    sentence = re.sub('ldquo','',sentence)
    
    # remove hanging characters
    sentence = re.sub(r'\b[b-z]\b',' ', sentence) # remove hanging characters
    sentence = re.sub(r'(?<!\w)\.(?!\w)',' ',sentence) # remove hanging .
    sentence = re.sub(' +',' ',sentence)

    return sentence

In [None]:
# identify dates from sentence and indicate whether it is one year before filing date

def catch(func, handle=lambda e : e, *args, **kwargs):
    try:
        return func(*args, **kwargs)
    except:
        return None

def find_olddate(sent,filing_date):
    
    any_olddate = 0

    filing_date = datetime.strptime(filing_date,"%Y-%m-%d")
    prevdate = filing_date-relativedelta(months=3) # filing date up to 2 months after end of quarter
    
    # date (e.g. jan 31, 2021)
    regex = r'\b(?:january|february|march|april|may|june|july|august|september|october|november|december)\s(?:\d{1,2},)\s(?:19[7-9]\d|2\d{3})(?=\D|$)'
    datestr = re.findall(regex,sent)
    datefmt0 = [catch(lambda : datetime.strptime(s, '%B %d, %Y')) for s in datestr]
    
    regex = re.compile('|'.join(datestr)) # remove pattern from sentence (so don't double parse)
    sent = re.sub(regex, '',sent)
    
    # date format (e.g. jan 2021)
    regex = r'\b(?:january|february|march|april|may|june|july|august|september|october|november|december)\s(?:19[7-9]\d|2\d{3})(?=\D|$)'
    datestr = re.findall(regex,sent)
    datefmt1 = [catch(lambda : datetime.strptime(s, '%B %Y')) for s in datestr]
    
    regex = re.compile('|'.join(datestr)) # remove pattern from sentence (so don't double parse)
    sent = re.sub(regex, '',sent)
    
    # date format (e.g. 2021) - assume refers to midyear month
    regex = r'(?:(?<=\D))(?:19[7-9]\d|2[0-1]\d{2})(?=\D|$)'
    datestr = re.findall(regex,sent)
    datefmt2 = [catch(lambda : datetime.strptime(s, '%Y')+relativedelta(months=6)) for s in datestr]

    # combine all parsed dates
    datefmt = datefmt0 + datefmt1 + datefmt2
    datefmt = [d for d in datefmt if d is not None]

    olddate = 0
    if len(datefmt) >0:
        olddate = np.mean([1 if (d is not None) and (d < prevdate) else 0 for d in datefmt])
    
    any_olddate = 0
    if olddate >= 0.5:
        any_olddate = 1
    
    return any_olddate


In [None]:
# get sentences around covenant mentions / loan agreement amendments etc
def parse_text(sentences, filing_date):
    
    # use dataframe structure
    df = pd.DataFrame(columns = ['row','rawtext'])
    df['rawtext'] = sentences
    df['row'] = df.index

    # delete negative or hypothetical action terms
    regex = r'\b(?:'+'|'.join(negate_keywords)+r')[A-z0-9\s]{0,10}\b(?:' + '|'.join(violate_keywords) + r')'
    df['text'] = df['rawtext'].apply(lambda x: re.sub(regex, '', x))

    # query covenant keywords
    regex = r'\b(?:covenant)'
    df['query_cov'] = df['text'].apply(lambda x: int(len(re.findall(regex,x))>0))
    
    # query violation keywords
    regex = r'\b(?:' + '|'.join(violate_keywords) + r')'
    df['word_viol'] = df['text'].apply(lambda x: re.findall(regex,x))
    df['query_viol'] = df['word_viol'].apply(lambda x: int(len(x)>0))
    
    # query expect keywords
    regex = r'\b(?:' + '|'.join(expect_keywords) + r')'
    df['word_expect'] = df['text'].apply(lambda x: re.findall(regex,x))
    df['query_expect'] = df['word_expect'].apply(lambda x: int(len(x)>0))
    
    # search for year in amendment sentence
    df['query_olddate'] = df['text'].apply(lambda x: find_olddate(x,filing_date))

    # check if mentions corresponds to correct date range
    mask = (df.query_olddate>0)|(df.query_expect>0)
    df.loc[mask,'query_cov'] = 0
    df.loc[mask,'query_viol'] = 0

    # get relevant text (up to 3 sentences following valid covenant mentions)
    df['l1'] = df.query_cov.shift(1)
    df['l2'] = df.query_cov.shift(2)
    df['l3'] = df.query_cov.shift(3)
    df['_all'] = df[['query_cov','l1','l2','l3']].sum(axis=1)    

    # save output
    out = {}
    cov_full_text = None
    cov_viol_text = None
    cov_viol_keywords = None
    cov_viol_ind = 0

    if df._all.sum()>0:

        df = df[df._all>0]
        cov_full_text = ' '.join(list(df['text'].values))
        
        if df.query_viol.sum()>0:
            cov_viol_text = df.loc[df.query_viol>0,'text'].sum()
            cov_viol_keywords = df.loc[df.query_viol>0,'word_viol'].sum()
            cov_viol_ind = 1
        
       
    # save output
    out['cov_full_text'] = cov_full_text
    out['cov_viol_text'] = cov_viol_text
    out['cov_viol_keywords'] = cov_viol_keywords
    out['cov_viol_ind'] = cov_viol_ind
            
    return out

In [None]:
# get raw text from input filepath

def read_text(file, header=7):
    
    # get text
    s3_client = boto3.client('s3')
    obj = s3_client.get_object(Bucket=bucket,Key=file)
    raw_text = obj['Body'].read().decode('utf-8').splitlines()
        
    # get file info
    master_idx = re.search(r'(?<=<master idx>)(.*?)(?=</master idx>)', raw_text[0]).group(0)
    cik = re.search(r'(?<=<cik>)(.*?)(?=</cik>)', raw_text[1]).group(0)
    company_name = re.search(r'(?<=<company name>)(.*?)(?=</company name>)', raw_text[2]).group(0)
    filing_type = re.search(r'(?<=<filing type>)(.*?)(?=</filing type>)', raw_text[3]).group(0)
    filing_date = re.search(r'(?<=<filing date>)(.*?)(?=</filing date>)', raw_text[4]).group(0)
    report_date = re.search(r'(?<=<report date>)(.*?)(?=</report date>)', raw_text[5]).group(0)
    filing_index = re.search(r'(?<=<filing index>)(.*?)(?=</filing index>)', raw_text[6]).group(0)
    filing_url = re.search(r'(?<=<filing url>)(.*?)(?=</filing url>)', raw_text[7]).group(0)
    
    # filter for incorrect text (old text mixed in)
    valid_flag = 1
    if len(raw_text)>=10:
        incorrect_text = re.search(r'(^Our operating results may fluctuate significantly)',raw_text[9])
        if incorrect_text:
            valid_flag = 0
    
    # body of text
    body = raw_text[header:] # remove header lines
    if len(body)<5:
        valid_flag = 0
    
    # join to single string
    body = ' '.join(body) 

    # split into sentences
    body = re.sub(r'(?<=No)\.(?!\w)', '',body) # dont tokenize "No. 1"
    sentences = sent_tokenize(body)
    
    # clean sentence
    sentences_clean = [clean_sentence(s) for s in sentences] # clean and tokenize
#     sentences_clean = [s for s in sentences_clean if len(s) > 1] # remove sentences that are empty strings
    
    # parse text
    query = parse_text(sentences_clean, filing_date = filing_date)

    return {
        'master_idx': master_idx,
        'cik': cik,
        'company_name': company_name,
        'filing_type': filing_type,
        'filing_date': filing_date,
        'report_date': report_date,
        'filing_index': filing_index,
        'valid_text': valid_flag,
        'cov_full_text': query['cov_full_text'],
        'cov_viol_text': query['cov_viol_text'],
        'cov_viol_keywords': query['cov_viol_keywords'],
        'cov_viol_ind': query['cov_viol_ind']
        }

In [None]:
## read file paths

yearstart = int(input('start year: '))
yearend = int(input('end year: '))
yrlist = range(yearstart,yearend+1)
qtrlist = range(1,5)
# qtrlist = range(1,2)

for yr in reversed(yrlist):
    
    df_all = pd.DataFrame()
    
    for qtr in qtrlist:
        
        print(f'parsing text for year {yr} qtr {qtr} \n')

        prefix = f"edgar_mda_new_2/{yr}/QTR{qtr}/"
        s3_client = boto3.client('s3')
        paginator = s3_client.get_paginator('list_objects_v2')
        pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
        filepath = [obj['Key'] for page in pages for obj in page['Contents'] if '.txt' in obj['Key']]
        
        # get tokenized sentences and filing info (nested list: token -> sentence -> document)
        df = Parallel(n_jobs=multiprocessing.cpu_count(), batch_size=32) \
                (delayed(read_text)(file, header=7) for file in tqdm(filepath)) 
        df = pd.DataFrame(df)
        
        df_all = df_all.append(df, ignore_index=True)
        
    output_prefix = f's3://{bucket}/edgar_mda_new_2_violations/'
    savepath = output_prefix + f'{yr}_violations.gzip'
    wr.s3.to_parquet(
        df=df_all,
        path=savepath,
    )

## extract numerical indicators as stata input
Note: the code above is run in python script (takes about 4 hours to run)
    
    

In [None]:
df_all = pd.DataFrame()

# yearstart = int(input('start year: '))
# yearend = int(input('end year: '))
# yearlist = range(yearstart,yearend+1)
yearlist = range(2000,2022)

for yr in reversed(yearlist):
        
    print(f'reading file from year {yr} \n')

    # read file
    s3_client = boto3.client('s3')
    file = f"edgar_mda_new_2_violations/{yr}_violations.gzip"
    obj = s3_client.get_object(Bucket=bucket,Key=file)
    df = pd.read_parquet(io.BytesIO(obj['Body'].read()))
 
    # extract indicators
    df = df[df.valid_text==1].reset_index(drop=True)
    df_out = df[['master_idx','cik','company_name','filing_type','filing_date','report_date',
                 'cov_viol_ind']]  
    df_out = df_out.fillna(0)
    
    df_all = df_all.append(df_out, ignore_index=True)


In [None]:
# save output to file
output_prefix = f's3://{bucket}/output/'
savepath = output_prefix +'edgar_mda_new_violations_2_postsubmit.txt'
wr.s3.to_csv(
    df=df_all,
    path=savepath,
    sep='|'
)

### Quality check

In [None]:
# check how many successfully parsed

import matplotlib.pyplot as plt

# import index file
loadpath = f's3://{bucket}/misc/edgar_masterhtml1_combined.csv'
indexdf = wr.s3.read_csv(path=loadpath,
                        sep='|',
                       lineterminator='\n'
                      )
indexdf.rename({'Unnamed: 0':'master_idx'},axis=1,inplace=True)
report_date = pd.to_datetime(indexdf.filing_date,format='%Y-%m-%d')
indexdf['yq'] = pd.PeriodIndex(report_date, freq='Q').to_timestamp()
indexdf.drop({'cik','company_name','filing_type','filing_date','report_date'},axis=1,inplace=True)

# get yq 
df_all['parsed'] = 1
df_all['master_idx'] = df_all['master_idx'].astype(int) 

# combine
dfcombine = indexdf.merge(df_all, on='master_idx', how='left')
dfcombine['parsed'] = dfcombine['parsed'].fillna(0)

In [None]:
# plot
fig, axs = plt.subplots(2)

plots = dfcombine.groupby('yq')['parsed'].mean()
axs[0].plot(plots)

subsample = dfcombine[(dfcombine.filing_type=='10-K')|(dfcombine.filing_type=='10-Q')]
plots = subsample.groupby('yq')['cov_viol_ind'].mean()
axs[1].plot(plots)