Extract header section containing safe harbor statements from filings

In [None]:
import pandas as pd
import re
from tqdm.notebook import tqdm
import warnings
import glob
from joblib import Parallel, delayed
import multiprocessing
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import collections


warnings.filterwarnings('ignore')
tqdm.pandas()
ps = PorterStemmer()

In [None]:
# search for covenant violation mentions, following algo by Nini Smith Sufi (2012)
def get_cautionary_text(raw_text):
    
    df = pd.DataFrame(columns = ['row','raw_text'])
    df['raw_text'] = [re.sub(' +',' ',r) for r in raw_text]
    df['row'] = df.index

    # get relevant lines 
    regex = re.compile(r'^(?:.{0,50})(?:f\s*o\s*r\s*w\s*a\s*r\s*d\s*-{0,1}\s*l\s*o\s*o\s*k\s*i\s*n\s*g)', re.IGNORECASE)
    start_row = df.loc[(df['raw_text'].str.contains(regex)),'row']
    
    out_list = []
    out_sent = []
    
    if len(start_row)>0:
        start_row = start_row.reset_index().loc[0,'row']
        out_text = ' '.join(df.loc[start_row:start_row+3,'raw_text'])   
        
        # get words in doubel quotation marks
        find_str = re.findall(r'(?<=words)(.*?)(?=\b(?:or|and|the negative of|not limited to|the negatives)\b)', out_text)
        if len(find_str)>0:
            word_list = find_str[0] # get first match
            word_list = word_list.split(',')
            word_list = [re.sub('[^A-z\s]','',x) for x in word_list if len(x)<25]
            
            if any('expect' in word for word in word_list):
                out_list = [word for word in word_list if not any(e in word for e in ['such','as']) and len(word)>1]
                out_sent = find_str[0]
                
    return out_list, out_sent

In [None]:
# wrapper to read files in parallel
def parse_wrapper(file):

    # read file
    with open(file) as f:
        raw_text = f.readlines()

    # get header info
    header_idx = file.rfind('/') # last occurrence of forward slash
    filename = file[header_idx+1:]
    pos_idx = [x.start() for x in re.finditer(r'_', filename)]

    filing_date = filename[:pos_idx[0]]
    filing_type = filename[pos_idx[0]+1:pos_idx[1]]
    cik = filename[pos_idx[3]+1:pos_idx[4]]
    
    # only parse text of 10K or 10q (or related variants)
    word_list = []
    sentence = []
    if any(s in filing_type for s in ['10-K','10-Q']):
        try:
            # get mda section
            word_list, sentence = get_cautionary_text(raw_text)

        except:
            pass
        
    return {  
          'cautionary_words': word_list,
          'sentence_example': sentence
        }



In [None]:
# load file from folder

yrlist = range(2000, 2022)
# yrlist= [2004]
qtrlist = range(1, 5)
# qtrlist = [1]

all_words = list()
all_sentences = list()

for yr in reversed(yrlist):
    
    out_df_yr = pd.DataFrame()
    for qtr in qtrlist:
        
        print('year: '+ str(yr) + ' quarter: '+ str(qtr))
        
        # read in folder
        folder = '../../rawdata/edgar_mcdonald/' + str(yr) + '/QTR' + str(qtr) + '/*.txt'
        filepath = glob.glob(folder)
        
        out_df = Parallel(n_jobs=multiprocessing.cpu_count(), batch_size=32) \
                (delayed(parse_wrapper)(file) for file in tqdm(filepath))  
        out_df = pd.DataFrame(out_df)

        cautionary_words = [y for x in out_df.cautionary_words.values for y in x if x]
        all_words.extend(cautionary_words)
        
        sentences = [sent for sent in out_df.loc[out_df.sentence_example.astype(bool),'sentence_example']]
        all_sentences.extend(sentences)


In [None]:
# get most frequent vocab 

print(len(all_words))

# raw version
all_words = [x.strip() for x in all_words]
frequency = collections.Counter(all_words)
most_common = pd.DataFrame(frequency.most_common(1500), columns=['word','count'])

In [None]:
# test if word comes from the correct section

for s in all_sentences:
    if 'world' in s:
        print(s)

In [None]:
# check for duplicates in cleaned list and then stem
# note that raw list is filtered for repeated keywords

# read in list 
with open('../../temp/safe_harbor_most_common_keywords_raw.txt') as f:
    keywords = f.read().splitlines()

# filter for duplicates
keywords = list(set(keywords))

with open("../../temp/safe_harbor_most_common_keywords_clean.txt", 'w') as output:
    for word in keywords:
        output.write(word + '\n')


In [None]:
# stem words
stem_words = []
for x in tqdm(keywords):
    xtoken = x.strip().split()
    xstem = [ps.stem(t) for t in xtoken]
    if len(xstem)>1:
        out = ' '.join(xstem)
        stem_words.append(out)
    elif len(xstem)==1:
        out = xstem[0]
        stem_words.append(xstem[0])
    else:
        next

stem_words = list(set(stem_words))
        
with open("../../temp/safe_harbor_most_common_keywords_clean_stem.txt", 'w') as output:
    for word in stem_words:
        if word!=stem_words[-1]:
            output.write(word + '\n')
        else:
            output.write(word)


In [None]:
print(len(most_common))
print(len(keywords))
print(len(stem_words))