In [27]:
import nest_asyncio
import unicodedata
nest_asyncio.apply()

from secedgar.filings import Filing, FilingType
import datetime as dt
import bs4 
import re
import glob
import pandas as pd

In [1]:
my_filings = Filing(cik_lookup = 'aapl', filing_type = FilingType.FILING_10K, 
                    start_date = dt.datetime(2010,1,1), end_date = dt.datetime(2020,12,31))

my_filings.save('Corpus_10k')

20it [00:05,  3.63it/s]                        


In [2]:
my_filings = Filing(cik_lookup = 'tsla', filing_type = FilingType.FILING_10K, 
                    start_date = dt.datetime(2010,1,1), end_date = dt.datetime(2020,12,31))

my_filings.save('Corpus_10k')

20it [00:06,  3.17it/s]                        


In [3]:
my_filings = Filing(cik_lookup = 'googl', filing_type = FilingType.FILING_10K, 
                    start_date = dt.datetime(2010,1,1), end_date = dt.datetime(2020,12,31))

my_filings.save('Corpus_10k')

10it [00:02,  3.50it/s]


In [4]:
my_filings = Filing(cik_lookup = 'fb', filing_type = FilingType.FILING_10K, 
                    start_date = dt.datetime(2010,1,1), end_date = dt.datetime(2020,12,31))

my_filings.save('Corpus_10k')

100%|██████████| 10/10 [00:04<00:00,  2.44it/s]


In [5]:
my_filings = Filing(cik_lookup = 'gm', filing_type = FilingType.FILING_10K, 
                    start_date = dt.datetime(2010,1,1), end_date = dt.datetime(2020,12,31))

my_filings.save('Corpus_10k')

20it [00:12,  1.55it/s]                        


In [6]:
my_filings = Filing(cik_lookup = 'msft', filing_type = FilingType.FILING_10K, 
                    start_date = dt.datetime(2010,1,1), end_date = dt.datetime(2020,12,31))

my_filings.save('Corpus_10k')

20it [00:07,  2.70it/s]                        


In [9]:
def normalize_text(text):
    """Normalize Text
    """
    text = unicodedata.normalize("NFKD", text)  # Normalize
    text = '\n'.join(text.splitlines())  # Unicode break lines

    # Convert to upper
    text = text.upper()  # Convert to upper

    # Take care of breaklines & whitespaces combinations due to beautifulsoup parsing
    text = re.sub(r'[ ]+\n', '\n', text)
    text = re.sub(r'\n[ ]+', '\n', text)
    text = re.sub(r'\n+', '\n', text)

    # To find MDA section, reformat item headers
    text = text.replace('\n.\n', '.\n')  # Move Period to beginning

    text = text.replace('\nI\nTEM', '\nITEM')
    text = text.replace('\nITEM\n', '\nITEM ')
    text = text.replace('\nITEM  ', '\nITEM ')

    text = text.replace(':\n', '.\n')

    # Math symbols for clearer looks
    text = text.replace('$\n', '$')
    text = text.replace('\n%', '%')

    # Reformat
    text = text.replace('\n', '\n\n')  # Reformat by additional breakline

    return text

In [10]:
def find_mda_from_text(text, start=0):
    """Find MDA (Management Discussion and Analysis) section from normalized text
    Args:
        text (str)s
    """
    debug = False

    mda = ""
    end = 0

    # Define start & end signal for parsing
    item7_begins = [
        '\nITEM 7.', '\nITEM 7 –', '\nITEM 7:', '\nITEM 7 ', '\nITEM 7\n'
    ]
    item7_ends = ['\nITEM 7A']
    if start != 0:
        item7_ends.append('\nITEM 7')  # Case: ITEM 7A does not exist
    item8_begins = ['\nITEM 8']
    """
    Parsing code section
    """
    text = text[start:]

    # Get begin
    for item7 in item7_begins:
        begin = text.find(item7)
        if debug:
            print(item7, begin)
        if begin != -1:
            break

    if begin != -1:  # Begin found
        for item7A in item7_ends:
            end = text.find(item7A, begin + 1)
            if debug:
                print(item7A, end)
            if end != -1:
                break

        if end == -1:  # ITEM 7A does not exist
            for item8 in item8_begins:
                end = text.find(item8, begin + 1)
                if debug:
                    print(item8, end)
                if end != -1:
                    break

        # Get MDA
        if end > begin:
            mda = text[begin:end].strip()
        else:
            end = 0

    return mda, end

In [48]:
companies = ['aapl', 'googl', 'fb', 'msft', 'tsla', 'gm']

for j in companies[2:]:
    company = j

    files = glob.glob('Corpus_10k/'+company+'/10-k/*')
    files.sort()

    df = pd.DataFrame()
    for i in files:
        print(i)
        with open(i) as f:
            content = f.read()
        try:
            soup = bs4.BeautifulSoup(content, "html.parser")
        except TypeError:
            continue
        text = soup.get_text("\n")
        text = normalize_text(text)
        mda, end = find_mda_from_text(text)
        if mda and len(mda.encode('utf-8')) < 1000:
            mda, _ = find_mda_from_text(text, start=end)
        if len(mda.encode('utf-8')) < 1000:
            continue
        df = df.append(pd.DataFrame({'company': [company], 'filename': [i], 'mda': [mda]}))

    df.to_pickle('Corpus_mda/'+company+'_mda.pkl')

Corpus_10k/fb/10-k/0001326801-13-000003.txt
Corpus_10k/fb/10-k/0001326801-14-000007.txt
Corpus_10k/fb/10-k/0001326801-15-000006.txt
Corpus_10k/fb/10-k/0001326801-15-000010.txt
Corpus_10k/fb/10-k/0001326801-16-000043.txt
Corpus_10k/fb/10-k/0001326801-16-000063.txt
Corpus_10k/fb/10-k/0001326801-17-000007.txt
Corpus_10k/fb/10-k/0001326801-18-000009.txt
Corpus_10k/fb/10-k/0001326801-19-000009.txt
Corpus_10k/fb/10-k/0001326801-20-000013.txt
Corpus_10k/msft/10-k/0001193125-10-171791.txt
Corpus_10k/msft/10-k/0001193125-11-200680.txt
Corpus_10k/msft/10-k/0001193125-12-316848.txt
Corpus_10k/msft/10-k/0001193125-13-310206.txt
Corpus_10k/msft/10-k/0001193125-14-289961.txt
Corpus_10k/msft/10-k/0001193125-15-272806.txt
Corpus_10k/msft/10-k/0001193125-16-662209.txt
Corpus_10k/msft/10-k/0001564590-17-014900.txt
Corpus_10k/msft/10-k/0001564590-18-019062.txt
Corpus_10k/msft/10-k/0001564590-19-027952.txt
Corpus_10k/msft/10-k/0001564590-20-034944.txt
Corpus_10k/tsla/10-k/0001193125-11-054847.txt
Corpus_1

In [55]:
df['mda'].apply(lambda x: x.replace('\n\n', ' '))

0    ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS O...
0    ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS O...
0    ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS O...
0    ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS O...
0    ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS O...
0    ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS O...
0    ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS O...
0    ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS O...
0    ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS O...
0    ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS O...
0    ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS O...
Name: mda, dtype: object