In [23]:
import bs4 # for typing purposes
from bs4 import BeautifulSoup

In [24]:
file = ""
with open("../data/sec-edgar-filings/AAPL/10-K/0000320193-17-000070/primary-document.html") as fp:
    fileHTML = fp.read()
with open("../data/sec-edgar-filings/META/10-K/0001326801-20-000013/primary-document.html") as fp:
    fileXML = fp.read()

In [25]:
def standardize_ascii(file: str, replace_dict: dict[str, str]=None) -> str:
    '''
    Standardizes file to clean special ascii characters like non-breaking space and special quote characters
    
    ### Arguments:
        file : str
            string representation of file
        replace_dict : dict[str, str], optional
            str-to-str dictionary of with target, replacement as key-value pair respectively
            default to pre-given special_ascii
        
    ### Returns:
        standardized_text : str
            text after being replaced
    '''
    if replace_dict is None:
        replace_dict = {
            "&#160;": " ",
            "&#8217;": "'"
        }
    
    standardized_text = file
    for c in replace_dict:
        standardized_text = standardized_text.replace(c, replace_dict[c])
    
    return standardized_text

fileHTML = standardize_ascii(fileHTML)
fileXML = standardize_ascii(fileXML)

In [26]:
# soupify html document
soupHTML = BeautifulSoup(fileHTML)
soupXML = BeautifulSoup(fileXML)

  soupXML = BeautifulSoup(fileXML)


In [27]:
# soup.find_all('font', string=(lambda x: x and x.lower() == (id_to_label['i5']).lower()))
# # id_to_label['i3']
# soup.find_all('font', string=id_to_label['i5'])

In [28]:
# dict mapping of item id to section label of 10-k form
id_to_label = {
    'p1': "Part I",
    'i1': "Business",
    'i1a': "Risk Factors",
    'i1b': "Unresolved Staff Comments",
    'i2': "Properties",
    'i3': "Legal Proceedings",
    'i4': "Mine Safety Disclosures",
    'p2': "Part II",
    'i5': "Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
    'i6': "Selected Financial Data",
    'i7': "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    'i7a': "Quantitative and Qualitative Disclosures About Market Risk",
    'i8': "Financial Statements and Supplementary Data",
    'i9': "Changes in and Disagreements with Accountants on Accounting and Financial Disclosure",
    'i9a': "Controls and Procedures",
    'i9b': "Other Information",
    'p3': "Part III",
    'i10': "Directors, Executive Officers and Corporate Governance",
    'i11': "Executive Compensation",
    'i12': "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
    'i13': "Certain Relationships and Related Transactions, and Director Independence",
    'i14': "Principal Accounting Fees and Services",
    'p4': "Part IV",
    'i15': "Changes in and Disagreements with Accountants on Accounting and Financial Disclosure",
}

id_order = ['p1', 'i1', 'i1a', 'i1b', 'i2', 'i3', 'i4', 'p2', 'i5', 'i6', 'i7', 'i7a', 'i8', 'i9', 'i9a', 'i9b', 'p3', 'i10', 'i11', 'i12', 'i13', 'i14', 'p4', 'i15']

In [29]:
def get_label_tag(soup: bs4.BeautifulSoup, label_id: str) -> bs4.element.Tag:
    '''
    Returns the html element of where label_id section starts.
    Section starts at the destination of where the label_id link from table of contents links to.
    
    ### Arguments:
        soup : bs4.BeautifulSoup
            soup object of html
        label_id : str
            string index of the item in document ['p1', 'i3', 'i5a', ...] 
        
    ### Returns:
        src_dest : bs4.element.Tag
            bs4.tag of which the label_id link in table of content is pointed to
    '''
    # tag of table of content
    toc_tag = soup.find_all(string=(lambda x: x and x.lower() == id_to_label[label_id].lower()))
    
    # anchor tag of tag
    anchor_tag = [t.parent.parent for t in toc_tag if t.parent.parent.name == 'a'][0]
    
    # anchor id of the label (ex. sD0F7ECFEC8965FDE8B546D17642E8FED)
    anchor_id = anchor_tag['href'][1:]
    
    # destination anchor of label (where link points to in table of contents)
    src_dest = soup.find('a', attrs={'name': anchor_id})
    if src_dest == None:
        src_dest = soup.find('a', id=anchor_id)
    
    return src_dest
    
get_label_tag(BeautifulSoup(fileXML), 'p1')

  get_label_tag(BeautifulSoup(fileXML), 'p1')


<a id="s8075F4C5914E511E83CEF869DF53BC1C"></a>

In [51]:
def get_section_index(html: str, label_list: list[str]=id_to_label.keys()) -> dict[str, int]:
    '''
    Creates and return lookup dict between label_id and integer index of the html string
    
    ### Arguments:
        html : str
            html string
        label_list: list[str], optional
            list of label_id to generate index from, default to section labels of 10-K form
        
    ### Returns:
        id_to_index : dict[str, int]
            dict mapping string label_id to integer index of html
    '''
    soup = BeautifulSoup(html)
    
    # create id-to-tag lookup dict
    id_to_tag = {label: get_label_tag(soup, label) for label in label_list}
    
    # create id-to-index lookup dict
    id_to_index = {id: html.find(str(tag)) for id, tag in id_to_tag.items()}
    
    return id_to_index

In [52]:
# dict mapping of item id to bs4.element.Tag of start of section
id_index = {'HTML': {}, 'XML':{}}

id_index['HTML'] = get_section_index(fileHTML)
id_index['XML'] = get_section_index(fileXML)

id_index['HTML']

  soup = BeautifulSoup(html)


{'p1': 59752,
 'i1': 62166,
 'i1a': 122163,
 'i1b': 194217,
 'i2': 195354,
 'i3': 197698,
 'i4': 200114,
 'p2': 201258,
 'i5': 201506,
 'i6': 280902,
 'i7': 360588,
 'i7a': 785225,
 'i8': 796886,
 'i9': 2435672,
 'i9a': 2436548,
 'i9b': 2445213,
 'p3': 2446352,
 'i10': 2446601,
 'i11': 2449148,
 'i12': 2450983,
 'i13': 2452534,
 'i14': 2454109,
 'p4': 2456053,
 'i15': 2435672}

In [57]:
def find_all_between(file: str, start_id: str=None, end_id: str=None, id_index: dict[str, int]=id_index) -> str:
    '''
    Returns a list of all tags between tags. 
    Optional start and end tags cannot both be None
    
    ### Arguments:
        file : str
            html text
        start_id : str, optional, default=None
            starting label_id for selection, inclusive, default to start if None
        end_id : str, optional, default=None
            ending label_id for selection non-inclusive, default to end if None
        id_index: dict[str, int], optional, default=id_index
            maps str label_id with int index in string
        
    ### Returns:
        sub_html : str
            substring of html representing the html between the two tags
            
    ### Exceptions:
        Exception
            start and end cannot both be None
    '''
    
    if start_id is None and end_id is None:
        raise Exception("start and end cannot both be None")
    
    # find index location of start of start tag
    start_index = id_index[start_id] if start_id is not None else 0
    
    # find index location of start of end tag
    end_index = id_index[end_id] if end_id is not None else -1
    
    
    # get substring before start and end index
    sub_html = file[start_index:end_index]

    return sub_html

# find_all_between(fileHTML, 'i1', 'i2', id_index=id_index['HTML']), find_all_between(fileXML, 'i1', 'i2', id_index['XML'])

In [167]:
def get_raw_text(html: str, soup: bs4.BeautifulSoup=None) -> str:
        '''
        Returns the raw text from html string
        
        ### Arguments:
            html : str
                html string
            soup : bs4.BeautifulSoup, optional (soupifies html otherwise)
                soup object of html string,
            
        ### Returns:
            text : str
                string of only the text in html
        '''
        formatted_text = ""
        
        if soup is None:
            soup = BeautifulSoup(html, 'html.parser')
        
        tag_stack = [soup]
        
        # manually iterates through the tags
        #       this was done over .descendents because of parents with hidden attributes
        #       descendents would require tracking parent and lineage which is much more complex
        #       than stack-iterating with children
        while len(tag_stack) != 0:
            tag = tag_stack.pop(0) # reverse stack
             
            if type(tag) == bs4.element.NavigableString:
                formatted_text += tag
                continue
            
            # ignores all non-text childless bs4 html objects
            if not hasattr(tag, 'children'): continue
            
            # ignores all children of certain tags 
            if tag.name in ['title', 'ix:header']:
                continue
            
            if tag.name == 'tr':
                formatted_text += '\n'
            
            if tag.name == 'div':
                if tag.parent.name != 'td':
                    formatted_text += '\n'
                    
                else:
                    formatted_text += ' '
                    
            tag_stack = [t for t in tag.children] + tag_stack
            
        return formatted_text.strip()

# print(get_raw_text(find_all_between(fileHTML, 'i8', 'i9', id_index=id_index['HTML'])))
# print(get_raw_text(find_all_between(fileXML, 'i6', 'i7', id_index=id_index['XML'])))
print(get_raw_text(find_all_between(fileHTML, 'i1a', 'i1b', id_index=id_index['HTML'])))

Item 1A.  Risk Factors
The following discussion of risk factors contains forward-looking statements. These risk factors may be important to understanding other statements in this Form 10-K. The following information should be read in conjunction with Part II, Item 7, “Management's Discussion and Analysis of Financial Condition and Results of Operations” and the consolidated financial statements and related notes in Part II, Item 8, “Financial Statements and Supplementary Data” of this Form 10-K.
The business, financial condition and operating results of the Company can be affected by a number of factors, whether currently known or unknown, including but not limited to those described below, any one or more of which could, directly or indirectly, cause the Company's actual financial condition and operating results to vary materially from past, or from anticipated future, financial condition and operating results. Any of these factors, in whole or in part, could materially and adversely 

In [118]:
def get_section_text(html: str, label_id: str, id_index: dict[str, int]=id_index) -> str:
    '''
    Return section text from label_id
    
    ### Arguments:
        html : str
            html string
        label_id : str
            label id
        id_index: dict[str, int], optional, default=id_index
            maps str label_id with int index in string
        
    ### Returns:
        text : str
            section text
    '''
    # set start label
    start = label_id
    
    # set end label one after start
    end = id_order[id_order.index(start) + 1]
    
    # return text content
    return get_raw_text(find_all_between(html, start, end, id_index))

In [None]:
print(get_section_text(fileHTML, 'i1a', id_index['HTML']))

In [160]:
get_raw_text(fileXML).strip()

'UNITED STATES \nSECURITIES AND EXCHANGE COMMISSION \nWashington, D.C. 20549 \n__________________________\nFORM 10-K \n__________________________\n (Mark One) \n☒ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 \nFor the fiscal year ended  \nor \n☐TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 \nFor the transition period from            to             \nCommission File Number: 001-35551 \n__________________________\nFacebook, Inc. \n(Exact name of registrant as specified in its charter) \n__________________________\n\n\n\n  \n (State or other jurisdiction of incorporation or organization) (I.R.S. Employer Identification Number)\n1601 Willow Road, Menlo Park, California 94025 \n(Address of principal executive offices and Zip Code) \n(650) 543-4800 \n(Registrant\'s telephone number, including area code) \n__________________________\nSecurities registered pursuant to Section 12(b) of the Act\n\n\n\n\n Title of e

In [169]:
print(get_raw_text(fileXML))

Document












  

UNITED STATES 
SECURITIES AND EXCHANGE COMMISSION 
Washington, D.C. 20549 
__________________________
FORM 10-K 
__________________________
 (Mark One) 
☒ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 
For the fiscal year ended December 31, 2019 
or 
☐TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 
For the transition period from            to             
Commission File Number: 001-35551 
__________________________
Facebook, Inc. 
(Exact name of registrant as specified in its charter) 
__________________________



 Delaware 20-1665019
 (State or other jurisdiction of incorporation or organization) (I.R.S. Employer Identification Number)
1601 Willow Road, Menlo Park, California 94025 
(Address of principal executive offices and Zip Code) 
(650) 543-4800 
(Registrant's telephone number, including area code) 
__________________________
Securities registered pursuant to Section 12(b) 

In [143]:
p = BeautifulSoup('''<p class="title"><b>The Dormouse's story</b></p><p>helloworld</p>''')
for child in p.children:
      print(child)

print("======")
for child in p.descendants:
      print(child)
      
a = ["hi"]
a.extend(p.children)
a.extend(a[1].children)
a

<html><body><p class="title"><b>The Dormouse's story</b></p><p>helloworld</p></body></html>
<html><body><p class="title"><b>The Dormouse's story</b></p><p>helloworld</p></body></html>
<body><p class="title"><b>The Dormouse's story</b></p><p>helloworld</p></body>
<p class="title"><b>The Dormouse's story</b></p>
<b>The Dormouse's story</b>
The Dormouse's story
<p>helloworld</p>
helloworld


['hi',
 <html><body><p class="title"><b>The Dormouse's story</b></p><p>helloworld</p></body></html>,
 <body><p class="title"><b>The Dormouse's story</b></p><p>helloworld</p></body>]