## In this notebook, we removed all the tables from the 10K reports in HTML format. Below is the general process:
- Load in HTML as BeautifulSoup object
- Keep only Item 7 of each report
- Traverse each level of the HTML tags to ensure we find all the contents between `<table>` and `</table>` tags
- Replace original table with an integer (effectively removed them)

In [None]:
# mount Google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#pip install openai

In [None]:
# import packages
import pandas as pd
import numpy as np
import json
import glob
import os.path
import re
from bs4 import BeautifulSoup, NavigableString, Tag
#import openai
from tqdm import tqdm

In [None]:
# load raw data in HTML format
file_path = "/content/drive/MyDrive/w210_capstone_project/data/SEC_Edgar_Annual_Financial_Filings_2021/raw"
result_path = "/content/drive/MyDrive/w210_capstone_project/data/SEC_Edgar_Annual_Financial_Filings_2021/working2/"

file_list = glob.glob(os.path.join(file_path,'*.htm'))

#### for debugging
# load HTML data into soup object
for file in file_list:
    if file.endswith('1800667_10K_2020_0001193125-21-041470.htm'):
        with open(file, 'r') as html_file:
            soup = BeautifulSoup(html_file, 'html.parser')
            print(file)
        break

file_list = [file]

/content/drive/MyDrive/w210_capstone_project/data/SEC_Edgar_Annual_Financial_Filings_2021/raw/1800667_10K_2020_0001193125-21-041470.htm


In [None]:
#%%writefile preprocessor.py
import re
from bs4 import BeautifulSoup

def to_text(soup):
  tmp = re.split('<\/[A-Za-z]+>', str(soup))
  tmp = [re.sub('<[^\<\>]+>', ' ', t) for t in tmp]
  return re.sub('\s+',' ', ' '.join(tmp)).strip()

# we may have items like this: Item \n7.
# therefore, we simply cannot match by Item 7.
# so we remove all special characters (including space) 
# so that the underlying text becomes Item7., 
# then we ask if the text starts with Item7.

def match_section(keyword, text):
  text = re.sub(r'[^0-9a-zA-Z\.]+', '', text)
  return text.lower().startswith(keyword.lower())

# suppose data contains the following tags
# <div><p>hello</p></div>
# <p>hello</p>
# note that the second tag is nested in the first
# this is what beautifulsoup gives us
# we only want to keep the first and ignore the second
# so we keep track of the text of the tag ("hello") to see if 
# we have previously captured it
# if we did, then ignore the tag at hand
# since the most outer tag is always consider first
# we will also get the most comprehensive tag that
# includes all the nested ones

def cleanse_section(data):
  section = []
  # data is a list of HTML tags
  for tag in data:
    # go to the next HTML tag if the text is empty
    if tag.text.strip() == '': continue
    # go to the next HTML tag if the current tag is a subset of the last tag
    if len(section) > 0 and str(tag) in str(section[-1]): continue
    section.append(tag)
  return section

# extract all the tags of an item section
# given the beginning keyword of the item

def get_section(soup, keyword):
  start = False
  data = []
  # loop all the HTML tags in the report 
  # (include the nested ones)
  for p in soup.body:
    if isinstance(p, NavigableString):
        continue
    text = p.text.replace("&nbsp;", " ").strip()
    if "Item 7" in text:
      print(p)
    # if we find the keyword, start the process
    if start == False and match_section(keyword, text):
      start = True
      continue
    if start == True:
      # ignore all the nested one that matches the keyword
      if match_section(keyword, text):
        continue
      # if we reach to the next item, then we are done
      elif match_section('Item', text):
        break
      else:
        # otherwise, store it
        data.append(p)
  # now we remove all the nested and empty tags in a particular item section
  return cleanse_section(data)

def parse_tables(data):
  parsed_data = []
  global table_id
  for i, d in enumerate(data):
    if d.name == 'table':
      rephrased_table = ''
      if 'PageNo' not in str(d) and d.text.strip() != '':
        # call API here using d.text.strip() or str(d) (this includes all HTML tags)
        table_lines.append({"file": file_name, "table_id": table_id, "text": to_text(d) })
        rephrased_table = f'<openai>{table_id}</openai>'
        table_id += 1
      # print(str(d), '->', rephrased_table)
      table_soup = BeautifulSoup(rephrased_table, "html.parser")
      parsed_data.append(table_soup)
      # print((i, d))
    else:
      tables = []
      for t in d.find_all('table'):
        if t.text.strip() != '' and (
          len(tables) == 0 or str(t) not in str(tables[-1])
        ):
          tables.append(t)
      # print(tables)
      # you should all the tables here
      _lines = []
      for t in tables:
        rephrased_table = ''
        if 'PageNo' not in str(t) and t.text.strip() != '':
          # call API here using t.text.strip() or str(t)
          #table_lines.append({"file": file_name, "text": t.text.strip()+' '})
          table_lines.append({"file": file_name, "table_id": table_id, "text": to_text(t)})
          rephrased_table = f'<openai>{table_id}</openai>'
          table_id += 1
        _lines.append(rephrased_table)
      text = str(d)
      for t, l in zip(tables, _lines):
        # print(str(t), '->', l)
        text = text.replace(str(t), l)
      table_soup = BeautifulSoup(text, "html.parser")
      parsed_data.append(table_soup)
  return parsed_data

def get_tables(data):
  tables = []
  # loop through all the tags in a particular section 
  for i, d in enumerate(data):
    # we ask if the particular tag is a table
    if d.name == 'table':
      text = d.text.strip()
      # we remove the page No. tables and empty tables
      if 'PageNo' not in str(d) and text != '':
        tables.append((i, d))    
    else:
      # if it is not a table, we try to find all the tables inside the tag
      # b/c of the nested structure
      for t in d.find_all('table'):
        text = t.text.strip()
        # same logic + dedup 
        if 'PageNo' not in str(t) and text != '' and (
          len(tables) == 0 or str(t) not in str(tables[-1])
        ):
          tables.append((i, t))
  # and the table so we know which tag contains the table 
  return tables

def parse_html(file, keyword):
  with open(file, 'r') as f:
    soup = BeautifulSoup(f, 'html.parser')
    for s in soup.select('a'):
        s.extract()
    data = get_section(soup, keyword)
    data = parse_tables(data)
    # tables = get_tables(data)
    return data

In [None]:
#from preprocessor import parse_html
global table_lines
table_lines = []
global file_name
file_name = ''
global table_id
table_id = 0

for file in tqdm(file_list):
#file = file_list[0]
  table_id = 0
  file_name = os.path.basename(file)
  item7 = parse_html(file, 'Item7.')
  #item7A = parse_html(file, 'Item7A.')
  #item8 = parse_html(file, 'Item8.')
  text = ['<html><body>']
  for item in [
    { 'title': 'Item 7', 'tags': item7 },
    #{ 'title': 'Item 7A', 'tags': item7A }, 
    #{ 'title': 'Item 8', 'tags': item8 }, 
  ]:
    title = item['title']
    text.append(f"<div id='{title}'>")
    for tag in item['tags']:
      text.append(str(tag))
    text.append("</div>")
  text.append('</body></html>')
  output_file = os.path.join(result_path, file_name)
  #with open(output_file, 'w') as f:
  #  f.writelines(text)

100%|██████████| 1/1 [00:02<00:00,  2.89s/it]

<p style="margin-top:1em; margin-bottom:0em; page-break-before:always">
<hr align="CENTER" size="3" style="COLOR:#999999" width="100%"/>
<h5 align="left"></h5>
<center><div align="left" style="width:8.5in">
<p align="center" style="margin-top:0pt; margin-bottom:0pt; font-size:10pt; font-family:Times New Roman"><b>TABLE OF CONTENTS </b></p>
<p style="font-size:12pt;margin-top:0pt;margin-bottom:0pt"> </p>
<table align="center" border="0" cellpadding="0" cellspacing="0" style="BORDER-COLLAPSE:COLLAPSE; font-family:Times New Roman; font-size:10pt" width="100%">
<tr>
<td width="9%"></td>
<td valign="bottom" width="1%"></td>
<td width="86%"></td>
<td valign="bottom" width="1%"></td>
<td></td>
<td></td>
<td></td></tr>
<tr style="page-break-inside:avoid ; font-family:Times New Roman; font-size:8pt">
<td valign="bottom"> </td>
<td valign="bottom">  </td>
<td valign="bottom"> </td>
<td valign="bottom">  </td>
<td align="center" colspan="2" style="border-bottom:1.00pt solid #000000" valign="botto




In [None]:
df = pd.read_pickle(os.path.join(result_path, 'table_lines.pkl'))

In [None]:
df.head()

Unnamed: 0,file,table_id,text
0,1803696_10K_2020_0001564590-21-009444.htm,0,"• On November 9, 2020, we entered into a paten..."
1,1803696_10K_2020_0001564590-21-009444.htm,1,"• On June 1, 2020, we completed the Mergers wi..."
2,1803696_10K_2020_0001564590-21-009444.htm,2,"• On December 10, 2018, we entered into an agr..."
3,1803696_10K_2020_0001564590-21-009444.htm,3,"Years ended December 31, 2020 2019 2018 Revenu..."
4,1803696_10K_2020_0001564590-21-009444.htm,4,"Years Ended December 31, 2020 vs. 2019 2020 20..."
