In [3]:
pip install beautiful soup

Collecting beautiful
  Downloading beautiful-0.0.2-py3-none-any.whl (3.0 kB)
Collecting soup
  Downloading soup-0.1.0.tar.gz (528 bytes)
Building wheels for collected packages: soup
  Building wheel for soup (setup.py) ... [?25ldone
[?25h  Created wheel for soup: filename=soup-0.1.0-py3-none-any.whl size=1406 sha256=40b155bffc0a30629b36a087dcdab9906b143f043033aedc9ffdaa78f6d8a792
  Stored in directory: /Users/christine/Library/Caches/pip/wheels/ea/4e/9f/e0ae399e9c9422307dca388919588ca8889727a9eb29e07873
Successfully built soup
Installing collected packages: beautiful, soup
Successfully installed beautiful-0.0.2 soup-0.1.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install bs4

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1272 sha256=07493d0dcc84a540f9d646c3026eaef9ea2ba6584367bf67947c9d1ce4da4bc1
  Stored in directory: /Users/christine/Library/Caches/pip/wheels/19/f5/6d/a97dd4f22376d4472d5f4c76c7646876052ff3166b3cf71050
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1
Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs

In [2]:
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
# US english
LANGUAGE = "en-US,en;q=0.5"

def get_soup(url):
    """Constructs and returns a soup using the HTML content of `url` passed"""
    # initialize a session
    session = requests.Session()
    # set the User-Agent as a regular browser
    session.headers['User-Agent'] = USER_AGENT
    # request for english content (optional)
    session.headers['Accept-Language'] = LANGUAGE
    session.headers['Content-Language'] = LANGUAGE
    # make the request
    html = session.get(url)
    # return the soup
    return bs(html.content, "html.parser")

In [3]:
def get_all_tables(soup):
    """Extracts and returns all tables in a soup object"""
    return soup.find_all("table")

In [4]:
def get_table_headers(table):
    """Given a table soup, returns all the headers"""
    headers = []
    for th in table.find("tr").find_all("th"):
        headers.append(th.text.strip())
    return headers

In [5]:
def get_table_rows(table):
    """Given a table, returns all its rows"""
    rows = []
    for tr in table.find_all("tr")[1:]:
        cells = []
        # grab all td tags in this table row
        tds = tr.find_all("td")
        if len(tds) == 0:
            # if no td tags, search for th tags
            # can be found especially in wikipedia tables below the table
            ths = tr.find_all("th")
            for th in ths:
                cells.append(th.text.strip())
        else:
            # use regular td tags
            for td in tds:
                cells.append(td.text.strip())
        rows.append(cells)
    return rows

In [6]:
def save_as_csv(table_name, headers, rows):
    pd.DataFrame(rows, columns=headers).to_csv(f"{table_name}.csv")

In [7]:
def main(url):
    # get the soup
    soup = get_soup(url)
    # extract all the tables from the web page
    tables = get_all_tables(soup)
    print(f"[+] Found a total of {len(tables)} tables.")
    # iterate over all tables
    for i, table in enumerate(tables, start=1):
        # get the table headers
        headers = get_table_headers(table)
        # get all the rows of the table
        rows = get_table_rows(table)
        # save table as csv file
        table_name = f"table-{i}"
        print(f"[+] Saving {table_name}")
        save_as_csv(table_name, headers, rows)

In [10]:
if __name__ == "__main__":
    import sys
    try:
        url = sys.argv[1]
    except IndexError:
        print("Please specify a URL.\nUsage: python html_table_extractor.py [URL]")
        exit(1)
    main('https://www.the-numbers.com/movie/budgets/all')

[+] Found a total of 1 tables.
[+] Saving table-1
