In [25]:
from selenium import webdriver
import json
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path

In [21]:
with open(Path.cwd().parent/'sources.json') as f:
    sources = json.load(f)

In [22]:
sources.keys()

dict_keys(['NYSE', 'NYSE Withdrawn', 'Nasdaq', 'Nasdaq Priced', 'Nasdaq Withdrawn', 'JPX', 'Shanghai', 'Euronext', 'AAStocks', 'LSE', 'CNInfo', 'Frankfurt', 'KRX', 'TWSE', 'BME', 'SGX', 'IDX', 'BM', 'BIT', 'IPOScoop', 'NasdaqNordic', 'East Money', 'NSE', 'AlphaVantage', 'SpotlightAPI', 'ASX', 'TokyoIPO', 'TMX', 'IPOHub'])

In [7]:
driver = webdriver.Firefox()

In [8]:
def load_url(driver, url):
    if url != driver.current_url:
        driver.get(url)

In [9]:
def return_soup(driver):
    return BeautifulSoup(driver.page_source, 'html.parser')

In [17]:
def parse_table(soup, get_links: bool = False, **kwargs):
    """
    Parses the element identified by the keyword arguments and returns a pandas dataframe
    :param get_links: bool, if true the function will add links to data returned
    :return: pandas dataframe
    """
    url = kwargs.get('url')
    table_elem = kwargs.get('table_elem')
    table_num = kwargs.get('table_num', 0)
    table_attrs = kwargs.get('table_attrs')
    table_title = kwargs.get('table_title')
    row_elem = kwargs.get('row_elem')
    cell_elem = kwargs.get('cell_elem')
    header_elem = kwargs.get('header_elem')
    link_elem = kwargs.get('link_elem')
    link_key = kwargs.get('link_key')
    cols = kwargs.get('columns')
    column_names_as_row = kwargs.get('column_names_as_row')

    if table_title is not None:
        if soup.find(text=table_title) is None:
            return None
        else:
            table = soup.find(text=table_title).parent.parent.find(table_elem)
    elif table_attrs is None:
        table = soup.find_all(table_elem)[table_num]
    else:
        table = soup.find(table_elem, attrs=table_attrs)
    assert table is not None, f'Unable to find {table_elem} with these attributes {table_attrs} on {url}'
    table_data = []
    for row in table.find_all(row_elem):
        cells = [c.text.strip() for c in row.find_all(cell_elem)]
        if get_links and link_elem is not None and link_key is not None:
            for link in row.find_all(link_elem):
                cells.append(link[link_key])
        if len(cells) > 1 and (cells[1] != cols[1]):
            table_data.append(cells)
    df = pd.DataFrame(table_data)
    if len(df) > 0:
        # adding columns for dataframe and making sure the column list is the correct length
        cols_in_row = len(df.loc[0])
        if len(cols) < cols_in_row:
            cols.extend([f"Unnamed_column_{c}" for c in range(cols_in_row - len(cols))])
        elif len(cols) > cols_in_row:
            cols = cols[0:cols_in_row]
        df.columns = cols
        df = df.replace(r'^\s*$', np.nan, regex=True)
        df.dropna(how='all', inplace=True)
        # Some sources give the column headers as rows in the table
        if column_names_as_row:
            df = df.drop(0).reset_index(drop=True)
        return df

In [28]:
sources_to_test = ['LSE', 'Nasdaq', 'Nasdaq Priced', 'NYSE']

In [29]:
test_source = 'Nasdaq'

In [35]:
sources[test_source]

{'source_type': 'website',
 'exchange': 'NASDAQ',
 'rank': 2,
 'location': 'New York',
 'url': 'https://www.nasdaq.com/market-activity/ipos?tab=upcoming',
 'table_num': 2,
 'table_elem': 'tbody',
 'table_title': 'Upcoming',
 'row_elem': 'tr',
 'cell_elem': ['th', 'td'],
 'header_elem': 'th',
 'header_attrs': {'class': ['market-calendar-table__columnheader']},
 'columns': ['ticker',
  'company_name',
  'exchange',
  'price',
  'shares_offered',
  'ipo_date',
  'deal_size'],
 'column_names_as_row': False,
 'file': 'Nasdaq',
 'db_table_raw': 'source_nasdaq_raw',
 'db_table': 'source_nasdaq'}

In [30]:
load_url(driver, sources[test_source]['url'])

In [31]:
soup = return_soup(driver)

In [32]:
for i, tbl in enumerate(soup.find_all(sources[test_source]['table_elem'])):
    print(i, tbl.attrs)

0 {}
1 {}
2 {'class': ['market-calendar-table__body']}
3 {'class': ['market-calendar-table__body']}
4 {'class': ['market-calendar-table__body']}
5 {'class': ['market-calendar-table__body']}


In [33]:
df = parse_table(soup, **sources[test_source])

In [34]:
df.head()

AttributeError: 'NoneType' object has no attribute 'head'