In [59]:
import time
import os
import pandas as pd

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from fake_useragent import UserAgent

## Configurations

In [49]:
NASDAQ_URL = 'https://www.nasdaq.com/market-activity/stocks/screener'
companies = {}

# Configure fake user agents
ua = UserAgent()
user_agent = ua.random
options = webdriver.ChromeOptions()
options.add_argument(f'--user-agent={user_agent}')

# Configure file download path
cwd = os.getcwd()
prefs={"download.default_directory":cwd}
options.add_experimental_option("prefs",prefs)

# Use headless browser
options.add_argument("--headless=new")


## Magic number for criteria

In [64]:
MARKET_CAP = 2500000

## Download CSV files for all NASDAQ & NYSE listings

In [50]:
# Start driver
driver = webdriver.Chrome(options=options)

# Load NASDAQ website
driver.get(NASDAQ_URL)
WebDriverWait(driver, 60).until(
    EC.presence_of_element_located((By.XPATH, '//*[@id="filterModal"]/div/div[3]/button[1]'))
)

def apply_and_download():
    time.sleep(5)
    apply_button = driver.find_element(By.XPATH, '//*[@id="filterModal"]/div/div[3]/button[1]')
    apply_button.click()
    time.sleep(5)
    download_button = driver.find_element(By.XPATH, '/html/body/div[2]/div/main/div[2]/article/div[3]/div[1]/div/div/div[3]/div[2]/div[2]/div/button')
    download_button.click()
    time.sleep(5)

# Apply changes and download NASDAQ csv
nasdaq_button = driver.find_element(By.XPATH, '//*[@id="radioItemNASDAQ"]')
nasdaq_button.click()
apply_and_download()
 
# Apply changes and download NYSE csv
nyse_button = driver.find_element(By.XPATH, '//*[@id="filterModal"]/div/div[2]/div[1]/div/div[2]/label/span[1]')
nyse_button.click()
apply_and_download()
driver.quit()

## Read in and sort CSVs

In [66]:
# Read CSVs into pandas dataframes
companies_dfs = []
for root,dirs,files in os.walk(cwd):
    for file in files:
       if file.endswith(".csv"):
           df = pd.read_csv(file)
           companies_dfs.append(df)
           #os.remove(os.path.join(cwd, file))

# Keep unique listings with market cap > 2.5M
companies = {}
for i in companies_dfs:
    for index, row in i.iterrows():
        if row['Symbol'] not in companies and row['Market Cap'] > MARKET_CAP:
            companies[row['Symbol']] = {'Market Cap': row['Market Cap']}

print(companies)
print(len(companies))

{'A': {'Market Cap': 40365434818.0}, 'AA': {'Market Cap': 6622135551.0}, 'AAN': {'Market Cap': 214655338.0}, 'AAP': {'Market Cap': 4474665296.0}, 'AAT': {'Market Cap': 1292817537.0}, 'AB': {'Market Cap': 3837119214.0}, 'ABBV': {'Market Cap': 281964477564.0}, 'ABEV': {'Market Cap': 36706431552.0}, 'ABG': {'Market Cap': 4529102738.0}, 'ABM': {'Market Cap': 2787356955.0}, 'ABR': {'Market Cap': 2416637484.0}, 'ABT': {'Market Cap': 186584366596.0}, 'AC': {'Market Cap': 694874380.0}, 'ACA': {'Market Cap': 3755806658.0}, 'ACCO': {'Market Cap': 458503786.0}, 'ACEL': {'Market Cap': 937761098.0}, 'ACHR': {'Market Cap': 1248510640.0}, 'ACI': {'Market Cap': 11623216256.0}, 'ACM': {'Market Cap': 12793010269.0}, 'ACN': {'Market Cap': 193751749421.0}, 'ACR': {'Market Cap': 106462920.0}, 'ACRE': {'Market Cap': 372794899.0}, 'ACV': {'Market Cap': 231819281.0}, 'ADC': {'Market Cap': 5813161016.0}, 'ADCT': {'Market Cap': 358178243.0}, 'ADM': {'Market Cap': 30641934095.0}, 'ADNT': {'Market Cap': 277569903