In [30]:
import yfinance as yf
import requests
import io
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

In [31]:
NASDAQLISTED_URL = "https://www.nasdaqtrader.com/dynamic/symdir/nasdaqlisted.txt"
OTHERLISTED_URL  = "https://www.nasdaqtrader.com/dynamic/symdir/otherlisted.txt"


In [79]:

class TickerInfo:
    def __init__(self, urls: list[str], params=None):
        self.urls = urls
        self.params = params or {
            "BATCH_SIZE": 200,
            "ALPACA_SNAPSHOT_BATCH": 200,
            "PRICE_MIN": 0,
            "PRICE_MAX": 20.0,
            "DVOL_MIN": 100_000.0,
            "DVOL_MAX": 50_000_000.0,
            "SLEEP_S": 1,
            "ALPACA_DATA_BASE_URL": "https://data.alpaca.markets",
            "ALPACA_TRADING_BASE_URL": "https://paper-api.alpaca.markets",
        }
        self.symbols = self.get_symbols()
        
    def get_symbols(self) -> list[str]:
        ls = []
        for url in self.urls:
            df = self._download_ticker_list(url)
            if 'ACT Symbol' in df.columns:
                ls.extend(df['ACT Symbol'].tolist())
            else:
                ls.extend(df['Symbol'].tolist())

        if len(ls) != len(set(ls)):
            print("Warning: Duplicate symbols found!")

        out = []
        for s in ls:
            sym = str(s).strip().upper()
            if sym.lower() == "nan" or not sym:
                continue
            if "$" in sym:
                continue
            if sym.endswith("ZZT"):
                continue
            if sym.endswith("W") and len(sym) > 1:
                continue
            if sym.endswith("U") and len(sym) > 1:
                continue
            if sym.endswith("R") and len(sym) > 1:
                continue
            if sym.endswith("-W"):
                continue
            out.append(sym)
        return out
    
    def _download_ticker_list(self, url: str) -> pd.DataFrame:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        lines = [ln for ln in r.text.splitlines() if ln.strip() and not ln.startswith("File Creation Time")]
        data = io.StringIO("\n".join(lines))
        df = pd.read_csv(data, sep="|")
        return df

    def _tradable_symbols_alpaca(self, auth: dict) -> set[str]:
        trading_base = auth.get("ALPACA_TRADING_BASE_URL", self.params["ALPACA_TRADING_BASE_URL"]).rstrip("/")
        key_id = (auth.get("ALPACA_KEY") or "").strip()
        secret = (auth.get("ALPACA_SECRET") or "").strip()
        headers = {"APCA-API-KEY-ID": key_id, "APCA-API-SECRET-KEY": secret}

        res = requests.get(
            f"{trading_base}/v2/assets",
            params={"status": "active", "asset_class": "us_equity"},
            headers=headers,
            timeout=30,
        )
        res.raise_for_status()
        assets = res.json()
        return {a.get("symbol") for a in assets if a.get("tradable") is True and a.get("symbol")}

    def filter_with_alpaca(self, auth: dict) -> list[str]:
        alpaca_base = auth.get("ALPACA_DATA_BASE_URL", self.params["ALPACA_DATA_BASE_URL"]).rstrip("/")
        key_id = (auth.get("ALPACA_KEY") or "").strip()
        secret = (auth.get("ALPACA_SECRET") or "").strip()
        headers = {
            "APCA-API-KEY-ID": key_id,
            "APCA-API-SECRET-KEY": secret,
        }

        try:
            tradable = self._tradable_symbols_alpaca(auth)
            symbols = [s for s in self.symbols if s in tradable]
        except Exception as e:
            print(f"Alpaca tradable filter error: {e}")
            symbols = self.symbols

        price_min = float(self.params["PRICE_MIN"])
        price_max = float(self.params["PRICE_MAX"])
        dvol_min = float(self.params["DVOL_MIN"])
        dvol_max = float(self.params["DVOL_MAX"])
        snap_batch = int(self.params.get("ALPACA_SNAPSHOT_BATCH", 100))
        sleep_s = float(self.params.get("SLEEP_S", 1))

        keep = []
        for i in range(0, len(symbols), snap_batch):
            batch = symbols[i:i + snap_batch]
            res = requests.get(
                f"{alpaca_base}/v2/stocks/snapshots",
                params={"symbols": ",".join(batch)},
                headers=headers,
                timeout=30,
            )
            res.raise_for_status()
            data = res.json()
            snaps = data.get("snapshots", data)

            for sym, snap in snaps.items():
                daily = (snap or {}).get("dailyBar") or {}
                px = daily.get("c")
                vol = daily.get("v")
                if px is None or vol is None:
                    continue
                dv = float(px) * float(vol)
                if price_min <= float(px) <= price_max and dvol_min <= dv <= dvol_max:
                    keep.append(sym)

            time.sleep(sleep_s)

        self.symbols = keep
        print(f"Alpaca filter kept {len(keep)} tickers")
        return keep
    
    def get_market_caps(self, auth: dict) -> dict[str, int]:
        symbols = self.symbols
        if (auth.get("ALPACA_KEY") or "").strip() and (auth.get("ALPACA_SECRET") or "").strip():
            symbols = self.filter_with_alpaca(auth)

        market_caps = {}

        print(f"Fetching market caps one-by-one from yfinance for {len(symbols)} tickers")
        for idx, sym in enumerate(symbols):
            try:
                info = yf.Ticker(sym.replace(".", "-")).get_info()
                mc = info.get("marketCap")
                if mc is not None:
                    market_caps[sym] = mc
            except Exception as e:
                print(f"yfinance error {sym}: {e}")
            time.sleep(0.1)

        print(f"yfinance returned market cap for {len(market_caps)} tickers")
        return market_caps



tickers = TickerInfo([NASDAQLISTED_URL, OTHERLISTED_URL])


In [80]:
def get_env():
    with open("../.env") as f:
        lines = f.readlines()
        env = {}
        for line in lines:
            if "=" in line:
                k, v = line.strip().replace('"', '').split("=", 1)
                env[k] = v
    return env
env = get_env()


In [81]:
caps = tickers.get_market_caps(env)

Alpaca filter kept 1195 tickers
Fetching market caps one-by-one from yfinance for 1195 tickers
yfinance error VYX: Failed to perform, curl: (28) Operation timed out after 346128 milliseconds with 0 bytes received. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.
yfinance returned market cap for 1071 tickers


In [82]:
with open("capdata.json", "w") as f:
    import json
    json.dump(caps, f, indent=4)

### Analysis

In [83]:
with open("capdata.json", "r") as f:
    caps = json.load(f)

In [92]:
filtered = {}
for key, value in caps.items():
    value = int(value)
    if 50_000_000 <= value <= 400_000_000:
        filtered[key] = value 

In [95]:
with open("filtered_lowcap_stocks.json", "w") as f:
    json.dump(filtered, f, indent=4)