In [30]:
import yfinance as yf
import requests
import io
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

In [31]:
NASDAQLISTED_URL = "https://www.nasdaqtrader.com/dynamic/symdir/nasdaqlisted.txt"
OTHERLISTED_URL  = "https://www.nasdaqtrader.com/dynamic/symdir/otherlisted.txt"


In [79]:

class TickerInfo:
    def __init__(self, urls: list[str], params=None):
        self.urls = urls
        self.params = params or {
            "BATCH_SIZE": 200,
            "ALPACA_SNAPSHOT_BATCH": 200,
            "PRICE_MIN": 0,
            "PRICE_MAX": 20.0,
            "DVOL_MIN": 100_000.0,
            "DVOL_MAX": 50_000_000.0,
            "SLEEP_S": 1,
            "ALPACA_DATA_BASE_URL": "https://data.alpaca.markets",
            "ALPACA_TRADING_BASE_URL": "https://paper-api.alpaca.markets",
        }
        self.symbols = self.get_symbols()
        
    def get_symbols(self) -> list[str]:
        ls = []
        for url in self.urls:
            df = self._download_ticker_list(url)
            if 'ACT Symbol' in df.columns:
                ls.extend(df['ACT Symbol'].tolist())
            else:
                ls.extend(df['Symbol'].tolist())

        if len(ls) != len(set(ls)):
            print("Warning: Duplicate symbols found!")

        out = []
        for s in ls:
            sym = str(s).strip().upper()
            if sym.lower() == "nan" or not sym:
                continue
            if "$" in sym:
                continue
            if sym.endswith("ZZT"):
                continue
            if sym.endswith("W") and len(sym) > 1:
                continue
            if sym.endswith("U") and len(sym) > 1:
                continue
            if sym.endswith("R") and len(sym) > 1:
                continue
            if sym.endswith("-W"):
                continue
            out.append(sym)
        return out
    
    def _download_ticker_list(self, url: str) -> pd.DataFrame:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        lines = [ln for ln in r.text.splitlines() if ln.strip() and not ln.startswith("File Creation Time")]
        data = io.StringIO("\n".join(lines))
        df = pd.read_csv(data, sep="|")
        return df

    def _tradable_symbols_alpaca(self, auth: dict) -> set[str]:
        trading_base = auth.get("ALPACA_TRADING_BASE_URL", self.params["ALPACA_TRADING_BASE_URL"]).rstrip("/")
        key_id = (auth.get("ALPACA_KEY") or "").strip()
        secret = (auth.get("ALPACA_SECRET") or "").strip()
        headers = {"APCA-API-KEY-ID": key_id, "APCA-API-SECRET-KEY": secret}

        res = requests.get(
            f"{trading_base}/v2/assets",
            params={"status": "active", "asset_class": "us_equity"},
            headers=headers,
            timeout=30,
        )
        res.raise_for_status()
        assets = res.json()
        return {a.get("symbol") for a in assets if a.get("tradable") is True and a.get("symbol")}

    def filter_with_alpaca(self, auth: dict) -> list[str]:
        alpaca_base = auth.get("ALPACA_DATA_BASE_URL", self.params["ALPACA_DATA_BASE_URL"]).rstrip("/")
        key_id = (auth.get("ALPACA_KEY") or "").strip()
        secret = (auth.get("ALPACA_SECRET") or "").strip()
        headers = {
            "APCA-API-KEY-ID": key_id,
            "APCA-API-SECRET-KEY": secret,
        }

        try:
            tradable = self._tradable_symbols_alpaca(auth)
            symbols = [s for s in self.symbols if s in tradable]
        except Exception as e:
            print(f"Alpaca tradable filter error: {e}")
            symbols = self.symbols

        price_min = float(self.params["PRICE_MIN"])
        price_max = float(self.params["PRICE_MAX"])
        dvol_min = float(self.params["DVOL_MIN"])
        dvol_max = float(self.params["DVOL_MAX"])
        snap_batch = int(self.params.get("ALPACA_SNAPSHOT_BATCH", 100))
        sleep_s = float(self.params.get("SLEEP_S", 1))

        keep = []
        for i in range(0, len(symbols), snap_batch):
            batch = symbols[i:i + snap_batch]
            res = requests.get(
                f"{alpaca_base}/v2/stocks/snapshots",
                params={"symbols": ",".join(batch)},
                headers=headers,
                timeout=30,
            )
            res.raise_for_status()
            data = res.json()
            snaps = data.get("snapshots", data)

            for sym, snap in snaps.items():
                daily = (snap or {}).get("dailyBar") or {}
                px = daily.get("c")
                vol = daily.get("v")
                if px is None or vol is None:
                    continue
                dv = float(px) * float(vol)
                if price_min <= float(px) <= price_max and dvol_min <= dv <= dvol_max:
                    keep.append(sym)

            time.sleep(sleep_s)

        self.symbols = keep
        print(f"Alpaca filter kept {len(keep)} tickers")
        return keep
    
    def get_market_caps(self, auth: dict) -> dict[str, int]:
        symbols = self.symbols
        if (auth.get("ALPACA_KEY") or "").strip() and (auth.get("ALPACA_SECRET") or "").strip():
            symbols = self.filter_with_alpaca(auth)

        market_caps = {}

        print(f"Fetching market caps one-by-one from yfinance for {len(symbols)} tickers")
        for idx, sym in enumerate(symbols):
            try:
                info = yf.Ticker(sym.replace(".", "-")).get_info()
                mc = info.get("marketCap")
                if mc is not None:
                    market_caps[sym] = mc
            except Exception as e:
                print(f"yfinance error {sym}: {e}")
            time.sleep(0.1)

        print(f"yfinance returned market cap for {len(market_caps)} tickers")
        return market_caps



tickers = TickerInfo([NASDAQLISTED_URL, OTHERLISTED_URL])


In [80]:
def get_env():
    with open("../.env") as f:
        lines = f.readlines()
        env = {}
        for line in lines:
            if "=" in line:
                k, v = line.strip().replace('"', '').split("=", 1)
                env[k] = v
    return env
env = get_env()


In [81]:
caps = tickers.get_market_caps(env)

Alpaca filter kept 1195 tickers
Fetching market caps one-by-one from yfinance for 1195 tickers
yfinance error VYX: Failed to perform, curl: (28) Operation timed out after 346128 milliseconds with 0 bytes received. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.
yfinance returned market cap for 1071 tickers


In [82]:
with open("capdata.json", "w") as f:
    import json
    json.dump(caps, f, indent=4)

### Analysis

In [31]:
import pandas as pd
df=pd.read_csv("data/fundamentals.csv")

In [32]:
(df['ebit'] >0).sum()

np.int64(37)

In [53]:
df

Unnamed: 0,ticker,CIK,record_date,assets,liabilities,cash,shares,long_term_debt,revenue,net_income,ebit,operating_cash_flow,assets_yoy,net_income_yoy,operating_cash_flow_yoy
0,AUTL,1730463,2025-11-11,6.619470e+08,3.964950e+08,86124000.0,266143286.0,,3.089100e+07,-442312000.0,-457373000.0,-646427000.0,-0.200054,0.036251,-0.284596
1,TOI,1799191,2025-11-06,1.636190e+08,1.758930e+08,200000.0,98381340.0,,,-101711000.0,-69563000.0,-43998000.0,-0.086861,-0.041367,0.094519
2,DBI,1319947,2025-11-01,2.052989e+09,1.750105e+09,51352000.0,,463089000.0,5.097926e+09,,150551000.0,130554000.0,-0.013348,,4.570322
3,VFF,1584549,2025-11-08,4.183840e+08,1.130190e+08,82561000.0,115517766.0,34594000.0,3.925390e+08,,-12805000.0,19560000.0,0.001283,,-126.520000
4,BYRN,1354866,2025-10-09,7.856200e+07,1.541900e+07,6495000.0,22725515.0,217171.0,1.856371e+07,-14423000.0,19096000.0,-12776000.0,0.331469,0.627503,-3.568207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186,SUUN,2011053,2024-06-30,,,,27191075.0,,,,,,,,
187,STSS,1737995,2025-11-14,4.439600e+08,1.074589e+07,10521706.0,28226153.0,,,,-218351255.0,-25479566.0,38.449446,,-1.270538
188,INDO,1757840,2025-06-30,2.521735e+07,3.284705e+06,0.0,13910038.0,1000000.0,,-14742958.0,-12136262.0,-7001374.0,0.443217,-1.408201,-2.584315
189,LPCN,1535955,2025-11-05,1.607317e+07,1.931119e+06,3901040.0,5551931.0,833000.0,2.285423e+06,-16720558.0,-18115509.0,-13887195.0,-0.223455,-0.436275,-1.340349


In [55]:
import requests
from bs4 import BeautifulSoup
import time
HEADERS = {"User-Agent": "leo@gmail.com", "Accept-Encoding": "gzip, deflate"}

def fetch_form4(ciks, limit=5) -> pd.DataFrame:
    rows = []

    for cik in ciks:
        cik = str(cik).zfill(10)

        r = requests.get(
            f"https://data.sec.gov/submissions/CIK{cik}.json",
            headers=HEADERS
        )
        time.sleep(0.2)
        print(f"Fetching CIK {cik}, status {r.status_code}")

        if r.status_code != 200:
            continue

        filings = r.json()["filings"]["recent"]

        accessions = [
            acc.replace("-", "")
            for f, acc in zip(filings["form"], filings["accessionNumber"])
            if f == "4"
        ][:limit]
        print(f"  Found {len(accessions)} Form 4 filings")

        for acc in accessions:
            xml = requests.get(
                f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{acc}/xslF345X03/form4.xml",
                headers=HEADERS
            ).text
            time.sleep(0.2)

            soup = BeautifulSoup(xml, "xml")

            ticker_node = soup.find("issuerTradingSymbol")
            owner_node = soup.find("reportingOwnerName")

            if not ticker_node or not owner_node:
                continue

            ticker = ticker_node.text
            owner = owner_node.text

            for tx in soup.find_all("nonDerivativeTransaction"):
                if tx.transactionCoding.transactionCode.text != "P":
                    continue

                rows.append({
                    "ticker": ticker,
                    "cik": cik,
                    "owner": owner,
                    "date": tx.transactionDate.value.text,
                    "shares": float(tx.transactionShares.value.text),
                    "price": float(tx.transactionPricePerShare.value.text)
                             if tx.transactionPricePerShare else None,
                })

    return pd.DataFrame(rows)

df = fetch_form4(df['CIK'].tolist()[:10], limit = 3)

Fetching CIK 0001730463, status 200
  Found 0 Form 4 filings
Fetching CIK 0001799191, status 200
  Found 3 Form 4 filings
Fetching CIK 0001319947, status 200
  Found 3 Form 4 filings
Fetching CIK 0001584549, status 200
  Found 3 Form 4 filings
Fetching CIK 0001354866, status 200
  Found 3 Form 4 filings
Fetching CIK 0002018064, status 200
  Found 3 Form 4 filings
Fetching CIK 0001338940, status 200
  Found 3 Form 4 filings
Fetching CIK 0001737287, status 200
  Found 3 Form 4 filings
Fetching CIK 0001422930, status 200
  Found 3 Form 4 filings
Fetching CIK 0001681087, status 200
  Found 3 Form 4 filings


In [60]:
import asyncio, httpx, sqlite3, xml.etree.ElementTree as ET
from datetime import datetime, timedelta
from aiolimiter import AsyncLimiter

# Config
USER_AGENT = "Name your@email.com"
limiter = AsyncLimiter(9, 1) # Stay under 10 req/sec

def setup_db():
    conn = sqlite3.connect("form4.db")
    conn.execute('''CREATE TABLE IF NOT EXISTS trades 
        (ticker TEXT, owner TEXT, date TEXT, type TEXT, shares REAL, price REAL)''')
    return conn

async def get_xml(client, url):
    async with limiter:
        try:
            resp = await client.get(url, headers={"User-Agent": USER_AGENT})
            return resp.text if resp.status_code == 200 else None
        except: return None

def parse_and_save(conn, xml_str):
    try:
        root = ET.fromstring(xml_str)
        ticker = root.findtext(".//issuerTradingSymbol")
        owner = root.findtext(".//rptOwnerName")
        
        for tx in root.findall(".//nonDerivativeTransaction"):
            vals = (
                ticker, owner,
                tx.findtext(".//transactionDate/value"),
                tx.findtext(".//transactionAcquiredDisposedCode/value"),
                float(tx.findtext(".//transactionShares/value") or 0),
                float(tx.findtext(".//transactionPricePerShare/value") or 0)
            )
            conn.execute("INSERT INTO trades VALUES (?,?,?,?,?,?)", vals)
        conn.commit()
    except: pass

async def process_days(n_days):
    conn = setup_db()
    async with httpx.AsyncClient(timeout=10) as client:
        for i in range(n_days):
            date_dt = datetime.now() - timedelta(days=i)
            date_str = date_dt.strftime("%Y%m%d")
            year, qtr = date_str[:4], f"QTR{(date_dt.month-1)//3 + 1}"
            
            print(f"Checking {date_str}...")
            idx_url = f"https://www.sec.gov/Archives/edgar/daily-index/{year}/{qtr}/form.{date_str}.idx"
            idx_data = await get_xml(client, idx_url)
            
            if not idx_data: continue # Skip weekends/holidays

            # Find all Form 4 paths in the index
            xml_urls = []
            for line in idx_data.splitlines():
                if line.startswith("4  ") or line.startswith("4/A"):
                    path = line.split()[-1].replace("-", "").replace(".txt", "/form4.xml")
                    xml_urls.append(f"https://www.sec.gov/Archives/{path}")

            # Download and parse
            print(f"Found {len(xml_urls)} filings. Downloading...")
            tasks = [get_xml(client, u) for u in xml_urls]
            results = await asyncio.gather(*tasks)
            
            for xml in results:
                if xml: parse_and_save(conn, xml)
                
    conn.close()
    print("Done.")

asyncio.run(process_days(3)) # Change 3 to however many days you want

RuntimeError: asyncio.run() cannot be called from a running event loop