In [2]:
from pathlib import Path
from dotenv import load_dotenv
import os
from alpaca.data import StockHistoricalDataClient
from alpaca.data.requests import StockBarsRequest
from alpaca.data.timeframe import TimeFrame, TimeFrameUnit
import pandas as pd
import time
import logging
import exchange_calendars as ec
from tqdm import tqdm
import requests
import pickle

In [2]:
load_dotenv()

True

In [None]:
nas_url = 'https://raw.githubusercontent.com/rreichel3/US-Stock-Symbols/refs/heads/main/nasdaq/nasdaq_tickers.json'
response = requests.get(nas_url)
nas_tickers = response.json()

nyse_url = 'https://raw.githubusercontent.com/rreichel3/US-Stock-Symbols/refs/heads/main/nyse/nyse_tickers.json'
response = requests.get(nyse_url)
nyse_tickers = response.json()

full_list_raw = nas_tickers + nyse_tickers
full_list_unique = list(set(full_list_raw))
# Remove tickers with special characters, usually tickers with different share classes
full_list_alpha = [t for t in full_list_unique if t.isalpha()]
tickers = full_list_alpha.copy()
tickers.sort()
len(tickers)

6382

In [None]:
# Load secrets from environment variable
ALPACA_KEY = os.getenv("ALPACA_KEY")
ALPACA_SECRET = os.getenv("ALPACA_SECRET")
client = StockHistoricalDataClient(ALPACA_KEY, ALPACA_SECRET)

Configure directory to store data.  This repo doesn't come with data, but has code showing how to download it.

In [3]:
SYS_DATA_ROOT = Path(os.getenv('DATA_DIR'))
PROJECT_DATA_ROOT = SYS_DATA_ROOT / 'stocks' / 'alpaca_2024_Q4_to_2025_Q3'

vol_temp = PROJECT_DATA_ROOT / 'month_interval' / 'temp' 
vol_temp.mkdir(parents=True, exist_ok=True)

min_data_dir = PROJECT_DATA_ROOT / 'minute_interval'
min_data_dir.mkdir(parents=True, exist_ok=True)

screen_data_temp = SYS_DATA_ROOT / 'screen_results_temp'
screen_data_temp.mkdir(parents=True, exist_ok=True)

Download month granularity data from Alpaca to get volume information.

In [None]:
logging.basicConfig(
    filename='stocks_dl_volume.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filemode='a'
)
logger = logging.getLogger(__name__)

for ticker in tqdm(tickers):
    file_path = vol_temp / f'{ticker}.feather'
    if not os.path.exists(file_path):
        result = pd.DataFrame()
    else:
        continue
    request_params = StockBarsRequest(
                        symbol_or_symbols=ticker,
                        timeframe=TimeFrame(1, TimeFrameUnit.Month),
                        start='2024-10-01',
                        end='2025-10-01',
                        adjustment='split',
                        sort='asc'
    )
    try:
        bars_response = client.get_stock_bars(request_params)
    except:
        logger.error(f"Could not request ticker {ticker}")
        continue
    try:
        df = bars_response.df
        ticker_vol = int(df['volume'].sum())
    except:
        logger.error(f'Could not read volume for ticker {ticker}')
        continue
    result.loc[ticker, 'total_volume'] = ticker_vol
    result.to_feather(file_path)
    time.sleep(0.1)

100%|██████████| 6382/6382 [04:59<00:00, 21.30it/s]  


In [None]:
results_df = pd.DataFrame()
for fp in vol_temp.glob('*.feather'):
    results_part = pd.read_feather(fp)
    results_df = pd.concat([results_df, results_part])
results_df.sort_values(by='total_volume', ascending=False, inplace=True)
results_df.to_feather('stocks_by_volume_alpaca_2024_Q4_to_2025_Q3.feather')
results_df

Unnamed: 0,total_volume
NVDA,5.758458e+10
OPEN,3.116093e+10
TSLA,2.504293e+10
INTC,2.343397e+10
PLTR,2.108463e+10
...,...
BKHAU,3.089500e+04
SVAC,2.351000e+04
QUMS,1.662300e+04
WHLRL,1.198900e+04


In [None]:
results_df = pd.read_feather('stocks_by_volume_alpaca_2024_Q4_to_2025_Q3.feather')
# Select top 500 tickers by volume
top_volume = results_df.head(500)
top_volume

Unnamed: 0,total_volume
NVDA,5.758458e+10
OPEN,3.116093e+10
TSLA,2.504293e+10
INTC,2.343397e+10
PLTR,2.108463e+10
...,...
PROK,9.891874e+08
WY,9.879066e+08
EQNR,9.874454e+08
BX,9.805559e+08


In [77]:
top_volume_tickers = top_volume.index.to_list()

In [None]:
logging.basicConfig(
    filename='min_data_dl.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filemode='a'
)
logger = logging.getLogger(__name__)

def clean(index, df):
    if not isinstance(df.index, pd.DatetimeIndex):
        raise TypeError(f"Passed DF doesn't have Datetime index")

    df_ridx = df.reindex(index)
    num_na_ridx = df_ridx.isnull().sum().sum()
    num_exist_ridx = df_ridx.count().sum()
    na_ridx_pct = num_na_ridx / (num_na_ridx + num_exist_ridx)
    if na_ridx_pct > .05:
        logger.error(f"{ticker} DF contains > 5% nans.")
        return None
    else:
        df_cleaned = df_ridx.ffill().dropna()
        return df_cleaned

Alpaca has data outside of regular trading hours.  We want to fit the data to an exchange calendar
that trades from 9:30 am to 4:00 pm EST.  Data outside of the regular window gets dropped.

In [3]:
xnys = ec.get_calendar("XNYS")
sched = xnys.schedule.loc['2024-10-01':'2025-09-30']
trading_index = pd.DatetimeIndex([], tz='UTC')

for index, row in sched.iterrows():
    index_segment = pd.date_range(row['open'], row['close'], freq="1min", tz='UTC')
    trading_index = trading_index.union(index_segment) 

trading_index

DatetimeIndex(['2024-10-01 13:30:00+00:00', '2024-10-01 13:31:00+00:00',
               '2024-10-01 13:32:00+00:00', '2024-10-01 13:33:00+00:00',
               '2024-10-01 13:34:00+00:00', '2024-10-01 13:35:00+00:00',
               '2024-10-01 13:36:00+00:00', '2024-10-01 13:37:00+00:00',
               '2024-10-01 13:38:00+00:00', '2024-10-01 13:39:00+00:00',
               ...
               '2025-09-30 19:51:00+00:00', '2025-09-30 19:52:00+00:00',
               '2025-09-30 19:53:00+00:00', '2025-09-30 19:54:00+00:00',
               '2025-09-30 19:55:00+00:00', '2025-09-30 19:56:00+00:00',
               '2025-09-30 19:57:00+00:00', '2025-09-30 19:58:00+00:00',
               '2025-09-30 19:59:00+00:00', '2025-09-30 20:00:00+00:00'],
              dtype='datetime64[ns, UTC]', length=97210, freq=None)

In [None]:
rejected_fp = Path('./rejected.pkl')
for ticker in tqdm(top_volume_tickers): 
    if os.path.exists(rejected_fp):
        with open(rejected_fp, "rb") as f:
            rejected_tickers = pickle.load(f)
    else:
        rejected_tickers = []
    
    file_path = min_data_dir / f'{ticker}.feather' 
    if os.path.exists(file_path):
        logger.info(f'Feather file for ticker {ticker} already exists')
        continue
    if ticker in rejected_tickers:
        logger.info(f'Ticker {ticker} already in rejected_tickers list')
        continue

    logger.info(f'Requesting ticker {ticker}...')

    request_params_min = StockBarsRequest(
                        symbol_or_symbols=ticker,
                        timeframe=TimeFrame(1, TimeFrameUnit.Minute),
                        start='2024-10-01',
                        end='2025-10-01 00:00', # up to and excluding
                        adjustment='split',
                        sort='asc'
                )
    try:    
        bars_response = client.get_stock_bars(request_params_min)
    except:
        logger.error(f"Could not request ticker {ticker}")
        continue
    df = bars_response.df
    try:
        df.index = df.index.droplevel(0)
    except:
        logger.error(f"Could not drop level for ticker {ticker}")
        with open('no_drop.txt', 'a') as file:
            file.write(ticker + '\n')
        continue
    df = df.drop(columns=['trade_count', 'vwap'])
    df_cleaned = clean(trading_index, df)
    if df_cleaned is None:
        logger.warning(f'Ticker {ticker} was rejected for too many nans')
        rejected_tickers.append(ticker)
        with open(rejected_fp, "wb") as f:
            pickle.dump(rejected_tickers, f)
        continue
    else:
        df_cleaned.to_feather(file_path)
        logger.info(f"Wrote {ticker} to feather")
    time.sleep(.1)

100%|██████████| 500/500 [00:00<00:00, 21024.08it/s]

Complete





Run ./screeners/screen_mr.py

In [79]:
screen_results = pd.read_csv('/home/user/Documents/trading/strat_dev/mr_1/screeners/results_1min_2025-12-03-14:35.csv')
screen_results.set_index('ticker', inplace=True)
# all tickers screened for sufficient volatility using natr,
# and minimum share price > 1.0.

filtered = screen_results.loc[
    ((screen_results['below_10%'] == True) | # some tickers are more stationary
    (screen_results['adf'] >= 0)) & # some tickers are less stationary 
    (screen_results['min_price'] >= 1.0) &
    (screen_results['natr_mean'] >= .2)
    ]

filtered = filtered.sort_values(by='adf')
filtered.to_feather('./bt_tickers.feather')

In [80]:
df = pd.read_feather('bt_tickers.feather')
df

Unnamed: 0_level_0,total_volume,adf,adf_10%_level,below_10%,adf_p,natr_min,natr_mean,min_price,mean_price,max_price
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AUR,3925395000.0,-4.088083,-2.566786,True,0.001013,0.119713,0.288609,4.785,6.379491,10.76
OSCR,2713092000.0,-3.473104,-2.566786,True,0.008705,0.078473,0.254265,11.2,15.59307,22.7623
MRNA,2411343000.0,-3.088582,-2.566786,True,0.02741,0.066299,0.226214,23.22,34.470613,66.93
LCID,2502446000.0,-3.078086,-2.566786,True,0.028218,0.103837,0.476985,15.28,24.850187,36.0
NVAX,1481190000.0,-3.067438,-2.566786,True,0.029059,0.085223,0.246876,5.065,8.098863,15.22
MSTR,4139685000.0,-2.982633,-2.566786,True,0.036548,0.09613,0.313662,157.2418,338.804098,535.23
NNE,966366500.0,-2.92794,-2.566786,True,0.042194,0.111786,0.443427,13.8,28.977992,48.0177
ENPH,1500498000.0,-2.913535,-2.566786,True,0.043797,0.076241,0.215112,29.9375,56.490207,113.62
MBLY,1358362000.0,-2.786736,-2.566786,True,0.060194,0.050418,0.215269,11.1697,15.557231,22.4674
RIVN,8736452000.0,-2.743353,-2.566786,True,0.066832,0.088328,0.232432,9.505,12.768785,17.135
