# Stock Filtering Development

This notebook contains development of stock filtering for 5 minute pullback strategy.

In order to run the scanner on a specific subset of stocks, trader needs to filter market for these stocks.
<br>
<br>
For now I have found market cap and price as the best global criterias for my stock selection. Local criteria like volume are used later on the scanner stage.

## Imports

In [None]:
import yfinance as yf
import pandas as pd
import os
from datetime import datetime
import time
from glob import glob
import re
import shutil
%cd ..
%pwd

/Users/ivanosipchyk/dev/investing/5-min-pullback


'/Users/ivanosipchyk/dev/investing/5-min-pullback'

## Functions

In [None]:
def add_market_cap_label(data, small_cap_threshold, soft_margin_coef=1.2):
    def label_cap(x):
        if x < small_cap_threshold:
            return 'small'
        elif x <= small_cap_threshold * soft_margin_coef:
            return 'almost small'
        else:
            return 'mid/large'

    data['marketCapLabel'] = data['marketCap'].apply(label_cap)
    return data


def add_price_range(data, lower_price_bound, upper_price_bound, soft_margin_coef=1.2):
    def label_price(x):
        if x < lower_price_bound or x > upper_price_bound * soft_margin_coef:
            return 'out'
        elif x >= lower_price_bound and x <= upper_price_bound:
            return 'in'
        else:
            return 'almost in'

    data['priceRangeLabel'] = data['currentPrice'].apply(label_price)
    return data


def get_stock_info(symbols_list, features, small_cap_threshold, price_bounds,
                   sleep_secs=60, batch_size=1000, temp_dir='scanner/temp', output_dir='daily_output'):
    os.makedirs(temp_dir, exist_ok=True)
    os.makedirs(output_dir, exist_ok=True)

    today = datetime.today()
    today_str = today.strftime('%Y-%m-%d')
    run_time = today.strftime('%Y-%m-%d_%H-%M-%S')
    is_sunday = today.weekday() == 6

    # Check if temp folder has leftover files
    batch_files = glob(os.path.join(temp_dir, 'batch_*.csv'))
    if not batch_files:
        # If no temp data, fall back to latest output if not Sunday
        if not is_sunday:
            output_files = sorted(glob(os.path.join(output_dir, 'market_update_*.csv')), reverse=True)
            if output_files:
                last_file = output_files[0]
                try:
                    processed_df = pd.read_csv(last_file)
                    processed_symbols = set(processed_df['Ticker'].tolist())
                    print(f"Using {last_file} as previous market update, skipping {len(processed_symbols)} already processed symbols.")
                except Exception as e:
                    print(f"Failed to load {last_file}: {e}")
                    processed_symbols = set()
            else:
                processed_symbols = set()
        else:
            processed_symbols = set()
    else:
        # Use leftover temp data from previous run
        batch_files.sort(reverse=True)
        latest_run = batch_files[0].split("_", 2)[-1].rsplit(".", 1)[0]
        processed_files = [f for f in batch_files if latest_run in f]
        processed_df_list = [pd.read_csv(f) for f in processed_files]
        processed_df = pd.concat(processed_df_list, ignore_index=True)
        processed_symbols = set(processed_df['Ticker'].tolist())
        print(f"Found {len(processed_symbols)} previously processed symbols from temp folder ({latest_run})")

    # Filter tickers to only those not yet processed
    if not is_sunday:
        remaining_tickers = [t for t in symbols_list if t not in processed_symbols]
    else:
        remaining_tickers = symbols_list
    print(f"Remaining tickers to process: {len(remaining_tickers)}")

    # Process in batches
    for i in range(0, len(remaining_tickers), batch_size):
        batch = remaining_tickers[i:i + batch_size]
        print(f"Processing batch {i // batch_size + 1} with {len(batch)} tickers...")

        ticker_data = {}
        tickers = yf.Tickers(" ".join(batch))

        for symbol, ticker in tickers.tickers.items():
            try:
                info = ticker.info
                data = {feature: info.get(feature, None) for feature in features}
                ticker_data[symbol] = data
                time.sleep(0.01)
            except Exception as e:
                print(f"Skipping {symbol}: {e}")
                continue

        batch_df = pd.DataFrame.from_dict(ticker_data, orient='index')
        batch_df.index.name = 'Ticker'
        batch_df = batch_df.reset_index()

        batch_df = add_market_cap_label(batch_df, small_cap_threshold)
        batch_df = add_price_range(batch_df, lower_price_bound=price_bounds[0], upper_price_bound=price_bounds[1])

        batch_filename = f"batch_{str(i // batch_size + 1).zfill(3)}_{run_time}.csv"
        batch_path = os.path.join(temp_dir, batch_filename)
        batch_df.to_csv(batch_path, index=False)
        print(f"Saved batch to {batch_path}")
        print(f'Waiting {sleep_secs} seconds...')
        time.sleep(sleep_secs)

    # Combine all batches from this run
    all_batches = glob(os.path.join(temp_dir, f'*_{run_time}.csv'))
    all_dfs = [pd.read_csv(f) for f in all_batches]
    final_df = pd.concat(all_dfs, ignore_index=True)

    # Filter out mid/large-cap stocks that are out of price range
    final_df = final_df[~((final_df['marketCapLabel'] == 'mid/large') & (final_df['priceRangeLabel'] == 'out'))]

    # Save final results to output/ folder
    output_path = os.path.join(output_dir, f'market_update_{today_str}.csv')
    final_df.to_csv(output_path, index=False)
    print(f"Saved final market update to {output_path}")

    # Delete temp folder
    try:
        shutil.rmtree(temp_dir)
        print(f"Deleted temporary folder {temp_dir}")
    except Exception as e:
        print(f"Failed to delete temp folder: {e}")

    return final_df

## Download Labeled Data

In [None]:
with open('data/symbol_lists/symbols_all.txt', 'r') as file:
    symbols = file.read()
    symbols = re.sub(r'[\[\]\'\,]', '', symbols)
    symbols = symbols.split('\n')

features = [
    'currentPrice', 'averageDailyVolume10Day', 'marketCap', 'sharesOutstanding'
]

small_cap_threshold = 2_000_000_000
price_bounds = (1, 100)

test_df = get_stock_info(
    symbolss_list=symbols,
    features=features,
    small_cap_threshold=small_cap_threshold,
    price_bounds=price_bounds
)

Remaining tickers to process: 5488
Processing batch 1 with 1000 tickers...


HTTP Error 404: 
HTTP Error 404: 
HTTP Error 404: 
HTTP Error 404: 
HTTP Error 404: 
HTTP Error 404: 
HTTP Error 404: 


Saved batch to scanner/temp/batch_001_2025-07-23_21-43-40.csv
Waiting 60...
Processing batch 2 with 1000 tickers...


HTTP Error 404: 
HTTP Error 404: 
HTTP Error 404: 
HTTP Error 404: 
HTTP Error 404: 
HTTP Error 404: 


Saved batch to scanner/temp/batch_002_2025-07-23_21-43-40.csv
Waiting 60...
Processing batch 3 with 1000 tickers...


HTTP Error 404: 
HTTP Error 404: 
HTTP Error 404: 


Saved batch to scanner/temp/batch_003_2025-07-23_21-43-40.csv
Waiting 60...
Processing batch 4 with 1000 tickers...


HTTP Error 404: 
HTTP Error 404: 


Saved batch to scanner/temp/batch_004_2025-07-23_21-43-40.csv
Waiting 60...
Processing batch 5 with 1000 tickers...
Saved batch to scanner/temp/batch_005_2025-07-23_21-43-40.csv
Waiting 60...
Processing batch 6 with 488 tickers...


HTTP Error 503: 


Skipping FTFT: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping RVYL: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping INTS: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping PRSO: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping CHNR: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping ALBT: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping SUNE: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping MBIO: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping OLB: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping OP: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping ULY: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping BGLC: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping TAOP: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping SOBR: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping IBG: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping SNTG: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping AEMD: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping IDAI: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping SMTK: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping SNES: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping MOVE: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping QNRX: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping BSLK: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping JAGX: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping BKYI: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping SGBX: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping DGLY: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping XELB: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping POLA: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping GLTO: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping CMND: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping UGRO: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping BOXL: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping LCFY: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping PCSA: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping JWEL: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping TTNP: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping PRFX: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping AMIX: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping ENSC: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping AGRI: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping PALI: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping XPON: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping ELAB: argument of type 'NoneType' is not iterable


HTTP Error 503: 


Skipping BBLG: argument of type 'NoneType' is not iterable
Saved batch to scanner/temp/batch_006_2025-07-23_21-43-40.csv
Waiting 60...
