# Finding Leading Indicator Stocks for Predicting VOO
This notebook includes reusable functions to:
- Preprocess stock data
- Calculate returns
- Identify top correlated stocks with VOO's next-day movement

In [None]:
# Imports
import pandas as pd
import numpy as np

In [None]:
# Function to preprocess and compute returns
def preprocess_data(df_nyse, df_voo):
    df_nyse['Date'] = pd.to_datetime(df_nyse['Date'], format='%d %b %Y')
    df_voo['Date'] = pd.to_datetime(df_voo['Date'], format='%d %b %Y')
    df_nyse.sort_values(by=['Symbol', 'Date'], inplace=True)
    df_voo.sort_values(by='Date', inplace=True)
    df_nyse['Return'] = df_nyse.groupby('Symbol')['Close'].pct_change()
    df_voo['Return'] = df_voo['Close'].pct_change()
    df_voo['Target'] = df_voo['Return'].shift(-1) > 0
    df_voo.dropna(inplace=True)
    return df_nyse.dropna(subset=['Return']), df_voo

In [None]:
# Function to select top N most liquid stocks
def get_most_liquid_stocks(df_nyse, top_n=100):
    avg_volume = df_nyse.groupby('Symbol')['Volume'].mean().sort_values(ascending=False)
    return avg_volume.head(top_n).index.tolist()

In [None]:
# Function to compute correlations and identify top leading indicators
def find_top_correlated_stocks(df_nyse, df_voo, top_n=5):
    liquid_symbols = get_most_liquid_stocks(df_nyse)
    df_filtered = df_nyse[df_nyse['Symbol'].isin(liquid_symbols)]
    df_filtered['Return'] = df_filtered.groupby('Symbol')['Close'].pct_change()
    df_pivot = df_filtered.pivot(index='Date', columns='Symbol', values='Return')
    df_model = pd.merge(df_voo[['Date', 'Target']], df_pivot, on='Date', how='inner')
    df_model.dropna(inplace=True)
    correlations = df_model.drop(columns='Target').corrwith(df_model['Target']).sort_values(ascending=False)
    return correlations.head(top_n)