# SPY Index compared to indiv stocks

In [21]:
import os
from dotenv import load_dotenv
from datetime import date, datetime

import pandas as pd
import matplotlib.pyplot as plt
import mplfinance as mpf
from scipy.stats import pearsonr, spearmanr

from alpaca.data import StockHistoricalDataClient
from alpaca.data.requests import StockBarsRequest
from alpaca.data.timeframe import TimeFrame

from scraper import get_spy_tickers

load_dotenv();

api_key = os.getenv('PAPER_API_KEY')
api_secret = os.getenv('PAPER_API_SECRET')
client = StockHistoricalDataClient(api_key, api_secret)



#### Data getters

In [2]:
def get_historical_data(symbol, start_date, end_date, timeframe):
    """Retrieves historical stock data for a given symbol and date range.

    Args:
        symbol (str): The stock symbol (e.g., "AAPL").
        start_date (datetime): The start date for the historical data (inclusive).
        end_date (datetime): The end date for the historical data (inclusive).
        timeframe (TimeFrame): The timeframe for the historical data (e.g., TimeFrame.Day, TimeFrame.Hour).

    Returns:
        pandas.DataFrame: The DataFrame containing the historical stock data.
    """

    params = StockBarsRequest(
        symbol_or_symbols=symbol,
        timeframe=timeframe,
        start=start_date,
        end=end_date
    )

    bars = client.get_stock_bars(params)
    df = bars.df

    # Calculate the percentage change
    df['percent_change'] = (df['close'] - df['open']) / df['open'] * 100

    return df

#### Plotting

In [3]:
def plot_candlestick(df, title_suffix=""):
    """Plots a candlestick chart from a DataFrame.

    Args:
        df (pandas.DataFrame): The DataFrame containing the stock data.
        title_suffix (str, optional): A suffix to add to the plot title (default: "").
    """

    symbol = df.index.get_level_values('symbol')[0]
    start_date = df.index.get_level_values('timestamp')[0].strftime('%Y-%m-%d')
    end_date = df.index.get_level_values('timestamp')[-1].strftime('%Y-%m-%d')

    title = f"{symbol} Candlestick Chart ({start_date} - {end_date}) {title_suffix}"

    # Ensure the DataFrame is in the correct format for mplfinance
    df = df.reset_index()
    df.set_index('timestamp', inplace=True)

    mpf.plot(df, type='candle', style='charles', title=title, ylabel='Price (USD)', volume=True)


def plot_percentage_change(df, title_suffix=""):
    """Plots the open/close percentage change from a DataFrame.

    Args:
        df (pandas.DataFrame): The DataFrame containing the stock data.
        title_suffix (str, optional): A suffix to add to the plot title (default: "").
    """

    plt.figure(figsize=(10, 6))
    plt.plot(df.index.get_level_values('timestamp'), df['percent_change'], label='Open/Close % Change')
    plt.xlabel('Date')
    plt.ylabel('Percentage Change (%)')

    # Extract symbol and date range from dataframe index
    symbol = df.index.get_level_values('symbol')[0]
    start_date = df.index.get_level_values('timestamp')[0].strftime('%Y-%m-%d')
    end_date = df.index.get_level_values('timestamp')[-1].strftime('%Y-%m-%d')

    title = f"{symbol} Open/Close % Change ({start_date} - {end_date}) {title_suffix}"
    plt.title(title)
    plt.grid(True)
    plt.legend()
    plt.show()

# symbol = "SPY"
# start_date = datetime(2024, 6, 1)
# end_date = datetime(2024, 6, 7)
# timeframe = TimeFrame.Hour

# df = get_historical_data(symbol, start_date, end_date, timeframe)
# plot_candlestick(df)
# plot_percentage_change(df)

In [4]:
#### Correlation

In [5]:
def calculate_correlations(spy_df, stock_dfs):
    """Calculates the Pearson correlation between SPY and each stock's percentage change.

    Args:
        spy_df (pandas.DataFrame): The DataFrame containing the SPY data.
        stock_dfs (dict): A dictionary where keys are stock symbols and values are their DataFrames.

    Returns:
        dict: A dictionary where keys are stock symbols and values are their correlation with SPY.
    """
    correlations = {}

    # Make 'timestamp' a column
    spy_df = spy_df.reset_index()
    spy_df = spy_df[['timestamp', 'percent_change']].set_index('timestamp')

    for symbol, df in stock_dfs.items():
        # Make 'timestamp' a column
        if not df.empty:
            df = df.reset_index()
            df = df[['timestamp', 'percent_change']].set_index('timestamp')
            
            # Align the data on the 'timestamp' index
            aligned_df = spy_df.join(df, how='inner', lsuffix='_spy', rsuffix=f'_{symbol}')
            
            if len(aligned_df) > 1:
                pearson_corr, _ = pearsonr(aligned_df['percent_change_spy'], aligned_df[f'percent_change_{symbol}'])
                spearman_corr, _ = spearmanr(aligned_df['percent_change_spy'], aligned_df[f'percent_change_{symbol}'])
                correlations[symbol] = (pearson_corr, spearman_corr)

    return correlations

In [9]:
symbol_spy = "SPY"
start_date = datetime(2024, 6, 1)
end_date = datetime(2024, 6, 6)
timeframe = TimeFrame.Hour

# Get SPY data
spy_df = get_historical_data(symbol_spy, start_date, end_date, timeframe)

# Get list of SPY constituents
constituents = get_spy_tickers()
# constituents = ["AAPL", "META", "AMZN"]


# Fetch data for each constituent
stock_dfs = {}
for symbol in constituents:
    stock_dfs[symbol] = get_historical_data(symbol, start_date, end_date, timeframe)
print("Retreived all data")

# Calculate correlations
correlations = calculate_correlations(spy_df, stock_dfs)



Retreived all data
Finished corr. of  MMM
Finished corr. of  AOS
Finished corr. of  ABT
Finished corr. of  ABBV
Finished corr. of  ACN
Finished corr. of  ADBE
Finished corr. of  AMD
Finished corr. of  AES
Finished corr. of  AFL
Finished corr. of  A
Finished corr. of  APD
Finished corr. of  ABNB
Finished corr. of  AKAM
Finished corr. of  ALB
Finished corr. of  ARE
Finished corr. of  ALGN
Finished corr. of  ALLE
Finished corr. of  LNT
Finished corr. of  ALL
Finished corr. of  GOOGL
Finished corr. of  GOOG
Finished corr. of  MO
Finished corr. of  AMZN
Finished corr. of  AMCR
Finished corr. of  AEE
Finished corr. of  AAL
Finished corr. of  AEP
Finished corr. of  AXP
Finished corr. of  AIG
Finished corr. of  AMT
Finished corr. of  AWK
Finished corr. of  AMP
Finished corr. of  AME
Finished corr. of  AMGN
Finished corr. of  APH
Finished corr. of  ADI
Finished corr. of  ANSS
Finished corr. of  AON
Finished corr. of  APA
Finished corr. of  AAPL
Finished corr. of  AMAT
Finished corr. of  APTV
Fi

In [18]:
sorted_correlations = sorted(correlations.items(), key=lambda item: item[1][0], reverse=True)
for symbol, (pearson_corr, spearman_corr) in sorted_correlations:
    print(f"{symbol}: Pearson: {pearson_corr:.2f}, Spearman: {spearman_corr:.2f}")
    
total_pearson = 0
total_spearman = 0

for symbol, (pearson_corr, spearman_corr) in sorted_correlations:
    total_pearson += pearson_corr
    total_spearman += spearman_corr

avg_pearson = total_pearson/ len(sorted_correlations)
avg_spearman = total_spearman/ len(sorted_correlations)


print("avg_pearson:", avg_pearson)
print("avg_spearman:", avg_spearman)

TMO: Pearson: 0.80, Spearman: 0.86
BKNG: Pearson: 0.79, Spearman: 0.79
OTIS: Pearson: 0.78, Spearman: 0.80
PNR: Pearson: 0.78, Spearman: 0.81
APH: Pearson: 0.76, Spearman: 0.76
PH: Pearson: 0.75, Spearman: 0.77
MCHP: Pearson: 0.74, Spearman: 0.59
MA: Pearson: 0.73, Spearman: 0.78
KLAC: Pearson: 0.73, Spearman: 0.66
AMZN: Pearson: 0.73, Spearman: 0.56
AME: Pearson: 0.73, Spearman: 0.76
PTC: Pearson: 0.72, Spearman: 0.64
MGM: Pearson: 0.71, Spearman: 0.61
NFLX: Pearson: 0.71, Spearman: 0.63
MPWR: Pearson: 0.70, Spearman: 0.54
HUBB: Pearson: 0.70, Spearman: 0.67
BRO: Pearson: 0.69, Spearman: 0.61
VMC: Pearson: 0.69, Spearman: 0.79
LRCX: Pearson: 0.69, Spearman: 0.61
DOV: Pearson: 0.69, Spearman: 0.50
AMP: Pearson: 0.68, Spearman: 0.70
ADI: Pearson: 0.68, Spearman: 0.56
SNPS: Pearson: 0.68, Spearman: 0.69
MSI: Pearson: 0.67, Spearman: 0.68
NDSN: Pearson: 0.67, Spearman: 0.66
COF: Pearson: 0.66, Spearman: 0.66
SPGI: Pearson: 0.66, Spearman: 0.67
ON: Pearson: 0.66, Spearman: 0.59
FTV: Pearso