In [2]:
import pandas_datareader.data as web
import pandas as pd
import pandas_ta
import numpy as np  
import yfinance as yf
import statsmodels.api as sm
from statsmodels.regression.rolling import RollingOLS
import matplotlib.pyplot as plt


# Code block to introduce all stocks under the S&P 500 index
# sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
# sp500['Symbol'] = sp500['Symbol'].str.replace('.','-')
# symbols_list = sp500['Symbol'].unique().tolist()
# to_remove = ['SOLV', 'VLTO', 'SW', 'GEV'] # Yfinance does not have legacy data for these tickers, consider reintegrating
# symbols_list = [symbol for symbol in symbols_list if symbol not in to_remove]

end_date = '2025-05-01'
start_date = pd.to_datetime(end_date) - pd.DateOffset(years=8)


df = yf.download(tickers = 'SPY',
                start = start_date,
                end = end_date,
                timeout= 5.0)
                # .stack(future_stack=True)
df.columns = df.columns.get_level_values(0) 

df.columns = df.columns.str.lower()

df['garman_klass_vol'] = ((np.log(df['high'])-np.log(df['low']))**2)/2-(2*np.log(2)-1)*((np.log(df['close'])-np.log(df['open']))**2)

df['rsi'] = pandas_ta.rsi(df['close'], length=20)

# Check RSI plot
# df.xs('SPY', level=1)['rsi'].plot(title='SPY RSI', figsize=(10, 5), color='blue') 

df['bb_low'] = df['close'].transform(lambda x:pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:, 0])
df['bb_mid'] = df['close'].transform(lambda x:pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:, 1])
df['bb_high'] = df['close'].transform(lambda x:pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:, 2])


def compute_atr(stock_data):
    atr = pandas_ta.atr(high=stock_data['high'],
        low=stock_data['low'],
        close=stock_data['close'],
        length=14)
    atr.dropna(inplace=True)
    z_atr = atr.sub(atr.mean()).div(atr.std())
    return z_atr

df['atr_zscore'] = compute_atr(df)

def compute_macd(close):
    macd = pandas_ta.macd(close=close, length=20).iloc[:, 0]
    return macd.sub(macd.mean()).div(macd.std())

df['macd'] = compute_macd(df['close'])
df['dollar_volume'] = (df['close'] * df['volume'])/1e6
last_cols = [c for c in df.columns.unique(0) if c not in ['dollar_volume', 'open', 'high', 'low', 'volume']]

adjusted_volume = df['dollar_volume'].resample('ME').mean().to_frame('dollar_volume')
df_monthly = df[last_cols].resample('ME').last()

data = (pd.concat([adjusted_volume, df_monthly], axis=1)).dropna()
data['dollar_volume'] = data.loc[:, 'dollar_volume'].rolling(5*12, min_periods=12).mean().dropna()
data['dollar_vol_rank'] = (data['dollar_volume'].rank(ascending=False)).dropna()
data = data.drop(['dollar_volume', 'dollar_vol_rank'], axis=1)

def calculate_returns(df):
    outlier_cutoff = 0.005
    lags = [1, 2, 3, 6, 9, 12]

    for lag in lags:
        df[f'return_{lag}m'] = (df['close']
                                .pct_change(lag)
                                .pipe(lambda x: x.clip(lower=x.quantile(outlier_cutoff),
                                                       upper=x.quantile(1-outlier_cutoff)))
                                .add(1)
                                .pow(1/lag)
                                .sub(1))
    return df

data = calculate_returns(data).dropna()

factor_data = web.DataReader('F-F_Research_data_5_factors_2x3',
               'famafrench',
               start='2010')[0].drop('RF', axis=1)
factor_data.index = factor_data.index.to_timestamp()
factor_data = factor_data.resample('M').last().div(100)
factor_data.index.name = 'date'
factor_data = factor_data.join(data['return_1m'])

endog = factor_data['return_1m']
exog = sm.add_constant(factor_data.drop(columns='return_1m',axis=1))
rolling_ols = RollingOLS(endog=endog, exog=exog, window=min(24,factor_data.shape[0]), min_nobs=len(factor_data.columns)+1)
rolling_betas = rolling_ols.fit(params_only=True).params.drop(columns='const', axis =1)
# rolling_betas
factors = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']
data = data.join(rolling_betas)
data.loc[:, factors] = data[factors].apply(lambda x: x.fillna(x.mean()))
data



# print(df)
# print(type(df.index))       # Shows the index type
# print(df.index.nlevels) 
# print(df.index.names)
# print(df.columns)

[*********************100%***********************]  1 of 1 completed
  factor_data = web.DataReader('F-F_Research_data_5_factors_2x3',
  factor_data = web.DataReader('F-F_Research_data_5_factors_2x3',
  factor_data = factor_data.resample('M').last().div(100)


Unnamed: 0_level_0,close,garman_klass_vol,rsi,bb_low,bb_mid,bb_high,atr_zscore,macd,return_1m,return_2m,return_3m,return_6m,return_9m,return_12m,Mkt-RF,SMB,HML,RMW,CMA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2018-06-30,243.564651,0.000039,48.337356,5.493515,5.511968,5.530421,-0.932291,-0.300600,0.005751,0.014988,0.011704,0.004159,0.010090,0.011193,0.975614,-0.120815,0.001841,0.061066,0.048432
2018-07-31,252.587845,0.000016,58.551657,5.505989,5.528161,5.550334,-1.077042,0.146520,0.037046,0.021279,0.022288,0.001076,0.011560,0.012545,0.975614,-0.120815,0.001841,0.061066,0.048432
2018-08-31,260.650421,0.000013,65.575161,5.533631,5.552774,5.571917,-1.167742,0.267922,0.031920,0.034480,0.024814,0.012564,0.011708,0.014953,0.975614,-0.120815,0.001841,0.061066,0.048432
2018-09-30,262.200043,0.000008,61.122781,5.556230,5.567797,5.579364,-1.189693,0.061130,0.005945,0.018850,0.024880,0.018270,0.011019,0.013768,0.975614,-0.120815,0.001841,0.061066,0.048432
2018-10-31,244.080917,0.000066,40.936119,5.465558,5.520493,5.575427,-0.082919,-1.397060,-0.069104,-0.032307,-0.011355,0.005326,-0.003085,0.005782,0.975614,-0.120815,0.001841,0.061066,0.048432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-31,584.322266,0.000040,45.465250,6.368058,6.392563,6.417068,0.749386,-0.391777,-0.024060,0.016926,0.008235,0.013511,0.013804,0.018692,1.006646,-0.076709,-0.009596,0.082037,0.038711
2025-01-31,600.015015,0.000074,53.929670,6.358035,6.388460,6.418886,0.890063,0.454354,0.026856,0.001074,0.020225,0.015958,0.021454,0.019601,1.006248,-0.068132,-0.021423,0.089097,0.053525
2025-02-28,592.397949,0.000135,46.705757,6.376363,6.399316,6.422269,0.977047,-0.652904,-0.012695,0.006887,-0.003536,0.009902,0.014428,0.014207,0.999393,-0.072838,-0.014859,0.084312,0.056791
2025-03-31,559.390015,0.000198,41.669021,6.309989,6.338590,6.367191,1.776425,-1.807571,-0.055719,-0.034447,-0.014430,-0.003162,0.004111,0.006670,0.992770,-0.072943,-0.001891,0.081486,0.044489
