# Import Packages

In [4]:
import warnings

import pandas as pd

warnings.filterwarnings('ignore')

import pandas_datareader.data as web
import polars as pl

from functions.utils.func import *

# OHCLV

In [2]:
def create_ohclv():
    price_data = pd.read_parquet(get_load_data_large_dir() / 'ohclv_sp500_all.parquet')
    price_data = price_data.rename(
        columns={'open': 'Open', 'high': 'High', 'low': 'Low', 'close': 'Close', 'vol': 'Volume', }).drop(
        ['industry', 'subindustry'], axis=1)
    price_data.index.name = 'date'
    price_data = price_data.reset_index('date').sort_values(['ticker', 'date']).set_index(['ticker', 'date'])
    price_data = price_data.astype(float)
    price_data.to_parquet(get_load_data_parquet_dir() / 'data_price.parquet.brotli', compression='brotli')
    data_date = price_data.drop(['Open', 'High', 'Low', 'Close', 'Volume'], axis=1)
    data_date.to_parquet(get_load_data_parquet_dir() / 'data_date.parquet.brotli', compression='brotli')

In [None]:
create_ohclv()

In [4]:
price_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_price.parquet.brotli')

In [None]:
date_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_date.parquet.brotli')

# Fama

In [22]:
def create_fama():
    fama_data = (web.DataReader('F-F_Research_Data_5_Factors_2x3_daily', 'famafrench', start=1990)[0].rename(columns={'Mkt-RF': 'MARKET'}))
    fama_data.index.names = ['date']
    fama_data = fama_data.astype(float)
    fama_data.to_parquet(get_load_data_parquet_dir() / 'data_fama.parquet.brotli', compression='brotli')

In [23]:
create_fama()

In [None]:
fama = pd.read_parquet(get_load_data_parquet_dir() / 'data_fama.parquet.brotli')

# Industry

In [None]:
def create_ind():
    ind_data = pd.read_parquet(get_load_data_large_dir() / 'ohclv_sp500_all.parquet')
    ind_data = ind_data.rename(
        columns={'industry': 'Industry', 'subindustry': 'Subindustry'}).drop(
        ['open', 'high', 'low', 'close', 'vol'], axis=1)
    ind_data.index.name = 'date'
    ind_data = ind_data.reset_index('date').sort_values(['ticker', 'date']).set_index(['ticker', 'date'])
    ind_data = ind_data.astype(float)
    ind_data.to_parquet(get_load_data_parquet_dir() / 'data_ind.parquet.brotli', compression='brotli')

In [None]:
create_ind()

In [9]:
ind = pd.read_parquet(get_load_data_parquet_dir() / 'data_ind.parquet.brotli')

# Fundamental Ratio

In [50]:
def create_fund_ratio(self):
    fund_data = pd.read_csv(get_load_data_large_dir() / 'wrds_fundamentals_raw_factor.csv')
    fund_data['date_index'] = pd.to_datetime(fund_data['date_index'])
    fund_data = fund_data.rename(columns={'date_index': 'date'}).set_index(['ticker', 'date'])
    fund_data = fund_data.drop(['fyearq', 'cash_flow', 'gross_assets', 'net_operating_assets',
       'total_debt', 'total_earning_assets', 'working_capital'], axis=1)
    fund_data = fund_data.astype(float)
    fund_data.to_parquet(get_load_data_parquet_dir() / 'data_fund_ratio.parquet.brotli', compression='brotli')

In [None]:
create_fund_ratio()

In [53]:
fund_ratio = pd.read_parquet(get_load_data_parquet_dir() / 'data_fund_ratio.parquet.brotli')

# Macro

In [58]:
def create_macro():
    IF = pd.read_csv(get_load_data_large_dir() / 'macro' / 'fiveYearIR.csv')
    IF.columns = ['date', '5YIF']
    IF = IF.set_index(pd.to_datetime(IF['date'])).drop('date', axis=1)

    medianCPI = pd.read_csv(get_load_data_large_dir() / 'macro' / 'medianCPI.csv')
    medianCPI.columns = ['date', 'medCPI']
    medianCPI = medianCPI.set_index(pd.to_datetime(medianCPI['date'])).drop('date', axis=1)
    medianCPI = medianCPI.shift(1)

    rGDP = pd.read_csv(get_load_data_large_dir() / 'macro' / 'realGDP.csv')
    rGDP.columns = ['date', 'rGDP']
    rGDP = rGDP.set_index(pd.to_datetime(rGDP['date'])).drop('date', axis=1)

    rIR = pd.read_csv(get_load_data_large_dir() / 'macro' / 'realInterestRate.csv')
    rIR.columns = ['date', 'rIR']
    rIR = rIR.set_index(pd.to_datetime(rIR['date'])).drop('date', axis=1)
    rIR = rIR.shift(1)

    UR = pd.read_csv(get_load_data_large_dir() / 'macro' / 'unemploymentRate.csv')
    UR.columns = ['date', 'UR']
    UR = UR.set_index(pd.to_datetime(UR['date'])).drop('date', axis=1)
    UR = UR.shift(1)

    TB = pd.read_csv(get_load_data_large_dir() / 'macro' / 'TB.csv')
    TB.columns = ['date', 'TB']
    TB = TB.set_index(pd.to_datetime(TB['date'])).drop('date', axis=1)
    TB = TB.shift(1)
    
    PPI = pd.read_csv(get_load_data_large_dir() / 'macro' / 'PPI.csv')
    PPI.columns = ['date', 'PPI']
    PPI = PPI.set_index(pd.to_datetime(PPI['date'])).drop('date', axis=1)
    PPI = PPI.shift(1)
    
    retailSales = pd.read_csv(get_load_data_large_dir() / 'macro' / 'retailSales.csv')
    retailSales.columns = ['date', 'retailSales']
    retailSales = retailSales.set_index(pd.to_datetime(retailSales['date'])).drop('date', axis=1)
    retailSales = retailSales.shift(1)
    
    indProdIndex = pd.read_csv(get_load_data_large_dir() / 'macro' / 'indProdIndex.csv')
    indProdIndex.columns = ['date', 'indProdIndex']
    indProdIndex = indProdIndex.set_index(pd.to_datetime(indProdIndex['date'])).drop('date', axis=1)
    indProdIndex = indProdIndex.shift(1)

    realDispoIncome = pd.read_csv(get_load_data_large_dir() / 'macro' / 'realDispoIncome.csv')
    realDispoIncome.columns = ['date', 'realDispoIncome']
    realDispoIncome = realDispoIncome.set_index(pd.to_datetime(realDispoIncome['date'])).drop('date', axis=1)
    realDispoIncome = realDispoIncome.shift(1)
    
    def pctChange(data, name):
        data.replace('.', np.nan, inplace=True)
        data = data.astype(float)
        data[f'{name}_pct']=data[f'{name}'].pct_change()
        return data
    
    IF = pctChange(IF, '5YIF')
    medianCPI = pctChange(medianCPI, 'medCPI')
    rGDP = pctChange(rGDP, 'rGDP')
    rIR = pctChange(rIR, 'rIR')
    UR = pctChange(UR, 'UR')
    TB = pctChange(TB, 'TB')
    PPI = pctChange(PPI, 'PPI')
    retailSales = pctChange(retailSales, 'retailSales')
    indProdIndex = pctChange(indProdIndex, 'indProdIndex')
    realDispoIncome = pctChange(realDispoIncome, 'realDispoIncome')
    
    macro = (pd.merge(IF, medianCPI, left_index=True, right_index=True, how='left').ffill()
                 .merge(rGDP, left_index=True, right_index=True, how='left').ffill()
                 .merge(rIR, left_index=True, right_index=True, how='left').ffill()
                 .merge(UR, left_index=True, right_index=True, how='left').ffill()
                 .merge(TB, left_index=True, right_index=True, how='left').ffill()
                 .merge(PPI, left_index=True, right_index=True, how='left').ffill()
                 .merge(retailSales, left_index=True, right_index=True, how='left').ffill()
                 .merge(indProdIndex, left_index=True, right_index=True, how='left').ffill()
                 .merge(realDispoIncome, left_index=True, right_index=True, how='left').ffill())
    
    factor_macro = macro[['5YIF_pct', 'medCPI_pct', 'rGDP_pct', 'rIR_pct', 'UR_pct', 'TB_pct', 'PPI_pct', 'retailSales_pct', 'indProdIndex_pct', 'realDispoIncome_pct']]
    
    def normalize(df):
        df = (df[-1]-df.mean())/df.std()
        return df
    
    factor_macro['5YIF_pct'] = factor_macro['5YIF_pct'].rolling(30).apply(lambda x: normalize(x))
    factor_macro['medCPI_pct'] = factor_macro['medCPI_pct'].rolling(30).apply(lambda x: normalize(x))
    factor_macro['rGDP_pct'] = factor_macro['rGDP_pct'].rolling(30).apply(lambda x: normalize(x))
    factor_macro['rIR_pct'] = factor_macro['rIR_pct'].rolling(30).apply(lambda x: normalize(x))
    factor_macro['UR_pct'] = factor_macro['UR_pct'].rolling(30).apply(lambda x: normalize(x))
    factor_macro['TB_pct'] = factor_macro['TB_pct'].rolling(30).apply(lambda x: normalize(x))
    factor_macro['PPI_pct'] = factor_macro['PPI_pct'].rolling(30).apply(lambda x: normalize(x))
    factor_macro['retailSales_pct'] = factor_macro['retailSales_pct'].rolling(30).apply(lambda x: normalize(x))
    factor_macro['indProdIndex_pct'] = factor_macro['indProdIndex_pct'].rolling(30).apply(lambda x: normalize(x))
    factor_macro['realDispoIncome_pct'] = factor_macro['realDispoIncome_pct'].rolling(30).apply(lambda x: normalize(x))

    
    factor_macro['medCPI_div_rGDP'] = (macro['medCPI'] / macro['rGDP']).pct_change()
    factor_macro['5YIF_div_medCPI'] = (macro['5YIF']/macro['medCPI']).pct_change()
    
    factor_macro = factor_macro.replace([np.inf, -np.inf], np.nan)
        
    factor_macro.to_parquet(get_load_data_parquet_dir() / 'data_macro.parquet.brotli', compression='brotli')

In [59]:
create_macro()

In [60]:
macro = pd.read_parquet(get_load_data_parquet_dir() / 'data_macro.parquet.brotli')

# ETF

In [20]:
def create_etf():
    etf_tickers = pl.scan_csv(get_load_data_large_dir() / 'tickers_etf.csv').collect(
        streaming=True).to_series().to_list()
    start_date = "1999-01-01"
    end_date = "2023-03-20"
    etf_data = yf.download(etf_tickers, start=start_date, end=end_date)
    etf_data = etf_data.stack().swaplevel().sort_index()
    etf_data.index.names = ['ticker', 'date']
    etf_data = etf_data.astype(float)

    # Calculate returns of each ticker and rename each return column to ticker
    ret = etf_data.groupby('ticker')['Close'].apply(lambda x: x.pct_change())
    ret_df = ret.unstack(level='ticker')
    dates = etf_data.reset_index('ticker').drop(
        ['ticker', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], axis=1)
    dates = dates.loc[~dates.index.duplicated(keep='first')].sort_index()
    etf_data = pd.concat([dates, ret_df], axis=1)
    etf_data.to_parquet(get_load_data_parquet_dir() / 'data_etf.parquet.brotli', compression='brotli')

In [21]:
create_etf()

[*********************100%***********************]  10 of 10 completed


In [None]:
etf = pd.read_parquet(get_load_data_parquet_dir() / 'data_etf.parquet.brotli')

# PCA Return

In [None]:
def create_pca_return():
    # Read in price data and set time frame and remove data with less than 2 years length of data (same data as create_factor.py)
    start_date = '2006-01-01'
    end_date = '2023-01-01'
    price_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_price.parquet.brotli')
    price_data = set_timeframe(price_data, start_date, end_date)
    price_data = set_length(price_data, year=2)

    # Create returns and convert ticker index to columns
    price_data = create_return(price_data, windows=[1])
    ret = price_data[[f'RET_01']]
    ret = ret['RET_01'].unstack('ticker')
    ret.iloc[0] = ret.iloc[0].fillna(0)
    ret = (ret - ret.mean()) / ret.std()

    # Execute Rolling PCA
    window_size=60
    num_components=5
    pca_return = rolling_pca(data=ret, window_size=window_size, num_components=num_components, name='Return')
    pca_return.to_parquet(get_load_data_parquet_dir() / 'data_pca_return.parquet.brotli', compression='brotli')

In [None]:
create_pca_return()

In [None]:
pca_return = pd.read_parquet(get_load_data_parquet_dir() / 'data_pca_return.parquet.brotli')

# PCA Loading Return

In [80]:
def create_pca_loading_return():
    # Read in price data and set time frame and remove data with less than 2 years length of data (same data as create_factor.py)
    start_date = '2006-01-01'
    end_date = '2023-01-01'
    price_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_price.parquet.brotli')
    price_data = set_timeframe(price_data, start_date, end_date)
    price_data = set_length(price_data, year=2)

    # Create returns and convert ticker index to columns
    price_data = create_return(price_data, windows=[1])
    ret = price_data[[f'RET_01']]
    ret = ret['RET_01'].unstack('ticker')
    ret.iloc[0] = ret.iloc[0].fillna(0)
    ret = (ret - ret.mean()) / ret.std()

    window_size = 60
    num_components = 5

    pca_loading_return = rolling_pca_loading(data=ret, window_size=window_size, num_components=num_components, name='Return')
    pca_loading_return.to_parquet(get_load_data_parquet_dir() / 'data_pca_loading_return.parquet.brotli', compression='brotli')

In [81]:
create_pca_loading_return()

In [78]:
pca_loading_return = pd.read_parquet(get_load_data_parquet_dir() / 'data_pca_loading_return.parquet.brotli')

In [79]:
pca_loading_return

Unnamed: 0_level_0,Unnamed: 1_level_0,pcaLoadingReturn_1,pcaLoadingReturn_2,pcaLoadingReturn_3,pcaLoadingReturn_4,pcaLoadingReturn_5
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,2006-03-29,-0.355348,0.108912,-0.124763,-0.076947,-0.040590
A,2006-03-30,-0.351288,0.107444,-0.126493,-0.075416,-0.041308
A,2006-03-31,-0.352231,0.111554,-0.128147,-0.084407,-0.048772
A,2006-04-03,-0.347243,0.088659,-0.114071,-0.127167,0.005377
A,2006-04-04,-0.348154,0.083710,-0.116153,-0.127305,0.008226
...,...,...,...,...,...,...
ZTS,2022-12-23,1.182809,0.242316,-0.514680,-0.609457,0.203406
ZTS,2022-12-27,1.174999,0.237338,-0.522731,-0.606563,0.200201
ZTS,2022-12-28,1.167263,0.220093,-0.550841,-0.584449,0.205198
ZTS,2022-12-29,1.187001,0.239578,-0.566540,-0.548541,0.208221


In [None]:
def rolling_pca_loading(data, window_size, num_components, name):
    loadings_list = []

    for i in range(0, len(data) - window_size + 1):
        # Get window data
        window_data = data.iloc[i:i + window_size]
        window_data.drop(columns=window_data.columns[window_data.isna().sum() > len(window_data) / 2], inplace=True)
        window_data.fillna(0, inplace=True)

        # Run pcaReturn and get loadings
        pca = PCA(n_components=num_components)
        pca.fit_transform(window_data)
        results_loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

        # Create a dataframe that matches loadings to ticker
        df_loadings = pd.DataFrame(results_loadings, columns=[f'pcaLoading{name}_{i + 1}' for i in range(num_components)],
                                   index=window_data.columns)
        loadings_list.append(df_loadings)

    # Concat all the window loadings
    results_loadings_combined = pd.concat(loadings_list, keys=data.index[window_size - 1:]).swaplevel()
    results_loadings_combined.index.set_names(['ticker', 'date'], inplace=True)
    # Rearrange data to groupby ticker
    results_loadings_combined = pd.concat([df for ticker, df in results_loadings_combined.groupby(level='ticker')],
                                          axis=0)
    return results_loadings_combined

In [70]:
factor_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_price.parquet.brotli')
# Create returns and convert ticker index to columns
factor_data = create_return(factor_data, windows=[1])
factor_data = factor_data[[f'RET_01']]
factor_data = factor_data['RET_01'].unstack('ticker')
# self.factor_data = (self.factor_data - self.factor_data.mean()) / self.factor_data.std()

In [62]:
def zscore_normalize(data, window):
    zscore_data = pd.DataFrame()
    for col in data.columns:
        zscore_data[col] = (data[col] - data[col].rolling(window).mean()) / data[col].rolling(window).std()
    return zscore_data

In [73]:
y = set_timeframe(factor_data, '2012-11-01', '2013-01-01')
y

ticker,A,AABA,AAL,AAMRQ,AAP,AAPL,ABBV,ABC,ABKFQ,ABMD,...,XOM,XRAY,XRX,XTO,XYL,YUM,ZBH,ZBRA,ZION,ZTS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-11-01,0.030842,0.006831,,,0.14181,0.002049,,0.037525,,-0.31332,...,0.004716,0.040988,0.032609,,0.046991,0.038654,0.020869,0.029223,0.013966,
2012-11-02,-0.002695,0.00944,,,-0.012716,-0.033091,,-0.010264,,0.018369,...,-0.01452,-0.008344,-0.027068,,0.009055,-0.009613,-0.007628,-0.023797,-0.016529,
2012-11-05,0.016486,0.015079,,,0.010254,0.013559,,-0.005926,,-0.027417,...,0.003988,0.021299,0.018547,,0.014436,0.002357,0.009685,0.01108,0.001867,
2012-11-06,0.017814,0.005297,,,-0.004332,-0.00303,,0.002484,,-0.010386,...,0.010813,0.021112,0.01214,,0.001923,0.011897,0.005024,0.049041,0.00932,
2012-11-07,-0.018286,-0.004009,,,-0.008454,-0.042631,,0.009911,,-0.02099,...,-0.037441,-0.025971,-0.046477,,-0.028407,-0.017362,-0.017876,-0.013319,-0.05771,
2012-11-08,-0.018627,-0.008626,,,-0.004514,-0.036294,,-0.01055,,-0.013017,...,-0.012594,-0.016309,-0.003145,,-0.015409,-0.010295,-0.002468,0.001323,-0.016169,
2012-11-09,0.000813,0.00116,,,-0.012469,0.017313,,0.003471,,0.028704,...,0.001614,0.017895,0.0,,-0.008427,0.011667,0.000309,0.00793,-0.00249,
2012-11-12,-0.002438,0.014484,,,-0.001785,-0.007732,,-0.010131,,0.0181,...,0.001261,-0.005688,-0.001577,,0.002023,0.010977,0.000155,-0.006294,-0.000499,
2012-11-13,-0.013308,0.019417,,,0.012776,0.000125,,0.016226,,0.002963,...,-0.009734,-0.00832,0.014218,,-0.009289,-0.000825,-0.004019,-0.006334,-0.018981,
2012-11-14,-0.00523,-0.001401,,,-0.017913,-0.011085,,-0.012282,,-0.002216,...,-0.004626,-0.004457,-0.020249,,-0.019568,-0.01912,-0.001552,-0.016467,-0.015784,


In [65]:

ret = zscore_normalize(factor_data, 60)

In [67]:
ret = set_timeframe(factor_data, '2006-01-01', '2023-01-01')

In [68]:
x = set_timeframe(ret, '2012-11-29', '2013-01-01')

In [50]:
def _rolling_window(factor_data, window):
    dates = factor_data.index
    cols = factor_data.columns
    np_data = factor_data.to_numpy()
    shape = (np_data.shape[0] - window + 1, window, np_data.shape[1])
    strides = (np_data.strides[0], np_data.strides[0], np_data.strides[1])
    window_data = np.lib.stride_tricks.as_strided(np_data, shape=shape, strides=strides)
    window_data_dict = {date: window_data[idx] for idx, date in enumerate(dates[window - 1:])}
    window_df = [pd.DataFrame(data=item, index=[key] * len(item), columns=cols) for key, item in window_data_dict.items()]
    return window_df

In [51]:
ret_window = _rolling_window(ret, 60)

In [52]:
splice =1 
data_spliced = {}
splice_size=22
for i in range(0, len(ret_window), splice_size):
    name = f'splice{splice}'
    data_spliced[name] = ret_window[i:i + splice_size]
    splice += 1

In [53]:
data_spliced['splice77']

[ticker             A      AABA  AAL  AAMRQ       AAP      AAPL  ABBV  \
 2012-11-14  0.480542 -0.219085  NaN    NaN  0.170430  1.270089   NaN   
 2012-11-14  0.075841 -0.262063  NaN    NaN -0.189374 -0.877890   NaN   
 2012-11-14  0.505852  0.311355  NaN    NaN  0.011610 -0.112741   NaN   
 2012-11-14 -0.520325 -0.375824  NaN    NaN -0.464810  1.220433   NaN   
 2012-11-14 -0.626561 -0.680219  NaN    NaN  0.525376 -0.341560   NaN   
 2012-11-14  0.290668  0.667047  NaN    NaN -0.517860 -0.395229   NaN   
 2012-11-14 -0.112677 -0.868794  NaN    NaN -0.314866 -1.307624   NaN   
 2012-11-14  0.213724 -0.052547  NaN    NaN  0.108373 -0.042196   NaN   
 2012-11-14 -0.103101  1.389893  NaN    NaN -0.657205  0.934611   NaN   
 2012-11-14 -0.382673  1.120276  NaN    NaN  0.001653 -0.769290   NaN   
 2012-11-14  1.044948  0.136668  NaN    NaN  0.936287  0.492661   NaN   
 2012-11-14  1.248391  0.609630  NaN    NaN -0.815298  0.252594   NaN   
 2012-11-14 -0.193938 -0.582474  NaN    NaN -0.4628

In [31]:
batch = []
factor_batch = {}
batch_num = 1
count = 1
from itertools import chain
for i, item in enumerate(data_spliced):
    batch.append(data_spliced[item])
    if count == 8:
        name = f'batch{batch_num}'
        factor_batch[name] = list(chain.from_iterable(batch))
        batch_num = batch_num + 1
        count = 0
        batch = []
    count = count + 1

name = f'batch{batch_num}'  # Excess data
factor_batch[name] = batch

In [40]:
factor_batch['batch1']

[ticker             A      AABA  AAL       AAP      AAPL  ABBV       ABC  \
 2006-03-29  0.006308  0.044155  NaN  0.001381  0.039783   NaN  0.020773   
 2006-03-29  0.002687  0.001467  NaN  0.007123  0.002943   NaN -0.006626   
 2006-03-29  0.026198  0.013669  NaN  0.004791 -0.007870   NaN -0.012387   
 2006-03-29  0.005222  0.040453  NaN -0.001135  0.025813   NaN -0.011095   
 2006-03-29 -0.002886  0.004860  NaN  0.012957 -0.003277   NaN  0.000000   
 2006-03-29  0.013025 -0.010134  NaN  0.001122  0.063248   NaN  0.000244   
 2006-03-29 -0.000286 -0.025826  NaN -0.008518  0.037596   NaN -0.000244   
 2006-03-29 -0.022006 -0.023406  NaN  0.002939  0.004660   NaN  0.007805   
 2006-03-29 -0.007598 -0.024211  NaN  0.000225  0.015410   NaN  0.002420   
 2006-03-29 -0.005889  0.005263  NaN -0.019382 -0.010279   NaN -0.012313   
 2006-03-29  0.020735 -0.122914  NaN -0.008274 -0.026207   NaN  0.003911   
 2006-03-29  0.015670 -0.024159  NaN -0.022480 -0.041885   NaN -0.002678   
 2006-03-29 

# Kmean Return

In [2]:
def create_kmean_return():
    # Read in price data and set time frame and remove data with less than 2 years length of data (same data as create_factor.py)
    start_date = '2006-01-01'
    end_date = '2023-01-01'
    price_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_price.parquet.brotli')
    price_data = set_timeframe(price_data, start_date, end_date)
    price_data = set_length(price_data, year=2)

    # Create smoothed returns and convert ticker index to columns
    window_size=10
    price_data = create_smooth_return(price_data, windows=[1], window_size=window_size)
    ret = price_data[[f'RET_01']]
    ret = ret['RET_01'].unstack('ticker')
    ret.iloc[:window_size+1] = ret.iloc[:window_size+1].fillna(0)
    ret = (ret - ret.mean()) / ret.std()

    # Exec rollingKMean
    kmeanRet = rolling_kmean(ret, window_size=60, n_clusters=15, name='Return')
    kmeanRet.to_parquet(get_load_data_parquet_dir() / 'data_kmean_return.parquet.brotli', compression='brotli')

In [None]:
create_kmean_return()

In [None]:
kmeanRet = pd.read_parquet(get_load_data_parquet_dir() / 'data_kmean_return.parquet.brotli')

# Kmean PCA Loading Return

In [None]:
def create_kmean_pca_loading_return():
    pca_loading = pd.read_parquet(get_load_data_parquet_dir() / 'data_pca_loading_return.parquet.brotli')
    all_loadings = []
    num_components = len(pca_loading.columns)

    for loading in range(1, num_components+1):
        print("--------------------------------------------------------------------------------")
        print(loading)
        pca_loading_df = pca_loading.unstack('ticker')[f'pcaLoadingReturn_{loading}'].copy()
        results = rolling_kmean(pca_loading_df, window_size=60, n_clusters=15, name='PCALoadingReturn')
        all_loadings.append(results)

    kmean_pca_loading = pd.concat(all_loadings, axis=1)
    kmean_pca_loading.columns = [f'kMeanPCALoadingReturn_{i}' for i in range(1, len(kmean_pca_loading.columns) + 1)]
    kmean_pca_loading.to_parquet(get_load_data_parquet_dir() / 'data_kmean_pca_loading_return.parquet.brotli', compression='brotli')

In [None]:
create_kmean_pca_loading_return()

In [None]:
kmean_pca_loading_return = pd.read_parquet(get_load_data_parquet_dir() / 'data_kmean_pca_loading_return.parquet.brotli')

# Kmean Fundamental Ratio

In [4]:
def create_kmean_fund_ratio():
    # Read in price data and set time frame and remove data with less than 2 years length of data (same data as create_factor.py)
    start_date = '2006-01-01'
    end_date = '2023-01-01'
    fund_ratio = pd.read_parquet(get_load_data_parquet_dir() / 'data_fundamental_ratio.parquet.brotli')
    fund_ratio = set_timeframe(fund_ratio, start_date, end_date)

    revenue = [
        'equity_earnings_per_share', 'prefer_dvd_coverage',
        'tax_rate', 'eps', 'extraordinary_per_share',
        'extraordinary_discontinued_per_share', 'interest_expense_per_share',
        'non_operating_inc_per_share', 'op_income_after_deprec_per_share',
        'op_income_before_deprec_per_share', 'pretax_income_per_share',
        'rnd_per_share', 'cogs_to_sales', 'deprec_to_sales',
        'interest_to_sales', 'nonop_income_to_sales', 'rnd_to_sales',
        'sga_to_sales', 'cash_flow_margin', 'net_profit_margin',
        'op_profit_after_deprec', 'op_profit_before_deprec',
        'pretax_profit_margin', 'accounts_receivable_turnover',
        'receivables_to_sales', 'inventory_to_sales', 'inventory_turnover',
        'inventory_sales_ratio', 'days_of_sales_in_inventories',
        'days_of_sales_in_receivables', 'sales_per_common_equity',
        'sales_per_gross_assets', 'sales_per_invested_capital', 
        'sales_per_assets', 'sales_per_ppe', 'sales_per_net_ppe', 
        'sales_per_receivables', 'sales_per_stockholder_equity',
        'return_on_common_equity', 'return_on_gross_assets',
        'return_on_invested_capital', 'return_on_ppe', 
        'return_on_stockholder_equity', 'fixed_asset_turnover',
        'annual_eps', 'pe_ratio', 'pb_ratio', 'ps_ratio'
    ]

    balance_sheet = [
        'common_equity_per_share', 'gross_assets_per_share',
        'invested_cap_per_share', 'net_operating_assets_per_share',
        'total_debt_per_share', 'total_earning_assets_per_share',
        'working_capital_per_share', 'retained_earnings', 
        'cash_short_term_to_sales', 'sales_per_cash',
        'comman_equity_per_invested_capital', 'long_debt_per_invested_capital',
        'non_controlling_interest_per_invested_cap', 'preferred_stock_per_invested_cap', 
        'total_debt_per_invested_capital', 'common_equity_per_invested_capital',
        'long_debt_per_investment_capital', 'cash_per_asset', 
        'percent_other_assets', 'inventory_to_curr_asset',
        'receivables_to_curr_assets', 'other_assets_to_assets',
        'curr_assets_to_assets', 'ppe_gross_to_assets', 'ppe_net_to_assets',
        'total_debt_to_assets', 'short_debt_to_total', 'deferred_tax_credit_to_liabilities',
        'interest_to_liabilities', 'noncontrolling_interest_to_liabilities',
        'current_ratio', 'long_debt_to_equity', 'interest_coverage', 'quick_ratio'
    ]

    cash_flow = [
        'capx_per_share', 'cash_flow_per_share', 'depreciation_per_share',
        'discontinued_ops_per_share', 'deprec_expense'
    ]
    
    revenue_top_10 = [
        'eps', 'net_profit_margin', 'return_on_common_equity', 
        'return_on_invested_capital', 'pe_ratio', 'pb_ratio', 
        'tax_rate', 'inventory_turnover', 'fixed_asset_turnover', 
        'cash_flow_margin'
    ]
    
    balance_sheet_top_10 = [
        'common_equity_per_share', 'gross_assets_per_share', 'total_debt_per_share',
        'working_capital_per_share', 'retained_earnings', 'cash_per_asset',
        'total_debt_to_assets', 'current_ratio', 'long_debt_to_equity', 'quick_ratio'
    ]

    
    def get_category(data, category):
        data_category = data[category].unstack('ticker')
        data_category = (data_category - data_category.mean()) / data_category.std()
        data_category = data_category.swaplevel(i=None, j='ticker', axis=1)
        data_category.values[np.isinf(data_category.values)] = np.nan
        return data_category
    
    categories_dict = {
        "revenue": revenue_top_10,
        "balance_sheet": balance_sheet_top_10,
        "cash_flow": cash_flow
    }

    collect = []
    for name, category in categories_dict.items():
        # Get revenue factors, unstack ticker, and normalize
        fund_category = get_category(fund_ratio, category)
        print('-------------------------------------------------------------')
        print(name)
        print('-------------------------------------------------------------')
        
        # Exec rollingKMean + PCA
        collect.append(rolling_kmean_pca(fund_category, window_size=60, n_clusters=10, name=name))
    
    kmeanFundRatio = pd.concat(collect, axis=1)
    kmeanFundRatio.to_parquet(get_load_data_parquet_dir() / 'data_kmean_fund_ratio.parquet.brotli', compression='brotli')

In [None]:
create_kmean_fund_ratio()

# Industry Momentum

In [None]:
def create_ind_momentum():
    start_date = '2006-01-01'
    end_date = '2023-01-01'
    price_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_price.parquet.brotli')
    ind_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_ind.parquet.brotli')
    combine = pd.concat([price_data, ind_data], axis=1)
    combine = set_timeframe(combine, start_date, end_date)
    combine = set_length(combine, year=2)

    t = 1
    ret = create_return(combine, windows=[t])[[f'RET_{t:02}', 'Industry']]
    avg_ret = ret.groupby(['Industry', ret.index.get_level_values('date')])[f'RET_{t:02}'].mean()
    ret = ret.reset_index()
    ret = pd.merge(ret, avg_ret.rename('indRET').reset_index(), on=['Industry', 'date'], how='left')
    ret[f'IndMom_{t:02}'] = ret[f'RET_{t:02}'] / ret['indRET']
    ind_mom = ret.set_index(['ticker', 'date'])[[f'IndMom_{t:02}']]

    ind_mom.to_parquet(get_load_data_parquet_dir() / 'data_ind_mom.parquet.brotli', compression='brotli')

In [None]:
create_ind_momentum()

In [11]:
ind_mom = pd.read_parquet(get_load_data_parquet_dir() / 'data_ind_mom.parquet.brotli')

In [26]:
pd.read_parquet(get_factor_data_dir() / 'factor_volatility.parquet.brotli')

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Volume,Volatility_01,Volatility_02,Volatility_03,Volatility_04,Volatility_05,Volatility_10,Volatility_20,Volatility_40,Volatility_60,Volatility_120,Volatility_210
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
A,2006-01-03,33.40,33.5800,32.8151,33.50,3796200.0,,,,,,,,,,,
A,2006-01-04,33.55,33.8300,33.3700,33.59,3001300.0,,,,,,,,,,,
A,2006-01-05,33.45,34.4700,33.4500,34.47,3458800.0,,,,,,,,,,,
A,2006-01-06,34.50,34.7900,34.0900,34.65,4396500.0,,,,,,,,,,,
A,2006-01-09,34.65,34.8000,34.4200,34.55,2920500.0,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZTS,2022-12-23,144.51,145.8920,143.5400,145.76,1017604.0,0.025135,0.037196,0.042348,0.047378,0.051998,0.054684,0.064152,0.068663,0.066011,0.058986,0.063529
ZTS,2022-12-27,145.91,146.1474,143.5700,145.30,957513.0,0.025035,0.037048,0.042342,0.047457,0.052027,0.054563,0.063695,0.067612,0.066493,0.057131,0.063732
ZTS,2022-12-28,145.18,146.6400,143.7700,143.83,1443200.0,0.024919,0.037083,0.042347,0.047428,0.051981,0.055108,0.063645,0.066866,0.066665,0.056672,0.063981
ZTS,2022-12-29,145.20,148.5100,145.1378,148.15,1298851.0,0.025056,0.036728,0.042271,0.047465,0.051844,0.055127,0.063754,0.066537,0.067100,0.056522,0.064622


# Open Asset Pricing

In [4]:
def create_open_asset_pricing():
    oap_data = pd.read_parquet(get_load_data_large_dir() / 'signed_predictors_dl_wide.parquet.brotli')
    permno_codes = pd.read_csv(get_load_data_large_dir() / 'permno.csv')
    factors_to_use = ['DivSeason', 'ChTax', 'EarningsStreak', 'ResidualMomentum', 'AssetGrowth',
                  'NOA', 'SmileSlope', 'MomSeasonShort', 'InvestPPEInv', 'NetDebtFinance', 'InvGrowth', 'MomSeason11YrPlus']

    oap_data = oap_data[['permno', 'yyyymm'] + factors_to_use]
    permno_codes = permno_codes[['LPERMNO', 'tic']].rename(columns={'LPERMNO':'permno'})

    permno_unique = permno_codes.drop_duplicates().sort_values(by='permno')

    permno_unique = dict(zip(permno_unique['permno'], permno_unique['tic']))

    oap_filtered = oap_data[oap_data['permno'].isin(permno_unique.keys())]
    oap_filtered['tic'] = oap_filtered['permno'].map(permno_unique)

    oap_filtered['date'] = pd.to_datetime(oap_filtered['yyyymm'], format='%Y%m')
    oap_filtered.rename(columns={'tic':'ticker'}, inplace=True)
    oap_filtered.drop(['permno', 'yyyymm'], axis=1, inplace=True)
    oap_filtered.set_index(['ticker', 'date'], inplace=True)
    oap_filtered.sort_index(level=['ticker', 'date'], inplace=True)
    
    # Find overlapping tickers
    current_tickers = read_ticker(get_load_data_large_dir() / 'tickers_to_train_fundamental.csv')
    oap_tickers = get_ticker_idx(oap_filtered)
    overlapping_tickers = list(set(oap_tickers) & set(current_tickers))

    # Filter DataFrame based on overlapping tickers
    oap_filtered = oap_filtered[oap_filtered.index.get_level_values('ticker').isin(overlapping_tickers)]
    
    export_ticker(oap_filtered, get_load_data_large_dir() / 'tickers_to_train_open.csv')
    oap_filtered.to_parquet(get_load_data_parquet_dir() / 'data_open_asset.parquet.brotli', compression='brotli')

In [5]:
create_open_asset_pricing()

In [11]:
open_asset = pd.read_parquet(get_load_data_parquet_dir() / 'data_open_asset.parquet.brotli')

# Lag Bond Returns

In [109]:
def create_lag_bond_return():
    start_date = "2006-01-01"
    end_date = "2023-03-20"
    bond_df = yf.download(['TLT', 'TIP'], start=start_date, end=end_date)
    bond_df = bond_df.stack().swaplevel().sort_index()
    bond_df.index.names = ['ticker', 'date']
    bond_df = bond_df.astype(float)
    T = [1, 6, 30]
    bond_df = create_return(bond_df, T)
    bond_df = bond_df.drop(['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume'], axis=1)
    bond_df = bond_df.unstack('ticker').swaplevel(axis=1)
    bond_df.columns = ['_'.join(col).strip() for col in bond_df.columns.values]
    
    bond_df.to_parquet(get_load_data_parquet_dir() / 'data_lag_bond_return.parquet.brotli')

In [110]:
create_lag_bond_return()

[*********************100%***********************]  2 of 2 completed


In [5]:
bond_lag = pd.read_parquet(get_load_data_parquet_dir() / 'data_lag_bond_return.parquet.brotli')

# All RF

In [116]:
def create_all_rf():
    etf_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_etf.parquet.brotli')
    fama_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_fama.parquet.brotli')
    pca_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_pca_return.parquet.brotli')
    all_rf = pd.concat([etf_data, fama_data, pca_data], axis=1)
    all_rf.to_parquet(get_load_data_parquet_dir() / 'data_all_rf.parquet.brotli', compression = 'brotli')

In [117]:
create_all_rf()

In [118]:
all_rf = pd.read_parquet(get_load_data_parquet_dir() / 'data_all_rf.parquet.brotli')

# SPY Return

In [30]:
def create_spy_return():
    spy_return = get_spy('2006-01-01', '2023-01-01')
    spy_return.index.name = 'date'
    spy_return.to_parquet(get_load_data_parquet_dir() / 'data_spy.parquet.brotli', compression = 'brotli')

In [32]:
create_spy_return()

[*********************100%***********************]  1 of 1 completed


In [33]:
spy_return = pd.read_parquet(get_load_data_parquet_dir() / 'data_spy.parquet.brotli')