# Import Packages

In [1]:
import warnings

import pandas as pd

warnings.filterwarnings('ignore')

import pandas_datareader.data as web
import polars as pl

from functions.utils.func import *

# CRSP

In [50]:
def create_crsp():
    crsp = pd.read_parquet(get_load_data_large_dir() / 'price.parquet.brotli')
    crsp['date'] = pd.to_datetime(crsp['date'])
    print('Finished pd.to_datetime')
    
    # Remove stocks that aren't over $100 Million Market Cap (Average market cap over stock timeline)
    cap = crsp[['PERMNO', 'date', 'PRC', 'SHROUT']]
    cap = cap.rename(columns={'PERMNO':'permno', 'PRC':'Close' , 'SHROUT':'out_share'})
    cap = cap.set_index(['permno', 'date']).sort_index(level=['permno', 'date'])
    cap = cap[~cap.index.duplicated(keep='first')]
    cap = cap.astype(float)
    cap['market_cap'] = cap['Close'] * cap['out_share'] * 1000
    avg_cap = cap.groupby('permno')['market_cap'].mean()
    above_mill = avg_cap[avg_cap > 10_000_000_000].index
    cap = cap[cap.index.get_level_values('permno').isin(above_mill)]
    print('Finished market capitalization')
    
    # Remove stocks that do not have at least 2 years worth of year data
    cap = set_length(cap, 2)
    
    # Remove stocks that have more than 1 NAN values in their Closing price column
    # Stocks that get delisted have 1 row of NAN values as their last row
    # Stocks that switch ticker (WM to COOP: 81593) have rows of NAN valuescap = cap.dropna(subset='Close')
    # Afterwards, drop all rows that have NAN values in Close (every delisted permno stock only has 1 NAN in Close now)
    nan_counts = cap.groupby('permno')['Close'].apply(lambda x: x.isna().sum())
    valid_permnos = nan_counts[nan_counts <= 1].index.tolist()
    cap = cap[cap.index.get_level_values('permno').isin(valid_permnos)]
    cap = cap.dropna(subset='Close')
    cap = cap.drop(columns=['Close', 'out_share'], axis=1)
    
    export_stock(cap, get_load_data_large_dir() / 'permno_to_train.csv')
    stock = read_stock(get_load_data_large_dir() / 'permno_to_train.csv')
    print('Finished exporting stock list')
    
    # There are some duplicates in WRD Indices (they are all the same value)
    data = crsp[['PERMNO', 'date', 'OPENPRC', 'ASKHI', 'BIDLO', 'PRC', 'VOL', 'TICKER', 'SHROUT']]
    data = data.rename(columns={'PERMNO':'permno', 'OPENPRC':'Open', 'ASKHI':'High', 
                                'BIDLO':'Low', 'PRC':'Close', 'VOL':'Volume', 'TICKER':'ticker', 
                                'SHROUT':'out_share'})
    
    data = data.set_index(['permno', 'date']).sort_index(level=['permno', 'date'])
    data = get_stocks_data(data, stock)
    data = data[~data.index.duplicated(keep='first')]
    data = data.dropna(subset='Close')
    
    # Price
    ohclv = data[['Open', 'High', 'Low', 'Close', 'Volume']]
    ohclv = ohclv.astype(float)
    ohclv = get_stocks_data(ohclv, stock)
    print('Finished Price')
    
    # Ticker
    ticker = data[['ticker']]
    print('Finished Ticker')

    # Outstanding Share
    out = data[['out_share']]
    out['out_share'] = out['out_share'] * 1000
    print('Finished Outstanding Share')
    
    # Date
    date = data.drop(columns=data.columns)
    print('Finished Date')
    
    print('Exporting...')
    ohclv.to_parquet(get_load_data_parquet_dir() / 'data_price.parquet.brotli')
    ticker.to_parquet(get_load_data_parquet_dir() / 'data_ticker.parquet.brotli')
    out.to_parquet(get_load_data_parquet_dir() / 'data_out.parquet.brotli')
    date.to_parquet(get_load_data_parquet_dir() / 'data_date.parquet.brotli')
    cap.to_parquet(get_load_data_parquet_dir() / 'data_cap.parquet.brotli')

In [51]:
create_crsp()

Finished pd.to_datetime
Finished market capitalization
Finished exporting stock list
Finished Price
Finished Ticker
Finished Outstanding Share
Finished Date
Exporting...


In [52]:
ohclv = pd.read_parquet(get_load_data_parquet_dir() / 'data_price.parquet.brotli')

In [53]:
ticker = pd.read_parquet(get_load_data_parquet_dir() / 'data_ticker.parquet.brotli')

In [54]:
out = pd.read_parquet(get_load_data_parquet_dir() / 'data_out.parquet.brotli')

In [55]:
date = pd.read_parquet(get_load_data_parquet_dir() / 'data_date.parquet.brotli')

In [56]:
cap = pd.read_parquet(get_load_data_parquet_dir() / 'data_cap.parquet.brotli')

In [57]:
stock = read_stock(get_load_data_large_dir() / 'permno_to_train.csv')

# Industry

In [81]:
def create_ind():
    # Assign industry based off range
    def assign_label(df, column_name, sic_ranges, label):
        df['sic_temp'] = df['sic']

        for r in sic_ranges:
            if isinstance(r, tuple):
                df.loc[(df['sic_temp'] >= r[0]) & (df['sic_temp'] <= r[1]), f'{column_name}'] = label
            else:
                df.loc[df['sic_temp'] == r, f'{column_name}'] = label

        df = df.drop(columns=['sic_temp'])
        return df

    price_data = pd.read_parquet(get_load_data_large_dir() / 'price.parquet.brotli')
    price_data['date'] = pd.to_datetime(price_data['date'])
    print('Finished pd.to_datetime')

    ind = price_data[['PERMNO', 'date', 'SICCD', 'PRC']]
    ind = ind.rename(columns={'PERMNO':'permno', 'SICCD':'sic'})
    ind = ind.set_index(['permno', 'date']).sort_index(level=['permno', 'date'])
    ind = ind[~ind.index.duplicated(keep='first')]
    ind = get_stocks_data(ind, stock)
    ind = ind.dropna(subset='PRC')
    ind = ind.drop('PRC', axis=1)
    

    fama_ind = {
    'agric' : [(100, 199), (200, 299), (700, 799), (910, 919), 2048],
    'food' : [(2000, 2009), (2010, 2019), (2020, 2029), (2030, 2039), (2040, 2046), (2050, 2059), (2060, 2063), (2070, 2079),(2090, 2092), 2095, (2098, 2099)],
    'soda' : [(2064, 2068), 2086, 2087, 2096, 2097],
    'beer' : [2080, 2082, 2083, 2084, 2085],
    'smoke' : [(2100, 2199)],
    'toys' : [(920, 999), (3650, 3651), 3652, 3732, (3930, 3931), (3940, 3949)],
    'fun' : [(7800, 7829), (7830, 7833), (7840, 7841), 7900, (7910, 7911),(7920, 7929), (7930, 7933), (7940, 7949), 7980, (7990, 7999)],
    'books' : [(2700, 2709), (2710, 2719), (2720, 2729), (2730, 2739), (2740, 2749), (2770, 2771), (2780, 2789), (2790, 2799)],
    'hshld' : [2047, (2391, 2392), (2510, 2519), (2590, 2599), (2840, 2843), 2844, (3160, 3161), (3170, 3171), 3172, (3190, 3199), 3229, 3260, (3262, 3263), 3269, (3230, 3231), (3630, 3639), (3750, 3751), 3800, (3860, 3861), (3870, 3873), (3910, 3911), 3914, 3915, (3960, 3962), 3991, 3995],
    'clths' : [(2300, 2390), (3020, 3021), (3100, 3111), (3130, 3131), (3140, 3149), (3150, 3151), (3963, 3965)],
    'hlth' : [(8000, 8099)],
    'medeq' : [3693, (3840, 3849), (3850, 3851)],
    'drugs' : [2830, 2831, 2833, 2834, 2835, 2836],
    'chems' : [(2800, 2809), (2810, 2819), (2820, 2829), (2850, 2859), (2860, 2869), (2870, 2879), (2890, 2899)],
    'rubbr' : [3031, 3041, (3050, 3053), (3060, 3069), (3070, 3079), (3080, 3089), (3090, 3099)],
    'txtls' : [(2200, 2269), (2270, 2279), (2280, 2284), (2290, 2295), 2297, 2298, 2299, (2393, 2395), (2397, 2399)],
    'bldmt' : [(800, 899), (2400, 2439), (2450, 2459), (2490, 2499), (2660, 2661), (2950, 2952), 3200, (3210, 3211), (3240, 3241), (3250, 3259), 3261, 3264, (3270, 3275), (3280, 3281), (3290, 3293), (3295, 3299), (3420, 3429), (3430, 3433), (3440, 3441), 3442, 3446, 3448, 3449, (3450, 3451), 3452, (3490, 3499), 3996],
    'cnstr' : [(1500, 1511), (1520, 1529), (1530, 1539), (1540, 1549), (1600, 1699), (1700, 1799)],
    'steel' : [(3300, 3300), (3310, 3317), (3320, 3325), (3330, 3339), (3340, 3341), (3350, 3357), (3360, 3369), (3370, 3379), (3390, 3399)],
    'fabpr' : [(3400, 3400), (3443, 3443), (3444, 3444), (3460, 3469), (3470, 3479)],
    'mach' : [(3510, 3519), (3520, 3529), (3530, 3530), (3531, 3531), (3532, 3532), (3533, 3533), (3534, 3534), (3535, 3535), (3536, 3536), (3538, 3538), (3540, 3549), (3550, 3559), (3560, 3569), (3580, 3580), (3581, 3581), (3582, 3582), (3585, 3585), (3586, 3586), (3589, 3589), (3590, 3599)],
    'elceq' : [(3600, 3600), (3610, 3613), (3620, 3621), (3623, 3629), (3640, 3644), (3645, 3645), (3646, 3646), (3648, 3649), (3660, 3660), (3690, 3690), (3691, 3692), (3699, 3699)],
    'autos' : [(2296, 2296), (2396, 2396), (3010, 3011), (3537, 3537), (3647, 3647), (3694, 3694), (3700, 3700), (3710, 3710), (3711, 3711), (3713, 3713), (3714, 3714), (3715, 3715), (3716, 3716), (3792, 3792), (3790, 3791), (3799, 3799)],
    'aero' : [(3720, 3720), (3721, 3721), (3723, 3724), (3725, 3725), (3728, 3729)],
    'ships' : [(3730, 3731), (3740, 3743)],
    'guns' : [(3760, 3769), (3795, 3795), (3480, 3489)],
    'gold' : [(1040, 1049)],
    'mines' : [(1000, 1009), (1010, 1019), (1020, 1029), (1030, 1039), (1050, 1059), (1060, 1069), (1070, 1079), (1080, 1089), (1090, 1099), (1100, 1119), (1400, 1499)],
    'coal' : [(1200, 1299)],
    'oil' : [(1300, 1300), (1310, 1319), (1320, 1329), (1330, 1339), (1370, 1379), (1380, 1380), (1381, 1381), (1382, 1382), (1389, 1389), (2900, 2912), (2990, 2999)],
    'util' : [(4900, 4900), (4910, 4911), (4920, 4922), (4923, 4923), (4924, 4925), (4930, 4931), (4932, 4932), (4939, 4939), (4940, 4942)],
    'telcm' : [(4800, 4800), (4810, 4813), (4820, 4822), (4830, 4839), (4840, 4841), (4880, 4889), (4890, 4890), (4891, 4891), (4892, 4892), (4899, 4899)],
    'persv' : [(7020, 7021), (7030, 7033), (7200, 7200), (7210, 7212), (7214, 7214), (7215, 7216), (7217, 7217), (7219, 7219), (7220, 7221), (7230, 7231), (7240, 7241), (7250, 7251), (7260, 7269), (7270, 7290), (7291, 7291), (7292, 7299), (7395, 7395), (7500, 7500), (7520, 7529), (7530, 7539), (7540, 7549), (7600, 7600), (7620, 7620), (7622, 7622), (7623, 7623), (7629, 7629), (7630, 7631), (7640, 7641), (7690, 7699), (8100, 8199), (8200, 8299), (8300, 8399), (8400, 8499), (8600, 8699), (8800, 8899), (7510, 7515)],
    'bussv' : [(2750, 2759), (3993, 3993), (7218, 7218), (7300, 7300), (7310, 7319), (7320, 7329), (7330, 7339),(7340, 7342), (7349, 7349), (7350, 7351), (7352, 7352), (7353, 7353), (7359, 7359), (7360, 7369),(7370, 7372), (7374, 7374), (7375, 7375), (7376, 7376), (7377, 7377), (7378, 7378), (7379, 7379),(7380, 7380), (7381, 7382), (7383, 7383), (7384, 7384), (7385, 7385), (7389, 7390), (7391, 7391),(7392, 7392), (7393, 7393), (7394, 7394), (7396, 7396), (7397, 7397), (7399, 7399), (7519, 7519),(8700, 8700), (8710, 8713), (8720, 8721), (8730, 8734), (8740, 8748), (8900, 8910), (8911, 8911),(8920, 8999), (4220, 4229)],
    'comps' : [(3570, 3579), (3680, 3680), (3681, 3681), (3682, 3682), (3683, 3683), (3684, 3684), (3685, 3685),(3686, 3686), (3687, 3687), (3688, 3688), (3689, 3689), (3695, 3695), (7373, 7373)],
    'chips' : [(3622, 3622), (3661, 3661), (3662, 3662), (3663, 3663), (3664, 3664), (3665, 3665), (3666, 3666),(3669, 3669), (3670, 3679), (3810, 3810), (3812, 3812)],
    'labeq' : [(3811, 3811), (3820, 3820), (3821, 3821), (3822, 3822), (3823, 3823), (3824, 3824), (3825, 3825),(3826, 3826), (3827, 3827), (3829, 3829), (3830, 3839)],
    'paper' : [(2520, 2549), (2600, 2639), (2670, 2699), (2760, 2761), (3950, 3955)],
    'boxes' : [(2440, 2449), (2640, 2659), (3220, 3221), (3410, 3412)],
    'whlsl' : [(5000, 5000), (5010, 5015), (5020, 5023), (5030, 5039), (5040, 5042), (5043, 5043), (5044, 5044), (5045, 5045), (5046, 5046), (5047, 5047), (5048, 5048), (5049, 5049), (5050, 5059), (5060, 5060), (5063, 5063), (5064, 5064), (5065, 5065), (5070, 5078), (5080, 5080), (5081, 5081), (5082, 5082), (5083, 5083), (5084, 5084), (5085, 5085), (5086, 5087), (5088, 5088), (5090, 5090), (5091, 5092), (5093, 5093), (5094, 5094), (5099, 5099), (5100, 5100), (5110, 5113), (5120, 5122), (5130, 5139), (5140, 5149), (5150, 5159), (5160, 5169), (5170, 5172), (5180, 5182), (5190, 5199)],
    'trans' : [(4000, 4013), (4040, 4049), (4100, 4100), (4110, 4119), (4120, 4121), (4130, 4131), (4140, 4142),(4150, 4151), (4170, 4173), (4190, 4199), (4200, 4200), (4210, 4219), (4230, 4231), (4240, 4249),(4400, 4499), (4500, 4599), (4600, 4699), (4700, 4700), (4710, 4712), (4720, 4729), (4730, 4739),(4740, 4749), (4780, 4780), (4782, 4782), (4783, 4783), (4784, 4784)],
    'rtail' : [(5200, 5200), (5210, 5219), (5220, 5229), (5230, 5231), (5250, 5251), (5260, 5261), (5270, 5271), (5300, 5300), (5310, 5311), (5320, 5320), (5330, 5331), (5334, 5334), (5340, 5349), (5390, 5399), (5400, 5400), (5410, 5411), (5412, 5412), (5420, 5429), (5430, 5439), (5440, 5449), (5450, 5459), (5460, 5469), (5490, 5499), (5500, 5500), (5510, 5529), (5530, 5539), (5540, 5549), (5550, 5559), (5560, 5569), (5570, 5579), (5590, 5599), (5600, 5699), (5700, 5700), (5710, 5719), (5720, 5722), (5730, 5733), (5734, 5734), (5735, 5735), (5736, 5736), (5750, 5799), (5900, 5900), (5910, 5912), (5920, 5929), (5930, 5932), (5940, 5940), (5941, 5941), (5942, 5942), (5943, 5943), (5944, 5944), (5945, 5945), (5946, 5946), (5947, 5947), (5948, 5948), (5949, 5949), (5950, 5959), (5960, 5969), (5970, 5979), (5980, 5989), (5990, 5990), (5992, 5992), (5993, 5993), (5994, 5994), (5995, 5995), (5999, 5999)],
    'meals' : [(5800, 5819), (5820, 5829), (5890, 5899), (7000, 7000), (7010, 7019), (7040, 7049), (7213, 7213)],
    'banks' : [(6000, 6000), (6010, 6019), (6020, 6020), (6021, 6021), (6022, 6022), (6023, 6024), (6025, 6025), (6026, 6026), (6027, 6027), (6028, 6029), (6030, 6036), (6040, 6059), (6060, 6062), (6080, 6082), (6090, 6099), (6100, 6100), (6110, 6111), (6112, 6113), (6120, 6129), (6130, 6139), (6140, 6149), (6150, 6159), (6160, 6169), (6170, 6179), (6190, 6199)],
    'insur' : [(6300, 6300), (6310, 6319), (6320, 6329), (6330, 6331), (6350, 6351), (6360, 6361), (6370, 6379), (6390, 6399), (6400, 6411)],
    'rlest' : [(6500, 6500), (6510, 6510), (6512, 6512), (6513, 6513), (6514, 6514), (6515, 6515), (6517, 6519), (6520, 6529), (6530, 6531), (6532, 6532), (6540, 6541), (6550, 6553), (6590, 6599), (6610, 6611)],
    'fin' : [(6200, 6299), (6700, 6700), (6710, 6719), (6720, 6722), (6723, 6723), (6724, 6724), (6725, 6725), (6726, 6726), (6730, 6733), (6740, 6779), (6790, 6791), (6792, 6792), (6793, 6793), (6794, 6794), (6795, 6795), (6798, 6798), (6799, 6799)]
    # 'other' : [(4950, 4959), (4960, 4961), (4970, 4971), (4990, 4991)]
    }

    for name, ranges in fama_ind.items():
        print('-'*60)
        print(name)
        ind = assign_label(ind, 'ind', ranges, name)

    ind['ind'], category_mapping = ind['ind'].factorize()

    ind = ind.drop('sic', axis=1)
    ind.to_parquet(get_load_data_parquet_dir() / 'data_ind.parquet.brotli')

In [82]:
create_ind()

Finished pd.to_datetime
------------------------------------------------------------
agric
------------------------------------------------------------
food
------------------------------------------------------------
soda
------------------------------------------------------------
beer
------------------------------------------------------------
smoke
------------------------------------------------------------
toys
------------------------------------------------------------
fun
------------------------------------------------------------
books
------------------------------------------------------------
hshld
------------------------------------------------------------
clths
------------------------------------------------------------
hlth
------------------------------------------------------------
medeq
------------------------------------------------------------
drugs
------------------------------------------------------------
chems
---------------------------------------------

In [83]:
ind = pd.read_parquet(get_load_data_parquet_dir() / 'data_ind.parquet.brotli')

# Fama

In [69]:
def create_fama():
    fama_data = (web.DataReader('F-F_Research_Data_5_Factors_2x3_daily', 'famafrench', start=2005)[0].rename(columns={'Mkt-RF': 'MARKET'}))
    fama_data.index.names = ['date']
    fama_data = fama_data.astype(float)
    fama_data.to_parquet(get_load_data_parquet_dir() / 'data_fama.parquet.brotli', compression='brotli')

In [70]:
create_fama()

In [71]:
fama = pd.read_parquet(get_load_data_parquet_dir() / 'data_fama.parquet.brotli')

# ETF

In [72]:
def create_etf():
    etf_tickers = read_stock(get_load_data_large_dir() / 'tickers_etf.csv')[1:]
    start_date = '2005-01-01'
    end_date = '2023-01-01'
    etf_data = yf.download(etf_tickers, start=start_date, end=end_date)
    etf_data = etf_data.stack().swaplevel().sort_index()
    etf_data.index.names = ['ticker', 'date']
    etf_data = etf_data.astype(float)

    # Calculate returns of each ticker and rename each return column to ticker
    ret = etf_data.groupby('ticker')['Close'].apply(lambda x: x.pct_change())
    ret_df = ret.unstack(level='ticker')
    dates = etf_data.reset_index('ticker').drop(
        ['ticker', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], axis=1)
    dates = dates.loc[~dates.index.duplicated(keep='first')].sort_index()
    etf_data = pd.concat([dates, ret_df], axis=1)
    etf_data.to_parquet(get_load_data_parquet_dir() / 'data_etf.parquet.brotli', compression='brotli')

In [73]:
create_etf()

[*********************100%***********************]  10 of 10 completed


In [74]:
etf = pd.read_parquet(get_load_data_parquet_dir() / 'data_etf.parquet.brotli')

# Macro

In [75]:
def create_macro():
    IF = pd.read_csv(get_load_data_large_dir() / 'macro' / 'fiveYearIR.csv')
    IF.columns = ['date', '5YIF']
    IF = IF.set_index(pd.to_datetime(IF['date'])).drop('date', axis=1)

    medianCPI = pd.read_csv(get_load_data_large_dir() / 'macro' / 'medianCPI.csv')
    medianCPI.columns = ['date', 'medCPI']
    medianCPI = medianCPI.set_index(pd.to_datetime(medianCPI['date'])).drop('date', axis=1)
    medianCPI = medianCPI.shift(1)

    rGDP = pd.read_csv(get_load_data_large_dir() / 'macro' / 'realGDP.csv')
    rGDP.columns = ['date', 'rGDP']
    rGDP = rGDP.set_index(pd.to_datetime(rGDP['date'])).drop('date', axis=1)

    rIR = pd.read_csv(get_load_data_large_dir() / 'macro' / 'realInterestRate.csv')
    rIR.columns = ['date', 'rIR']
    rIR = rIR.set_index(pd.to_datetime(rIR['date'])).drop('date', axis=1)
    rIR = rIR.shift(1)

    UR = pd.read_csv(get_load_data_large_dir() / 'macro' / 'unemploymentRate.csv')
    UR.columns = ['date', 'UR']
    UR = UR.set_index(pd.to_datetime(UR['date'])).drop('date', axis=1)
    UR = UR.shift(1)

    TB = pd.read_csv(get_load_data_large_dir() / 'macro' / 'TB.csv')
    TB.columns = ['date', 'TB']
    TB = TB.set_index(pd.to_datetime(TB['date'])).drop('date', axis=1)
    TB = TB.shift(1)
    
    PPI = pd.read_csv(get_load_data_large_dir() / 'macro' / 'PPI.csv')
    PPI.columns = ['date', 'PPI']
    PPI = PPI.set_index(pd.to_datetime(PPI['date'])).drop('date', axis=1)
    PPI = PPI.shift(1)
    
    retailSales = pd.read_csv(get_load_data_large_dir() / 'macro' / 'retailSales.csv')
    retailSales.columns = ['date', 'retailSales']
    retailSales = retailSales.set_index(pd.to_datetime(retailSales['date'])).drop('date', axis=1)
    retailSales = retailSales.shift(1)
    
    indProdIndex = pd.read_csv(get_load_data_large_dir() / 'macro' / 'indProdIndex.csv')
    indProdIndex.columns = ['date', 'indProdIndex']
    indProdIndex = indProdIndex.set_index(pd.to_datetime(indProdIndex['date'])).drop('date', axis=1)
    indProdIndex = indProdIndex.shift(1)

    realDispoIncome = pd.read_csv(get_load_data_large_dir() / 'macro' / 'realDispoIncome.csv')
    realDispoIncome.columns = ['date', 'realDispoIncome']
    realDispoIncome = realDispoIncome.set_index(pd.to_datetime(realDispoIncome['date'])).drop('date', axis=1)
    realDispoIncome = realDispoIncome.shift(1)
    
    def pctChange(data, name):
        data.replace('.', np.nan, inplace=True)
        data = data.astype(float)
        data[f'{name}_pct']=data[f'{name}'].pct_change()
        return data
    
    IF = pctChange(IF, '5YIF')
    medianCPI = pctChange(medianCPI, 'medCPI')
    rGDP = pctChange(rGDP, 'rGDP')
    rIR = pctChange(rIR, 'rIR')
    UR = pctChange(UR, 'UR')
    TB = pctChange(TB, 'TB')
    PPI = pctChange(PPI, 'PPI')
    retailSales = pctChange(retailSales, 'retailSales')
    indProdIndex = pctChange(indProdIndex, 'indProdIndex')
    realDispoIncome = pctChange(realDispoIncome, 'realDispoIncome')
    
    macro = (pd.merge(IF, medianCPI, left_index=True, right_index=True, how='left').ffill()
                 .merge(rGDP, left_index=True, right_index=True, how='left').ffill()
                 .merge(rIR, left_index=True, right_index=True, how='left').ffill()
                 .merge(UR, left_index=True, right_index=True, how='left').ffill()
                 .merge(TB, left_index=True, right_index=True, how='left').ffill()
                 .merge(PPI, left_index=True, right_index=True, how='left').ffill()
                 .merge(retailSales, left_index=True, right_index=True, how='left').ffill()
                 .merge(indProdIndex, left_index=True, right_index=True, how='left').ffill()
                 .merge(realDispoIncome, left_index=True, right_index=True, how='left').ffill())
    
    factor_macro = macro[['5YIF_pct', 'medCPI_pct', 'rGDP_pct', 'rIR_pct', 'UR_pct', 'TB_pct', 'PPI_pct', 'retailSales_pct', 'indProdIndex_pct', 'realDispoIncome_pct']]
    
#     def normalize(df):
#         df = (df[-1]-df.mean())/df.std()
#         return df
    
#     factor_macro['5YIF_pct'] = factor_macro['5YIF_pct'].rolling(30).apply(lambda x: normalize(x))
#     factor_macro['medCPI_pct'] = factor_macro['medCPI_pct'].rolling(30).apply(lambda x: normalize(x))
#     factor_macro['rGDP_pct'] = factor_macro['rGDP_pct'].rolling(30).apply(lambda x: normalize(x))
#     factor_macro['rIR_pct'] = factor_macro['rIR_pct'].rolling(30).apply(lambda x: normalize(x))
#     factor_macro['UR_pct'] = factor_macro['UR_pct'].rolling(30).apply(lambda x: normalize(x))
#     factor_macro['TB_pct'] = factor_macro['TB_pct'].rolling(30).apply(lambda x: normalize(x))
#     factor_macro['PPI_pct'] = factor_macro['PPI_pct'].rolling(30).apply(lambda x: normalize(x))
#     factor_macro['retailSales_pct'] = factor_macro['retailSales_pct'].rolling(30).apply(lambda x: normalize(x))
#     factor_macro['indProdIndex_pct'] = factor_macro['indProdIndex_pct'].rolling(30).apply(lambda x: normalize(x))
#     factor_macro['realDispoIncome_pct'] = factor_macro['realDispoIncome_pct'].rolling(30).apply(lambda x: normalize(x))

    
#     factor_macro['medCPI_div_rGDP'] = (macro['medCPI'] / macro['rGDP']).pct_change()
#     factor_macro['5YIF_div_medCPI'] = (macro['5YIF']/macro['medCPI']).pct_change()
    
    factor_macro = factor_macro.replace([np.inf, -np.inf], np.nan)
    factor_macro.to_parquet(get_load_data_parquet_dir() / 'data_macro.parquet.brotli', compression='brotli')

In [76]:
create_macro()

In [77]:
macro = pd.read_parquet(get_load_data_parquet_dir() / 'data_macro.parquet.brotli')

# PCA Return

In [99]:
def create_pca_return():
    # Read in price data and set time frame and remove data with less than 2 years length of data (same data as create_factor.py)
    price_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_price.parquet.brotli')
    price_data = set_timeframe(price_data, '2005-01-01', '2023-01-01')
    price_data = set_length(price_data, year=2)

    # Create returns and convert ticker index to columns
    price_data = create_return(price_data, windows=[1])
    ret = price_data[[f'RET_01']]
    ret = ret['RET_01'].unstack(price_data.index.names[0])
    ret.iloc[0] = ret.iloc[0].fillna(0)

    # Execute Rolling PCA
    window_size=60
    num_components=5
    pca_return = rolling_pca(data=ret, window_size=window_size, num_components=num_components, name='Return')
    pca_return.to_parquet(get_load_data_parquet_dir() / 'data_pca_ret.parquet.brotli', compression='brotli')

In [100]:
create_pca_return()

In [101]:
pca_ret = pd.read_parquet(get_load_data_parquet_dir() / 'data_pca_ret.parquet.brotli')

# All RF

In [104]:
def create_all_rf():
    etf_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_etf.parquet.brotli')
    fama_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_fama.parquet.brotli')
    pca_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_pca_ret.parquet.brotli')
    macro_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_macro.parquet.brotli')
    all_rf = pd.concat([etf_data, fama_data, pca_data, macro_data], axis=1)
    all_rf = set_timeframe(all_rf, '2005-01-01', '2023-01-01')
    fama_data = set_timeframe(fama_data, '2005-01-01', '2023-01-01')
    # Execute Rolling PCA
    window_size=60
    num_components=5
    pca_rf = rolling_pca(data=all_rf, window_size=window_size, num_components=num_components, name='RF')
    # Add risk-free rate
    pca_rf = pd.concat([pca_rf, fama_data['RF']], axis=1)    
    pca_rf.to_parquet(get_load_data_parquet_dir() / 'data_all_rf.parquet.brotli', compression = 'brotli')

In [105]:
create_all_rf()

In [106]:
all_rf = pd.read_parquet(get_load_data_parquet_dir() / 'data_all_rf.parquet.brotli')

# SPY Return

In [107]:
def create_spy_return():
    spy_return = get_spy('2005-01-01', '2023-01-01')
    spy_return.index.name = 'date'
    spy_return.to_parquet(get_load_data_parquet_dir() / 'data_spy.parquet.brotli', compression = 'brotli')

In [108]:
create_spy_return()

[*********************100%***********************]  1 of 1 completed


In [109]:
spy_return = pd.read_parquet(get_load_data_parquet_dir() / 'data_spy.parquet.brotli')

# Open Asset Pricing

In [4]:
def create_open_asset_pricing():
    oap_data = pd.read_parquet(get_load_data_large_dir() / 'signed_predictors_dl_wide.parquet.brotli')
    permno_codes = pd.read_csv(get_load_data_large_dir() / 'permno.csv')
    factors_to_use = ['DivSeason', 'ChTax', 'EarningsStreak', 'ResidualMomentum', 'AssetGrowth',
                  'NOA', 'SmileSlope', 'MomSeasonShort', 'InvestPPEInv', 'NetDebtFinance', 'InvGrowth', 'MomSeason11YrPlus']

    oap_data = oap_data[['permno', 'yyyymm'] + factors_to_use]
    permno_codes = permno_codes[['LPERMNO', 'tic']].rename(columns={'LPERMNO':'permno'})

    permno_unique = permno_codes.drop_duplicates().sort_values(by='permno')

    permno_unique = dict(zip(permno_unique['permno'], permno_unique['tic']))

    oap_filtered = oap_data[oap_data['permno'].isin(permno_unique.keys())]
    oap_filtered['tic'] = oap_filtered['permno'].map(permno_unique)

    oap_filtered['date'] = pd.to_datetime(oap_filtered['yyyymm'], format='%Y%m')
    oap_filtered.rename(columns={'tic':'ticker'}, inplace=True)
    oap_filtered.drop(['permno', 'yyyymm'], axis=1, inplace=True)
    oap_filtered.set_index(['ticker', 'date'], inplace=True)
    oap_filtered.sort_index(level=['ticker', 'date'], inplace=True)
    
    # Find overlapping tickers
    current_tickers = read_ticker(get_load_data_large_dir() / 'tickers_to_train_fundamental.csv')
    oap_tickers = get_ticker_idx(oap_filtered)
    overlapping_tickers = list(set(oap_tickers) & set(current_tickers))

    # Filter DataFrame based on overlapping tickers
    oap_filtered = oap_filtered[oap_filtered.index.get_level_values('ticker').isin(overlapping_tickers)]
    
    export_ticker(oap_filtered, get_load_data_large_dir() / 'tickers_to_train_open.csv')
    oap_filtered.to_parquet(get_load_data_parquet_dir() / 'data_open_asset.parquet.brotli', compression='brotli')

In [5]:
create_open_asset_pricing()

In [11]:
open_asset = pd.read_parquet(get_load_data_parquet_dir() / 'data_open_asset.parquet.brotli')

# Dividend

In [334]:
def create_dividend():
    dividend = pd.read_csv(get_load_data_large_dir() / 'dividend.csv')
    dividend = dividend.drop(['PERMNO'], axis=1)
    dividend.columns = ['date', 'ticker', 'divdecdt', 'divpaydt', 'divrecdt', 'distcode', 'divpay']
    dividend['date'] = pd.to_datetime(dividend['date'])
    dividend['divdecdt'] = pd.to_datetime(dividend['divdecdt'])
    dividend['divpaydt'] = pd.to_datetime(dividend['divpaydt'])
    dividend['divrecdt'] = pd.to_datetime(dividend['divrecdt'])
    dividend = dividend.set_index(['ticker', 'date']).sort_index(level=['ticker', 'date'])
    dividend = dividend[dividend.index.get_level_values('ticker').notna()]

    mask = ~dividend['distcode'].astype(str).str.startswith('12')
    dividend[mask] = np.nan
    dividend.to_parquet(get_load_data_parquet_dir() / 'data_dividend.parquet.brotli', compression='brotli')

In [335]:
create_dividend()

In [336]:
dividend = pd.read_parquet(get_load_data_parquet_dir() / 'data_dividend.parquet.brotli')

# Fund Ind

In [109]:
def create_fund_ind():
    fund_ind = pd.read_csv(get_load_data_large_dir() / 'fund_ind.csv')

    def convert_to_float(cell):
        if isinstance(cell, str) and '%' in cell:
            return float(cell.replace('%', '')) / 100
        return float(cell)

    def connect(s):
        return '_'.join(word for word in s.split() if word)

    fund_ind['public_date'] = pd.to_datetime(fund_ind['public_date'])
    fund_ind = fund_ind.rename(columns={'public_date': 'date'}).set_index(['date', 'gicdesc'])
    fund_ind = fund_ind.drop(['NFIRM'], axis=1)
    fund_ind = fund_ind.fillna(0)

    collect = []
    for ind, df in fund_ind.groupby('gicdesc'):
        ind = connect(ind)
        df = df.applymap(convert_to_float)
        # Execute Rolling PCA
        window_size=5 # 3 months
        num_components=5
        pca_fund_ind = rolling_pca(data=df, window_size=window_size, num_components=num_components, name=f'FI_{ind}')
        pca_fund_ind = pca_fund_ind.reset_index('gicdesc').drop('gicdesc', axis=1)
        collect.append(pca_fund_ind)

    fund_ind = pd.concat(collect, axis=1)
    fund_ind.to_parquet(get_load_data_parquet_dir() / 'data_fund_ind.parquet.brotli')

In [110]:
create_fund_ind()

In [111]:
fund_ind = pd.read_parquet(get_load_data_parquet_dir() / 'data_fund_ind.parquet.brotli')

# Fund Raw

In [195]:
def create_fund_raw():
    fund_raw = pd.read_csv(get_load_data_large_dir() / 'fund_raw.csv')
    stock = read_stock(get_load_data_large_dir() / 'permno_to_train.csv')
    attributes = ['GVKEY', 'fyearq', 'fyearq', 'indfmt', 'costat', 'consol', 'popsrc', 
                  'datafmt', 'curcdq', 'datacqtr', 'datafqtr']
    fund_raw = fund_raw.drop(attributes, axis=1)
    fund_raw['datadate'] = pd.to_datetime(fund_raw['datadate'])
    fund_raw = fund_raw.rename(columns={'datadate': 'date', 'LPERMNO': 'permno'}).set_index(['permno', 'date'])
    fund_raw = fund_raw.sort_index(level=['permno', 'date'])
    fund_raw = fund_raw.drop(columns=fund_raw.columns[fund_raw.isna().sum() > len(fund_raw) / 3])
    fund_raw = get_stocks_data(fund_raw, stock)
    fund_raw = fund_raw[~fund_raw.index.duplicated(keep='first')]
    export_stock(fund_raw, get_load_data_large_dir() / 'permno_to_train_fund.csv')

    # Retrieve quarter category
    fund_q = fund_raw[['fqtr']]
    fund_q = fund_q.fillna(-1)
    fund_q = fund_q.astype(int)
    fund_raw = fund_raw.drop('fqtr', axis=1)
    fund_raw = fund_raw.astype(float)
    fund_raw.columns = [col.upper() for col in fund_raw.columns]

    before = fund_raw.columns
    # Liquidity Ratios
    fund_raw['current_ratio'] = (fund_raw['CHEQ'] + fund_raw['AOQ']) / (fund_raw['APQ'] + fund_raw['DLCQ'])
    fund_raw['quick_ratio'] = (fund_raw['CHEQ'] + fund_raw['AOQ'] - fund_raw['INVTQ']) / (fund_raw['APQ'] + fund_raw['DLCQ'])

    # Leverage Ratios
    fund_raw['total_debt'] = fund_raw['DLCQ'] + fund_raw['DLTTQ']
    fund_raw['total_equity'] = fund_raw['ATQ'] - fund_raw['LTQ']
    fund_raw['debt_to_equity_ratio'] = fund_raw['total_debt'] / fund_raw['total_equity']
    fund_raw['debt_ratio'] = fund_raw['total_debt'] / fund_raw['ATQ']

    # Profitability Ratios
    fund_raw['net_profit_margin'] = fund_raw['NIQ'] / fund_raw['SALEQ']
    fund_raw['return_on_assets'] = fund_raw['NIQ'] / fund_raw['ATQ']
    fund_raw['return_on_equity'] = fund_raw['NIQ'] / fund_raw['total_equity']
    fund_raw['gross_profit_margin'] = (fund_raw['SALEQ'] - fund_raw['COGSQ']) / fund_raw['SALEQ']
    fund_raw['operating_margin'] = (fund_raw['REVTQ'] - fund_raw['XSGAQ']) / fund_raw['REVTQ']
    fund_raw['return_on_investment'] = (fund_raw['NIQ'] + fund_raw['XINTQ']) / (fund_raw['LTQ'] + fund_raw['total_equity'])

    # Efficiency Ratios
    fund_raw['inventory_turnover'] = fund_raw['COGSQ'] / fund_raw['INVTQ']
    fund_raw['asset_turnover'] = fund_raw['SALEQ'] / fund_raw['ATQ']
    fund_raw['receivable_turnover'] = fund_raw['SALEQ'] / fund_raw.get('ARQ', 0.01)  # Assume ARQ is Account Receivable, replace with correct column if different

    # Market Ratios
    fund_raw['earnings_per_share'] = fund_raw['NIQ'] / fund_raw['CSHOQ']
    fund_raw['book_value_per_share'] = fund_raw['total_equity'] / fund_raw['CSHOQ']
    # fund_raw['price_to_earnings_ratio'] = fund_raw['stock_price'] / fund_raw['earnings_per_share']  # Uncomment if stock price is available

    # Cash Flow Ratios
    fund_raw['operating_cash_flow_ratio'] = fund_raw.get('OANCFQ', 1) / fund_raw['LTQ']  # Assume OANCFQ is Operating Activities Net Cash Flow, replace with correct column if different
    fund_raw['free_cash_flow'] = fund_raw.get('OANCFQ', 1) - fund_raw['DPQ'] - fund_raw.get('CAPEXQ', 0)  # Assume CAPEXQ is Capital Expenditure, replace with correct column if different

    # Handling division by zero and replacing inf with NaN
    fund_raw = fund_raw.drop(before, axis=1)
    fund_raw.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Ranking by each column
    fund_rank = fund_raw[['current_ratio']]
    for col in fund_raw.columns:
        fund_rank[f'{col}_rank'] = fund_raw.groupby('date')[col].rank()

    fund_rank = fund_rank.drop(['current_ratio'], axis=1)
    
    fund_q.to_parquet(get_load_data_parquet_dir() / 'data_fund_q.parquet.brotli')
    fund_raw.to_parquet(get_load_data_parquet_dir() / 'data_fund_raw.parquet.brotli')
    fund_rank.to_parquet(get_load_data_parquet_dir() / 'data_fund_rank.parquet.brotli', compression='brotli')
    
#     fund_raw = fund_raw.mul(10**6)
#     # Change to daily interval
#     date_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_date.parquet.brotli')
#     tickers = read_ticker(get_load_data_large_dir() / 'tickers_to_train_fundamental.csv')
#     date_data = set_timeframe(date_data, '2004-01-01', '2023-06-01')
    
#     fund_raw = pd.merge(date_data.loc[tickers], fund_raw, left_index=True, right_index=True, how='left')
#     fund_raw = fund_raw.loc[~fund_raw.index.duplicated(keep='first')]
#     fund_raw = fund_raw.ffill()
    
#     fund_q = pd.merge(date_data.loc[tickers], fund_q, left_index=True, right_index=True, how='left')
#     fund_q = fund_q.loc[~fund_q.index.duplicated(keep='first')]
#     fund_q = fund_q.ffill()
#     fund_raw = fund_raw.replace([np.inf, -np.inf], np.nan)
    
#     # Divide by outstanding shares
#     out_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_out.parquet.brotli')
#     fund_raw = pd.merge(fund_raw, out_data.out, left_index=True, right_index=True, how='left')
#     fund_raw = fund_raw.loc[~fund_raw.index.duplicated(keep='first')]
#     fund_raw.iloc[:, :-1] = fund_raw.iloc[:, :-1].div(fund_raw.out, axis=0)
#     fund_raw = fund_raw.drop(['out'], axis=1)
#     fund_raw = fund_raw.replace([np.inf, -np.inf], np.nan)

#     # Divide by price
#     price_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_price.parquet.brotli')
#     fund_raw = pd.merge(fund_raw, price_data.Close.loc[tickers], left_index=True, right_index=True, how='left')
#     fund_raw = fund_raw.loc[~fund_raw.index.duplicated(keep='first')]
#     fund_raw.iloc[:, :-1] = fund_raw.iloc[:, :-1].div(fund_raw.Close, axis=0)
#     fund_raw = fund_raw.drop(['Close'], axis=1)
#     fund_raw = fund_raw.replace([np.inf, -np.inf], np.nan)

In [196]:
create_fund_raw()

In [192]:
fund_q = pd.read_parquet(get_load_data_parquet_dir() / 'data_fund_q.parquet.brotli')

In [193]:
fund_raw = pd.read_parquet(get_load_data_parquet_dir() / 'data_fund_raw.parquet.brotli')

In [194]:
fund_rank = pd.read_parquet(get_load_data_parquet_dir() / 'data_fund_rank.parquet.brotli')

# Fund Ratio

In [116]:
def create_fund_ratio():
    def convert_column_to_numeric(column):
        new_column = []
        for element in column:
            try:
                new_column.append(float(element))
            except ValueError:
                try:
                    if element.endswith('%'):
                        new_column.append(float(element.rstrip('%')) / 100)
                    else:
                        new_column.append(np.nan)
                except (TypeError, AttributeError):
                    new_column.append(np.nan)
        return pd.Series(new_column, index=column.index)

    fund_ratio = pd.read_csv(get_load_data_large_dir() / 'fund_ratio.csv')
    stock = read_stock(get_load_data_large_dir() / 'permno_to_train.csv')
    fund_ratio['public_date'] = pd.to_datetime(fund_ratio['public_date'])
    fund_ratio = fund_ratio.drop(['adate', 'qdate'], axis=1)
    fund_ratio = fund_ratio.rename(columns={'public_date': 'date', 'permno': 'permno'}).set_index(['permno', 'date'])
    fund_ratio = fund_ratio.sort_index(level=['permno', 'date'])
    fund_ratio = get_stocks_data(fund_ratio, stock)
    fund_ratio = fund_ratio.drop(columns=fund_ratio.columns[fund_ratio.isna().sum() > len(fund_ratio) / 3])
    for col in fund_ratio.columns:
        fund_ratio[col] = convert_column_to_numeric(fund_ratio[col])

#     # Multiply monthly data by monthly price
#     price_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_price.parquet.brotli')
#     complete_date_range = pd.date_range(start=fund_ratio.index.get_level_values('date').min(), end=fund_ratio.index.get_level_values('date').max(), freq='D')
#     multi_idx = pd.MultiIndex.from_product([tickers, complete_date_range], names=['ticker', 'date'])
#     price_data = price_data.Close.reindex(multi_idx).ffill()
#     fund_ratio = pd.merge(fund_ratio, price_data, left_index=True, right_index=True, how='left')
#     fund_ratio = fund_ratio.loc[~fund_ratio.index.duplicated(keep='first')]

#     fund_ratio.iloc[:, :-1] = fund_ratio.iloc[:, :-1].mul(fund_ratio.Close, axis=0)
#     fund_ratio = fund_ratio.drop(['Close'], axis=1)
#     fund_ratio = fund_ratio.replace([np.inf, -np.inf], np.nan)
        
#     # Divide by price
#     price_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_price.parquet.brotli')
#     fund_ratio = pd.merge(fund_ratio, price_data.Close.loc[tickers], left_index=True, right_index=True, how='left')
#     fund_ratio = fund_ratio.loc[~fund_ratio.index.duplicated(keep='first')]
#     fund_ratio.iloc[:, :-1] = fund_ratio.iloc[:, :-1].div(fund_ratio.Close, axis=0)
#     fund_ratio = fund_ratio.drop(['Close'], axis=1)
#     fund_ratio = fund_ratio.replace([np.inf, -np.inf], np.nan)

    # Ranking by each column
    ratio_rank = fund_ratio[['CAPEI']]
    for col in fund_ratio.columns:
        ratio_rank[f'{col}_rank'] = fund_ratio.groupby('date')[col].rank(pct=True)

    ratio_rank = ratio_rank.drop(['CAPEI'], axis=1)
    
    fund_ratio.to_parquet(get_load_data_parquet_dir() / 'data_fund_ratio.parquet.brotli', compression='brotli')
    ratio_rank.to_parquet(get_load_data_parquet_dir() / 'data_fund_ratio_rank.parquet.brotli', compression='brotli')

In [117]:
create_fund_ratio()

In [118]:
fund_ratio = pd.read_parquet(get_load_data_parquet_dir() / 'data_fund_ratio.parquet.brotli')

In [119]:
fund_ratio_rank = pd.read_parquet(get_load_data_parquet_dir() / 'data_fund_ratio_rank.parquet.brotli')

# Test

In [21]:
def create_pca_return_test():
    # Read in price data and set time frame and remove data with less than 2 years length of data (same data as create_factor.py)
    price_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_price.parquet.brotli')
    ticker = read_ticker(get_load_data_large_dir() / 'tickers_to_train_fundamental.csv')
    price_data = set_timeframe(price_data, '2002-01-01', '2023-01-01')
    price_data = price_data.loc[ticker]

    # Create returns and convert ticker index to columns
    price_data = create_return(price_data, windows=[1])
    ret = price_data[[f'RET_01']]
    ret = ret['RET_01'].unstack('ticker')
    ret.iloc[0] = ret.iloc[0].fillna(0)

    # Execute Rolling PCA
    window_size=60
    num_components=5
    pca_return = rolling_pca(data=ret, window_size=window_size, num_components=num_components, name='Return')
    pca_return.to_parquet(get_load_data_parquet_dir() / 'data_pca_ret_test.parquet.brotli', compression='brotli')

def create_all_rf_test():
    etf_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_etf.parquet.brotli')
    fama_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_fama.parquet.brotli')
    pca_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_pca_ret_test.parquet.brotli')
    macro_data = pd.read_parquet(get_load_data_parquet_dir() / 'data_macro.parquet.brotli')
    all_rf = pd.concat([etf_data, fama_data, pca_data, macro_data], axis=1)
    all_rf = set_timeframe(all_rf, '2002-01-01', '2023-01-01')
    fama_data = set_timeframe(fama_data, '2002-01-01', '2023-01-01')
    # Execute Rolling PCA
    window_size=60
    num_components=5
    pca_rf = rolling_pca(data=all_rf, window_size=window_size, num_components=num_components, name='RF')
    # Add risk-free rate
    pca_rf = pd.concat([pca_rf, fama_data['RF']], axis=1)    
    pca_rf.to_parquet(get_load_data_parquet_dir() / 'data_all_rf_test.parquet.brotli', compression = 'brotli')
    
def create_macro_test():
    IF = pd.read_csv(get_load_data_large_dir() / 'macro' / 'fiveYearIR.csv')
    IF.columns = ['date', '5YIF']
    IF = IF.set_index(pd.to_datetime(IF['date'])).drop('date', axis=1)

    medianCPI = pd.read_csv(get_load_data_large_dir() / 'macro' / 'medianCPI.csv')
    medianCPI.columns = ['date', 'medCPI']
    medianCPI = medianCPI.set_index(pd.to_datetime(medianCPI['date'])).drop('date', axis=1)
    medianCPI = medianCPI.shift(1)

    rGDP = pd.read_csv(get_load_data_large_dir() / 'macro' / 'realGDP.csv')
    rGDP.columns = ['date', 'rGDP']
    rGDP = rGDP.set_index(pd.to_datetime(rGDP['date'])).drop('date', axis=1)

    rIR = pd.read_csv(get_load_data_large_dir() / 'macro' / 'realInterestRate.csv')
    rIR.columns = ['date', 'rIR']
    rIR = rIR.set_index(pd.to_datetime(rIR['date'])).drop('date', axis=1)
    rIR = rIR.shift(1)

    UR = pd.read_csv(get_load_data_large_dir() / 'macro' / 'unemploymentRate.csv')
    UR.columns = ['date', 'UR']
    UR = UR.set_index(pd.to_datetime(UR['date'])).drop('date', axis=1)
    UR = UR.shift(1)

    TB = pd.read_csv(get_load_data_large_dir() / 'macro' / 'TB.csv')
    TB.columns = ['date', 'TB']
    TB = TB.set_index(pd.to_datetime(TB['date'])).drop('date', axis=1)
    TB = TB.shift(1)
    
    PPI = pd.read_csv(get_load_data_large_dir() / 'macro' / 'PPI.csv')
    PPI.columns = ['date', 'PPI']
    PPI = PPI.set_index(pd.to_datetime(PPI['date'])).drop('date', axis=1)
    PPI = PPI.shift(1)
    
    retailSales = pd.read_csv(get_load_data_large_dir() / 'macro' / 'retailSales.csv')
    retailSales.columns = ['date', 'retailSales']
    retailSales = retailSales.set_index(pd.to_datetime(retailSales['date'])).drop('date', axis=1)
    retailSales = retailSales.shift(1)
    
    indProdIndex = pd.read_csv(get_load_data_large_dir() / 'macro' / 'indProdIndex.csv')
    indProdIndex.columns = ['date', 'indProdIndex']
    indProdIndex = indProdIndex.set_index(pd.to_datetime(indProdIndex['date'])).drop('date', axis=1)
    indProdIndex = indProdIndex.shift(1)

    realDispoIncome = pd.read_csv(get_load_data_large_dir() / 'macro' / 'realDispoIncome.csv')
    realDispoIncome.columns = ['date', 'realDispoIncome']
    realDispoIncome = realDispoIncome.set_index(pd.to_datetime(realDispoIncome['date'])).drop('date', axis=1)
    realDispoIncome = realDispoIncome.shift(1)
    
    def pctChange(data, name):
        data.replace('.', np.nan, inplace=True)
        data = data.astype(float)
        data[f'{name}_pct']=data[f'{name}'].pct_change()
        return data
    
    IF = pctChange(IF, '5YIF')
    medianCPI = pctChange(medianCPI, 'medCPI')
    rGDP = pctChange(rGDP, 'rGDP')
    rIR = pctChange(rIR, 'rIR')
    UR = pctChange(UR, 'UR')
    TB = pctChange(TB, 'TB')
    PPI = pctChange(PPI, 'PPI')
    retailSales = pctChange(retailSales, 'retailSales')
    indProdIndex = pctChange(indProdIndex, 'indProdIndex')
    realDispoIncome = pctChange(realDispoIncome, 'realDispoIncome')
    
    macro = (pd.merge(IF, medianCPI, left_index=True, right_index=True, how='left').ffill()
                 .merge(rGDP, left_index=True, right_index=True, how='left').ffill()
                 .merge(rIR, left_index=True, right_index=True, how='left').ffill()
                 .merge(UR, left_index=True, right_index=True, how='left').ffill()
                 .merge(TB, left_index=True, right_index=True, how='left').ffill()
                 .merge(PPI, left_index=True, right_index=True, how='left').ffill()
                 .merge(retailSales, left_index=True, right_index=True, how='left').ffill()
                 .merge(indProdIndex, left_index=True, right_index=True, how='left').ffill()
                 .merge(realDispoIncome, left_index=True, right_index=True, how='left').ffill())
    
    factor_macro = macro[['5YIF_pct', 'medCPI_pct', 'rGDP_pct', 'rIR_pct', 'UR_pct', 'TB_pct', 'PPI_pct', 'retailSales_pct', 'indProdIndex_pct', 'realDispoIncome_pct']]
    
    def normalize(df):
        df = (df[-1]-df.mean())/df.std()
        return df
    
    factor_macro['5YIF_pct'] = factor_macro['5YIF_pct'].rolling(30).apply(lambda x: normalize(x))
    factor_macro['medCPI_pct'] = factor_macro['medCPI_pct'].rolling(30).apply(lambda x: normalize(x))
    factor_macro['rGDP_pct'] = factor_macro['rGDP_pct'].rolling(30).apply(lambda x: normalize(x))
    factor_macro['rIR_pct'] = factor_macro['rIR_pct'].rolling(30).apply(lambda x: normalize(x))
    factor_macro['UR_pct'] = factor_macro['UR_pct'].rolling(30).apply(lambda x: normalize(x))
    factor_macro['TB_pct'] = factor_macro['TB_pct'].rolling(30).apply(lambda x: normalize(x))
    factor_macro['PPI_pct'] = factor_macro['PPI_pct'].rolling(30).apply(lambda x: normalize(x))
    factor_macro['retailSales_pct'] = factor_macro['retailSales_pct'].rolling(30).apply(lambda x: normalize(x))
    factor_macro['indProdIndex_pct'] = factor_macro['indProdIndex_pct'].rolling(30).apply(lambda x: normalize(x))
    factor_macro['realDispoIncome_pct'] = factor_macro['realDispoIncome_pct'].rolling(30).apply(lambda x: normalize(x))
    
    factor_macro = factor_macro.replace([np.inf, -np.inf], np.nan)
        
    factor_macro.to_parquet(get_load_data_parquet_dir() / 'data_macro_test.parquet.brotli', compression='brotli')

In [None]:
create_pca_return_test()

In [None]:
create_all_rf_test()

In [22]:
create_macro_test()