# Import Packages

In [1]:
import warnings
import pandas as pd

warnings.filterwarnings('ignore')

import pandas_datareader.data as web
import polars as pl

from functions.utils.func import *

# CRSP

In [None]:
def create_crsp():
    # Read in crsp dataset
    crsp = pd.read_parquet(get_load_data_large_dir() / 'crsp.parquet.brotli')
    
    # Rename Columns
    print('Rename columns')
    crsp.columns = crsp.columns.str.lower()
    crsp = crsp.rename(columns={'prc':'Close', 'bid':'High', 'ask':'Low', 'openprc':'Open', 'shrout':'outstanding', 'vol':'Volume', 'cfacpr':'adj_price'})
    
    # Adjust closing price
    print('Adjusting Close Price')
    crsp['Close'] = crsp['Close']/crsp['adj_price']

    # Set and sort index
    print('Set and sort indices')
    crsp.date = pd.to_datetime(crsp.date)
    crsp = crsp.set_index(['permno', 'date'])
    crsp = crsp.sort_index(level=['permno', 'date'])
    
    # Remove duplicate indices and replace all infinity with nan
    print('Remove duplicate indices and infinity')
    crsp = crsp[~crsp.index.duplicated(keep='first')]
    crsp = crsp.replace([np.inf, -np.inf], np.nan)
    
    # Remove stocks that have more than 1 NAN values in their Closing price column
    # Stocks that get delisted have 1 row of NAN values as their last row
    # Stocks that switch ticker (WM to COOP: 81593) have rows of NAN valuescap = cap.dropna(subset='Close')
    # Afterwards, drop all rows that have NAN values in Close (every delisted permno stock only has 1 NAN in Close now)
    print('Remove stocks with NAN')
    nan_counts = crsp.groupby('permno')['Close'].apply(lambda x: x.isna().sum())
    valid_permnos = nan_counts[nan_counts <= 1].index.tolist()
    crsp = crsp[crsp.index.get_level_values('permno').isin(valid_permnos)]
    crsp = crsp.dropna(subset='Close')
    
    # Remove dates in stocks that have a negative closing price
    crsp = crsp[crsp['Close'] >= 0]
    
    # Remove stocks that do not have at least 3 years worth of year data
    print('Set length to 3 years')
    crsp = set_length(crsp, 3)
    
    # Drop tickers that do not have over 6B market cap
    crsp['market_cap'] = crsp['Close'] * crsp['outstanding'] * 1000
    avg_cap = crsp.groupby('permno')['market_cap'].mean()
    above_cap = avg_cap[avg_cap > 6_000_000_000].index
    crsp = crsp[crsp.index.get_level_values('permno').isin(above_cap)]

    # Export list of stocks
    print('Exporting stock list')
    print(f'Number of stocks: {len(get_stock_idx(crsp))}')
    export_stock(crsp, get_load_data_large_dir() / 'permno_to_train.csv')

    # Export crsp data
    print('Exporting Crsp')
    crsp.to_parquet(get_load_data_parquet_dir() / 'data_crsp.parquet.brotli', compression='brotli')
    
    # Export ohclv
    print('Exporting Price')
    ohclv = crsp[['Open', 'High', 'Low', 'Close', 'Volume']]
    ohclv = ohclv.astype(float)
    ohclv.to_parquet(get_load_data_parquet_dir() / 'data_price.parquet.brotli', compression='brotli')
    
    # Export date
    print('Export Date')
    date = crsp.drop(columns=crsp.columns)
    date.to_parquet(get_load_data_parquet_dir() / 'data_date.parquet.brotli', compression='brotli')

    # Export ticker
    print('Exporting Tickers')
    ticker = crsp[['ticker']]
    ticker.to_parquet(get_load_data_parquet_dir() / 'data_ticker.parquet.brotli', compression='brotli')

In [None]:
create_crsp()

In [None]:
ohclv = pd.read_parquet(get_load_data_parquet_dir() / 'data_price.parquet.brotli')

In [None]:
ticker = pd.read_parquet(get_load_data_parquet_dir() / 'data_ticker.parquet.brotli')

In [None]:
date = pd.read_parquet(get_load_data_parquet_dir() / 'data_date.parquet.brotli')

In [None]:
stock = read_stock(get_load_data_large_dir() / 'permno_to_train.csv')

In [None]:
crsp = pd.read_parquet(get_load_data_parquet_dir() / 'data_crsp.parquet.brotli')

In [None]:
crsp.shape

In [None]:
ohclv.shape

In [None]:
len(stock)

# CRSP Compustat

In [2]:
def create_crsp_compustat():
    # Read in crsp_compustat
    crsp_compustat = pd.read_parquet(get_load_data_large_dir() / 'crsp_compustat.parquet.brotli')
    stock = read_stock(get_load_data_large_dir() / 'permno_to_train.csv')
    
    # Rename Columns
    print('Rename columns')
    crsp_compustat.columns = crsp_compustat.columns.str.lower()
    crsp_compustat = crsp_compustat.rename(columns={'lpermno':'permno', 'tic':'ticker', 'datadate':'date', 'prccd':'Close', 'prchd':'High', 'prcld':'Low', 'prcod':'Open',
                                                    'cshoc':'outstanding', 'cshtrd':'Volume', 'gsubind':'Subindustry', 'gind':'Industry', 'naics':'IndNAIC'})
    
    # Set and sort index
    print('Set and sort indices')
    crsp_compustat.date = pd.to_datetime(crsp_compustat.date)
    crsp_compustat = crsp_compustat.set_index(['permno', 'date'])
    crsp_compustat = crsp_compustat.sort_index(level=['permno', 'date'])
    crsp_compustat = get_stocks_data(crsp_compustat, stock)
    
    # Remove duplicate indices and replace all infinity with nan
    print('Remove duplicate indices and infinity')
    crsp_compustat = crsp_compustat[~crsp_compustat.index.duplicated(keep='first')]
    crsp_compustat = crsp_compustat.replace([np.inf, -np.inf], np.nan)

    # Fill NAN values for industries with -1
    print('Fill industries with -1')
    cols_to_fill = ['Industry', 'Subindustry', 'IndNAIC']
    crsp_compustat[cols_to_fill] = crsp_compustat[cols_to_fill].fillna(-1)

    # Export list of stocks
    print('Exporting stock list')
    print(f'Number of stocks: {len(get_stock_idx(crsp_compustat))}')
    export_stock(crsp_compustat, get_load_data_large_dir() / 'permno_to_train_comp.csv')
    stock_comp = read_stock(get_load_data_large_dir() / 'permno_to_train_comp.csv')

    # Export crsp_compustat data
    print('Exporting Crsp Compustat')
    crsp_compustat.to_parquet(get_load_data_parquet_dir() / 'data_crsp_compustat.parquet.brotli', compression='brotli')
    
    # Export ind
    print('Exporting Industries')
    ind = crsp_compustat[['Industry']]
    ind = ind.astype(int)
    sub_ind = crsp_compustat[['Subindustry']]
    sub_ind = sub_ind.astype(int)
    ind_naic = crsp_compustat[['IndNAIC']]
    ind_naic = ind_naic.astype(int)

    # Merge ind data with crsp dataset
    date = pd.read_parquet(get_load_data_parquet_dir() / 'data_date.parquet.brotli')
    date = get_stocks_data(date, stock_comp)
    ind = pd.merge(date, ind, left_index=True, right_index=True, how='left')
    sub_ind = pd.merge(date, sub_ind, left_index=True, right_index=True, how='left')
    ind_naic = pd.merge(date, ind_naic, left_index=True, right_index=True, how='left')
    ind.to_parquet(get_load_data_parquet_dir() / 'data_ind.parquet.brotli', compression='brotli')
    sub_ind.to_parquet(get_load_data_parquet_dir() / 'data_ind_sub.parquet.brotli', compression='brotli')
    ind_naic.to_parquet(get_load_data_parquet_dir() / 'data_ind_naic.parquet.brotli', compression='brotli')

In [None]:
create_crsp_compustat()

In [None]:
ind = pd.read_parquet(get_load_data_parquet_dir() / 'data_ind.parquet.brotli')

In [None]:
sub_ind = pd.read_parquet(get_load_data_parquet_dir() / 'data_ind_sub.parquet.brotli')

In [None]:
crsp_compustat = pd.read_parquet(get_load_data_parquet_dir() / 'data_crsp_compustat.parquet.brotli')

In [None]:
stock_comp = read_stock(get_load_data_large_dir() / 'permno_to_train_comp.csv')

In [None]:
ind.shape

In [None]:
get_stocks_data(ohclv, stock_comp).shape

In [None]:
len(stock_comp)

# Industry

In [None]:
def create_ind():
    # Assign Fama industries based off given ranges
    def assign_ind(df, column_name, sic_ranges, label):
        # Sic from CRSP and Compustat
        df['sic_temp_crsp'] = df['sic_crsp']
        df['sic_temp_comp'] = df['sic_comp']

        # Iterate through each range and assign industry
        for r in sic_ranges:
            if isinstance(r, tuple):
                df.loc[(df['sic_temp_crsp'] >= r[0]) & (df['sic_temp_crsp'] <= r[1]), f'{column_name}_crsp'] = label
                df.loc[(df['sic_temp_comp'] >= r[0]) & (df['sic_temp_comp'] <= r[1]), f'{column_name}_comp'] = label
            else:
                df.loc[df['sic_temp_crsp'] == r, f'{column_name}_crsp'] = label
                df.loc[df['sic_temp_comp'] == r, f'{column_name}_comp'] = label
        
        df = df.drop(columns=['sic_temp_crsp', 'sic_temp_comp'])
        return df

    # Read in CRSP sic
    crsp_sic = pd.read_parquet(get_load_data_parquet_dir() / 'data_crsp.parquet.brotli', columns=['siccd'])
    crsp_sic = crsp_sic.rename(columns={'siccd':'sic_crsp'})
    print('Finished crsp')

    # Read in Compustat sic 
    comp_sic = pd.read_parquet(get_load_data_parquet_dir() / 'data_crsp_compustat.parquet.brotli', columns=['sic'])
    comp_sic = comp_sic.rename(columns={'sic':'sic_comp'})
    print('Finished compustat')
    
    # Merge CRSP and Compustat
    combined = pd.merge(crsp_sic, comp_sic, left_index=True, right_index=True, how='left')
    combined = combined[~combined.index.duplicated(keep='first')]
    stock_comp = read_stock(get_load_data_large_dir() / 'permno_to_train_comp.csv')
    combined = get_stock_data(combined, stock_comp)

    # FF49 Industry ranges
    fama_ind = {
    'agric': [(100, 199), (200, 299), (700, 799), (910, 919), 2048],
    'food': [(2000, 2009), (2010, 2019), (2020, 2029), (2030, 2039), (2040, 2046), (2050, 2059), (2060, 2063), (2070, 2079), (2090, 2092), 2095, (2098, 2099)],
    'soda': [(2064, 2068), 2086, 2087, 2096, 2097],
    'beer': [2080, 2082, 2083, 2084, 2085],
    'smoke': [(2100, 2199)],
    'toys': [(920, 999), (3650, 3651), 3652, 3732, (3930, 3931), (3940, 3949)],
    'fun': [(7800, 7829), (7830, 7833), (7840, 7841), 7900, (7910, 7911), (7920, 7929), (7930, 7933), (7940, 7949), 7980, (7990, 7999)],
    'books': [(2700, 2709), (2710, 2719), (2720, 2729), (2730, 2739), (2740, 2749), (2770, 2771), (2780, 2789), (2790, 2799)],
    'hshld': [2047, (2391, 2392), (2510, 2519), (2590, 2599), (2840, 2843), 2844, (3160, 3161), (3170, 3171), 3172, (3190, 3199), 3229, 3260, (3262, 3263), 3269, (3230, 3231), (3630, 3639), (3750, 3751), 3800, (3860, 3861), (3870, 3873), (3910, 3911), 3914, 3915, (3960, 3962), 3991, 3995],
    'clths': [(2300, 2390), (3020, 3021), (3100, 3111), (3130, 3131), (3140, 3149), (3150, 3151), (3963, 3965)],
    'hlth': [(8000, 8099)],
    'medeq': [3693, (3840, 3849), (3850, 3851)],
    'drugs': [2830, 2831, 2833, 2834, 2835, 2836],
    'chems': [(2800, 2809), (2810, 2819), (2820, 2829), (2850, 2859), (2860, 2869), (2870, 2879), (2890, 2899)],
    'rubbr': [3031, 3041, (3050, 3053), (3060, 3069), (3070, 3079), (3080, 3089), (3090, 3099)],
    'txtls': [(2200, 2269), (2270, 2279), (2280, 2284), (2290, 2295), 2297, 2298, 2299, (2393, 2395), (2397, 2399)],
    'bldmt': [(800, 899), (2400, 2439), (2450, 2459), (2490, 2499), (2660, 2661), (2950, 2952), 3200, (3210, 3211), (3240, 3241), (3250, 3259), 3261, 3264, (3270, 3275), (3280, 3281), (3290, 3293), (3295, 3299), (3420, 3429), (3430, 3433), (3440, 3441), 3442, 3446, 3448, 3449, (3450, 3451), 3452, (3490, 3499), 3996],
    'cnstr': [(1500, 1511), (1520, 1529), (1530, 1539), (1540, 1549), (1600, 1699), (1700, 1799)],
    'steel': [3300, (3310, 3317), (3320, 3325), (3330, 3339), (3340, 3341), (3350, 3357), (3360, 3369), (3370, 3379), (3390, 3399)],
    'fabpr': [3400, 3443, 3444, (3460, 3469), (3470, 3479)],
    'mach': [(3510, 3519), (3520, 3529), 3530, 3531, 3532, 3533, 3534, 3535, 3536, 3538, (3540, 3549), (3550, 3559), (3560, 3569), 3580, 3581, 3582, 3585, 3586, 3589, (3590, 3599)],
    'elceq': [3600, (3610, 3613), (3620, 3621), (3623, 3629), (3640, 3644), 3645, 3646, (3648, 3649), 3660, 3690, (3691, 3692), 3699],
    'autos': [2296, 2396, (3010, 3011), 3537, 3647, 3694, 3700, 3710, 3711, 3713, 3714, 3715, 3716, 3792, (3790, 3791), 3799],
    'aero': [3720, 3721, (3723, 3724), 3725, (3728, 3729)],
    'ships': [(3730, 3731), (3740, 3743)],
    'guns': [(3760, 3769), 3795, (3480, 3489)],
    'gold': [(1040, 1049)],
    'mines': [(1000, 1009), (1010, 1019), (1020, 1029), (1030, 1039), (1050, 1059), (1060, 1069), (1070, 1079), (1080, 1089), (1090, 1099), (1100, 1119), (1400, 1499)],
    'coal': [(1200, 1299)],
    'oil': [1300, (1310, 1319), (1320, 1329), (1330, 1339), (1370, 1379), 1380, 1381, 1382, 1389, (2900, 2912), (2990, 2999)],
    'util': [4900, (4910, 4911), (4920, 4922), 4923, (4924, 4925), (4930, 4931), 4932, 4939, (4940, 4942)],
    'telcm': [4800, (4810, 4813), (4820, 4822), (4830, 4839), (4840, 4841), 4880, 4890, 4891, 4892, 4899],
    'persv': [(7020, 7021), (7030, 7033), 7200, (7210, 7212), 7214, (7215, 7216), 7217, 7219, (7220, 7221), (7230, 7231), (7240, 7241), (7250, 7251), (7260, 7269), (7270, 7290), 7291, (7292, 7299), 7395, 7500, (7520, 7529), (7530, 7539), (7540, 7549), 7600, 7620, 7622, 7623, 7629, 7630, 7640, (7690, 7699), (8100, 8199), (8200, 8299), (8300, 8399), (8400, 8499), (8600, 8699), (8800, 8899), (7510, 7515)],
    'bussv': [(2750, 2759), 3993, 7218, 7300, (7310, 7319), (7320, 7329), (7330, 7339), (7340, 7342), 7349, (7350, 7351), 7352, 7353, 7359, (7360, 7369), 7374, 7376, 7377, 7378, 7379, 7380, (7381, 7382), 7383, 7384, 7385, 7389, 7390, 7391, (7392, 7392), 7393, 7394, 7396, 7397, 7399, (7519, 7519), 8700, (8710, 8713), (8720, 8721), (8730, 8734), (8740, 8748), (8900, 8910), 8911, (8920, 8999), (4220, 4229)],
    'hardw': [(3570, 3579), 3680, 3681, 3682, 3683, 3684, 3685, 3686, 3687, 3688, 3689, 3695],
    'softw': [(7370, 7372), 7375, 7373],
    'chips': [3622, 3661, (3662, 3662), 3663, 3664, 3665, 3666, 3669, (3670, 3679), (3810, 3810), (3812, 3812)],
    'labeq': [3811, 3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3829, (3830, 3839)],
    'paper': [(2520, 2549), (2600, 2639), (2670, 2699), (2760, 2761), (3950, 3955)],
    'boxes': [(2440, 2449), (2640, 2659), (3220, 3221), (3410, 3412)],
    'trans': [(4000, 4013), (4040, 4049), 4100, (4110, 4119), (4120, 4121), (4130, 4131), (4140, 4142), (4150, 4151), (4170, 4173), (4190, 4199), 4200, (4210, 4219), (4230, 4231), (4240, 4249), (4400, 4499), (4500, 4599), (4600, 4699), 4700, (4710, 4712), (4720, 4729), (4730, 4739), (4740, 4749), 4780, 4782, 4783, 4784, 4785, 4789],
    'whlsl': [5000, (5010, 5015), (5020, 5023), (5030, 5039), (5040, 5042), 5043, 5044, 5045, 5046, 5047, 5048, 5049, (5050, 5059), 5060, 5063, 5064, 5065, (5070, 5078), 5080, 5081, 5082, 5083, 5084, 5085, (5086, 5087), 5088, 5090, (5091, 5092), 5093, 5094, 5099, 5100, (5110, 5113), (5120, 5122), (5130, 5139), (5140, 5149), (5150, 5159), (5160, 5169), (5170, 5172), (5180, 5182), (5190, 5199)],
    'rtail': [5200, (5210, 5219), (5220, 5229), (5230, 5231), (5250, 5251), (5260, 5261), (5270, 5271), 5300, 5310, 5320, (5330, 5331), 5334, (5340, 5349), (5390, 5399), 5400, (5410, 5411), 5412, (5420, 5429), (5430, 5439), (5440, 5449), (5450, 5459), (5460, 5469), (5490, 5499), 5500, (5510, 5529), (5530, 5539), (5540, 5549), (5550, 5559), (5560, 5569), (5570, 5579), (5590, 5599), (5600, 5699), 5700, (5710, 5719), (5720, 5722), (5730, 5733), 5734, 5735, 5736, (5750, 5799), 5900, (5910, 5912), (5920, 5929), (5930, 5932), 5940, 5941, 5942, 5943, 5944, 5945, 5946, 5947, 5948, 5949, (5950, 5959), (5960, 5969), (5970, 5979), (5980, 5989), 5990, 5992, 5993, 5994, 5995, 5999],
    'meals': [(5800, 5819), (5820, 5829), (5890, 5899), 7000, (7010, 7019), (7040, 7049), 7213],
    'banks': [6000, (6010, 6019), 6020, 6021, 6022, 6023, 6025, 6026, 6027, (6028, 6029), (6030, 6036), (6040, 6059), (6060, 6062), (6080, 6082), (6090, 6099), 6100, (6110, 6111), (6112, 6113), (6120, 6129), (6130, 6139), (6140, 6149), (6150, 6159), (6160, 6169), (6170, 6179), (6190, 6199)],
    'insur': [6300, (6310, 6319), (6320, 6329), (6330, 6331), (6350, 6351), (6360, 6361), (6370, 6379), (6390, 6399), (6400, 6411)],
    'rlest': [6500, 6510, 6512, 6513, 6514, 6515, (6517, 6519), (6520, 6529), (6530, 6531), 6532, (6540, 6541), (6550, 6553), (6590, 6599), (6610, 6611)],
    'fin': [(6200, 6299), 6700, (6710, 6719), (6720, 6722), 6723, 6724, 6725, 6726, (6730, 6733), (6740, 6779), 6790, 6791, 6792, 6793, 6794, 6795, 6798, 6799],
    'other': [(4950, 4959), (4960, 4961), (4970, 4971), (4990, 4991)]
    }

    # Iterate through each key
    for name, ranges in fama_ind.items():
        print('-'*60)
        print(name)
        combined = assign_ind(combined, 'IndustryFama', ranges, name)

    # Assign industry based off Compustat. If Compustat is NAN, then use CRSP
    combined['IndustryFama'] = combined['IndustryFama_comp'].combine_first(combined['IndustryFama_crsp'])
    combined['IndustryFama'], category_mapping = combined['IndustryFama'].factorize()
    print(combined)
    combined = combined[['IndustryFama']]

    # Export FF49 Industries
    combined.to_parquet(get_load_data_parquet_dir() / 'data_ind_fama.parquet.brotli')

In [None]:
create_ind()

In [None]:
ind_fama = pd.read_parquet(get_load_data_parquet_dir() / 'data_ind_fama.parquet.brotli')

In [None]:
ind_fama.shape

# Fama

In [None]:
def create_fama():
    # Get Fama data from web
    fama_data = (web.DataReader('F-F_Research_Data_5_Factors_2x3_daily', 'famafrench', start=2005)[0].rename(columns={'Mkt-RF': 'MARKET'}))
    fama_data.index.names = ['date']
    fama_data = fama_data.astype(float)
    fama_data.to_parquet(get_load_data_parquet_dir() / 'data_fama.parquet.brotli', compression='brotli')

In [None]:
create_fama()

In [None]:
fama = pd.read_parquet(get_load_data_parquet_dir() / 'data_fama.parquet.brotli')

In [None]:
fama.shape

# Macro

In [None]:
def create_macro():
    # Read in IF
    IF = pd.read_csv(get_load_data_large_dir() / 'macro' / 'fiveYearIR.csv')
    IF.columns = ['date', '5YIF']
    IF = IF.set_index(pd.to_datetime(IF['date'])).drop('date', axis=1)
    IF = IF[~IF.index.duplicated(keep='first')]

    # Read in medianCPI
    medianCPI = pd.read_csv(get_load_data_large_dir() / 'macro' / 'medianCPI.csv')
    medianCPI.columns = ['date', 'medCPI']
    medianCPI['date'] = pd.to_datetime(medianCPI['date'])
    medianCPI['date'] = (medianCPI['date'] + pd.DateOffset(months=1))
    medianCPI = medianCPI.set_index('date')
    medianCPI = medianCPI[~medianCPI.index.duplicated(keep='first')]

    # Read in rGDP
    rGDP = pd.read_csv(get_load_data_large_dir() / 'macro' / 'realGDP.csv')
    rGDP.columns = ['date', 'rGDP']
    rGDP['date'] = pd.to_datetime(rGDP['date'])
    rGDP['date'] = (rGDP['date'] + pd.DateOffset(months=3))
    rGDP = rGDP.loc[rGDP.index.repeat(3)]
    rGDP = rGDP.set_index('date')
    rGDP = rGDP[~rGDP.index.duplicated(keep='first')]

    # Read in rIR
    rIR = pd.read_csv(get_load_data_large_dir() / 'macro' / 'realInterestRate.csv')
    rIR.columns = ['date', 'rIR']
    rIR['date'] = pd.to_datetime(rIR['date'])
    rIR['date'] = (rIR['date'] + pd.DateOffset(months=1))
    rIR = rIR.set_index('date')
    rIR = rIR[~rIR.index.duplicated(keep='first')]

    # Read in UR
    UR = pd.read_csv(get_load_data_large_dir() / 'macro' / 'unemploymentRate.csv')
    UR.columns = ['date', 'UR']
    UR['date'] = pd.to_datetime(UR['date'])
    UR['date'] = (UR['date'] + pd.DateOffset(months=1))
    UR = UR.set_index('date')
    UR = UR[~UR.index.duplicated(keep='first')]

    # Read in TB
    TB = pd.read_csv(get_load_data_large_dir() / 'macro' / 'TB.csv')
    TB.columns = ['date', 'TB']
    TB['date'] = pd.to_datetime(TB['date'])
    TB['date'] = (TB['date'] + pd.DateOffset(months=1))
    TB = TB.set_index('date')
    TB = TB[~TB.index.duplicated(keep='first')]

    # Read in PPI
    PPI = pd.read_csv(get_load_data_large_dir() / 'macro' / 'PPI.csv')
    PPI.columns = ['date', 'PPI']
    PPI['date'] = pd.to_datetime(PPI['date'])
    PPI['date'] = (PPI['date'] + pd.DateOffset(months=1))
    PPI = PPI.set_index('date')
    PPI = PPI[~PPI.index.duplicated(keep='first')]

    # Read in Retail Sales
    retailSales = pd.read_csv(get_load_data_large_dir() / 'macro' / 'retailSales.csv')
    retailSales.columns = ['date', 'retailSales']
    retailSales['date'] = pd.to_datetime(retailSales['date'])
    retailSales['date'] = (retailSales['date'] + pd.DateOffset(months=1))
    retailSales = retailSales.set_index('date')
    retailSales = retailSales[~retailSales.index.duplicated(keep='first')]

    # Read in Industry Production Index
    indProdIndex = pd.read_csv(get_load_data_large_dir() / 'macro' / 'indProdIndex.csv')
    indProdIndex.columns = ['date', 'indProdIndex']
    indProdIndex['date'] = pd.to_datetime(indProdIndex['date'])
    indProdIndex['date'] = (indProdIndex['date'] + pd.DateOffset(months=1))
    indProdIndex = indProdIndex.set_index('date')
    indProdIndex = indProdIndex[~indProdIndex.index.duplicated(keep='first')]

    # Read in Real Disposable Income
    realDispoIncome = pd.read_csv(get_load_data_large_dir() / 'macro' / 'realDispoIncome.csv')
    realDispoIncome.columns = ['date', 'realDispoIncome']
    realDispoIncome['date'] = pd.to_datetime(realDispoIncome['date'])
    realDispoIncome['date'] = (realDispoIncome['date'] + pd.DateOffset(months=1))
    realDispoIncome = realDispoIncome.set_index('date')
    realDispoIncome = realDispoIncome[~realDispoIncome.index.duplicated(keep='first')]

    # Calculate percent change
    def pctChange(data, name):
        data.replace('.', np.nan, inplace=True)
        data = data.astype(float)
        data[f'{name}_pct']=data[f'{name}'].pct_change()
        return data

    # Execute function
    IF = pctChange(IF, '5YIF')
    medianCPI = pctChange(medianCPI, 'medCPI')
    rGDP = pctChange(rGDP, 'rGDP')
    rIR = pctChange(rIR, 'rIR')
    UR = pctChange(UR, 'UR')
    TB = pctChange(TB, 'TB')
    PPI = pctChange(PPI, 'PPI')
    retailSales = pctChange(retailSales, 'retailSales')
    indProdIndex = pctChange(indProdIndex, 'indProdIndex')
    realDispoIncome = pctChange(realDispoIncome, 'realDispoIncome')

    # Merge all macro data together
    macro = (pd.merge(IF, medianCPI, left_index=True, right_index=True, how='left')
                 .merge(rGDP, left_index=True, right_index=True, how='left')
                 .merge(rIR, left_index=True, right_index=True, how='left')
                 .merge(UR, left_index=True, right_index=True, how='left')
                 .merge(TB, left_index=True, right_index=True, how='left')
                 .merge(PPI, left_index=True, right_index=True, how='left')
                 .merge(retailSales, left_index=True, right_index=True, how='left')
                 .merge(indProdIndex, left_index=True, right_index=True, how='left')
                 .merge(realDispoIncome, left_index=True, right_index=True, how='left'))
    factor_macro = macro[['5YIF_pct', 'medCPI_pct', 'rGDP_pct', 'rIR_pct', 'UR_pct', 'TB_pct', 'PPI_pct', 'retailSales_pct', 'indProdIndex_pct', 'realDispoIncome_pct']]
    factor_macro = factor_macro.replace([np.inf, -np.inf], np.nan)

    # Forward Fill by max 31 days
    factor_macro = ffill_max_days(factor_macro, 31)

    # Set timeframe
    factor_macro = factor_macro.loc['2005-01-01': '2023-01-01']

    # Export data
    factor_macro.to_parquet(get_load_data_parquet_dir() / 'data_macro.parquet.brotli', compression='brotli')

In [None]:
create_macro()

In [None]:
macro = pd.read_parquet(get_load_data_parquet_dir() / 'data_macro.parquet.brotli')

In [None]:
macro.shape

# SPY Return

In [None]:
def create_spy_return():
    # Get SPY returns
    spy_return = get_spy('2005-01-01', '2023-01-01')
    spy_return.index.name = 'date'
    spy_return.to_parquet(get_load_data_parquet_dir() / 'data_spy.parquet.brotli', compression = 'brotli')

In [None]:
create_spy_return()

In [None]:
spy_return = pd.read_parquet(get_load_data_parquet_dir() / 'data_spy.parquet.brotli')

In [None]:
spy_return.shape

# Fund Raw

In [4]:
def create_fund_raw():
    # Read in dataset
    fund_raw = pd.read_csv(get_load_data_large_dir() / 'crsp_compustat_fund.csv')

    # Rename columns
    fund_raw.columns = fund_raw.columns.str.lower()

    # Keep only the most recent data for each fiscal quarter
    fund_raw = fund_raw.sort_values(by=['gvkey', 'fyearq', 'fqtr', 'datadate'])
    fund_raw = fund_raw.groupby(['gvkey', 'fyearq', 'fqtr']).last().reset_index()
    
    # Convert to datetime
    fund_raw['datadate'] = pd.to_datetime(fund_raw['datadate'])
    fund_raw['rdq'] = pd.to_datetime(fund_raw['rdq'])

    # Shift data 3 months forward
    fund_raw['time_avail_m'] = (fund_raw['datadate'] + pd.DateOffset(months=3)).dt.to_period('M')
    fund_raw.loc[(~fund_raw['rdq'].isnull()) & (fund_raw['rdq'].dt.to_period('M') > fund_raw['time_avail_m']), 'time_avail_m'] = fund_raw['rdq'].dt.to_period('M')
    
    # Compute month difference
    month_diff = (fund_raw['rdq'] - fund_raw['datadate']).dt.days // 30
    fund_raw = fund_raw.drop(fund_raw[(month_diff > 6) & ~fund_raw['rdq'].isnull()].index)
    fund_raw = fund_raw.sort_values(by=['gvkey', 'time_avail_m', 'datadate'])

    # Keep most recent data
    fund_raw = fund_raw.groupby(['gvkey', 'time_avail_m']).last().reset_index()

    # Create extra yearly columns
    for col in ['sstky', 'prstkcy', 'oancfy', 'fopty']:
        grouped = fund_raw.groupby(['gvkey', 'fyearq'])[col]
        condition = fund_raw['fqtr'] == 1
        new_values = np.where(condition, fund_raw[col], fund_raw[col] - grouped.shift(1))
        fund_raw[col + 'q'] = new_values
        
    # Convert index from quarterly to monthly
    fund_raw = fund_raw.loc[fund_raw.index.repeat(3)]
    fund_raw['tempTimeAvailM'] = fund_raw['time_avail_m']
    fund_raw = fund_raw.sort_values(by=['gvkey', 'tempTimeAvailM'])
    fund_raw['time_avail_m'] = fund_raw.groupby(['gvkey', 'tempTimeAvailM']).cumcount() + fund_raw['time_avail_m']
    
    # Sort values
    fund_raw = fund_raw.sort_values(by=['gvkey', 'time_avail_m', 'datadate'])
    # Keep most recent data
    fund_raw = fund_raw.groupby(['gvkey', 'time_avail_m']).last().reset_index()
    fund_raw = fund_raw.drop(columns=['tempTimeAvailM'])
    fund_raw = fund_raw.rename(columns={'datadate': 'datadateq', 'time_avail_m':'date', 'lpermno':'permno'})

    # Convert from YY-MM to YY-MM-DD (2012-01 to 2012-01-31)
    fund_raw.date = fund_raw.date.dt.to_timestamp("M")
    fund_raw = fund_raw.set_index(['permno', 'date'])

    # Export list of stocks
    stock = read_stock(get_load_data_large_dir() / 'permno_to_train_comp.csv')

    # Convert data to numerical format (exclude columns that are not numerical format)
    fund_raw = get_stocks_data(fund_raw, stock)
    numeric_cols = fund_raw.select_dtypes(include=['number']).columns
    fund_raw[numeric_cols] = fund_raw[numeric_cols].astype(float)
    non_numeric_cols = fund_raw.select_dtypes(exclude=['number']).columns
    fund_raw_numeric = fund_raw[numeric_cols]

    # Export list of stocks (these will be used for training)
    export_stock(fund_raw, get_load_data_large_dir() / 'permno_to_train_fund.csv')
    fund_raw_numeric = fund_raw_numeric.sort_index(level=['permno', 'date'])

    # Forward fill yearly data
    cols_to_fill = [col for col in fund_raw_numeric.columns if col.endswith('y')]
    fund_raw_numeric[cols_to_fill] = fund_raw_numeric[cols_to_fill].ffill()

    # Export data
    fund_raw_numeric.to_parquet(get_load_data_parquet_dir() / 'data_fund_raw.parquet.brotli', compression='brotli')

In [None]:
create_fund_raw()

In [None]:
fund_raw = pd.read_parquet(get_load_data_parquet_dir() / 'data_fund_raw.parquet.brotli')

In [None]:
stock_fund = read_stock(get_load_data_large_dir() / 'permno_to_train_fund.csv')

In [None]:
len(stock_fund)

# Fund Raw Annual

In [5]:
def create_fund_raw_a():
    # Read in annual
    annual = pd.read_csv(get_load_data_large_dir() / 'crsp_compustat_fund_a.csv')

    # Rename columns 
    annual.columns = annual.columns.str.lower()
    annual = annual.rename(columns={'lpermno':'permno'})
    
    # Drop rows based on condition
    annual = annual.dropna(subset=['at', 'prcc_c', 'ni'])
    
    # Extract 6 digits from CUSIP
    annual['cnum'] = annual['cusip'].str[:6]
    
    # Replacing missing values
    annual['dr'] = annual.apply(lambda row: row['drc'] + row['drlt'] if pd.notna(row['drc']) and pd.notna(row['drlt']) else (row['drc'] if pd.notna(row['drc']) else (row['drlt'] if pd.notna(row['drlt']) else None)), axis=1)
    annual.loc[(annual['dcpstk'] > annual['pstk']) & pd.notna(annual['dcpstk']) & pd.notna(annual['pstk']) & pd.isna(annual['dcvt']), 'dc'] = annual['dcpstk'] - annual['pstk']
    annual.loc[pd.isna(annual['pstk']) & pd.notna(annual['dcpstk']) & pd.isna(annual['dcvt']), 'dc'] = annual['dcpstk']
    annual.loc[pd.isna(annual['dc']), 'dc'] = annual['dcvt']
    annual['xint0'] = annual['xint'].fillna(0)
    annual['xsga0'] = annual['xsga'].fillna(0)
    annual['xad0'] = annual.apply(lambda row: 0 if row['xad'] < 0 else row['xad'], axis=1)
    vars_list = ['nopi', 'dvt', 'ob', 'dm', 'dc', 'aco', 'ap', 'intan', 'ao', 'lco', 'lo', 'rect', 'invt', 'drc', 'spi', 'gdwl', 'che', 'dp', 'act', 'lct', 'tstkp', 'dvpa', 'scstkc', 'sstk', 'mib', 'ivao', 'prstkc', 'prstkcc', 'txditc', 'ivst']
    for var in vars_list:
        annual[var].fillna(0, inplace=True)

    # Shift data forward by 6 months
    annual['date'] = pd.to_datetime(annual['datadate']).dt.to_period('M') + 6

    # Convert index from annually to monthly
    annual = annual.reindex(annual.index.repeat(12))
    annual['tempTime'] = annual.groupby(['gvkey', 'date']).cumcount()
    annual['date'] += annual['tempTime']
    annual = annual.drop(columns=['tempTime'])
    
    # Convert from YY-MM to YY-MM-DD (2012-01 to 2012-01-31)
    annual.date = annual.date.dt.to_timestamp("M")
    annual = annual.drop('datadate', axis=1)

    # Set index and remove duplicate indices
    annual = annual.set_index(['permno', 'date'])
    annual = annual.sort_index(level=['permno', 'date'])
    annual = annual[~annual.index.duplicated(keep='first')]

    # Read in list of stock and apply get_stocks_data
    stock = read_stock(get_load_data_large_dir() / 'permno_to_train_fund.csv')
    annual = get_stocks_data(annual, stock)

    # Export data
    annual.to_parquet(get_load_data_parquet_dir() / 'data_fund_raw_a.parquet.brotli', compression='brotli')

In [None]:
create_fund_raw_a()

In [None]:
annual = pd.read_parquet(get_load_data_parquet_dir() / 'data_fund_raw_a.parquet.brotli')

In [None]:
len(get_stock_idx(annual))

# Pension

In [3]:
def create_pension():
    # Read in pension
    pension = pd.read_csv(get_load_data_large_dir() / 'pension_compustat.csv')

    # Rename columns
    pension.columns = pension.columns.str.lower()

    # Drop duplicate indices
    pension = pension.sort_values(by=['gvkey', 'datadate'])
    pension = pension.groupby(['gvkey', 'datadate']).last().reset_index()
    
    # Convert to datetime and set index
    pension['datadate'] = pd.to_datetime(pension['datadate'])
    pension = pension.rename(columns = {'datadate': 'date', 'tic': 'ticker'})
    pension = pension.set_index('date')

    # Shift everything 1 year forward
    for col in pension.columns:
        if col != 'gvkey' or col != 'indfmt' or col != 'datafmt' or col != 'consol' or col != 'popsrc' or col != 'ticker':
            pension[col] = pension.groupby('gvkey')[col].shift(1)

    # Export data
    pension.to_parquet(get_load_data_parquet_dir() / 'data_pension.parquet.brotli', compression='brotli')

In [None]:
create_pension()

In [None]:
pension = pd.read_parquet(get_load_data_parquet_dir() / 'data_pension.parquet.brotli')

# Open Asset Pricing

In [None]:
def create_open_asset_pricing():
    # Read in open asset
    oap_data = pd.read_parquet(get_load_data_large_dir() / 'open_asset.parquet.brotli')

    # Convert to datetime and set index
    oap_data['date'] = pd.to_datetime(oap_data['yyyymm'], format='%Y%m')
    oap_data = oap_data.drop(['yyyymm'], axis=1)
    oap_data = oap_data.set_index(['permno', 'date'])
    oap_data = oap_data.sort_index(level=['permno', 'date'])

    # Find overlapping permnos between open asset dataset and my dataset
    stocks = read_stock(get_load_data_large_dir() / 'permno_to_train_fund.csv')
    oap_data = get_stocks_data(oap_data, stocks)
    oap_data.to_parquet(get_load_data_parquet_dir() / 'data_open_asset.parquet.brotli', compression='brotli')

In [None]:
create_open_asset_pricing()

In [None]:
open_asset = pd.read_parquet(get_load_data_parquet_dir() / 'data_open_asset.parquet.brotli')

In [None]:
oap_data = pd.read_parquet(get_load_data_large_dir() / 'open_asset.parquet.brotli')