In [18]:
import polars as pl
from factorlib.utils.system import get_data_dir

In [19]:
"""
ORIGINAL STATE: The ratios-history-full.csv is a csv full of common fundamental ratios of individual stocks from Wharton Research Data Services (WRDS). These fundamnetals correspond to stocks represented by words as GVKEYs instead of tickers. GVKEYs are numeric identification numbers for stocks that can be used to identify a stock rather than using the stocks ticker. Over long periods of time, these GVKEY's are more accurate because they do not change, where as a company's ticker can change anytime. However, our factor model uses tickers to identify stocks, and therefore needs a `ticker` column in all non general factors. The goal of this notebook is to add a tickers column to the data, where each ticker corresponds to its GVKEY in that row.

METHOD: Thankfully, we also have a tickers.csv that contains a unique list of 505 SP500 common tickers and their corresponding GVKEY. You can look at the data, but it has two columns, a ticker column, and a GVKEY column. We can use this dataframe as a map to match the tickers in tickers.csv to the GVKEYs in ratios-history-full.csv to create our factor. We use the polars method .join(data: pl.DataFrame, on: str, how: str) to do this. See polars documentation to understand how .join() works, and understand how different parameters of the function will affect how the two dataframes are merged.
"""

"\nORIGINAL STATE: The ratios-history-full.csv is a csv full of common fundamental ratios of individual stocks from Wharton Research Data Services (WRDS). These fundamnetals correspond to stocks represented by words as GVKEYs instead of tickers. GVKEYs are numeric identification numbers for stocks that can be used to identify a stock rather than using the stocks ticker. Over long periods of time, these GVKEY's are more accurate because they do not change, where as a company's ticker can change anytime. However, our factor model uses tickers to identify stocks, and therefore needs a `ticker` column in all non general factors. The goal of this notebook is to add a tickers column to the data, where each ticker corresponds to its GVKEY in that row.\n\nMETHOD: Thankfully, we also have a tickers.csv that contains a unique list of 505 SP500 common tickers and their corresponding GVKEY. You can look at the data, but it has two columns, a ticker column, and a GVKEY column. We can use this dataf

In [21]:
# load data
raw_data = get_data_dir() / 'raw'
ratios = pl.scan_csv(raw_data / 'ratios_history_full.csv', try_parse_dates=True).collect(streaming=True)
tickers = pl.scan_csv(raw_data / 'tickers.csv', try_parse_dates=True).collect(streaming=True)

In [22]:
# join data
ratios_with_tickers = tickers.join(ratios, on='gvkey', how='left')
ratios_with_tickers.sort('public_date')

gvkey,co_tic,adate,qdate,public_date,bm,pe_op_dil,pcf,gpm,roa,roe,aftret_invcapx,capital_ratio,quick_ratio,curr_ratio,ptb,PEG_1yrforward
i64,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
9465,"""SLB""",,,,,,,,,,,,,,,
13498,"""CCL""",,,,,,,,,,,,,,,
28733,"""EQR""",,,,,,,,,,,,,,,
28034,"""CB""",,,,,,,,,,,,,,,
29389,"""SPG""",,,,,,,,,,,,,,,
30490,"""AIV""",,,,,,,,,,,,,,,
29984,"""PLD""",,,,,,,,,,,,,,,
11220,"""VNO""",,,,,,,,,,,,,,,
10096,"""PSA""",,,,,,,,,,,,,,,
24731,"""KIM""",,,,,,,,,,,,,,,


In [23]:
# remove columns that we don't want in this specific factor and rename the date column to 'date_index' to addhere to factorlib's requirements.
fundamental_factor_dir = get_data_dir() / 'fundamental'
final_factor = ratios_with_tickers.drop(['adate', 'qdate', 'gvkey'])
final_factor.rename({'public_date': 'date_index'})
final_factor.write_csv(fundamental_factor_dir / 'fundamentals_1.csv')
final_factor

co_tic,public_date,bm,pe_op_dil,pcf,gpm,roa,roe,aftret_invcapx,capital_ratio,quick_ratio,curr_ratio,ptb,PEG_1yrforward
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""ABT""","""01/31/2001""",0.113,25.782,25.401,0.625,0.288,0.357,0.318,0.137,1.171,1.545,8.34,4.616
"""ABT""","""02/28/2001""",0.114,28.483,24.444,0.606,0.275,0.348,0.331,0.112,1.311,1.716,8.84,4.454
"""ABT""","""03/31/2001""",0.114,27.436,23.546,0.606,0.275,0.348,0.331,0.112,1.311,1.716,8.515,4.29
"""ABT""","""04/30/2001""",0.114,26.965,23.142,0.606,0.275,0.348,0.331,0.112,1.311,1.716,8.369,4.638
"""ABT""","""05/31/2001""",0.111,33.321,30.691,0.606,0.279,0.227,0.212,0.121,0.932,1.244,9.94,7.71
"""ABT""","""06/30/2001""",0.111,30.769,28.341,0.606,0.279,0.227,0.212,0.121,0.932,1.244,9.179,7.12
"""ABT""","""07/31/2001""",0.111,34.353,31.642,0.606,0.279,0.227,0.212,0.121,0.932,1.244,10.248,7.949
"""ABT""","""08/31/2001""",0.115,31.258,26.163,0.6,0.256,0.207,0.202,0.184,0.841,1.135,9.009,8.042
"""ABT""","""09/30/2001""",0.115,32.61,27.295,0.6,0.256,0.207,0.202,0.184,0.841,1.135,9.399,8.39
"""ABT""","""10/31/2001""",0.115,33.321,27.89,0.6,0.256,0.207,0.202,0.184,0.841,1.135,9.604,8.573
