In [59]:
import os

os.chdir("../../factorlib-polars")

FileNotFoundError: [Errno 2] No such file or directory: '../../factorlib-polars'

In [60]:
import polars as pl
from factorlib.utils.system import get_data_dir

In [61]:
"""
ORIGINAL STATE: The ratios-history-full.csv is a csv full of common fundamental ratios of individual stocks from Wharton Research Data Services (WRDS). These
fundamnetals correspond to stocks represented by words as GVKEYs instead of tickers. GVKEYs are numeric identification numbers for stocks that can be used to
identify a stock rather than using the stocks ticker. Over long periods of time, these GVKEY's are more accurate because they do not change, where as a
company's ticker can change anytime. However, our factor model uses tickers to identify stocks, and therefore needs a `ticker` column in all non general factors.
The goal of this notebook is to add a tickers column to the data, where each ticker corresponds to its GVKEY in that row.

METHOD: Thankfully, we also have a tickers.csv that contains a unique list of 505 SP500 common tickers and their corresponding GVKEY. You can look at the data,
but it has two columns, a ticker column, and a GVKEY column. We can use this dataframe as a map to match the tickers in tickers.csv to the GVKEYs in
ratios-history-full.csv to create our factor. We use the polars method .join(data: pl.DataFrame, on: str, how: str) to do this. See polars documentation to
understand how .join() works, and understand how different parameters of the function will affect how the two dataframes are merged.
"""

"\nORIGINAL STATE: The ratios-history-full.csv is a csv full of common fundamental ratios of individual stocks from Wharton Research Data Services (WRDS). These\nfundamnetals correspond to stocks represented by words as GVKEYs instead of tickers. GVKEYs are numeric identification numbers for stocks that can be used to\nidentify a stock rather than using the stocks ticker. Over long periods of time, these GVKEY's are more accurate because they do not change, where as a\ncompany's ticker can change anytime. However, our factor model uses tickers to identify stocks, and therefore needs a `ticker` column in all non general factors.\nThe goal of this notebook is to add a tickers column to the data, where each ticker corresponds to its GVKEY in that row.\n\nMETHOD: Thankfully, we also have a tickers.csv that contains a unique list of 505 SP500 common tickers and their corresponding GVKEY. You can look at the data,\nbut it has two columns, a ticker column, and a GVKEY column. We can use this 

In [64]:
# load data
raw_data = get_data_dir() / 'raw'
ratios = pl.scan_csv(raw_data / 'ratios_history_full.csv', try_parse_dates=True).collect(streaming=True)
tickers = pl.scan_csv(raw_data / 'tickers.csv', try_parse_dates=True).collect(streaming=True)

In [65]:
# join data
gvkeys_list = tickers.select(pl.col('gvkey')).to_series().to_list()
tickers_list = tickers.select('co_tic').to_series().to_list()
map_dict = dict(zip(gvkeys_list, tickers_list,))
ratios_with_tickers = ratios.select(pl.all().exclude('gvkey'),
                                    pl.col('gvkey').map_dict(map_dict).alias('ticker'))
ratios_with_tickers = ratios_with_tickers.drop_nulls(subset='ticker')
ratios_with_tickers.sort('public_date')

adate,qdate,public_date,bm,pe_op_dil,pcf,gpm,roa,roe,aftret_invcapx,capital_ratio,quick_ratio,curr_ratio,ptb,PEG_1yrforward,ticker
str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
"""12/31/1999""","""09/30/2000""","""01/31/2001""",1.879,6.032,2.079,0.143,0.112,0.111,0.1,0.446,0.652,0.76,0.636,0.726,"""AAL"""
"""12/31/1999""","""09/30/2000""","""01/31/2001""",0.808,12.025,5.644,0.307,0.155,0.089,0.086,0.488,0.862,0.992,1.05,8.586,"""PNW"""
"""12/31/1999""","""09/30/2000""","""01/31/2001""",0.113,25.782,25.401,0.625,0.288,0.357,0.318,0.137,1.171,1.545,8.34,4.616,"""ABT"""
"""12/31/1999""","""09/30/2000""","""01/31/2001""",0.405,11.659,5.957,0.576,0.293,0.405,0.258,0.37,1.755,2.003,2.5,-0.439,"""AMD"""
"""09/30/2000""","""09/30/2000""","""01/31/2001""",0.467,15.354,7.377,0.415,0.173,0.034,0.061,0.481,0.962,1.313,2.404,6.792,"""APD"""
"""12/31/1999""","""09/30/2000""","""01/31/2001""",1.589,4.647,3.32,0.11,0.108,0.031,0.048,0.27,0.78,0.86,0.848,-0.07,"""ALK"""
"""12/31/1999""","""09/30/2000""","""01/31/2001""",0.364,17.058,14.234,0.312,0.21,0.158,0.159,0.271,0.805,1.254,3.68,7.468,"""HON"""
"""03/31/2000""","""09/30/2000""","""01/31/2001""",0.189,44.776,30.254,0.498,0.197,0.125,0.126,0.001,7.066,7.559,4.706,0.768,"""SWKS"""
"""12/31/1999""","""09/30/2000""","""01/31/2001""",0.685,3.056,3.48,0.295,0.264,0.227,0.173,0.385,0.989,1.131,1.506,0.011,"""HES"""
"""12/31/1999""","""09/30/2000""","""01/31/2001""",1.114,15.67,12.956,0.112,0.124,0.076,0.104,0.553,0.619,0.699,0.992,-5.745,"""AEP"""


In [69]:
# remove columns that we don't want in this specific factor and rename the date column to 'date_index' to addhere to factorlib's requirements.
fundamental_factor_dir = get_data_dir() / 'fundamental'
final_factor = ratios_with_tickers.drop(['adate', 'qdate'])
final_factor = final_factor.rename({'public_date': 'date_index'})
final_factor = final_factor.select(
    pl.col('date_index').str.to_datetime("%m/%d/%Y"),
    pl.all().exclude('date_index')
)
final_factor = final_factor.drop_nulls(subset='date_index').sort(['ticker', 'date_index'])
final_factor.write_csv(fundamental_factor_dir / 'fundamentals1_monthly.csv')
final_factor

date_index,bm,pe_op_dil,pcf,gpm,roa,roe,aftret_invcapx,capital_ratio,quick_ratio,curr_ratio,ptb,PEG_1yrforward,ticker
datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
2001-01-31 00:00:00,0.25,33.466,29.707,0.535,0.226,0.175,0.224,0.0,1.379,2.05,4.728,1.231,"""A"""
2001-02-28 00:00:00,0.25,22.086,19.605,0.535,0.226,0.175,0.224,0.0,1.379,2.05,3.12,0.812,"""A"""
2001-03-31 00:00:00,0.222,19.206,54.617,0.468,0.236,0.157,0.157,0.0,1.345,2.045,2.533,-2.98,"""A"""
2001-04-30 00:00:00,0.222,24.381,69.333,0.468,0.236,0.157,0.157,0.0,1.345,2.045,3.216,-1.067,"""A"""
2001-05-31 00:00:00,0.222,20.963,59.611,0.468,0.236,0.157,0.157,0.0,1.345,2.045,2.765,-0.941,"""A"""
2001-06-30 00:00:00,0.315,29.279,34.546,0.545,0.178,0.139,0.139,0.0,1.217,1.876,2.645,-0.302,"""A"""
2001-07-31 00:00:00,0.315,25.775,30.411,0.545,0.178,0.139,0.139,0.0,1.217,1.876,2.329,-0.257,"""A"""
2001-08-31 00:00:00,0.315,23.874,28.168,0.545,0.178,0.139,0.139,0.0,1.217,1.876,2.157,-0.184,"""A"""
2001-09-30 00:00:00,0.416,45.465,13.812,0.539,0.126,0.065,0.065,0.0,1.133,1.771,1.641,-0.247,"""A"""
2001-10-31 00:00:00,0.416,51.791,15.734,0.539,0.126,0.065,0.065,0.0,1.133,1.771,1.87,-0.281,"""A"""


In [43]:
groups = final_factor.groupby('ticker')
for group in groups:
    print(len(group[1].select(pl.col('date_index'))))

1260
5244
19
5928
7524
1498
8346
2268
21
5928
6840
5016
4788
5016
5244
37
3552
504
31
22
6840
4332
7296
6384
5244
7296
5016
3100
1024
11856
2492
6970
4104
8436
5472
5700
6156
5928
4560
5472
3648
5244
5966
6384
7980
7752
37
8664
1044
4560
5700
20
4332
3420
5070
52
3298
5700
6290
840
4620
9120
6612
6384
5472
14
8436
6612
3876
1232
5472
6384
5016
47
1980
7752
9348
8892
1960
9576
41
7006
9804
2280
5928
8436
4104
5928
5016
7068
7980
6510
5928
2736
6612
35
7068
4100
5700
3984
5244
6840
5640
5700
3876
5016
5472
7752
6612
6612
5700
2716
6612
4560
7524
285
2808
4788
2224
3648
5016
5244
7980
5928
5472
4154
7752
7296
7068
4788
6384
32
32
5472
8664
10944
672
6847
4788
5016
7524
7296
4560
7524
5700
3051
9804
4560
5928
7752
6156
6384
5016
3128
8436
4104
8600
41
40
6156
4560
5016
8190
6384
5016
5244
5472
4250
3648
6588
5016
6510
5244
4560
10488
3926
24
1458
780
7980
4100
6280
7524
6612
6480
4864
44
7980
4332
8208
24
6897
4560
5700
3540
5016
4788
4788
6708
6825
7296
5984
12
6384
756
804
7752
6612
6156