In [None]:
import pandas as pd
import polars as pl
from datetime import datetime

from factorlib.utils.system import get_data_dir

In [35]:
raw_data_dir = get_data_dir() / 'raw'
ohclv = (
    pl.scan_csv(raw_data_dir / 'ohclv_daily.csv', try_parse_dates=True)
    .select(
        pl.col('tic').alias('ticker'),
        pl.col('datadate').cast(pl.Datetime).alias('date_index'),
        (pl.col('prcod') / pl.col('ajexdi')).cast(pl.Float64).alias('open'),
        (pl.col('prchd') / pl.col('ajexdi')).cast(pl.Float64).alias('high'),
        (pl.col('prcld') / pl.col('ajexdi')).cast(pl.Float64).alias('low'),
        (pl.col('prccd') / pl.col('ajexdi')).cast(pl.Float64).alias('close'),
        pl.col('cshtrd').alias('vol')
    )
    .collect(streaming=True))
ohclv

ticker,date_index,open,high,low,close,vol
str,datetime[μs],f64,f64,f64,f64,f64
"""AIR""",2011-12-09 00:00:00,16.82,17.44,16.82,17.27,317485.0
"""AIR""",2000-02-17 00:00:00,,28.5,22.875,26.875,574500.0
"""AIR""",2014-10-17 00:00:00,24.5,24.55,24.03,24.09,373713.0
"""AIR""",2018-04-09 00:00:00,43.64,43.68,42.62,42.69,156637.0
"""AIR""",2008-04-11 00:00:00,22.2,22.53,20.65,20.82,1.409566e6
"""AIR""",1995-01-23 00:00:00,,9.25,9.083333,9.25,26700.0
"""AIR""",2012-05-03 00:00:00,15.13,15.32,15.1,15.11,266408.0
"""AIR""",1990-06-19 00:00:00,,13.0,12.666667,13.0,42700.0
"""AIR""",2016-05-31 00:00:00,24.52,24.66,24.31,24.41,106061.0
"""AIR""",1996-11-27 00:00:00,,19.916667,19.75,19.833333,133100.0


In [36]:
tickers = ["OPRA", "SMCI", "LMB", "MLTX", "YPF", "CABA", "WEAV", "ELF", "EDN", "ACLS", "INTT", "ETNB", "CIR", "RCL", "NVDA", "DAKT", "TCMD", "DMAC", "IMVT", "MMMB", "ENIC", "WFRD", "IPDN", "STRL", "RMBS", "MOD", "NGL", "TDW", "TAYD", "VIST", "EXTR", "SYM", "CCL", "CMT", "CBAY", "TGLS", "BELFB", "VECT", "AEHR", "CUK", "UFPT", "AUGX", "ISEE", "TAST", "COCO", "VRT", "BWMN", "ONCY", "BLDR", "ODC", "ATEC", "NVTS", "RMTI", "AVDL", "IRS", "DFH", "CVRX", "PEN", "TGS", "GRBK", "PLPC", "SKYW", "USAP", "ACVA", "RETA", "BTBT", "TROO", "POWL", "PPSI", "FTI", "DO", "SGML", "GGAL", "PCYG", "NETI", "TRHC", "ARDX", "STVN", "NFLX", "INTA", "MORF", "RXST", "HGBL", "GE", "BZH", "BBAR", "PESI", "RIG", "NU", "TK", "JBL", "ERO", "SMHI", "IRON", "EVLV", "GENI", "ELTK", "ENVX", "META", "NCLH"]

In [40]:
ohclv_filtered = ohclv.lazy().filter(pl.col('ticker').is_in(tickers)).collect(streaming=True)
ohclv_filtered = ohclv_filtered.lazy().filter(pl.col('date_index') > datetime(2005, 1, 1)).collect(streaming=True)
ohclv_filtered = ohclv_filtered.to_pandas()
ohclv_filtered.set_index('date_index').to_csv(raw_data_dir / 'small_universe_ohclv.csv')

In [54]:
with_returns = ohclv_filtered
with_returns['ret'] = ohclv_filtered.sort_values(['ticker', 'date_index']).groupby('ticker')['close'].pct_change()
with_returns

Unnamed: 0,ticker,date_index,open,high,low,close,vol,ret
1970-01-01 00:00:00.000000000,BELFB,2023-06-06,51.62,55.390,51.6200,55.39,195255.0,0.069099
1970-01-01 00:00:00.000000001,BELFB,2023-06-02,51.00,52.690,50.5300,52.60,176159.0,0.044272
1970-01-01 00:00:00.000000002,BELFB,2023-06-01,49.37,50.530,49.0000,50.37,123219.0,0.023989
1970-01-01 00:00:00.000000003,BELFB,2023-05-31,49.35,49.490,47.9752,49.19,119064.0,-0.003646
1970-01-01 00:00:00.000000004,BELFB,2023-05-26,48.17,49.780,47.9750,49.35,190291.0,0.028982
...,...,...,...,...,...,...,...,...
1970-01-01 00:00:00.000269213,VIST,2020-01-24,8.01,8.010,7.5200,7.80,71710.0,-0.035847
1970-01-01 00:00:00.000269214,VIST,2019-10-14,5.69,5.710,5.4000,5.45,74308.0,-0.053819
1970-01-01 00:00:00.000269215,VIST,2020-01-09,7.88,7.910,7.6200,7.80,89185.0,-0.016393
1970-01-01 00:00:00.000269216,VIST,2020-07-02,3.05,3.230,3.0500,3.13,22830.0,0.009677


In [55]:
returns_unstacked = with_returns.set_index(['date_index', 'ticker'])['ret'].unstack()
returns_unstacked.index = pd.to_datetime(returns_unstacked.index)
returns_unstacked = returns_unstacked.resample('B').first()
returns_unstacked

ticker,ACLS,ACVA,AEHR,ARDX,ATEC,AUGX,AVDL,BBAR,BELFB,BLDR,...,TRHC,TROO,UFPT,USAP,VECT,VIST,VRT,WEAV,WFRD,YPF
date_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-03,,,,,,,,,,,...,,,,,,,,,,
2005-01-04,-0.063371,,0.010127,,,,-0.002545,-0.005690,-0.061343,,...,,,0.000000,-0.002016,,,,,,-0.003460
2005-01-05,-0.058187,,-0.110276,,,,-0.050000,-0.034335,-0.013255,,...,,,-0.024324,-0.057239,,,,,,0.001852
2005-01-06,-0.002874,,0.008169,,,,0.021482,0.000000,0.005936,,...,,,-0.000277,0.003571,,,,,,0.007625
2005-01-07,-0.021614,,-0.019279,,,,0.016824,0.013333,-0.013975,,...,,,-0.030202,-0.002776,,,,,,-0.000229
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-01,0.028816,0.000000,0.166869,0.047904,0.010554,0.029730,0.010830,0.036530,0.023989,-0.000776,...,0.024482,0.035191,0.024769,-0.018122,0.000000,0.041122,0.005181,-0.008951,0.050850,0.028207
2023-06-02,0.001666,0.025822,0.073449,0.005714,0.002611,0.091864,0.007143,-0.006608,0.044272,0.026843,...,0.023897,0.002805,0.008835,0.000000,0.001208,0.006041,0.020619,-0.010323,0.056820,0.015929
2023-06-05,0.002895,0.013158,-0.010155,0.031250,0.001953,-0.033654,0.021277,0.035477,-0.015019,0.011431,...,0.046679,-0.045171,0.010071,0.030201,0.001811,0.017090,0.013636,0.005215,0.008456,0.021777
2023-06-06,0.013880,0.016375,0.038837,0.027548,-0.003249,0.009950,0.030556,0.122056,0.069099,0.029835,...,0.032590,-0.029586,0.008856,-0.018730,0.000000,0.025886,-0.003488,-0.001297,-0.011707,0.062234


In [56]:
returns_unstacked.to_csv(raw_data_dir / 'small_universe_returns.csv')