In [129]:
import polars as pl
import pandas as pd
import numpy as np
import datetime as dt

from factorlib.utils.system import get_data_dir

In [29]:
raw_data_dir = get_data_dir() / 'raw'
ohclv_raw = pl.scan_csv(raw_data_dir / 'ohclv_daily.csv', try_parse_dates=True).collect(streaming=True)
ohclv_raw

GVKEY,iid,datadate,tic,conm,divd,cshtrd,eps,busdesc,gind,gsector,gsubind,sic,prcod,prchd,prcld,prccd
i64,i64,date,str,str,f64,f64,f64,str,i64,i64,i64,i64,f64,f64,f64,f64
1004,1,2013-10-24,"""AIR""","""AAR CORP""",,351238.0,1.37,"""AAR Corp. prov…",201010,20,20101010,5080,29.21,29.67,29.08,29.38
1004,1,2002-03-01,"""AIR""","""AAR CORP""",,75300.0,-1.6,"""AAR Corp. prov…",201010,20,20101010,5080,,7.45,7.15,7.44
1004,1,2007-11-30,"""AIR""","""AAR CORP""",,354300.0,1.7,"""AAR Corp. prov…",201010,20,20101010,5080,32.99,33.75,32.81,33.02
1004,1,2015-06-30,"""AIR""","""AAR CORP""",,248059.0,1.23,"""AAR Corp. prov…",201010,20,20101010,5080,32.1,32.17,31.73,31.87
1004,1,2019-11-08,"""AIR""","""AAR CORP""",,106407.0,2.37,"""AAR Corp. prov…",201010,20,20101010,5080,44.08,44.29,43.14,43.35
1004,1,2022-09-14,"""AIR""","""AAR CORP""",,303898.0,2.19,"""AAR Corp. prov…",201010,20,20101010,5080,41.39,41.81,40.31,41.0
1004,1,2003-07-11,"""AIR""","""AAR CORP""",,55900.0,-0.39,"""AAR Corp. prov…",201010,20,20101010,5080,,6.9,6.27,6.89
1004,1,2014-01-13,"""AIR""","""AAR CORP""",,281396.0,1.44,"""AAR Corp. prov…",201010,20,20101010,5080,26.86,26.88,26.65,26.84
1004,1,2013-01-07,"""AIR""","""AAR CORP""",,303724.0,1.73,"""AAR Corp. prov…",201010,20,20101010,5080,20.03,20.16,19.68,19.75
1004,1,2021-07-22,"""AIR""","""AAR CORP""",,392224.0,1.31,"""AAR Corp. prov…",201010,20,20101010,5080,37.16,37.3,36.32,36.38


In [30]:
relevant_data = ohclv_raw.lazy().select(pl.col('datadate'), pl.col('tic'), pl.col('prccd'), pl.col('gind'),
                                        pl.col('gsubind')).collect(streaming=True)
relevant_data

datadate,tic,prccd,gind,gsubind
date,str,f64,i64,i64
2013-10-24,"""AIR""",29.38,201010,20101010
2002-03-01,"""AIR""",7.44,201010,20101010
2007-11-30,"""AIR""",33.02,201010,20101010
2015-06-30,"""AIR""",31.87,201010,20101010
2019-11-08,"""AIR""",43.35,201010,20101010
2022-09-14,"""AIR""",41.0,201010,20101010
2003-07-11,"""AIR""",6.89,201010,20101010
2014-01-13,"""AIR""",26.84,201010,20101010
2013-01-07,"""AIR""",19.75,201010,20101010
2021-07-22,"""AIR""",36.38,201010,20101010


In [31]:
pattern = r'^[a-zA-Z]+$'
mask = relevant_data.lazy().with_columns(pl.col('tic').str.contains(pattern).alias('mask')).collect(streaming=True)
alpha_only = mask.lazy().filter(pl.col('mask') == True).collect(streaming=True)
alpha_only = alpha_only.rename(
    {
        'tic': 'ticker',
        'datadate': 'date_index'
    }
)
alpha_only = alpha_only.lazy().sort(['ticker', 'date_index']).collect(streaming=True)
alpha_only

date_index,ticker,prccd,gind,gsubind,mask
date,str,f64,i64,i64,bool
2015-11-11,"""AAAP""",24.5,352010,35201010,true
2015-11-12,"""AAAP""",25.0,352010,35201010,true
2015-11-13,"""AAAP""",25.26,352010,35201010,true
2015-11-16,"""AAAP""",25.02,352010,35201010,true
2015-11-17,"""AAAP""",24.62,352010,35201010,true
2015-11-18,"""AAAP""",25.0,352010,35201010,true
2015-11-19,"""AAAP""",25.9,352010,35201010,true
2015-11-20,"""AAAP""",25.2,352010,35201010,true
2015-11-23,"""AAAP""",25.15,352010,35201010,true
2015-11-24,"""AAAP""",25.62,352010,35201010,true


In [98]:
sp500_tickers = pd.read_html(
    'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
sp500_tickers = pl.Series(sp500_tickers['Symbol'])
sp500_only = alpha_only.lazy().filter(pl.col('ticker').is_in(sp500_tickers)).collect(streaming=True)
sp500_only = sp500_only.to_pandas().set_index('date_index').groupby('ticker').resample('D').ffill().reset_index(level=0, drop=True)
sp500_only

Unnamed: 0_level_0,ticker,prccd,gind,gsubind,mask
date_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-12-09,AAL,24.60,203020,20302010,True
2013-12-10,AAL,24.88,203020,20302010,True
2013-12-11,AAL,25.99,203020,20302010,True
2013-12-12,AAL,25.45,203020,20302010,True
2013-12-13,AAL,26.23,203020,20302010,True
...,...,...,...,...,...
2023-02-27,ZTS,165.47,352020,35202010,True
2023-02-28,ZTS,167.00,352020,35202010,True
2023-03-01,ZTS,167.57,352020,35202010,True
2023-03-02,ZTS,168.58,352020,35202010,True


In [99]:
with_returns = sp500_only
with_returns['ret'] = with_returns.groupby('ticker')['prccd'].pct_change()
with_returns = with_returns.dropna()
with_returns

Unnamed: 0_level_0,ticker,prccd,gind,gsubind,mask,ret
date_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-12-10,AAL,24.88,203020,20302010,True,0.011382
2013-12-11,AAL,25.99,203020,20302010,True,0.044614
2013-12-12,AAL,25.45,203020,20302010,True,-0.020777
2013-12-13,AAL,26.23,203020,20302010,True,0.030648
2013-12-14,AAL,26.23,203020,20302010,True,0.000000
...,...,...,...,...,...,...
2023-02-27,ZTS,165.47,352020,35202010,True,-0.005170
2023-02-28,ZTS,167.00,352020,35202010,True,0.009246
2023-03-01,ZTS,167.57,352020,35202010,True,0.003413
2023-03-02,ZTS,168.58,352020,35202010,True,0.006027


In [185]:
window = 200
tickers_momentum_short = with_returns
tickers_momentum_short['momentum_short'] = tickers_momentum_short.groupby('ticker')['ret'].rolling(window).sum().reset_index(level=0, drop=True)
tickers_momentum_short = tickers_momentum_short.sort_index().reset_index()
tickers_momentum_short


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,date_index,ticker,prccd,gind,gsubind,mask,ret,momentum_short
0,2002-01-03,AMGN,54.34,352010,35201010,True,-0.036525,
1,2002-01-03,PXD,18.21,101020,10102020,True,-0.013543,
2,2002-01-03,CVX,179.00,101020,10102010,True,0.000224,
3,2002-01-03,TSN,11.50,302020,30202030,True,0.019504,
4,2002-01-03,MS,56.82,402030,40203020,True,0.028975,
...,...,...,...,...,...,...,...,...
2308726,2023-03-30,STT,90.42,402030,40203010,True,0.000000,0.244650
2308727,2023-03-30,GL,120.58,403010,40301020,True,0.000000,0.170874
2308728,2023-03-30,APD,294.78,151010,15101040,True,0.000000,0.157649
2308729,2023-03-30,CAH,75.20,351020,35102010,True,0.000000,0.088432


In [186]:
industry_average_momentum = tickers_momentum_short[['gind', 'ticker', 'momentum_short', 'date_index']]
industry_average_momentum['industry_average'] = industry_average_momentum.groupby(['gind', 'date_index'])['momentum_short'].transform('mean')
industry_average_momentum.drop(columns=['momentum_short'], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [187]:
combined_momentums = tickers_momentum_short.merge(industry_average_momentum, on=['date_index', 'gind', 'ticker'], how='inner')
combined_momentums = combined_momentums.sort_values(['ticker', 'date_index'])
combined_momentums

Unnamed: 0,date_index,ticker,prccd,gind,gsubind,mask,ret,momentum_short,industry_average
1240876,2013-12-10,AAL,24.88,203020,20302010,True,0.011382,,0.280690
1241177,2013-12-11,AAL,25.99,203020,20302010,True,0.044614,,0.253906
1241272,2013-12-12,AAL,25.45,203020,20302010,True,-0.020777,,0.273445
1241664,2013-12-13,AAL,26.23,203020,20302010,True,0.030648,,0.278505
1241988,2013-12-14,AAL,26.23,203020,20302010,True,0.000000,,0.295786
...,...,...,...,...,...,...,...,...,...
2306162,2023-02-27,ZTS,165.47,352020,35202010,True,-0.005170,-0.015095,0.035245
2306611,2023-02-28,ZTS,167.00,352020,35202010,True,0.009246,-0.020227,0.013802
2307006,2023-03-01,ZTS,167.57,352020,35202010,True,0.003413,-0.016814,0.015495
2307052,2023-03-02,ZTS,168.58,352020,35202010,True,0.006027,-0.010786,0.014337


In [188]:
industry_relative_momentum = combined_momentums
industry_relative_momentum['industry_relative'] = industry_relative_momentum['momentum_short'] - industry_relative_momentum['industry_average']
industry_relative_momentum = industry_relative_momentum[['date_index', 'ticker', 'industry_relative']]
industry_relative_momentum = industry_relative_momentum.dropna().sort_values(['date_index', 'ticker']).set_index('date_index')
industry_relative_momentum

Unnamed: 0_level_0,ticker,industry_relative
date_index,Unnamed: 1_level_1,Unnamed: 2_level_1
2002-07-21,AAPL,0.057472
2002-07-21,ABT,-0.237851
2002-07-21,ADBE,0.264195
2002-07-21,ADI,0.023836
2002-07-21,ADM,-0.041327
...,...,...
2023-03-30,APD,0.000000
2023-03-30,CAH,0.000000
2023-03-30,GL,0.000000
2023-03-30,RJF,-0.114303


In [189]:
momentum_data_dir = get_data_dir() / 'momentum'
industry_relative_momentum.to_csv(momentum_data_dir / f'gind_rel_{window}day_mom.csv')