In [122]:
import pandas as pd
import polars as pl
import numpy as np
import yfinance as yf

from factorlib.utils.system import get_data_dir
from sklearn.linear_model import LinearRegression

In [54]:
raw_data_dir = get_data_dir() / 'raw'
ex_sh_cds_prc = pd.read_csv(raw_data_dir / 'ex_sh_cds_prc.csv')
ex_sh_cds_prc = ex_sh_cds_prc.rename(columns={'date': 'date_index', 'TICKER': 'ticker'})
ex_sh_cds_prc['date_index'] = pd.to_datetime(ex_sh_cds_prc['date_index'])
ex_sh_cds_prc['PRC'] = ex_sh_cds_prc['PRC'] / ex_sh_cds_prc['CFACPR']
ex_sh_cds_prc = ex_sh_cds_prc.drop(columns=['CFACPR'])
ex_sh_cds_prc

Unnamed: 0,PERMNO,date_index,SHRCD,EXCHCD,ticker,PRC
0,10001,2001-01-02,11.0,3.0,EWST,6.583333
1,10001,2001-01-03,11.0,3.0,EWST,6.375000
2,10001,2001-01-04,11.0,3.0,EWST,6.583333
3,10001,2001-01-05,11.0,3.0,EWST,-6.541667
4,10001,2001-01-08,11.0,3.0,EWST,6.583333
...,...,...,...,...,...,...
40426429,93436,2022-12-23,11.0,3.0,TSLA,123.150000
40426430,93436,2022-12-27,11.0,3.0,TSLA,109.100000
40426431,93436,2022-12-28,11.0,3.0,TSLA,112.710000
40426432,93436,2022-12-29,11.0,3.0,TSLA,121.820000


In [79]:
returns = ex_sh_cds_prc
returns['ret'] = ex_sh_cds_prc['PRC'].pct_change()
returns = returns[['date_index', 'ticker', 'PERMNO', 'ret']]
returns = returns.dropna()
returns

Unnamed: 0,date_index,ticker,PERMNO,ret
1,2001-01-03,EWST,10001,-0.031646
2,2001-01-04,EWST,10001,0.032680
3,2001-01-05,EWST,10001,-1.993671
4,2001-01-08,EWST,10001,-2.006369
5,2001-01-09,EWST,10001,0.006329
...,...,...,...,...
40426429,2022-12-23,TSLA,93436,-0.017551
40426430,2022-12-27,TSLA,93436,-0.114089
40426431,2022-12-28,TSLA,93436,0.033089
40426432,2022-12-29,TSLA,93436,0.080827


In [69]:
outstanding_shares = pd.read_csv(raw_data_dir / 'shares_outstanding.csv')
outstanding_shares = outstanding_shares.rename(columns={'datadate': 'date_index', 'tic': 'ticker'})
outstanding_shares['date_index'] = pd.to_datetime(outstanding_shares['date_index'])
outstanding_shares = outstanding_shares.set_index(['date_index', 'ticker'])
outstanding_shares = outstanding_shares.loc[~outstanding_shares.index.duplicated(keep='first')]
outstanding_shares = outstanding_shares.reset_index().set_index('date_index')
outstanding_shares = outstanding_shares.groupby('ticker').resample('D', convention='start').ffill()
outstanding_shares = outstanding_shares.reset_index(level=0, drop=True).reset_index()
outstanding_shares

Unnamed: 0,date_index,PRC
40423284,2010-06-29,1.592667
40423285,2010-06-30,1.588667
40423286,2010-07-01,1.464000
40423287,2010-07-02,1.280000
40423288,2010-07-06,1.074000
...,...,...
40426429,2022-12-23,123.150000
40426430,2022-12-27,109.100000
40426431,2022-12-28,112.710000
40426432,2022-12-29,121.820000


In [71]:
trend_factor_raw_data = outstanding_shares[['gvkey', 'date_index', 'ticker', 'csho']].merge(ex_sh_cds_prc[['PERMNO', 'date_index', 'ticker', 'SHRCD', 'EXCHCD', 'PRC', ]], on=['date_index', 'ticker'], how='inner')

In [110]:
trend_raw_complete = trend_factor_raw_data
trend_raw_complete['mve_c'] = trend_raw_complete['csho'] * trend_raw_complete['PRC']
trend_raw_complete = trend_raw_complete.drop(columns='csho')
trend_raw_complete[trend_raw_complete['ticker']=='TSLA']

Unnamed: 0,gvkey,date_index,ticker,PERMNO,SHRCD,EXCHCD,PRC,mve_c
20488790,184996,2010-06-29,TSLA,93436,11.0,3.0,1.592667,11.600984
20488791,184996,2010-06-30,TSLA,93436,11.0,3.0,1.588667,11.571848
20488792,184996,2010-07-01,TSLA,93436,11.0,3.0,1.464000,10.663776
20488793,184996,2010-07-02,TSLA,93436,11.0,3.0,1.280000,9.323520
20488794,184996,2010-07-06,TSLA,93436,11.0,3.0,1.074000,7.823016
...,...,...,...,...,...,...,...,...
20491179,184996,2019-12-24,TSLA,93436,11.0,3.0,28.350000,4893.295050
20491180,184996,2019-12-26,TSLA,93436,11.0,3.0,28.729333,4958.769121
20491181,184996,2019-12-27,TSLA,93436,11.0,3.0,28.692000,4952.325276
20491182,184996,2019-12-30,TSLA,93436,11.0,3.0,27.646667,4771.897722


In [111]:
with_moving_avgs = trend_raw_complete
lags = [3, 5, 10, 20, 50, 100, 200, 400, 600, 800, 1000]
for lag in lags:
    with_moving_avgs[f'A_{lag}'] = with_moving_avgs.groupby('PERMNO')['PRC'].transform(lambda x: x.rolling(window=lag).mean())

# Normalization
for lag in lags:
    with_moving_avgs[f'A_{lag}'] = with_moving_avgs[f'A_{lag}'] / with_moving_avgs['PRC']

with_moving_avgs

Unnamed: 0,gvkey,date_index,ticker,PERMNO,SHRCD,EXCHCD,PRC,mve_c,A_3,A_5,A_10,A_20,A_50,A_100,A_200,A_400,A_600,A_800,A_1000
0,126554,2002-10-31,A,87432,11.0,1.0,9.387492,4383.958654,,,,,,,,,,,
1,126554,2002-11-01,A,87432,11.0,1.0,9.954155,4648.590340,,,,,,,,,,,
2,126554,2002-11-04,A,87432,11.0,1.0,10.261382,4792.065351,0.961632,,,,,,,,,,
3,126554,2002-11-05,A,87432,11.0,1.0,10.288691,4804.818685,0.988277,,,,,,,,,,
4,126554,2002-11-06,A,87432,11.0,1.0,10.213591,4769.747016,1.004011,0.981150,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22784669,8101,2012-11-26,ZZ,91205,11.0,1.0,2.190000,221.006040,1.000000,0.999087,0.998630,1.009132,0.996986,0.889315,0.856918,0.876632,0.999444,1.117340,1.041324
22784670,8101,2012-11-27,ZZ,91205,11.0,1.0,2.190000,221.006040,1.000000,0.999087,0.998630,1.008219,0.999817,0.890868,0.858562,0.875959,0.999186,1.116849,1.041701
22784671,8101,2012-11-28,ZZ,91205,11.0,1.0,2.190000,221.006040,1.000000,1.000000,0.998630,1.007306,1.001918,0.892055,0.860091,0.875251,0.998813,1.116381,1.042103
22784672,8101,2012-11-29,ZZ,91205,11.0,1.0,2.200000,222.015200,0.996970,0.996364,0.995000,1.002045,0.998727,0.889455,0.857636,0.870398,0.993909,1.110750,1.037777


In [112]:
moving_avgs_monthly = with_moving_avgs.set_index('date_index')
moving_avgs_monthly = moving_avgs_monthly.groupby('PERMNO').resample('MS').first()
moving_avgs_monthly = moving_avgs_monthly.reset_index(level=0, drop=True)
moving_avgs_monthly

Unnamed: 0_level_0,gvkey,ticker,PERMNO,SHRCD,EXCHCD,PRC,mve_c,A_3,A_5,A_10,A_20,A_50,A_100,A_200,A_400,A_600,A_800,A_1000
date_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2009-08-01,12994.0,EGAS,10001.0,11.0,3.0,8.600000,36.954200,0.987733,1.035545,0.998645,0.998563,,,,,,,
2009-09-01,12994.0,EGAS,10001.0,11.0,3.0,8.320000,35.751040,1.012019,1.021632,1.029323,1.023284,,,,,,,
2009-10-01,12994.0,EGAS,10001.0,11.0,3.0,8.599900,36.953770,0.996132,0.993770,0.986543,0.977931,0.946515,,,,,,
2009-11-01,12994.0,EGAS,10001.0,11.0,3.0,8.820000,37.899540,1.005669,1.009932,1.017892,1.017806,0.981435,,,,,,
2009-12-01,12994.0,EGAS,10001.0,11.0,3.0,8.850000,38.028450,1.000151,0.997153,0.794678,0.905305,0.960569,0.833668,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-08-01,184996.0,TSLA,93436.0,11.0,3.0,15.590001,2690.880885,1.023049,1.010502,1.047364,1.043327,0.955431,1.031534,1.202437,1.271260,1.317739,1.222115,1.167799
2019-09-01,184996.0,TSLA,93436.0,11.0,3.0,15.000666,2589.159954,0.996000,0.979512,0.976508,0.995680,1.032723,1.021570,1.205833,1.294103,1.360450,1.271201,1.211909
2019-10-01,184996.0,TSLA,93436.0,11.0,3.0,16.312667,2815.615205,0.991309,0.979975,0.978422,0.973663,0.951713,0.926082,1.062040,1.169902,1.241065,1.170699,1.113493
2019-11-01,184996.0,TSLA,93436.0,11.0,3.0,20.887333,3605.216395,1.003522,1.013163,0.950732,0.875681,0.797171,0.769999,0.806612,0.906007,0.960905,0.918314,0.873408


In [113]:
with_returns = moving_avgs_monthly.merge(returns, on=['date_index', 'ticker'], how='inner')
with_returns = with_returns.drop(columns=['PERMNO_y', 'PERMNO_x'])
with_returns

Unnamed: 0,date_index,gvkey,ticker,SHRCD,EXCHCD,PRC,mve_c,A_3,A_5,A_10,A_20,A_50,A_100,A_200,A_400,A_600,A_800,A_1000,ret
0,2009-09-01,12994.0,EGAS,11.0,3.0,8.320000,35.751040,1.012019,1.021632,1.029323,1.023284,,,,,,,,-0.025761
1,2009-10-01,12994.0,EGAS,11.0,3.0,8.599900,36.953770,0.996132,0.993770,0.986543,0.977931,0.946515,,,,,,,0.000000
2,2009-12-01,12994.0,EGAS,11.0,3.0,8.850000,38.028450,1.000151,0.997153,0.794678,0.905305,0.960569,0.833668,,,,,,-0.005394
3,2010-02-01,12994.0,EGAS,11.0,2.0,9.990000,43.576380,1.007674,1.002134,1.005872,1.018401,0.929245,0.903324,,,,,,-0.006958
4,2010-03-01,12994.0,EGAS,11.0,2.0,10.000000,43.620000,1.001613,1.001368,1.001809,1.003631,1.005987,0.933095,,,,,,-0.000839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
696377,2019-05-01,184996.0,TSLA,11.0,3.0,15.600666,2692.721754,1.017293,1.022982,1.079757,1.123610,1.178277,1.282927,1.310882,1.339924,1.323521,1.221655,1.175778,-0.019607
696378,2019-07-01,184996.0,TSLA,11.0,3.0,15.144667,2614.014901,0.988203,0.979443,0.981111,0.951387,0.975787,1.115952,1.258394,1.327810,1.359160,1.255322,1.203991,0.016602
696379,2019-08-01,184996.0,TSLA,11.0,3.0,15.590001,2690.880885,1.023049,1.010502,1.047364,1.043327,0.955431,1.031534,1.202437,1.271260,1.317739,1.222115,1.167799,-0.032118
696380,2019-10-01,184996.0,TSLA,11.0,3.0,16.312667,2815.615205,0.991309,0.979975,0.978422,0.973663,0.951713,0.926082,1.062040,1.169902,1.241065,1.170699,1.113493,0.015859


In [142]:
filtered = with_returns
qu10 = filtered[filtered['EXCHCD'] == 1].groupby('date_index')['mve_c'].quantile(0.1)
qu10.name = 'qu10'
filtered = filtered.merge(qu10, on='date_index', how='inner')
filtered = filtered[(filtered['EXCHCD'].isin([1, 2, 3])) & (filtered['SHRCD'].isin([10, 11])) & (filtered['PRC'].abs()>=5) & (filtered['mve_c'] >= filtered['qu10'])]
filtered = filtered.sort_values(by='date_index')
filtered

Unnamed: 0,date_index,gvkey,ticker,SHRCD,EXCHCD,PRC,mve_c,A_3,A_5,A_10,A_20,A_50,A_100,A_200,A_400,A_600,A_800,A_1000,ret,qu10
542419,2002-02-01,62667.0,PSS,11.0,1.0,19.536667,434.749443,1.010964,1.015839,1.002203,0.966302,,,,,,,,0.007045,81.779220
542418,2002-02-01,11511.0,WSM,11.0,1.0,22.390000,1281.670770,1.032987,1.045765,0.975034,0.975689,,,,,,,,-0.026522,81.779220
542417,2002-02-01,61763.0,RDEN,11.0,3.0,9.750000,171.853500,1.109283,1.168219,1.069475,1.015405,,,,,,,,-0.030815,81.779220
542414,2002-02-01,66278.0,TSA,11.0,1.0,7.990000,85.724710,0.985366,0.972216,0.972082,0.931778,,,,,,,,-0.007453,81.779220
542413,2002-02-01,30219.0,BTH,11.0,1.0,42.400000,1987.669600,1.000787,1.012632,0.993037,0.996818,,,,,,,,0.000472,81.779220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
696305,2020-05-01,11600.0,WOR,11.0,1.0,25.460000,1412.215280,1.054857,1.027965,0.968500,0.976551,1.033574,1.302930,1.379073,1.447715,1.561400,1.626304,1.663513,-0.037065,65.202048
696288,2020-05-01,5071.0,GIS,11.0,1.0,59.580000,35861.202000,1.005763,1.009433,1.009785,0.992397,0.927348,0.907617,0.901509,0.843741,0.833221,0.858599,0.904526,-0.005176,65.202048
696284,2020-05-01,12142.0,ORCL,11.0,1.0,51.790000,173962.610000,1.021047,1.024175,1.018884,1.016692,0.965213,1.007310,1.033030,1.013584,0.985193,0.968696,0.929130,-0.022277,65.202048
696291,2020-05-01,7401.0,MLHR,11.0,3.0,21.540000,1266.422760,1.051223,1.039786,0.967363,0.943802,1.122043,1.537166,1.818325,1.742199,1.723117,1.674877,1.634458,-0.044366,65.202048


In [136]:
# Linear regression model
model = LinearRegression()
training = filtered
training = training[[f'A_{lag}' for lag in lags]]
X = training.shift(1).dropna()  # lagged moving averages
y = filtered['ret'][1:]
model.fit(X, y)

In [137]:
# Calculate beta coefficients
betas = filtered
betas[[f'beta_{lag}' for lag in lags]] = pd.Series(model.coef_)
betas

Unnamed: 0,date_index,gvkey,ticker,SHRCD,EXCHCD,PRC,mve_c,A_3,A_5,A_10,...,beta_5,beta_10,beta_20,beta_50,beta_100,beta_200,beta_400,beta_600,beta_800,beta_1000
545602,2002-10-01,9311.0,SBEI,11.0,3.0,67.000000,2.718860e+02,0.352388,0.222699,0.125371,...,-0.169574,0.330387,-0.20903,0.179096,-0.112885,0.013443,-0.003145,0.002012,-0.000574,0.000095
546999,2002-11-01,9311.0,SBEI,11.0,3.0,50.000000,2.029000e+02,0.792933,0.490936,0.264258,...,-0.169574,0.330387,-0.20903,0.179096,-0.112885,0.013443,-0.003145,0.002012,-0.000574,0.000095
323064,2003-04-01,61414.0,SNUS,11.0,3.0,93600.000000,1.281571e+06,1.012821,1.011111,1.005556,...,-0.169574,0.330387,-0.20903,0.179096,-0.112885,0.013443,-0.003145,0.002012,-0.000574,0.000095
323549,2003-04-01,65243.0,NUVO,11.0,3.0,41818.181818,9.660418e+05,0.971014,0.971739,0.932500,...,-0.169574,0.330387,-0.20903,0.179096,-0.112885,0.013443,-0.003145,0.002012,-0.000574,0.000095
321776,2003-04-01,9311.0,SBEI,11.0,3.0,32.000000,1.298560e+02,0.307292,0.650000,0.189844,...,-0.169574,0.330387,-0.20903,0.179096,-0.112885,0.013443,-0.003145,0.002012,-0.000574,0.000095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
696305,2020-05-01,11600.0,WOR,11.0,1.0,25.460000,1.412215e+03,1.054857,1.027965,0.968500,...,-0.169574,0.330387,-0.20903,0.179096,-0.112885,0.013443,-0.003145,0.002012,-0.000574,0.000095
696288,2020-05-01,5071.0,GIS,11.0,1.0,59.580000,3.586120e+04,1.005763,1.009433,1.009785,...,-0.169574,0.330387,-0.20903,0.179096,-0.112885,0.013443,-0.003145,0.002012,-0.000574,0.000095
696284,2020-05-01,12142.0,ORCL,11.0,1.0,51.790000,1.739626e+05,1.021047,1.024175,1.018884,...,-0.169574,0.330387,-0.20903,0.179096,-0.112885,0.013443,-0.003145,0.002012,-0.000574,0.000095
696291,2020-05-01,7401.0,MLHR,11.0,3.0,21.540000,1.266423e+03,1.051223,1.039786,0.967363,...,-0.169574,0.330387,-0.20903,0.179096,-0.112885,0.013443,-0.003145,0.002012,-0.000574,0.000095


In [150]:
# NOTHING IS ALIGNED LOOK TMRW
rolling_betas = betas
for lag in lags:
    rolling_betas[f'EBeta_{lag}'] = rolling_betas[f'beta_{lag}'].rolling(window=12).mean()

# Calculate expected return E[r] = \sum E[\beta_i]A_L_i
rolling_betas['TrendFactor'] = sum(rolling_betas[f'EBeta_{lag}'] * rolling_betas[f'A_{lag}'] for lag in lags)
rolling_betas = rolling_betas[['date_index', 'ticker', 'TrendFactor']]
rolling_betas = rolling_betas.set_index('date_index').groupby('ticker').resample('MS').first()
rolling_betas = rolling_betas.reset_index(level=0, drop=True).reset_index().ffill()
rolling_betas

Unnamed: 0,date_index,ticker,TrendFactor
0,2006-11-01,A,0.288263
1,2006-12-01,A,0.299175
2,2007-01-01,A,0.299175
3,2007-02-01,A,0.291252
4,2007-03-01,A,0.297710
...,...,...,...
246268,2019-09-01,ZUMZ,0.305350
246269,2019-10-01,ZUMZ,0.282010
246270,2019-11-01,ZUMZ,0.303787
246271,2019-07-01,ZYNE,0.304919


In [152]:
momentum_dir = get_data_dir() / 'momentum'
rolling_betas.to_csv(momentum_dir / 'trend_factor.csv')