In [57]:
import pandas as pd
import polars as pl

from factorlib.utils.system import get_data_dir
from sklearn.linear_model import LinearRegression

In [105]:
raw_data_dir = get_data_dir() / 'raw'
returns = pd.read_csv(raw_data_dir / 'training_returns.csv').set_index('date_index')
returns = returns.stack()
returns = returns.reset_index(level=1)
returns.index = pd.to_datetime(returns.index)
returns.columns = ['ticker', 'ret']
returns = returns.reset_index()
returns.sort_values(['ticker', 'date_index'])
len(returns['ticker'].unique())

258

In [102]:
prices = pl.scan_csv(raw_data_dir / 'prices_wrds.csv').collect(streaming=True)
prices = (
    prices.lazy()
    .select(
        pl.col('datadate').alias('date_index'),
        pl.col('tic').alias('ticker'),
        (pl.col('prccd').cast(pl.Float64) / pl.col('ajexdi').cast(pl.Float64)).alias('price')
    )
    .collect(streaming=True)
)
prices = prices.to_pandas()
prices.sort_values(['ticker', 'date_index'])

Unnamed: 0,date_index,ticker,price
41650550,1996-12-19,0081A,12.625
41650551,1996-12-20,0081A,12.375
41650552,1996-12-23,0081A,12.125
41650553,1996-12-24,0081A,12.000
41650554,1996-12-26,0081A,12.000
...,...,...,...
9326840,2013-03-11,ZZ,2.200
9326841,2013-03-12,ZZ,2.210
9326842,2013-03-13,ZZ,2.210
9326843,2013-03-14,ZZ,2.210


In [88]:
income_statements = pd.read_csv(raw_data_dir / 'income_statements.csv', index_col=0)
trainable_tickers = pd.read_csv(raw_data_dir / 'tickers_to_train.csv')['ticker'].to_list()
income_statements = income_statements.loc[income_statements['symbol'].isin(trainable_tickers)]
# balance_sheets = balance_sheets[['symbol', 'date', 'marketCap']]
income_statements = income_statements[['date', 'symbol', 'eps', 'netIncome']]
income_statements.columns = ['date_index', 'ticker', 'eps', 'netIncome']
income_statements['sharesOutstanding'] = income_statements['netIncome'] / income_statements['eps']
income_statements = income_statements[['date_index', 'ticker', 'sharesOutstanding']]
income_statements

Unnamed: 0,date_index,ticker,sharesOutstanding
0,2022-09-30,AAME,2.041791e+07
1,2022-06-30,AAME,2.040097e+07
2,2022-03-31,AAME,2.186154e+07
3,2021-12-31,AAME,2.050000e+07
4,2021-09-30,AAME,1.830000e+07
...,...,...,...
126,1990-03-31,ZIXI,2.276776e+07
127,1989-12-31,ZIXI,
128,1989-09-30,ZIXI,-0.000000e+00
129,1989-06-30,ZIXI,-0.000000e+00


In [89]:
mve_c = income_statements.merge(prices, how='inner',
                                on=['date_index', 'ticker'])
mve_c['mve_c'] = mve_c['sharesOutstanding'] * mve_c['price']
mve_c = mve_c[['date_index', 'ticker', 'mve_c', 'price']]
mve_c

Unnamed: 0,date_index,ticker,mve_c,price
0,2022-09-30,AAME,5.839522e+07,2.860000
1,2022-06-30,AAME,5.447060e+07,2.670000
2,2022-03-31,AAME,6.842662e+07,3.130000
3,2021-12-31,AAME,5.022500e+07,2.450000
4,2021-09-30,AAME,7.594500e+07,4.150000
...,...,...,...,...
99165,1992-06-30,ZIXI,2.100000e+08,12.600000
99166,1992-03-31,ZIXI,4.125000e+08,22.000000
99167,1991-12-31,ZIXI,1.861888e+08,14.933333
99168,1991-09-30,ZIXI,2.390606e+08,10.200000


In [101]:
returns['date_index'] = pd.to_datetime(returns['date_index'])
mve_c['date_index'] = pd.to_datetime(mve_c['date_index'])
trend_factor_raw_data = returns.merge(mve_c, on=['date_index', 'ticker'], how='inner')
trend_factor_raw_data.sort_values(['ticker', 'date_index'])

Unnamed: 0,date_index,ticker,ret,mve_c,price
3213,1995-06-30,ABC,-0.008130,5.275506e+08,2.851625
3541,1996-09-30,ABC,0.034884,1.056875e+09,5.562500
3767,1996-12-31,ABC,0.078212,1.138700e+09,6.031250
4011,1997-03-31,ABC,-0.069149,1.011719e+09,5.468750
4241,1997-06-30,ABC,0.075472,1.122187e+09,6.234375
...,...,...,...,...,...
21914,2021-12-31,ZION,0.000475,9.678475e+09,63.160000
22149,2022-03-31,ZION,-0.026867,1.047928e+10,65.560000
22383,2022-06-30,ZION,-0.020589,8.009845e+09,50.900000
22624,2022-09-30,ZION,-0.017767,7.883300e+09,50.860000


In [99]:
with_moving_avgs = trend_factor_raw_data
lags = [3, 5, 10, 20, 50, 100, 200, 400, 600, 800, 1000]
for lag in lags:
    with_moving_avgs[f'A_{lag}'] = with_moving_avgs.groupby('ticker')['price'].transform(lambda x: x.rolling(window=lag).mean())

# Normalization
for lag in lags:
    with_moving_avgs[f'A_{lag}'] = with_moving_avgs[f'A_{lag}'] / with_moving_avgs['price']

with_moving_avgs

22898

In [97]:
filtered = with_moving_avgs
qu10 = filtered.groupby('date_index')['mve_c'].quantile(0.1)
qu10.name = 'qu10'
qu10
filtered = filtered.merge(qu10, on='date_index', how='inner')
filtered
# filtered = filtered[(filtered['price'].abs()>=5) & (filtered['mve_c'] >= filtered['qu10'])]
# filtered = filtered.sort_values(by='date_index')
# filtered = filtered.dropna()
# filtered

Unnamed: 0,date_index,ticker,ret,mve_c,price,A_3,A_5,A_10,A_20,A_50,A_100,A_200,A_400,A_600,A_800,A_1000,qu10
0,1990-01-31,ADSK,0.012821,9.495192e+08,4.937500,,,,,,,,,,,,4.581203e+08
1,1990-01-31,COO,0.000000,,6.750675,,,,,,,,,,,,4.581203e+08
2,1990-01-31,DE,0.005736,4.779051e+09,10.958333,,,,,,,,,,,,4.581203e+08
3,1990-01-31,HPQ,0.008475,1.072240e+10,5.578125,,,,,,,,,,,,4.581203e+08
4,1990-01-31,LOW,0.038095,9.885955e+08,0.851562,,,,,,,,,,,,4.581203e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22893,2023-03-31,XOM,0.001553,4.337072e+11,109.660000,0.859049,0.777658,0.619798,0.637557,0.685572,,,,,,,1.120741e+10
22894,2023-03-31,XRAY,0.018672,8.213091e+09,39.280000,0.877122,1.060947,1.246385,1.311049,1.122195,,,,,,,1.120741e+10
22895,2023-03-31,ZION,-0.012211,4.590767e+09,29.930000,1.466644,1.740127,1.679853,1.448363,1.201828,,,,,,,1.120741e+10
22896,2023-05-31,AZO,-0.027621,5.165423e+10,2386.840000,0.340997,0.208798,0.109950,,,,,,,,,6.914361e+10


In [94]:
# Linear regression model
model = LinearRegression()
training = filtered
training = training[[f'A_{lag}' for lag in lags]].dropna()
X = training.shift(1).dropna()  # lagged moving averages
y = filtered['ret'][1:]
model.fit(X, y)

ValueError: Found array with 0 sample(s) (shape=(0, 11)) while a minimum of 1 is required by LinearRegression.

In [24]:
# Calculate beta coefficients
betas = filtered
betas[[f'beta_{lag}' for lag in lags]] = pd.Series(model.coef_)
betas

Unnamed: 0,date_index,gvkey,ticker,SHRCD,EXCHCD,PRC,mve_c,A_3,A_5,A_10,...,beta_5,beta_10,beta_20,beta_50,beta_100,beta_200,beta_400,beta_600,beta_800,beta_1000
545602,2002-10-01,9311.0,SBEI,11.0,3.0,67.000000,2.718860e+02,0.352388,0.222699,0.125371,...,-0.169574,0.330387,-0.20903,0.179096,-0.112885,0.013443,-0.003145,0.002012,-0.000574,0.000095
546999,2002-11-01,9311.0,SBEI,11.0,3.0,50.000000,2.029000e+02,0.792933,0.490936,0.264258,...,-0.169574,0.330387,-0.20903,0.179096,-0.112885,0.013443,-0.003145,0.002012,-0.000574,0.000095
323064,2003-04-01,61414.0,SNUS,11.0,3.0,93600.000000,1.281571e+06,1.012821,1.011111,1.005556,...,-0.169574,0.330387,-0.20903,0.179096,-0.112885,0.013443,-0.003145,0.002012,-0.000574,0.000095
323549,2003-04-01,65243.0,NUVO,11.0,3.0,41818.181818,9.660418e+05,0.971014,0.971739,0.932500,...,-0.169574,0.330387,-0.20903,0.179096,-0.112885,0.013443,-0.003145,0.002012,-0.000574,0.000095
321776,2003-04-01,9311.0,SBEI,11.0,3.0,32.000000,1.298560e+02,0.307292,0.650000,0.189844,...,-0.169574,0.330387,-0.20903,0.179096,-0.112885,0.013443,-0.003145,0.002012,-0.000574,0.000095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
696305,2020-05-01,11600.0,WOR,11.0,1.0,25.460000,1.412215e+03,1.054857,1.027965,0.968500,...,-0.169574,0.330387,-0.20903,0.179096,-0.112885,0.013443,-0.003145,0.002012,-0.000574,0.000095
696288,2020-05-01,5071.0,GIS,11.0,1.0,59.580000,3.586120e+04,1.005763,1.009433,1.009785,...,-0.169574,0.330387,-0.20903,0.179096,-0.112885,0.013443,-0.003145,0.002012,-0.000574,0.000095
696284,2020-05-01,12142.0,ORCL,11.0,1.0,51.790000,1.739626e+05,1.021047,1.024175,1.018884,...,-0.169574,0.330387,-0.20903,0.179096,-0.112885,0.013443,-0.003145,0.002012,-0.000574,0.000095
696291,2020-05-01,7401.0,MLHR,11.0,3.0,21.540000,1.266423e+03,1.051223,1.039786,0.967363,...,-0.169574,0.330387,-0.20903,0.179096,-0.112885,0.013443,-0.003145,0.002012,-0.000574,0.000095


In [25]:
# NOTHING IS ALIGNED LOOK TMRW
rolling_betas = betas
for lag in lags:
    rolling_betas[f'EBeta_{lag}'] = rolling_betas[f'beta_{lag}'].rolling(window=12).mean()

# Calculate expected return E[r] = \sum E[\beta_i]A_L_i
rolling_betas['TrendFactor'] = sum(rolling_betas[f'EBeta_{lag}'] * rolling_betas[f'A_{lag}'] for lag in lags)
rolling_betas = rolling_betas[['date_index', 'ticker', 'TrendFactor']]
rolling_betas = rolling_betas.set_index('date_index').groupby('ticker').resample('MS').first()
rolling_betas = rolling_betas.reset_index(level=0, drop=True).reset_index().ffill()
rolling_betas['date_index'] = pd.to_datetime(rolling_betas['date_index'])
rolling_betas = rolling_betas.set_index('date_index')
rolling_betas

Unnamed: 0_level_0,ticker,TrendFactor
date_index,Unnamed: 1_level_1,Unnamed: 2_level_1
2006-11-01,A,0.288263
2006-12-01,A,0.299175
2007-01-01,A,0.299175
2007-02-01,A,0.291252
2007-03-01,A,0.297710
...,...,...
2019-09-01,ZUMZ,0.305350
2019-10-01,ZUMZ,0.282010
2019-11-01,ZUMZ,0.303787
2019-07-01,ZYNE,0.304919


In [30]:
# added this after testing because it seemed like there were duplicates
no_duplicates = rolling_betas.reset_index().set_index(['date_index', 'ticker'])
no_duplicates = no_duplicates.loc[~rolling_betas.index.duplicated(keep='first')]
no_duplicates

Unnamed: 0_level_0,Unnamed: 1_level_0,TrendFactor
date_index,ticker,Unnamed: 2_level_1
2006-11-01,A,0.288263
2006-12-01,A,0.299175
2007-01-01,A,0.299175
2007-02-01,A,0.291252
2007-03-01,A,0.297710
...,...,...
2002-11-01,SBEI,0.299518
2002-12-01,SBEI,0.299518
2003-01-01,SBEI,0.299518
2003-02-01,SBEI,0.299518


In [32]:
momentum_dir = get_data_dir() / 'momentum'
no_duplicates.to_csv(momentum_dir / 'trend_factor.csv')