In [40]:
import pandas as pd
import numpy as np
import polars as pl
from arch import arch_model
from scipy.stats import norm

from factorlib.utils.system import get_data_dir

In [2]:
raw_data_dir = get_data_dir() / 'raw'
prices = pl.scan_csv(raw_data_dir / 'prices_wrds.csv', try_parse_dates=True).collect(streaming=True)
prices = (
    prices.lazy()
    .select(
        pl.col('datadate').alias('date_index'),
        pl.col('tic').alias('ticker'),
        (pl.col('prccd').cast(pl.Float64) / pl.col('ajexdi').cast(pl.Float64)).alias('price')
    )
    .collect(streaming=True)
)
prices

date_index,ticker,price
date,str,f64
1990-01-02,"""AIR""",24.0
1990-01-03,"""AIR""",24.166667
1990-01-04,"""AIR""",23.666667
1990-01-05,"""AIR""",23.916667
1990-01-08,"""AIR""",23.5
1990-01-09,"""AIR""",23.416667
1990-01-10,"""AIR""",23.416667
1990-01-11,"""AIR""",22.916667
1990-01-12,"""AIR""",22.5
1990-01-15,"""AIR""",22.166667


In [3]:
training_tickers = pl.scan_csv(raw_data_dir / 'tickers_to_train.csv').collect(streaming=True)
training_tickers = training_tickers.select(pl.col('ticker')).to_series().to_list()
training_prices = prices.lazy().filter(pl.col('ticker').is_in(training_tickers)).collect(streaming=True)
len(training_prices.select(pl.col('ticker').unique()))

1152

In [4]:
no_duplicates = training_prices.to_pandas().set_index(['ticker', 'date_index'])
no_duplicates = no_duplicates.loc[~no_duplicates.index.duplicated(keep='first')].reset_index(level=0)
no_duplicates

Unnamed: 0_level_0,ticker,price
date_index,Unnamed: 1_level_1,Unnamed: 2_level_1
1990-01-02,AIR,24.000000
1990-01-03,AIR,24.166667
1990-01-04,AIR,23.666667
1990-01-05,AIR,23.916667
1990-01-08,AIR,23.500000
...,...,...
2023-06-02,BVN,6.850000
2023-06-05,BVN,6.890000
2023-06-06,BVN,6.910000
2023-06-07,BVN,7.280000


In [5]:
training_returns = no_duplicates
training_returns['ret'] = no_duplicates.groupby('ticker').pct_change()
training_returns.dropna(inplace=True)
training_returns

Unnamed: 0_level_0,ticker,price,ret
date_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1990-01-03,AIR,24.166667,0.006944
1990-01-04,AIR,23.666667,-0.020690
1990-01-05,AIR,23.916667,0.010563
1990-01-08,AIR,23.500000,-0.017422
1990-01-09,AIR,23.416667,-0.003546
...,...,...,...
2023-06-02,BVN,6.850000,-0.005806
2023-06-05,BVN,6.890000,0.005839
2023-06-06,BVN,6.910000,0.002903
2023-06-07,BVN,7.280000,0.053546


In [6]:
raw_prices = pl.scan_csv(raw_data_dir / 'ohclv_daily.csv', try_parse_dates=True).collect(
    streaming=True)
industry_codes = raw_prices.lazy().select(
    pl.col('datadate').cast(pl.Datetime).alias('date_index'),
    pl.col('tic').alias('ticker'),
    pl.col('gind').alias('industry')
)
industry_codes = industry_codes.lazy().filter(pl.col('ticker').is_in(training_tickers)).sort(['ticker', 'date_index']).collect(streaming=True)
industry_codes = industry_codes.to_pandas().set_index('date_index')
industry_codes

Unnamed: 0_level_0,ticker,industry
date_index,Unnamed: 1_level_1,Unnamed: 2_level_1
1990-01-02,AAME,403010
1990-01-03,AAME,403010
1990-01-04,AAME,403010
1990-01-05,AAME,403010
1990-01-08,AAME,403010
...,...,...
2021-12-17,ZIXI,451030
2021-12-20,ZIXI,451030
2021-12-21,ZIXI,451030
2021-12-22,ZIXI,451030


In [12]:
returns_by_industry = training_returns.merge(industry_codes, on=['ticker', 'date_index'], how='left')
returns_by_industry

Unnamed: 0_level_0,ticker,price,ret,industry
date_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1990-01-03,AIR,24.166667,0.006944,201010.0
1990-01-04,AIR,23.666667,-0.020690,201010.0
1990-01-05,AIR,23.916667,0.010563,201010.0
1990-01-08,AIR,23.500000,-0.017422,201010.0
1990-01-09,AIR,23.416667,-0.003546,201010.0
...,...,...,...,...
2023-06-02,BVN,6.850000,-0.005806,151040.0
2023-06-05,BVN,6.890000,0.005839,151040.0
2023-06-06,BVN,6.910000,0.002903,151040.0
2023-06-07,BVN,7.280000,0.053546,


In [13]:
from statsmodels.tsa.stattools import adfuller

result = adfuller(returns_by_industry[returns_by_industry['ticker'] == 'AAME']['ret'])
print('ADF Stat: ', result[0])

print('p-stat: ', round(result[1], 4))

ADF Stat:  -16.382919044678456
p-stat:  0.0


In [25]:
def fit_egarch(data):
    model = arch_model(data, vol='EGARCH', p=1, q=1)
    results = model.fit(options={'maxiter': 1000}, disp=False)
    return results.forecast(start=0).variance.dropna().values.flatten()

In [31]:
from arch.__future__ import reindexing

window = 252
forecasted_variance = returns_by_industry.reset_index().sort_values(['ticker', 'date_index']).set_index('date_index')
egarch_tickers = np.unique(forecasted_variance['ticker'])
forecasted_variance['ret'] = forecasted_variance['ret'] * 100
grouped = returns_by_industry.groupby('ticker')
forecasted_groups = []
for ticker in egarch_tickers:
    ticker_group = grouped.get_group(ticker)
    ticker_group['forecasted_variance'] = fit_egarch(ticker_group['ret']) / 100
    forecasted_groups.append(ticker_group)

forecasted_df = pd.DataFrame()
for group in forecasted_groups:
    forecasted_df = pd.concat([forecasted_df, group])

forecasted_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/

Unnamed: 0_level_0,ticker,price,ret,industry,forecasted_variance
date_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1990-01-03,AAME,2.500,0.000000,403010.0,0.131907
1990-01-04,AAME,2.500,0.000000,403010.0,0.117112
1990-01-05,AAME,2.375,-5.000000,403010.0,0.139173
1990-01-08,AAME,2.375,0.000000,403010.0,0.123470
1990-01-09,AAME,2.375,0.000000,403010.0,0.109723
...,...,...,...,...,...
2021-12-16,ZIXI,8.480,0.000000,451030.0,0.018855
2021-12-17,ZIXI,8.420,-0.707547,451030.0,0.019086
2021-12-20,ZIXI,8.460,0.475059,451030.0,0.018532
2021-12-21,ZIXI,8.490,0.354610,451030.0,0.017823


In [39]:
egarch_stdv = forecasted_df[['ticker', 'forecasted_variance']]
egarch_stdv['stdv'] = np.sqrt(egarch_stdv['forecasted_variance'])
egarch_stdv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,ticker,forecasted_variance,stdv
date_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1990-01-03,AAME,0.131907,0.363190
1990-01-04,AAME,0.117112,0.342216
1990-01-05,AAME,0.139173,0.373059
1990-01-08,AAME,0.123470,0.351383
1990-01-09,AAME,0.109723,0.331245
...,...,...,...
2021-12-16,ZIXI,0.018855,0.137314
2021-12-17,ZIXI,0.019086,0.138151
2021-12-20,ZIXI,0.018532,0.136133
2021-12-21,ZIXI,0.017823,0.133504


In [42]:
egarch_with_stats = egarch_stdv
confidence_level = 0.95  # 95% confidence interval
egarch_with_stats['var_95'] = norm.ppf(1 - confidence_level) * egarch_with_stats['stdv']
egarch_with_stats['cvar_95'] = (1 - confidence_level)**-1 * norm.pdf(norm.ppf(1 - confidence_level)) * egarch_with_stats['stdv']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [43]:
statistics_data_dir = get_data_dir() / 'statistical'
egarch_factor = egarch_with_stats
egarch_factor.to_csv(statistics_data_dir / 'egarch_variance.csv')