In [317]:
import pandas as pd
import numpy as np
import os, sys

# Now you can import the library as usual
import investos as inv
from scipy.stats import ttest_ind
from sklearn.linear_model import LinearRegression
from datetime import datetime
from scipy.stats.mstats import winsorize
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate
from sklearn.preprocessing import StandardScaler

import statsmodels.api as sm

In [631]:
# Step 1: loading the dataframe and join them together
# loading
df_loadings = pd.read_parquet('john_rbics_industry.parquet')
df_actual_returns = pd.read_parquet('john_return_prev_1d_open.parquet').rename(columns={"value": "return_1d"})
df_sizes = pd.read_parquet('john_market_cap_open_dil.parquet').rename(columns={"value": "size"})
# log and then winsorize
df_sizes['size'] = np.log(df_sizes['size'])
df_values = pd.read_parquet('john_market_cap_open_dil_to_net_income_ltm.parquet').rename(columns={"value": "value"})
df_loadings = pd.get_dummies(df_loadings, columns=["industry"])

# merge these dataframe into one
df_values = df_values.merge(df_sizes, how='left', on=['id','datetime'])
df_actual_returns = df_actual_returns.merge(df_values, how='left', on=['id','datetime'])
df_loadings = pd.merge(df_actual_returns, df_loadings, how='left', on=None, left_on='id', right_on='id', left_index=False)

for col in df_loadings.columns:
    if col.startswith("industry_"):
        df_loadings[col] = df_loadings[col].astype(int)

In [632]:
df_loadings[(df_loadings.id == 'XR7GZL-R') & (df_loadings.datetime == '2024-01-31')]

Unnamed: 0,id,datetime,return_1d,value,size,industry_Business Services,industry_Consumer Cyclicals,industry_Consumer Non-Cyclicals,industry_Consumer Services,industry_Energy,industry_Finance,industry_Healthcare,industry_Industrials,industry_Non-Corporate,industry_Non-Energy Materials,industry_Other,industry_Technology,industry_Telecommunications,industry_Utilities
24411632,XR7GZL-R,2024-01-31,0.013623,20.462871,7.755486,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [633]:
# Step 2: Calculate momentum (e.g., 12-month cumulative return)

# sliding window size for volatiltiy and momentum 
#look_back_period_volatility = 252  # Approx. 12 months for daily data
look_back_period_momentum = 30 
#df_loadings['momentum'] = df_loadings.groupby('id')['return_1d'].apply(lambda x: x.rolling(window=look_back_period_momentum).apply(lambda y: np.prod(1 + y) - 1))
#df_loadings['volatility'] = df_loadings.groupby('id')['return_1d'].apply(lambda x: x.rolling(window=look_back_period_volatility).std()).reset_index(level = 'id').rename(columns={"return_1d": "volatility"}).volatility
df_loadings['momentum'] = df_loadings.groupby('id')['return_1d'].rolling(window=look_back_period_momentum).apply(lambda x: np.prod(1 + x) - 1, raw = True).reset_index(level = 'id').rename(columns={"return_1d": "momentum"}).momentum
# drop those nan values less than then window size
df_loadings['momentum'] = df_loadings['momentum'].shift(periods = 1)
df_loadings['country'] = 1

df_loadings.dropna(inplace = True)
df_loadings.replace([np.inf, -np.inf], 0, inplace=True)

In [634]:
df_loadings[(df_loadings.id == 'XR7GZL-R') & (df_loadings.datetime == '2024-01-31')]

Unnamed: 0,id,datetime,return_1d,value,size,industry_Business Services,industry_Consumer Cyclicals,industry_Consumer Non-Cyclicals,industry_Consumer Services,industry_Energy,...,industry_Healthcare,industry_Industrials,industry_Non-Corporate,industry_Non-Energy Materials,industry_Other,industry_Technology,industry_Telecommunications,industry_Utilities,momentum,country
24411632,XR7GZL-R,2024-01-31,0.013623,20.462871,7.755486,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.09365,1


In [635]:
# Step 3. Winsorize and standardize

# Identify numeric columns excluding 'id' and 'datetime'
startDate = '2020-01-01'
df_loadings = df_loadings[df_loadings.datetime >= startDate]
cols = [col for col in df_loadings if col not in ['datetime','id','country','return_1d','momentum','period'] and not col.startswith('industry_')]

# Function to winsorize and standardize within each group
def process_group(group):
    for col in cols:
        # Winsorizing numeric columns at 5th and 95th percentiles within the group
        group[col] = winsorize(group[col], limits=[0.05, 0.05])
    
    # Ensure no infinite values are left
    group[cols] = np.nan_to_num(group[cols], nan=0.0, posinf=0.0, neginf=0.0)
    
    # Standardizing numeric columns within the group
    scaler = StandardScaler()
    group[cols] = scaler.fit_transform(group[cols])
    
    return group

# Apply the processing function to each group defined by 'datetime'
df_loadings = df_loadings.groupby('datetime').apply(process_group, include_groups=False)
df_loadings = df_loadings.reset_index().drop(columns=['level_1'])

In [636]:
df_loadings[(df_loadings.id == 'XR7GZL-R') & (df_loadings.datetime == '2024-01-31')]

Unnamed: 0,datetime,id,return_1d,value,size,industry_Business Services,industry_Consumer Cyclicals,industry_Consumer Non-Cyclicals,industry_Consumer Services,industry_Energy,...,industry_Healthcare,industry_Industrials,industry_Non-Corporate,industry_Non-Energy Materials,industry_Other,industry_Technology,industry_Telecommunications,industry_Utilities,momentum,country
3383986,2024-01-31,XR7GZL-R,0.013623,0.281468,-0.007366,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.09365,1


In [637]:
#df_loadings

In [638]:
# identify factor effectiveness
# Step 1: calculate t statistics 
def get_t_statistics(model,X,y):
    predictions = model.predict(X)     
    residuals = y - predictions
    # Calculate the residual standard error
    rss = (residuals ** 2).sum()

    n = len(y)      # number of sample
    p = X.shape[1]  # number of factors
    rmse = (rss/n)**0.5
    rse = (rss / (n - p - 1)) ** 0.5
    # Calculate t-values for coefficients
    t_values = model.coef_ / (rse/n**0.5)

    return t_values, residuals

In [639]:
# cols = ['industry_Business Services', 'industry_Consumer Cyclicals', 'industry_Consumer Non-Cyclicals', 'industry_Consumer Services', 'industry_Energy', 'industry_Finance', 'industry_Healthcare', 'industry_Industrials', 'industry_Non-Corporate', 'industry_Non-Energy Materials', 'industry_Other', 'industry_Technology', 'industry_Telecommunications', 'industry_Utilities']

# df_current[cols]

In [641]:

"""
1.
it comes out with an warning like this:
# /var/folders/sg/1nh38dx53_zc3kxcq1cq1f000000gn/T/ipykernel_29543/1917117034.py:11: RuntimeWarning: invalid value encountered in divide
#  t_values = model.coef_ / rse

this is caused by t_values = model.coef_ / rse while certain model.coef_ value is too low (about e-17 level). Overall the code is fine.
We can just mute this warning later

2. some results shows [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] 0.0 which model.coef_ is a zero list, and rse is zero 
as well
"""
cols = [col for col in df_loadings.columns if col.startswith("industry_")] + ['momentum', 'size', 'value']

factor_returns = {}
# change to freq 2Y to any period you want
df_loadings["period"] = df_loadings["datetime"].dt.to_period(freq = 'Y')

for d in df_loadings["datetime"].unique():
    # Isolating the data for the current date
    df_current = df_loadings[df_loadings["datetime"] == d]

    # outlier modification and factor standarization
    # identify factor effectiveness
    y = df_current["return_1d"]
    X = df_current[cols]

    #Performing linear regression
    model = LinearRegression(fit_intercept=False).fit(X, y)

    t_values, residuals = get_t_statistics(model,X,y)
    #model = sm.OLS(y, X).fit()
    #print(model.summary())
    # Print t-values for each factor
    # print("T-Values:")
    # for i, factor in enumerate(X.columns):
    #     print(f"{factor}: {t_values[i]}")
    
    # Storing the coefficients and intercept
    factor_returns[d] = {
        "coefficients": model.coef_,
        "intercept": model.intercept_,
        "feature_names": model.feature_names_in_,
        "r2": model.score(X, y),
         # t_values added to the factor_returns dataframe
        "t-values": abs(t_values),
        "residuals": residuals,
        "period":df_current.period.unique()
    }

  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)
  t_values = model.coef_ / (rse/n**0.5)


In [642]:
factor_returns

{Timestamp('2020-01-01 00:00:00'): {'coefficients': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
  'intercept': 0.0,
  'feature_names': array(['industry_Business Services', 'industry_Consumer Cyclicals',
         'industry_Consumer Non-Cyclicals', 'industry_Consumer Services',
         'industry_Energy', 'industry_Finance', 'industry_Healthcare',
         'industry_Industrials', 'industry_Non-Corporate',
         'industry_Non-Energy Materials', 'industry_Other',
         'industry_Technology', 'industry_Telecommunications',
         'industry_Utilities', 'momentum', 'size', 'value'], dtype=object),
  'r2': 1.0,
  't-values': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan]),
  'residuals': 0       0.0
  1       0.0
  2       0.0
  3       0.0
  4       0.0
         ... 
  3039    0.0
  3040    0.0
  3041    0.0
  3042    0.0
  3043    0.0
  Name: return_1d, Length: 3044, dtype: float64,
  'period': <Pe

In [580]:
list_to_insert = [
    [k, factor_returns[k]["r2"], *factor_returns[k]["t-values"],*factor_returns[k]["coefficients"], factor_returns[k]["period"][0]]
    for k in factor_returns    # iterative through the factor returns dict
]
#print(list_to_insert)
#cols = df_loadings.drop(columns=["return_1d", "datetime", "id"]).columns
#cols = ['country','size','value','momentum']
#cols = factor_returns[0]['feature_names']
# update
cols_with_return = ['returns_' + col for col in cols]
cols_with_t_values = ['t_values_' + col for col in cols]

# df_factor_returns = df.append(pd.Series(list_to_insert, index=['date', 'r2', *cols]), ignore_index=True)  # using append
df_factor_returns = pd.DataFrame(list_to_insert, columns=["datetime", "r2", *cols_with_t_values, *cols_with_return, "period"])

# generate df_returns_residual df


In [660]:
residual_list = [factor_returns[k]["residuals"] for k in factor_returns]
pd.concat(residual_list)
df_residuals = df_loadings[["datetime","id"]].merge(pd.concat(residual_list), left_index = True, right_index = True).rename(columns = {"return_1d":"residuals"})

In [661]:
df_residuals

Unnamed: 0,datetime,id,residuals
0,2020-01-01,B00FG1-R,0.000000
1,2020-01-01,B01HWF-R,0.000000
2,2020-01-01,B04XY5-R,0.000000
3,2020-01-01,B06HD3-R,0.000000
4,2020-01-01,B0TXKG-R,0.000000
...,...,...,...
3383982,2024-01-31,XQF8XY-R,0.003404
3383983,2024-01-31,XQFML8-R,0.015875
3383984,2024-01-31,XQN3ZC-R,0.024537
3383985,2024-01-31,XR00QK-R,0.008522


In [663]:
for d in df_residuals["datetime"].unique():
    print(df_residuals[df_residuals["datetime"] == d]['residuals'].var())
    

0.0
0.0006653017687210244
0.0006981677642964903
0.0006611820550913612
0.0007757355050212405
0.0005418296086203554
0.0007031649753673212
0.0010254592578787217
0.0006983707939816298
0.0010296164067895846
0.0007937146928843099
0.001074318336425916
0.0006634279073346698
0.0
0.0007927294891001571
0.0008663402973727766
0.001115548015581239
0.0004845033233677611
0.0024912372049648674
0.00085348555320735
0.0009322672522507455
0.0008436013479148111
0.0008039930156901618
0.0006661581818018503
0.0007856469121024109
0.0007845300095898771
0.0012934525874673658
0.0013848103819029778
0.0006245619168411438
0.0009560450133161686
0.0007846405174928273
0.00076382688764746
0.0009634858867780253
0.0
0.0006781905896986496
0.0009996492971195746
0.001124896065839523
0.0011750710768371315
0.001168160585199069
0.0023285578739012836
0.001174804240787132
0.001816916812251063
0.0028180507506176435
0.0022693079448045045
0.0015001558140794232
0.0012473435511671945
0.0008909752109166529
0.002024644859161111
0.0039218

In [647]:
pd.DataFrame(pd.concat(residual_list),columns = ['residuals'])

Unnamed: 0,residuals


In [649]:
type(pd.concat(residual_list))

pandas.core.series.Series

In [581]:
df_factor_returns.columns

Index(['datetime', 'r2', 't_values_industry_Business Services',
       't_values_industry_Consumer Cyclicals',
       't_values_industry_Consumer Non-Cyclicals',
       't_values_industry_Consumer Services', 't_values_industry_Energy',
       't_values_industry_Finance', 't_values_industry_Healthcare',
       't_values_industry_Industrials', 't_values_industry_Non-Corporate',
       't_values_industry_Non-Energy Materials', 't_values_industry_Other',
       't_values_industry_Technology', 't_values_industry_Telecommunications',
       't_values_industry_Utilities', 't_values_momentum', 't_values_size',
       't_values_value', 'returns_industry_Business Services',
       'returns_industry_Consumer Cyclicals',
       'returns_industry_Consumer Non-Cyclicals',
       'returns_industry_Consumer Services', 'returns_industry_Energy',
       'returns_industry_Finance', 'returns_industry_Healthcare',
       'returns_industry_Industrials', 'returns_industry_Non-Corporate',
       'returns_indu

In [582]:
industry_cols = [col for col in df_factor_returns.columns if col.startswith('t_values_')] + ['r2']
df_factor_returns[industry_cols].mean(axis = 0)

t_values_industry_Business Services         1.772860e+01
t_values_industry_Consumer Cyclicals        2.420559e+01
t_values_industry_Consumer Non-Cyclicals    1.674066e+01
t_values_industry_Consumer Services         2.305906e+01
t_values_industry_Energy                    2.995068e+01
t_values_industry_Finance                   1.888085e+01
t_values_industry_Healthcare                2.228077e+01
t_values_industry_Industrials               2.155898e+01
t_values_industry_Non-Corporate             8.304093e-15
t_values_industry_Non-Energy Materials      2.112414e+01
t_values_industry_Other                     6.257786e-15
t_values_industry_Technology                2.289067e+01
t_values_industry_Telecommunications        1.747097e+01
t_values_industry_Utilities                 1.548236e+01
t_values_momentum                           2.646948e+01
t_values_size                               4.099401e+00
t_values_value                              2.305364e+00
r2                             

In [583]:
df_factor_returns.groupby('period').mean()

Unnamed: 0_level_0,datetime,r2,t_values_industry_Business Services,t_values_industry_Consumer Cyclicals,t_values_industry_Consumer Non-Cyclicals,t_values_industry_Consumer Services,t_values_industry_Energy,t_values_industry_Finance,t_values_industry_Healthcare,t_values_industry_Industrials,...,returns_industry_Industrials,returns_industry_Non-Corporate,returns_industry_Non-Energy Materials,returns_industry_Other,returns_industry_Technology,returns_industry_Telecommunications,returns_industry_Utilities,returns_momentum,returns_size,returns_value
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020,2020-07-01 16:45:48.091602944,0.10922,19.136968,24.005417,16.869357,25.431078,32.528274,21.482666,19.844433,21.604769,...,0.000655,6.095534e-19,0.001441,-6.926272e-20,0.001775,0.001013,-0.00015,-0.003966,5.2e-05,-0.000249
2021,2021-07-02 19:07:35.172413696,0.109318,16.898568,23.592889,15.091105,21.952353,31.412535,16.674006,21.551373,21.09639,...,0.000894,-1.232316e-18,0.001305,3.5724669999999995e-19,0.000864,0.00098,0.000422,-0.001525,0.000128,0.00015
2022,2022-07-02 12:00:00.000000000,0.126439,21.037716,30.214536,21.023334,28.620449,35.010632,20.26849,30.363273,25.673875,...,-0.001571,-2.1007150000000003e-17,0.000304,-1.598354e-17,-3.8e-05,-0.003112,0.005951,-0.095238,-0.002781,-0.002093
2023,2023-07-01 12:00:00.000000000,0.089313,14.247296,19.440667,14.265594,16.798187,21.778209,17.523644,17.565662,18.240132,...,0.001083,3.2580059999999996e-19,0.00062,-4.485365e-19,0.001284,8.6e-05,-0.00038,-0.001843,0.000349,-0.000217
2024,2024-01-15 15:39:07.826086912,0.149477,12.613654,18.873785,13.261683,15.824696,18.167513,13.589075,19.912635,16.885486,...,-0.001415,-4.85534e-19,-0.002719,-8.001883e-19,-0.000912,0.000141,-0.002878,-0.002635,0.001047,0.000176


In [584]:
df_factor_returns.head()

Unnamed: 0,datetime,r2,t_values_industry_Business Services,t_values_industry_Consumer Cyclicals,t_values_industry_Consumer Non-Cyclicals,t_values_industry_Consumer Services,t_values_industry_Energy,t_values_industry_Finance,t_values_industry_Healthcare,t_values_industry_Industrials,...,returns_industry_Non-Corporate,returns_industry_Non-Energy Materials,returns_industry_Other,returns_industry_Technology,returns_industry_Telecommunications,returns_industry_Utilities,returns_momentum,returns_size,returns_value,period
0,2020-01-01,1.0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2020
1,2020-01-02,0.042633,22.353142,29.666148,34.181877,20.4483,45.52173,17.333699,36.477993,24.854178,...,-8.673616999999999e-19,0.012262,3.469447e-18,0.016917,0.010388,0.005673,0.001771,-0.001989,-0.001348,2020
2,2020-01-03,0.06873,20.262788,34.308947,46.077407,25.902691,8.163698,30.106112,49.567043,25.704554,...,-1.734723e-18,-0.013419,-3.469447e-18,-0.00578,-0.011253,-0.015129,-0.001534,0.000423,0.000551,2020
3,2020-01-06,0.013387,1.825378,8.536786,1.123627,6.119165,1.134268,4.886859,0.583701,10.976945,...,-1.301043e-18,-0.004573,-8.673616999999999e-19,-0.0016,-0.003968,0.004443,0.001861,-0.000513,0.000512,2020
4,2020-01-07,0.062333,15.41607,18.755486,9.685412,15.016929,20.279447,6.649213,35.428669,10.122473,...,5.2041700000000004e-18,-0.002166,-3.469447e-18,0.016135,0.007271,-0.001347,-0.001786,-0.002363,-0.001082,2020


#### 4.4 Calculate CoVar Factor Matrix

In [590]:
from typing import Sequence

import numpy as np
from matplotlib import pyplot as plt
from sklearn.neighbors import KernelDensity


def ewa(arr: Sequence, half_life: int | None = None) -> float | np.ndarray:
    """Exponential Weighted Average (EWA)

    Parameters
    ----------
    arr : Sequence
        Array of numbers or arrays (later one on axis=0 weights more)
    half_life : int | None, optional
        Steps it takes for weight to reduce to half of the original value, by default
        None, meaning that weights are all equal

    Returns
    -------
    float | np.ndarray
        Exponential weighted average of elements in `arr`
    """
    arr = np.array(arr)
    alpha = 1.0 if half_life is None else 0.5 ** (1 / half_life)
    weights = alpha ** np.arange(len(arr) - 1, -1, -1)
    w_shape = tuple([arr.shape[0]] + [1] * (len(arr.shape) - 1))
    weights = weights.reshape(w_shape)
    sum_weight = len(arr) - 1 if half_life is None else np.sum(weights)
    return (weights * arr).sum(axis=0) / sum_weight


def cov_ewa(data: np.ndarray, half_life: int | None = None, lag: int = 0) -> np.ndarray:
    """Calculate the covariance matrix as an exponential weighted average of range

    Parameters
    ----------
    data : np.ndarray
        Data matrix (K features * T periods)
    half_life : int | None, optional
        Argument in ewa(), by default None
    lag : int, optional
        Difference between to terms of fator, cov(t-lag, t), when lag is opposite, the
        result is transposed, by default 0

    Returns
    -------
    np.ndarray
        Covariance matrix
    """
    if not isinstance(data, np.ndarray):
        raise Exception("data matrix should be an ndarray or pd.DataFrame")
    if data.shape[0] > data.shape[1]:
        raise Exception("data matrix should not have less columns than rows")
    if lag >= data.shape[1]:
        raise Exception("lag must be smaller than the number of columns of matrix")
    data = data.astype("float64")
    f_bar = data.mean(axis=1)
    data = data - f_bar.reshape(data.shape[0], -1)
    t_range = range(lag, data.shape[1]) if lag > 0 else range(data.shape[1] + lag)
    #print(t_range)
    elements = np.array([np.outer(data[:, t - lag], data[:, t]) for t in t_range])
    return ewa(elements, half_life)

In [591]:
# import numpy as np
# from bias_statistics import BiasStatsCalculator
# from utils import cov_ewa


class FactorCovAdjuster:
    """Adjustments on factor covariance matrix"""

    def __init__(self, FRM: pd.DataFrame, window: int | None = None) -> None:
        """Initialization

        Parameters
        ----------
        FRM : np.ndarray
            Factor return matrix (T*K)
        """
        self.T, self.K = FRM.shape
        if self.K > self.T:
            raise Exception("number of periods must be larger than number of factors")
        self.FRM = FRM.astype("float64")

        if window and window > self.T:
            raise Exception("number of window must be larger than number of periods")
    
        if window:

            self.window = window
            self.first_level_index = FRM.index[self.window-1:]
        else:
            self.first_level_index = FRM.index
            self.window = 0
            
        self.FCM = None
        self.second_level_index = FRM.columns
        self.first_level_index = FRM.index[self.window-1:]

    def calc_fcm_raw(self, half_life: int | None = None) -> pd.DataFrame:
        """Calculate the factor covariance matrix, FCM (K*K)

        Parameters
        ----------
        half_life : int
            Steps it takes for weight in EWA to reduce to half of the original value

        Returns
        -------
        np.ndarray
            FCM, denoted by `F_Raw`
        """
        cov_matrices = []
        if not self.window:
            cov_matrices.append(cov_ewa(self.FRM.T.to_numpy(), half_life))
        else:
            for i in range(len(self.FRM)-window+1):
                cov_matrices.append(cov_ewa(self.FRM.iloc[i:i+window,:].T.to_numpy(), half_life))
                #print(cov_ewa(test.iloc[i:i+window,:].T.to_numpy()))
        cov_matrices = np.array(cov_matrices)
        #print(cov_matrices)
        #self.FCM = cov_ewa(self.FRM, half_life).astype("float64")
        multi_index = pd.MultiIndex.from_product([self.first_level_index, self.second_level_index], names=['datetime', 'factor'])
        #print(multi_index)
        return pd.DataFrame(cov_matrices.reshape((-1, cov_matrices.shape[-1])), index = multi_index, columns = self.second_level_index)


    def newey_west_adjust(self,
        FRM: np.ndarray, half_life: int, max_lags: int, multiplier: int
    ) -> np.ndarray:
        """Apply Newey-West adjustment on `F_Raw`

        Parameters
        ----------
        half_life : int
            Steps it takes for weight in EWA to reduce to half of the original value
        max_lags : int
            Maximum Newey-West correlation lags
        multiplier : int
            Number of periods a FCM with new frequence contains

        Returns
        -------
        np.ndarray
            Newey-West adjusted FCM, denoted by `F_NW`
        """
        FCM = 0
        for D in range(1, max_lags + 1):
            C_pos_delta = cov_ewa(FRM, half_life, D)
            FCM += (1 - D / (1 + max_lags)) * (C_pos_delta + C_pos_delta.T)
        D, U = np.linalg.eigh(FCM * multiplier)
        D[D <= 0] = 1e-14  # fix numerical error
        self.FCM = U.dot(np.diag(D)).dot(U.T)
        D, U = np.linalg.eigh(FCM)
        return FCM

    def calc_newey_west_frm(self, max_lags:int, multiplier: int, half_life: int | None = None) -> pd.DataFrame:

        cov_matrices = []
        if not self.window:
            cov_matrices.append(self.newey_west_adjust(self.FRM.T.to_numpy(), 
                                                  half_life = half_life, 
                                                  max_lags = max_lags, 
                                                  multiplier = multiplier))
        else:
            for i in range(len(self.FRM)-window+1):
                cov_matrices.append(self.newey_west_adjust(self.FRM.iloc[i:i+window,:].T.to_numpy(), 
                                            half_life = half_life,
                                            max_lags = max_lags, 
                                            multiplier = multiplier))

        cov_matrices = np.array(cov_matrices)
        #print(cov_matrices)
        #self.FCM = cov_ewa(self.FRM, half_life).astype("float64")
        multi_index = pd.MultiIndex.from_product([self.first_level_index, self.second_level_index], names=['datetime', 'factor'])
        #print(multi_index)
        return pd.DataFrame(cov_matrices.reshape((-1, cov_matrices.shape[-1])), index = multi_index, columns = self.second_level_index)



In [592]:
test.shape

(1066, 17)

In [593]:
factorcovadjuster = FactorCovAdjuster(test)

In [594]:
type(factorcovadjuster.FRM.T.to_numpy())

numpy.ndarray

In [595]:
factorcovadjuster.calc_fcm_raw()

Unnamed: 0_level_0,Unnamed: 1_level_0,returns_industry_Business Services,returns_industry_Consumer Cyclicals,returns_industry_Consumer Non-Cyclicals,returns_industry_Consumer Services,returns_industry_Energy,returns_industry_Finance,returns_industry_Healthcare,returns_industry_Industrials,returns_industry_Non-Corporate,returns_industry_Non-Energy Materials,returns_industry_Other,returns_industry_Technology,returns_industry_Telecommunications,returns_industry_Utilities,returns_momentum,returns_size,returns_value
datetime,factor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-01-31,returns_industry_Business Services,0.0002076908,0.0002585162,-0.0002642917,0.0002562457,0.0002706373,0.0002182466,0.0001844511,0.0002316459,1.4014309999999999e-19,0.0002065279,1.0634959999999999e-19,0.0001948145,0.0001664834,9.790059e-05,0.000590425,8.30809e-06,1.771054e-06
2024-01-31,returns_industry_Consumer Cyclicals,0.0002585162,0.0006957879,-0.008832709,0.0006648779,0.001203431,0.0004447882,0.0002886553,0.0004254043,2.964223e-18,0.000181296,2.2211920000000002e-18,3.874619e-05,0.0004599082,-0.0007453745,0.01343653,0.0004664041,0.0002914525
2024-01-31,returns_industry_Consumer Non-Cyclicals,-0.0002642917,-0.008832709,0.2487967,-0.009042535,-0.02495463,-0.004789607,-0.001079296,-0.003268914,-8.137948000000001e-17,0.002901155,-6.103660000000001e-17,0.006825772,-0.007137549,0.02515573,-0.3716309,-0.01334037,-0.008568731
2024-01-31,returns_industry_Consumer Services,0.0002562457,0.0006648779,-0.009042535,0.0007137778,0.001238296,0.0004485555,0.0002807854,0.0004192537,3.034635e-18,0.0001651107,2.270437e-18,2.017959e-05,0.0004644822,-0.0007728134,0.01372188,0.0004771054,0.0002983321
2024-01-31,returns_industry_Energy,0.0002706373,0.001203431,-0.02495463,0.001238296,0.003138139,0.0007745602,0.0003381396,0.0006455882,8.240796e-18,3.643558e-05,6.184981e-18,-0.0004500137,0.0009287246,-0.002366189,0.03754261,0.001320383,0.0008477575
2024-01-31,returns_industry_Finance,0.0002182466,0.0004447882,-0.004789607,0.0004485555,0.0007745602,0.0003532505,0.000212038,0.0003162532,1.625127e-18,0.0001830299,1.223154e-18,7.272412e-05,0.0003160438,-0.0003363825,0.007368434,0.0002483678,0.000157412
2024-01-31,returns_industry_Healthcare,0.0001844511,0.0002886553,-0.001079296,0.0002807854,0.0003381396,0.000212038,0.0002948365,0.0002388102,4.1259239999999995e-19,0.0001948134,3.068079e-19,0.000220335,0.0001958148,1.503678e-05,0.001775873,5.121342e-05,2.274046e-05
2024-01-31,returns_industry_Industrials,0.0002316459,0.0004254043,-0.003268914,0.0004192537,0.0006455882,0.0003162532,0.0002388102,0.0003348825,1.135148e-18,0.0002278359,8.535810999999999e-19,0.0001561428,0.0002826533,-0.0001868562,0.00511482,0.0001671632,0.0001019982
2024-01-31,returns_industry_Non-Corporate,1.4014309999999999e-19,2.964223e-18,-8.137948000000001e-17,3.034635e-18,8.240796e-18,1.625127e-18,4.1259239999999995e-19,1.135148e-18,2.66964e-32,-8.852342e-19,1.9975180000000002e-32,-2.173954e-18,2.390091e-18,-8.191932e-18,1.216096e-16,4.361107e-18,2.8003e-18
2024-01-31,returns_industry_Non-Energy Materials,0.0002065279,0.000181296,0.002901155,0.0001651107,3.643558e-05,0.0001830299,0.0001948134,0.0002278359,-8.852342e-19,0.0003199502,-6.606526999999999e-19,0.0003029681,9.877003e-05,0.0004354817,-0.004100243,-0.000163028,-0.0001094085


In [596]:
factorcovadjuster.calc_newey_west_frm( max_lags = 2, multiplier = 1)

Unnamed: 0_level_0,Unnamed: 1_level_0,returns_industry_Business Services,returns_industry_Consumer Cyclicals,returns_industry_Consumer Non-Cyclicals,returns_industry_Consumer Services,returns_industry_Energy,returns_industry_Finance,returns_industry_Healthcare,returns_industry_Industrials,returns_industry_Non-Corporate,returns_industry_Non-Energy Materials,returns_industry_Other,returns_industry_Technology,returns_industry_Telecommunications,returns_industry_Utilities,returns_momentum,returns_size,returns_value
datetime,factor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-01-31,returns_industry_Business Services,3.636988e-05,4.011302e-05,-4.255118e-05,4.588188e-05,4.90496e-05,4.219274e-05,1.812925e-05,3.863008e-05,2.61517e-20,3.268625e-05,1.9148869999999998e-20,2.214432e-05,3.174126e-05,2.235236e-05,0.0001114545,-7.094422e-06,-1.655034e-07
2024-01-31,returns_industry_Consumer Cyclicals,4.011302e-05,4.86022e-05,-5.115743e-05,5.565745e-05,3.69789e-05,4.258653e-05,1.66809e-05,4.047293e-05,3.352919e-20,3.304021e-05,2.900477e-20,2.74797e-05,3.166747e-05,2.648308e-05,0.0001099419,-1.030276e-05,5.144448e-07
2024-01-31,returns_industry_Consumer Non-Cyclicals,-4.255118e-05,-5.115743e-05,-0.0006177953,-3.111829e-05,0.000163898,9.493616e-05,6.540663e-05,-5.457631e-05,2.0041729999999998e-19,9.213887e-06,7.426470000000001e-22,-0.0001257352,0.0001109258,-0.0001318966,0.001007696,6.718418e-05,1.50814e-05
2024-01-31,returns_industry_Consumer Services,4.588188e-05,5.565745e-05,-3.111829e-05,7.152806e-05,5.163544e-05,4.858083e-05,2.130695e-05,4.922579e-05,2.81818e-20,4.00162e-05,2.413498e-20,3.425988e-05,3.537914e-05,2.619708e-05,6.999309e-05,-1.216569e-05,-1.250887e-06
2024-01-31,returns_industry_Energy,4.90496e-05,3.69789e-05,0.000163898,5.163544e-05,4.712048e-05,3.736638e-05,-1.135242e-05,4.766246e-05,-3.346105e-20,4.469486e-05,-1.2798279999999999e-20,3.44003e-05,3.291673e-05,5.315374e-05,-0.0001957256,-2.476254e-05,-4.890009e-06
2024-01-31,returns_industry_Finance,4.219274e-05,4.258653e-05,9.493616e-05,4.858083e-05,3.736638e-05,4.205509e-05,1.353359e-05,4.259822e-05,-1.5604969999999998e-20,3.738672e-05,-1.1618469999999999e-20,2.938422e-05,2.880691e-05,4.076588e-05,-9.497555e-05,-1.509377e-05,-4.497453e-06
2024-01-31,returns_industry_Healthcare,1.812925e-05,1.66809e-05,6.540663e-05,2.130695e-05,-1.135242e-05,1.353359e-05,1.100888e-05,1.484991e-05,-7.830333e-21,1.483637e-05,-1.237655e-20,1.881613e-05,1.543704e-05,2.664331e-05,-8.782564e-05,-1.166464e-05,-4.921064e-06
2024-01-31,returns_industry_Industrials,3.863008e-05,4.047293e-05,-5.457631e-05,4.922579e-05,4.766246e-05,4.259822e-05,1.484991e-05,4.012337e-05,3.246138e-20,3.406014e-05,2.328631e-20,2.374545e-05,3.228654e-05,2.224081e-05,0.000111645,-8.621667e-06,2.968327e-07
2024-01-31,returns_industry_Non-Corporate,2.61517e-20,3.352919e-20,2.0041729999999998e-19,2.81818e-20,-3.346105e-20,-1.5604969999999998e-20,-7.830333e-21,3.246138e-20,-6.114545e-35,1.1746969999999999e-20,7.0371269999999995e-37,5.305963e-20,-2.141978e-20,5.422944e-20,-3.025169e-19,-2.5544709999999998e-20,-5.477206e-21
2024-01-31,returns_industry_Non-Energy Materials,3.268625e-05,3.304021e-05,9.213887e-06,4.00162e-05,4.469486e-05,3.738672e-05,1.483637e-05,3.406014e-05,1.1746969999999999e-20,3.415627e-05,5.142782e-21,2.142635e-05,3.248236e-05,2.598603e-05,2.898281e-05,-1.002219e-05,-1.796966e-06


In [597]:
type(factorcovadjuster.FRM)

pandas.core.frame.DataFrame

In [599]:
#.equals(test.cov().dropna())

In [600]:
test.cov().dropna()

Unnamed: 0,returns_industry_Business Services,returns_industry_Consumer Cyclicals,returns_industry_Consumer Non-Cyclicals,returns_industry_Consumer Services,returns_industry_Energy,returns_industry_Finance,returns_industry_Healthcare,returns_industry_Industrials,returns_industry_Non-Corporate,returns_industry_Non-Energy Materials,returns_industry_Other,returns_industry_Technology,returns_industry_Telecommunications,returns_industry_Utilities,returns_momentum,returns_size,returns_value
returns_industry_Business Services,0.0002076908,0.0002585162,-0.0002642917,0.0002562457,0.0002706373,0.0002182466,0.0001844511,0.0002316459,1.4014309999999999e-19,0.0002065279,1.0634959999999999e-19,0.0001948145,0.0001664834,9.790059e-05,0.000590425,8.30809e-06,1.771054e-06
returns_industry_Consumer Cyclicals,0.0002585162,0.0006957879,-0.008832709,0.0006648779,0.001203431,0.0004447882,0.0002886553,0.0004254043,2.964223e-18,0.000181296,2.2211920000000002e-18,3.874619e-05,0.0004599082,-0.0007453745,0.01343653,0.0004664041,0.0002914525
returns_industry_Consumer Non-Cyclicals,-0.0002642917,-0.008832709,0.2487967,-0.009042535,-0.02495463,-0.004789607,-0.001079296,-0.003268914,-8.137948000000001e-17,0.002901155,-6.103660000000001e-17,0.006825772,-0.007137549,0.02515573,-0.3716309,-0.01334037,-0.008568731
returns_industry_Consumer Services,0.0002562457,0.0006648779,-0.009042535,0.0007137778,0.001238296,0.0004485555,0.0002807854,0.0004192537,3.034635e-18,0.0001651107,2.270437e-18,2.017959e-05,0.0004644822,-0.0007728134,0.01372188,0.0004771054,0.0002983321
returns_industry_Energy,0.0002706373,0.001203431,-0.02495463,0.001238296,0.003138139,0.0007745602,0.0003381396,0.0006455882,8.240796e-18,3.643558e-05,6.184981e-18,-0.0004500137,0.0009287246,-0.002366189,0.03754261,0.001320383,0.0008477575
returns_industry_Finance,0.0002182466,0.0004447882,-0.004789607,0.0004485555,0.0007745602,0.0003532505,0.000212038,0.0003162532,1.625127e-18,0.0001830299,1.223154e-18,7.272412e-05,0.0003160438,-0.0003363825,0.007368434,0.0002483678,0.000157412
returns_industry_Healthcare,0.0001844511,0.0002886553,-0.001079296,0.0002807854,0.0003381396,0.000212038,0.0002948365,0.0002388102,4.1259239999999995e-19,0.0001948134,3.068079e-19,0.000220335,0.0001958148,1.503678e-05,0.001775873,5.121342e-05,2.274046e-05
returns_industry_Industrials,0.0002316459,0.0004254043,-0.003268914,0.0004192537,0.0006455882,0.0003162532,0.0002388102,0.0003348825,1.135148e-18,0.0002278359,8.535810999999999e-19,0.0001561428,0.0002826533,-0.0001868562,0.00511482,0.0001671632,0.0001019982
returns_industry_Non-Corporate,1.4014309999999999e-19,2.964223e-18,-8.137948000000001e-17,3.034635e-18,8.240796e-18,1.625127e-18,4.1259239999999995e-19,1.135148e-18,2.66964e-32,-8.852342e-19,1.9975180000000002e-32,-2.173954e-18,2.390091e-18,-8.191932e-18,1.216096e-16,4.361107e-18,2.8003e-18
returns_industry_Non-Energy Materials,0.0002065279,0.000181296,0.002901155,0.0001651107,3.643558e-05,0.0001830299,0.0001948134,0.0002278359,-8.852342e-19,0.0003199502,-6.606526999999999e-19,0.0003029681,9.877003e-05,0.0004354817,-0.004100243,-0.000163028,-0.0001094085


In [442]:
window = 252
second_level_index = test.columns
first_level_index = test.index[window-1:]


In [447]:
cov_matries = []
for i in range(len(test)-window+1):
    cov_matries.append(cov_ewa(test.iloc[i:i+window,:].T.to_numpy()))
    #print(cov_ewa(test.iloc[i:i+window,:].T.to_numpy()))

In [456]:
arr = np.array(cov_matries)

In [449]:
len(cov_matries)

815

In [452]:
#index = pd.MultiIndex.from_tuples((first_level_index, second_level_index), names=["datetime", "factors"])

In [None]:
# Reshape the 3D array into a 2D array with MultiIndex
reshaped_array = data_array_3d.reshape((-1, data_array_3d.shape[-1]))

# Create a MultiIndex for rows
multi_index = pd.MultiIndex.from_product([row_labels_level_1, row_labels_level_2], names=['Level_1', 'Level_2'])

# Convert the reshaped array to a DataFrame with MultiIndex
df_multiindex = pd.DataFrame(reshaped_array, index=multi_index, columns=column_labels)


In [454]:
multi_index = pd.MultiIndex.from_product([first_level_index, second_level_index], names=['datetime', 'factor'])


In [459]:
pd.DataFrame(arr.reshape((-1, arr.shape[-1])), index = multi_index, columns = second_level_index)

Unnamed: 0_level_0,Unnamed: 1_level_0,returns_industry_Business Services,returns_industry_Consumer Cyclicals,returns_industry_Consumer Non-Cyclicals,returns_industry_Consumer Services,returns_industry_Energy,returns_industry_Finance,returns_industry_Healthcare,returns_industry_Industrials,returns_industry_Non-Corporate,returns_industry_Non-Energy Materials,returns_industry_Other,returns_industry_Technology,returns_industry_Telecommunications,returns_industry_Utilities,returns_momentum,returns_size,returns_value
datetime,factor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2020-12-17,returns_industry_Business Services,0.000410,0.000450,0.000329,0.000483,0.000516,0.000445,0.000285,0.000427,-7.716348e-21,0.000405,4.804385e-21,3.050502e-04,0.000298,0.000327,3.832317e-05,-0.000029,-1.702681e-05
2020-12-17,returns_industry_Consumer Cyclicals,0.000450,0.000621,0.000413,0.000618,0.000600,0.000521,0.000359,0.000523,-1.858336e-20,0.000497,2.739166e-21,3.937769e-04,0.000342,0.000372,-4.628389e-05,-0.000023,-2.528456e-05
2020-12-17,returns_industry_Consumer Non-Cyclicals,0.000329,0.000413,0.000321,0.000432,0.000455,0.000377,0.000278,0.000378,-2.122376e-21,0.000373,2.905569e-21,2.931848e-04,0.000274,0.000295,2.802972e-05,-0.000018,-1.710766e-05
2020-12-17,returns_industry_Consumer Services,0.000483,0.000618,0.000432,0.000767,0.000686,0.000561,0.000363,0.000561,-5.785942e-21,0.000516,-1.323471e-22,4.015017e-04,0.000375,0.000372,-8.708851e-05,-0.000030,-2.860872e-05
2020-12-17,returns_industry_Energy,0.000516,0.000600,0.000455,0.000686,0.001189,0.000609,0.000391,0.000607,6.232203e-21,0.000615,1.307819e-20,4.167474e-04,0.000414,0.000403,4.879846e-05,-0.000053,-3.538261e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-01-31,returns_industry_Telecommunications,0.000085,0.000114,0.000084,0.000096,0.000104,0.000114,0.000097,0.000108,5.216761e-21,0.000107,3.346920e-21,9.024264e-05,0.000146,0.000084,-1.575975e-05,-0.000017,-6.320616e-06
2024-01-31,returns_industry_Utilities,0.000074,0.000096,0.000080,0.000080,0.000085,0.000107,0.000096,0.000096,1.141249e-20,0.000095,3.073717e-21,7.754855e-05,0.000084,0.000135,-2.573476e-05,-0.000014,-6.017063e-06
2024-01-31,returns_momentum,-0.000016,-0.000015,-0.000022,-0.000011,-0.000020,-0.000032,-0.000032,-0.000020,7.868664e-21,-0.000015,1.314039e-21,-5.208151e-07,-0.000016,-0.000026,3.494440e-04,0.000001,9.027373e-08
2024-01-31,returns_size,-0.000014,-0.000022,-0.000017,-0.000016,-0.000018,-0.000020,-0.000022,-0.000021,-1.914590e-21,-0.000019,-8.302971e-22,-1.970250e-05,-0.000017,-0.000014,1.258967e-06,0.000009,2.174603e-06


In [557]:
#factorcovadjuster.FRM.rolling(window=window).cov()
#cov_ewa(test.T.to_numpy())

In [558]:
#factorcovadjuster.calc_fcm_raw(90)

In [310]:
# access data by one of the index
#print(df_factor_covar.xs('2020-12-17', level='datetime'))

#### 4.5 Calculate Idiosyncratic Returns variance matrix

In [601]:
df_loadings

Unnamed: 0,datetime,id,return_1d,value,size,industry_Business Services,industry_Consumer Cyclicals,industry_Consumer Non-Cyclicals,industry_Consumer Services,industry_Energy,...,industry_Industrials,industry_Non-Corporate,industry_Non-Energy Materials,industry_Other,industry_Technology,industry_Telecommunications,industry_Utilities,momentum,country,period
0,2020-01-01,B00FG1-R,0.000000,0.014547,-0.371694,0,0,0,0,1,...,0,0,0,0,0,0,0,-0.019683,1,2020
1,2020-01-01,B01HWF-R,0.000000,-0.442699,-1.358461,0,0,1,0,0,...,0,0,0,0,0,0,0,-0.151665,1,2020
2,2020-01-01,B04XY5-R,0.000000,-1.719625,-0.694700,0,0,0,0,0,...,0,0,1,0,0,0,0,0.031746,1,2020
3,2020-01-01,B06HD3-R,0.000000,-0.097482,-0.852544,0,0,0,0,0,...,0,0,0,0,0,0,0,0.050132,1,2020
4,2020-01-01,B0TXKG-R,0.000000,1.091488,0.566158,0,0,0,1,0,...,0,0,0,0,0,0,0,0.066271,1,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3383982,2024-01-31,XQF8XY-R,0.004628,0.042221,1.488157,0,0,0,0,0,...,0,0,0,0,0,0,0,-0.022630,1,2024
3383983,2024-01-31,XQFML8-R,0.032251,-0.256071,0.135506,0,0,0,0,1,...,0,0,0,0,0,0,0,-0.040797,1,2024
3383984,2024-01-31,XQN3ZC-R,0.011494,-0.085134,-0.600385,0,0,0,0,0,...,0,0,0,0,0,0,0,0.025865,1,2024
3383985,2024-01-31,XR00QK-R,0.005945,0.613000,-0.403807,0,0,0,1,0,...,0,0,0,0,0,0,0,0.033170,1,2024


In [615]:
factor_cols = [
    col
    for col in df_loadings.columns if f"returns_{col}" in df_factor_returns.columns
    #if col in df_loadings.columns and col != "datetime"
]

# Merge the DataFrames on 'date'
df_idio = pd.merge(
    df_loadings, df_factor_returns, on="datetime", suffixes=("", "_factor_returns")
)

In [616]:
df_loadings.columns

Index(['datetime', 'id', 'return_1d', 'value', 'size',
       'industry_Business Services', 'industry_Consumer Cyclicals',
       'industry_Consumer Non-Cyclicals', 'industry_Consumer Services',
       'industry_Energy', 'industry_Finance', 'industry_Healthcare',
       'industry_Industrials', 'industry_Non-Corporate',
       'industry_Non-Energy Materials', 'industry_Other',
       'industry_Technology', 'industry_Telecommunications',
       'industry_Utilities', 'momentum', 'country', 'period'],
      dtype='object')

In [617]:
df_factor_returns.columns

Index(['datetime', 'r2', 't_values_industry_Business Services',
       't_values_industry_Consumer Cyclicals',
       't_values_industry_Consumer Non-Cyclicals',
       't_values_industry_Consumer Services', 't_values_industry_Energy',
       't_values_industry_Finance', 't_values_industry_Healthcare',
       't_values_industry_Industrials', 't_values_industry_Non-Corporate',
       't_values_industry_Non-Energy Materials', 't_values_industry_Other',
       't_values_industry_Technology', 't_values_industry_Telecommunications',
       't_values_industry_Utilities', 't_values_momentum', 't_values_size',
       't_values_value', 'returns_industry_Business Services',
       'returns_industry_Consumer Cyclicals',
       'returns_industry_Consumer Non-Cyclicals',
       'returns_industry_Consumer Services', 'returns_industry_Energy',
       'returns_industry_Finance', 'returns_industry_Healthcare',
       'returns_industry_Industrials', 'returns_industry_Non-Corporate',
       'returns_indu

In [618]:
df_idio.columns

Index(['datetime', 'id', 'return_1d', 'value', 'size',
       'industry_Business Services', 'industry_Consumer Cyclicals',
       'industry_Consumer Non-Cyclicals', 'industry_Consumer Services',
       'industry_Energy', 'industry_Finance', 'industry_Healthcare',
       'industry_Industrials', 'industry_Non-Corporate',
       'industry_Non-Energy Materials', 'industry_Other',
       'industry_Technology', 'industry_Telecommunications',
       'industry_Utilities', 'momentum', 'country', 'period', 'r2',
       't_values_industry_Business Services',
       't_values_industry_Consumer Cyclicals',
       't_values_industry_Consumer Non-Cyclicals',
       't_values_industry_Consumer Services', 't_values_industry_Energy',
       't_values_industry_Finance', 't_values_industry_Healthcare',
       't_values_industry_Industrials', 't_values_industry_Non-Corporate',
       't_values_industry_Non-Energy Materials', 't_values_industry_Other',
       't_values_industry_Technology', 't_values_indust

In [619]:
factor_cols

['value',
 'size',
 'industry_Business Services',
 'industry_Consumer Cyclicals',
 'industry_Consumer Non-Cyclicals',
 'industry_Consumer Services',
 'industry_Energy',
 'industry_Finance',
 'industry_Healthcare',
 'industry_Industrials',
 'industry_Non-Corporate',
 'industry_Non-Energy Materials',
 'industry_Other',
 'industry_Technology',
 'industry_Telecommunications',
 'industry_Utilities',
 'momentum']

In [623]:
# Multiplying matching columns
for col in factor_cols:
    df_idio[f"{col}_calc_f_r"] = df_idio[col] * df_idio[f"{col}_factor_returns"]

KeyError: 'value_factor_returns'

In [624]:
# # Dropping the extra columns
df_idio = df_idio.drop(columns=factor_cols)   # drop factor columns
df_idio = df_idio.drop(columns=[f"{col}_factor_returns" for col in factor_cols])   


KeyError: "['value_factor_returns', 'size_factor_returns', 'industry_Business Services_factor_returns', 'industry_Consumer Cyclicals_factor_returns', 'industry_Consumer Non-Cyclicals_factor_returns', 'industry_Consumer Services_factor_returns', 'industry_Energy_factor_returns', 'industry_Finance_factor_returns', 'industry_Healthcare_factor_returns', 'industry_Industrials_factor_returns', 'industry_Non-Corporate_factor_returns', 'industry_Non-Energy Materials_factor_returns', 'industry_Other_factor_returns', 'industry_Technology_factor_returns', 'industry_Telecommunications_factor_returns', 'industry_Utilities_factor_returns', 'momentum_factor_returns'] not found in axis"

In [625]:
df_idio.columns

Index(['datetime', 'id', 'return_1d', 'country', 'period', 'r2',
       't_values_industry_Business Services',
       't_values_industry_Consumer Cyclicals',
       't_values_industry_Consumer Non-Cyclicals',
       't_values_industry_Consumer Services', 't_values_industry_Energy',
       't_values_industry_Finance', 't_values_industry_Healthcare',
       't_values_industry_Industrials', 't_values_industry_Non-Corporate',
       't_values_industry_Non-Energy Materials', 't_values_industry_Other',
       't_values_industry_Technology', 't_values_industry_Telecommunications',
       't_values_industry_Utilities', 't_values_momentum', 't_values_size',
       't_values_value', 'returns_industry_Business Services',
       'returns_industry_Consumer Cyclicals',
       'returns_industry_Consumer Non-Cyclicals',
       'returns_industry_Consumer Services', 'returns_industry_Energy',
       'returns_industry_Finance', 'returns_industry_Healthcare',
       'returns_industry_Industrials', 'return

In [626]:
#### 4.3 Calc idiosyncratic risk

# factor_cols = [
#     col
#     for col in df_factor_returns.columns
#     if col in df_loadings.columns and col != "datetime"
# ]

# # Merge the DataFrames on 'date'
# df_idio = pd.merge(
#     df_loadings, df_factor_returns, on="datetime", suffixes=("", "_factor_returns")
# )

# # Multiplying matching columns
# for col in factor_cols:
#     df_idio[f"{col}_calc_f_r"] = df_idio[col] * df_idio[f"{col}_factor_returns"]

# # # Dropping the extra columns
# df_idio = df_idio.drop(columns=factor_cols)   # drop factor columns
# df_idio = df_idio.drop(columns=[f"{col}_factor_returns" for col in factor_cols])   

# Summing the specified columns
df_idio["factor_return_1d"] = df_idio[[f"{col}_calc_f_r" for col in factor_cols]].sum(
    axis=1
)
df_idio["factor_return_1d_error"] = df_idio["factor_return_1d"] - df_idio["return_1d"]

df_idio = df_idio[
    ["datetime", "id", "return_1d", "factor_return_1d", "factor_return_1d_error"]
]

In [627]:
df_idio

Unnamed: 0,datetime,id,return_1d,factor_return_1d,factor_return_1d_error
0,2020-01-01,B00FG1-R,0.000000,0.000000,0.000000
1,2020-01-01,B01HWF-R,0.000000,0.000000,0.000000
2,2020-01-01,B04XY5-R,0.000000,0.000000,0.000000
3,2020-01-01,B06HD3-R,0.000000,0.000000,0.000000
4,2020-01-01,B0TXKG-R,0.000000,0.000000,0.000000
...,...,...,...,...,...
3383982,2024-01-31,XQF8XY-R,0.004628,0.001224,-0.003404
3383983,2024-01-31,XQFML8-R,0.032251,0.016376,-0.015875
3383984,2024-01-31,XQN3ZC-R,0.011494,-0.013042,-0.024537
3383985,2024-01-31,XR00QK-R,0.005945,-0.002576,-0.008522


In [None]:
# Calculate idiosync_return_var from return error
df_idio["idio_risk_252d"] = (
    df_idio.sort_values(by=["id", "datetime"])
    .groupby("id")["factor_return_1d_error"]
    .rolling(window=252)
    .var()
    .reset_index(level=0, drop=True)
)
df_idio = df_idio.set_index("datetime").pivot(columns="id", values="idio_risk_252d")


In [629]:
# Calculate idiosync_return_var from return error
df_idio.loc[:,"idio_risk_252d"] = (
    df_idio.sort_values(by=["id", "datetime"])
    .groupby("id")["factor_return_1d_error"]
    .rolling(window=252)
    .var()
    .reset_index(level=0, drop=True)
)
df_idio = df_idio.set_index("datetime").pivot(columns="id", values="idio_risk_252d")


KeyError: 'id'

In [630]:
df_idio

id,B00FG1-R,B01HWF-R,B04XY5-R,B06HD3-R,B0TXKG-R,B0VZ2Z-R,B0ZCK4-R,B12BZP-R,B14762-R,B15GK7-R,...,XQC8FD-R,XQCN4R-R,XQCWLZ-R,XQCXP0-R,XQF8XY-R,XQFML8-R,XQGHLF-R,XQN3ZC-R,XR00QK-R,XR7GZL-R
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-01,,,,,,,,,,,...,,,,,,,,,,
2020-01-02,,,,,,,,,,,...,,,,,,,,,,
2020-01-03,,,,,,,,,,,...,,,,,,,,,,
2020-01-06,,,,,,,,,,,...,,,,,,,,,,
2020-01-07,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-01-25,,0.001568,,0.000935,0.000272,0.000554,0.000272,,0.000287,0.000529,...,0.000295,0.000125,0.013638,0.000293,0.000086,0.000512,,0.000304,0.000287,0.000344
2024-01-26,,0.001574,,0.000937,0.000273,0.000553,0.000265,,0.000287,0.000535,...,0.000295,0.000125,0.013639,0.000293,0.000086,0.000522,,0.000307,0.000287,0.000343
2024-01-29,,0.001575,,0.000941,0.000273,0.000556,0.000267,,0.000288,0.000536,...,0.000295,0.000125,0.013638,0.000293,0.000086,0.000491,,0.000309,0.000287,0.000344
2024-01-30,,0.001570,,0.000941,0.000275,0.000557,0.000267,,0.000288,0.000535,...,0.000293,0.000125,0.013633,0.000302,0.000086,0.000486,,0.000309,0.000287,0.000344
