In [809]:
import numpy as np
import pandas as pd
from fredmd import FredMD
import sklearn.pipeline as skpipe
import sklearn.decomposition as skd
import sklearn.preprocessing as skp
import math
import matplotlib.pyplot as plt
from statsmodels.tsa.ar_model import AutoReg
from sklearn.utils.extmath import randomized_svd

In [810]:
from datetime import datetime

# Data Loaders

In [811]:
class SPCAData:
    
    def __init__(self, Nfactor=None, vintage=None, maxfactor=8, standard_method=2, ic_method=2,
                 target=None, train_test_split=[('1960-01-01', '1984-12-01'),('1985-01-01', '2019-12-01')], 
                 nlags=1, drop_cols=["ACOGNO", "ANDENOx", "TWEXAFEGSMTHx", "UMCSENTx", "VXOCLSx"]) -> None:
        """
        Create fredmd object
        Auguments:
        1) Nfactor = None: Number of factors to estimate. If None then estimate number of true factors via information critea
        2) vintage = None: Vinatege of data to use in "year-month" format (e.g. "2020-10"). If None use current vintage
        3) maxfactor = 8: Maximimum number of factors to test against information critea. If Nfactor is a number, then this is ignored
        4) standard_method = 2: method to standardize data before factors are estimate. 0 = Identity transform, 1 = Demean only, 2 = Demean and stardize to unit variance. Default = 2.
        5) ic_method = 2: information critea penalty term. Se
        e http://www.columbia.edu/~sn2294/pub/ecta02.pdf page 201, equation 9 for options.
        """
        self.drop_cols = drop_cols
        self.train_test_split = [[datetime.strptime(x, '%Y-%m-%d') for x in split] for split in train_test_split]
        # Make sure arguments are valid
        if standard_method not in [0, 1, 2]:
            raise ValueError(f"standard_method must be in [0, 1, 2], got {standard_method}")
        if ic_method not in [1, 2, 3]:
            raise ValueError(f"ic_method must be in [1, 2, 3], got {ic_method}")
        # Download data
        self.rawseries, self.transforms, self.target, self.train_mask, self.test_mask = self.download_data(vintage, 
                                                                       target if target is not None else "UNRATE",
                                                                         self.train_test_split)
        

        self.target_name = target
        self.standard_method = standard_method
        self.ic_method = ic_method
        self.maxfactor = maxfactor
        self.Nfactor = Nfactor

        self.nlags = nlags

    @staticmethod
    def download_data(vintage, tgt, train_test_split):
        if vintage is None:
            url = 'https://s3.amazonaws.com/files.fred.stlouisfed.org/fred-md/monthly/current.csv'
        else:
            url = f'https://s3.amazonaws.com/files.fred.stlouisfed.org/fred-md/monthly/{vintage}.csv'
        transforms = pd.read_csv(
            url, header=0, nrows=1, index_col=0).transpose()
        transforms.index.rename("series", inplace=True)
        transforms.columns = ['transform']
        transforms = transforms.to_dict()['transform']
        data = pd.read_csv(url, names=transforms.keys(), skiprows=2, index_col=0,
                           skipfooter=1, engine='python', parse_dates=True, infer_datetime_format=True)
        
        train_mask = (data.index > train_test_split[0][0]) & (data.index <= train_test_split[0][1])
        test_mask = (data.index > train_test_split[1][0]) & (data.index <= train_test_split[1][1])
        
        target = None
        if "FB-yeild" in tgt:
            bond_data = pd.read_csv("data/bond_data.csv", engine='python', parse_dates=True, infer_datetime_format=True,
                       skiprows=range(1,45), index_col=4)
            if tgt == 'FB-yeild-1':
                bond_data = bond_data.loc[bond_data['TTERMTYPE'] == 5001]
            elif tgt == 'FB-yeild-2':
                bond_data = bond_data.loc[bond_data['TTERMTYPE'] == 5002]
            elif tgt == 'FB-yeild-3':
                bond_data = bond_data.loc[bond_data['TTERMTYPE'] == 5003]
            elif tgt == 'FB-yeild-4':
                bond_data = bond_data.loc[bond_data['TTERMTYPE'] == 5004]
            elif tgt == 'FB-yeild-5':
                bond_data = bond_data.loc[bond_data['TTERMTYPE'] == 5005]
            bond_data.index = bond_data.index.to_period('M') 
            intersection = bond_data.index.intersection(data.index.to_period('M'))
            target = bond_data[bond_data.index.isin(intersection)]["TMYTM"]
            
        return data, transforms, target, train_mask, test_mask

    @staticmethod
    def factor_standardizer_method(code):
        """
        Outputs the sklearn standard scaler object with the desired features
        codes:
        0) Identity transform
        1) Demean only
        2) Demean and standardized
        """
        if code == 0:
            return skp.StandardScaler(with_mean=False, with_std=False)
        elif code == 1:
            return skp.StandardScaler(with_mean=True, with_std=False)
        elif code == 2:
            return skp.StandardScaler(with_mean=True, with_std=True)
        else:
            raise ValueError("standard_method must be in [0, 1, 2]")

    @staticmethod
    def data_transforms(series, transform):
        """
        Transforms a single series according to its transformation code
        Inputs:
        1) series: pandas series to be transformed
        2) transfom: transform code for the series
        Returns:
        transformed series
        """
        if transform == 1:
            # level
            return series
        elif transform == 2:
            # 1st difference
            return series.diff()
        elif transform == 3:
            # second difference
            return series.diff().diff()
        elif transform == 4:
            # Natural log
            return np.log(series)
        elif transform == 5:
            # log 1st difference
            return np.log(series).diff()
        elif transform == 6:
            # log second difference
            return np.log(series).diff().diff()
        elif transform == 7:
            # First difference of percent change
            return series.pct_change().diff()
        else:
            raise ValueError("Transform must be in [1, 2, ..., 7]")

    def apply_transforms(self):
        """
        Apply the transformation to each series to make them stationary and drop the first 2 rows that are mostly NaNs
        Save results to self.series
        """
        self.series = pd.DataFrame({key: self.data_transforms(
            self.rawseries[key], value) for (key, value) in self.transforms.items()})

    def remove_outliers(self):
        """
        Removes outliers from each series in self.series
        Outlier definition: a data point x of a series X is considered an outlier if abs(x-median)>10*interquartile_range.
        """
        Z = abs((self.series - self.series.median()) /
                (self.series.quantile(0.75) - self.series.quantile(0.25))) > 10
        for col, _ in self.series.iteritems():
            self.series[col][Z[col]] = np.nan
        

    def get_data(self):
        """
        """
        # Define our estimation pipelines
        self.apply_transforms()
        self.remove_outliers()
        
        self.series = self.series.loc[self.series.index >= '1960-01-01']
        self.series = self.series.loc[self.series.index < '2020-01-01']
        
        pipe = skpipe.Pipeline([('Standardize', self.factor_standardizer_method(self.standard_method))])

        actual_data = self.series.to_numpy(copy=True)
        intial_nas = self.series.isna().to_numpy(copy=True)
        working_data = self.series.fillna(value=self.series.mean(), axis='index').to_numpy(copy=True)

        last_timestep = np.sum(self.train_mask) + np.sum(self.test_mask)     

        if self.target is None:
            idx = self.series.columns.get_loc(self.target_name)
            target_data = np.copy(working_data[:,idx])

        else:
            target_data = self.target
            target_data = target_data.loc[target_data.index >= '1960-01-01']
            target_data = target_data.loc[target_data.index < '2020-01-01'].to_numpy(copy=True)
        
        bad_cols = np.isin(self.series.columns.to_numpy(), self.drop_cols)
        
        return working_data[:-1,~bad_cols], target_data[1:], np.sum(self.train_mask), working_data[:last_timestep]
        


In [812]:
class ChenZData:
    
    def __init__(self, start_date='1976-03', end_date='2019-09', tgt_factor='mkt'):
        chen_z_port_data = pd.read_csv("data/allportretbase.csv", engine='python', 
                                       parse_dates=True, infer_datetime_format=True,
                               index_col=0)
        chen_z_port_data = chen_z_port_data.loc[chen_z_port_data.date >= start_date]
        chen_z_port_data = chen_z_port_data.loc[chen_z_port_data.date < end_date]

        chen_z_port_data['port_id'] = chen_z_port_data.signalname.add(chen_z_port_data.port.astype(str))
        chen_z_port_data.port_id = chen_z_port_data.port_id
        portfolios = sorted(chen_z_port_data.port_id.unique())
        dates = chen_z_port_data.date.unique()
        portfolios = [p for p in portfolios if 
                     np.sum(chen_z_port_data['port_id'] == p) == len(dates)]
        return_panel = np.zeros((len(dates), len(portfolios)), dtype=float)
        for i, pname in enumerate(portfolios):
            return_panel[:, i] = chen_z_port_data.loc[chen_z_port_data['port_id'] == pname].ret.to_numpy()
        tgt = None
        if tgt_factor in {'mkt', 'SMB', 'HML', 'CMA', 'RMW'}:
            int_start_date = int(start_date.replace('-', ''))
            int_end_date = int(end_date.replace('-', ''))
            ff_factors = pd.read_csv('data/F-F_Research_Data_5_Factors_2x3.CSV', skiprows=0,
                                       engine='python', parse_dates=True, infer_datetime_format=True)
            ff_factors = ff_factors.loc[ff_factors.date >= int_start_date] #NOTE not offsetting by one here across data sources
            ff_factors = ff_factors.loc[ff_factors.date < int_end_date]
            if tgt_factor == 'mkt':
                tgt = ff_factors['Mkt-RF'].to_numpy()
            if tgt_factor == 'SMB':
                tgt = ff_factors['SMB'].to_numpy()
            if tgt_factor == 'HML':
                tgt = ff_factors['HML'].to_numpy()
            if tgt_factor == 'CMA':
                tgt = ff_factors['CMA'].to_numpy()
            if tgt_factor == 'RMW':
                tgt = ff_factors['RMW'].to_numpy()
        self.data = return_panel, tgt, int(tgt.shape[0]/2)

# Models


##  SPCA

In [827]:
def scale(X, ref=None):
    if ref is None:
        ref = X
    norm = np.std(ref, axis=0, keepdims=True)
    return (X - np.mean(ref, axis=0, keepdims=True)) / norm

class SPCA:
    
    def __init__(self, data_panel, target_panel, n_train,
                N_factor=6) -> None:
        """
        Auguments:
        1) data_panel: numpy 2d array of data, normalizations and transforms should have already been applied
        2) target_panel: numpy 2d array of target data, normalizations and transforms should have already been applied
        3) n_train: index of start of oos
        4) N_factors: number of factors
        """
        self.n_factors = N_factor
        self.test_start = n_train
        self.data_series = np.copy(data_panel)
        self.tgt_series = np.copy(target_panel)
        
    def fit(self, nlags, true_oos=False, pca=False, raw_data=None,
           stack_lags=False, plot_resids=False, fit_factors_epanding_window=True):
        T, N = self.data_series.shape
        
        
        
        lags = np.zeros((T, nlags))
        if stack_lags:
            for t in range(T):
                lags[t,max(0, nlags-t):] = self.tgt_series[max(0, t-nlags):t]
                
        # 
        scaled_data = scale(np.copy(self.data_series[:self.test_start,:]))
        scaled_data_full = scale(np.copy(self.data_series), 
                                 ref=self.data_series[:self.test_start,:])
        gamma_is = SPCA.get_gamma_is(scaled_data, self.tgt_series[:self.test_start])
        if pca:
            gamma_is[:] = 1
        gamma_is = np.diag(gamma_is)
        _, loadings = SPCA.fit_factors(scaled_data@gamma_is, self.n_factors)

        factors = (1/N ) * scaled_data_full @ loadings
                    
        #In sample R2
        A_fit = np.concatenate([factors[:self.test_start], np.ones((self.test_start, 1))], axis=1)
        fit_target = self.tgt_series[:self.test_start]
        reg_loadings = np.linalg.lstsq(A_fit, fit_target, rcond=None)[0]
        preds = A_fit@reg_loadings
        is_r2 = np.sum(np.square(preds - fit_target)) / np.sum(np.square(fit_target - np.mean(fit_target)))
        print("is r2:", is_r2)
        
        preds = []
        ar_preds = []
        gts = []
        mean_preds = []
        for t in range(self.test_start, self.tgt_series.shape[0]):
            if fit_factors_epanding_window:
                scaled_data = scale(np.copy(self.data_series[:t,:]))
                scaled_data_ext = scale(np.copy(self.data_series[:t+1,:]))
                assert np.sum(np.isnan(scaled_data)) == 0 and np.sum(np.isnan(scaled_data_ext)) == 0 
                gamma_is = SPCA.get_gamma_is(scaled_data, self.tgt_series[:t])

                if pca:
                    gamma_is[:] = 1
                gamma_is = np.diag(gamma_is)
                if true_oos:
                    _, loadings = SPCA.fit_factors(scaled_data@gamma_is, self.n_factors)
                    factors = (1/N ) * scaled_data_ext @ loadings
                    fit_factors = factors[:t]
                    test_factors = factors[t:t+1]
                else:
                    factors, _ = SPCA.fit_factors(scaled_data_ext@gamma_is, self.n_factors, raw_data=raw_data)
                    fit_factors = factors[:t]
                    test_factors = factors[t:]
            else:
                fit_factors = factors[:t]
                test_factors = factors[t:t+1]
                
            
            if stack_lags:
                A_fit = np.concatenate([fit_factors, np.ones((t, 1)), lags[:t]], axis=1)
                A_test = np.concatenate([test_factors, np.ones((1, 1)), lags[t:t+1]], axis=1)
            else:
                A_fit = np.concatenate([fit_factors, np.ones((t, 1))], axis=1)
                A_test = np.concatenate([test_factors, np.ones((1, 1))], axis=1)
            loadings = np.linalg.lstsq(A_fit, self.tgt_series[:t], rcond=None)[0]

            
            sim_ar_model = AutoReg(self.tgt_series[:t], lags=nlags, old_names=False,
                                  trend='n')
            sim_ar_model_fit = sim_ar_model.fit()
            
            gts.append(self.tgt_series[t])
            mean_preds.append(np.mean(self.tgt_series[:t]))
            ar_forcast = sim_ar_model_fit.forecast()
            ar_preds.append(ar_forcast)
            preds.append(A_test@loadings)
        preds = np.array(preds).squeeze()
        ar_preds = np.array(ar_preds).squeeze()
        gts = np.array(gts)
        mean_preds = np.array(mean_preds)

        if plot_resids:
            plt.plot(preds - gts, label='factor resids')
            plt.plot(ar_preds - gts, label='ar resids')
            plt.legend()
            plt.show()
        
        print("r2 vs ar model", 1 - np.sum(np.square(preds - gts)) / np.sum(np.square(gts - ar_preds)))
        print("ar r2 vs mean model", 1 - np.sum(np.square(gts - ar_preds)) / np.sum(np.square(gts - mean_preds)))
        print("r2 vs mean model", 1 - np.sum(np.square(preds - gts)) / np.sum(np.square(gts - mean_preds)))
        print("EV", 1 - np.sum(np.square(preds - gts)) / np.sum(np.square(gts)))
        

        
    def fit_ts_reg(self, plot_resids=False):
        self.test_start
        gamma_is = SPCA.get_gamma_is(self.data_series[:self.test_start,:], self.tgt_series[:self.test_start])
        factors = SPCA.fit_factors(self.data_series*gamma_is, self.n_factors)[self.test_start:]
        loadings = np.linalg.lstsq(factors, self.tgt_series[self.test_start:], rcond=None)[0]
        preds = factors.dot(loadings)
        gts = self.tgt_series[self.test_start:]
        if plot_resids:
            plt.plot(preds - gts, label='factor resids')
            plt.legend()
            plt.show()
        
        print("r2 vs mean model", 1 - np.sum(np.square(preds - gts)) / np.sum(np.square(gts - np.mean(gts))))
        print("EV", 1 - np.sum(np.square(preds - gts)) / np.sum(np.square(gts)))
    

    @staticmethod
    def fit_factors(scaled_data, n_factors):
        T,N = scaled_data.shape

        fit_pipe = skpipe.Pipeline([('loadings', skd.TruncatedSVD(n_factors, algorithm='arpack'))])
        objective = scaled_data.T.dot(scaled_data)
        loadings = fit_pipe.fit_transform(objective)
        factors = (1/N ) * scaled_data.dot(loadings)
        
        return factors, loadings
        
    
    @staticmethod
    def get_gamma_is(data, target):
        loadings = []
        T = target.shape[0]
        
        for i in range(data.shape[1]):
            A = np.stack([data[:,i], np.ones(T)], axis=1)
            loading = np.linalg.lstsq(A, target, rcond=None)[0][0]
            loadings.append(loading)
        return np.array(loadings)

    

In [847]:


class GX_SPCA:
    
    def __init__(self, data_panel, target_panel, test_start, N_factors=6):
        """
        Auguments:
        1) data_panel: numpy 2d array of data, normalizations and transforms should have already been applied
        2) target_panel: numpy 2d array of target data, normalizations and transforms should have already 
            been applied
        3) n_train: index of start of oos
        4) N_factors: number of factors
        """
        T,N = data_panel.shape
        self.all_tgt = target_panel
        self.all_train = scale(data_panel, ref=data_panel[:test_start])
        self.train_data = scale(data_panel[:test_start])
        self.train_target = target_panel[:test_start]
        self.test_data = scale(data_panel[test_start:], ref=data_panel[:test_start])
        self.test_target = target_panel[test_start:]
        self.n_factors = N_factors
        
    def fit(self, nlags, true_oos=False, pca=False, raw_data=None,
           stack_lags=False, plot_resids=False, quantile=None, print_res=False,
           fit_factors_epanding_window=False):
        
        if quantile is None:
            qtile_test_start = int(self.train_data.shape[0]/3)
            quantiles = [i*(1/50) for i in range(50)]
            qtuile_r2s = [GX_SPCA(self.train_data, self.train_target, 
                                  qtile_test_start, self.n_factors).fit(nlags=nlags, true_oos=True,
                                                                       quantile=q)[0] for q in quantiles]
            quantile = quantiles[np.argmax(qtuile_r2s)]
            
        betas = GX_SPCA.get_portfolio_loadings(self.train_data, self.train_target, self.n_factors,
                                              quantile=quantile)
        
        train_factors = self.train_data @ betas
        test_factors = self.test_data @ betas
        
        #In sample R2
        A_fit = np.concatenate([train_factors, np.ones((train_factors.shape[0], 1))], axis=1)
        fit_target = self.train_target
        loadings = np.linalg.lstsq(A_fit, fit_target, rcond=None)[0]
        predicted = A_fit@loadings
        is_r2 = np.sum(np.square(predicted - fit_target)) / np.sum(np.square(fit_target - np.mean(fit_target)))
        if print_res:
            print("is r2:", is_r2)
        
        preds = []
        mean_preds = []
        gts = []
        for t in range(self.test_data.shape[0]):
            if fit_factors_epanding_window and t > 0:
                betas = GX_SPCA.get_portfolio_loadings(self.all_train[:t + self.train_target.shape[0]], 
                                                       self.all_tgt[:t + self.train_target.shape[0]], 
                                                       self.n_factors,
                                                       quantile=quantile)
                
                train_factors = self.train_data @ betas
                test_factors = self.test_data @ betas
            
            if t > 0:
                fit_factors = np.concatenate([train_factors, test_factors[:t]], axis=0)
                fit_target = np.concatenate([self.train_target, self.test_target[:t]], axis=0)
            else:
                fit_factors = train_factors
                fit_target = self.train_target
            
            eval_factors = test_factors[t:t+1]
            
            A_fit = np.concatenate([fit_factors, np.ones((fit_factors.shape[0], 1))], axis=1)
            A_test = np.concatenate([eval_factors, np.ones((1, 1))], axis=1)
            
            loadings = np.linalg.lstsq(A_fit, fit_target, rcond=None)[0]
        
            gts.append(self.test_target[t])
            mean_preds.append(np.mean(self.all_tgt[:t + self.train_target.shape[0]]))
        
            preds.append(A_test@loadings)

        preds = np.array(preds).squeeze()
        mean_preds = np.array(mean_preds)
        gts = np.array(gts)
        r2 =  1 - np.sum(np.square(preds - gts)) / np.sum(np.square(gts - mean_preds))
        ev = 1 - np.sum(np.square(preds - gts)) / np.sum(np.square(gts))
        if print_res:
            print("r2 vs mean model", r2)
            print("EV", ev)
        else:
            return is_r2, r2, ev
        
    
    
    @staticmethod
    def get_portfolio_loadings(train_data, train_tgt, nfactor, quantile):
        T, N = train_data.shape
        loadings = np.zeros((N, nfactor))
        
        R_k = np.copy(train_data.T)
        G_k = np.copy(train_tgt.T)
        rbar_k = np.mean(R_k, axis=1)
        
        eta_hats = []
        gamma_hats = []
        v_hats = []
        beta_hats = []
        
        for k in range(nfactor):
            vhat, gamma_hat, eta_hat, beta_hat, R_k, rbar_k, G_k = GX_SPCA.get_next_factor_loading(R_k, G_k, rbar_k,
                                                                                                  quantile=quantile)
            eta_hats.append(eta_hat)
            gamma_hats.append(gamma_hat)
            v_hats.append(vhat)
            beta_hats.append(beta_hat)
        
        betas = np.concatenate(beta_hats, axis=1)
        return betas
        
    @staticmethod
    def get_next_factor_loading(train_data, train_tgt, rbar, quantile):
        #train_data: n times T
        #train_tgt: 1 times T
        N,T = train_data.shape 
        cors = (1/T)*(train_data @ train_tgt.T)
        thresh = np.quantile(cors, quantile) # FIXME
        selected_inds = np.argwhere(cors >= thresh).squeeze()
#         print(cors.shape, selected_inds.shape)
        selected_data = train_data[selected_inds,:] #
        
        # algo 1
        psi, singval, xi = randomized_svd(selected_data, n_components=1,
                                           n_iter=10, random_state=None)
        vhat = np.sqrt(T) * xi #factors
        gamma_hat = (singval/T)*psi.T @ rbar[selected_inds]
        eta_hat = (1/T) * train_tgt @ vhat.T
        #END algo 1
        
        beta_hat = (1/T) * train_data @ vhat.T
        
        train_data_perp = train_data - beta_hat @ vhat
        rbar_perp = rbar - beta_hat.squeeze() * gamma_hat
        target_perp = train_tgt - eta_hat * vhat.squeeze()
        
        return vhat, gamma_hat, eta_hat, beta_hat, train_data_perp, rbar_perp, target_perp

# Metrics

In [815]:
from statsmodels.tools.eval_measures import bic

In [816]:
def get_lags(target_series_train, max_lags):
    bics = []
    for i in range(max_lags):
        i += 1
        model = AutoReg(target_series_train, lags=i, old_names=False, trend='n')
        model_fit = model.fit()
        bics.append(model_fit.bic)
    return np.argmax(bics) + 1

In [832]:
for tgt in ["VXOCLSx", "UNRATE", "INDPRO", "CPIAUCSL"]:
    spca_data = SPCAData(target=tgt, 
                         drop_cols=["ACOGNO", "ANDENOx", "TWEXAFEGSMTHx", "UMCSENTx", "VXOCLSx"]
#                          drop_cols=["ACOGNO", "ANDENOx", "TWEXAFEGSMTHx", "UMCSENTx"]
                        )
    data_panel, tgt_data, test_start, raw_data = spca_data.get_data()
    opt_lags = get_lags(tgt_data[:test_start], max_lags=5)
    print(tgt, opt_lags)
    spca = SPCA(data_panel, tgt_data, test_start, N_factor=4)
    print("SPCA")
    spca.fit(opt_lags, true_oos=True, stack_lags=False, fit_factors_epanding_window=True)
    print("PCA")
    spca.fit(opt_lags, true_oos=True, stack_lags=False, pca=True, fit_factors_epanding_window=True)
    print("giglio xui")
    spca = GX_SPCA(data_panel, tgt_data, test_start, N_factors=4)
    spca.fit(1, true_oos=True, stack_lags=False, print_res=True, fit_factors_epanding_window=True)
    print()

VXOCLSx 1
SPCA
is r2: 0.7085049492535421
r2 vs ar model -2.5139666147096356
ar r2 vs mean model 0.77591764844881
r2 vs mean model 0.21258209770349001
EV 0.8840031842719243
PCA
is r2: 0.7791087292486113
r2 vs ar model -3.017413181746897
ar r2 vs mean model 0.77591764844881
r2 vs mean model 0.09976860708140667
EV 0.8673843016618573
giglio xui
is r2: 0.7977254529473414
r2 vs mean model 0.12019280880567207
EV 0.8703930500747307

UNRATE 1
SPCA
is r2: 0.7208798651997318
r2 vs ar model 0.17136344902024536
ar r2 vs mean model 0.003061115352460897
r2 vs mean model 0.1739000010880597
EV 0.17126251458179143
PCA
is r2: 0.736258995746545
r2 vs ar model 0.11190827681038173
ar r2 vs mean model 0.003061115352460897
r2 vs mean model 0.11462682801863089
EV 0.11180010026510412
giglio xui
is r2: 0.767426223764381
r2 vs mean model 0.0038285768101509188
EV 0.00064810387702674

INDPRO 5
SPCA
is r2: 0.708835098011172
r2 vs ar model -0.033832142694584455
ar r2 vs mean model 0.11179035309279373
r2 vs mean model

In [833]:
for i in range(5):
    tgt = f'FB-yeild-{i+1}'
    spca_data = SPCAData(target=tgt, 
                         drop_cols=["ACOGNO", "ANDENOx", "TWEXAFEGSMTHx", "UMCSENTx", "VXOCLSx"]
#                          drop_cols=["ACOGNO", "ANDENOx", "TWEXAFEGSMTHx", "UMCSENTx"]
                        )
    data_panel, tgt_data, test_start, raw_data = spca_data.get_data()
    opt_lags = get_lags(tgt_data[:test_start], max_lags=5)
    print(tgt, opt_lags)
    spca = SPCA(data_panel, tgt_data, test_start, N_factor=5)
    spca.fit(opt_lags, true_oos=True, stack_lags=False, fit_factors_epanding_window=False)
    spca.fit(opt_lags, true_oos=True, pca=True, fit_factors_epanding_window=False)
    print("giglio xui")
    spca = GX_SPCA(data_panel, tgt_data, test_start, N_factors=5)
    spca.fit(1, true_oos=True, stack_lags=False, print_res=True, fit_factors_epanding_window=False)
    print()

FB-yeild-1 4
is r2: 0.34018291226984765
r2 vs ar model -133.8502482058935
ar r2 vs mean model 0.9942046744466398
r2 vs mean model 0.21849891069542626
EV 0.5737199299702239
is r2: 0.7230648305605135
r2 vs ar model -132.90968282372003
ar r2 vs mean model 0.9942046744466398
r2 vs mean model 0.22394979328934217
EV 0.5766931857284804
giglio xui
is r2: 0.8736392676469477
r2 vs mean model 0.06980615686125025
EV 0.4926135074906899

FB-yeild-2 5
is r2: 0.3681442768242849
r2 vs ar model -121.96975569462843
ar r2 vs mean model 0.9930864643215482
r2 vs mean model 0.14984420663469367
EV 0.591802473948146
is r2: 0.7824238249656462
r2 vs ar model -118.43739651310442
ar r2 vs mean model 0.9930864643215482
r2 vs mean model 0.17426529786526646
EV 0.6035281236485921
giglio xui
is r2: 0.8968141643153564
r2 vs mean model 0.054661570149194594
EV 0.5461010659948172

FB-yeild-3 5
is r2: 0.33707553413050606


KeyboardInterrupt: 

In [849]:
for tgt in 'mkt', 'SMB', 'HML', 'CMA', 'RMW' :
    print(tgt)
    chen_z_data_panel, tgt_factor, weak_test_start = ChenZData(tgt_factor=tgt).data
    spca = SPCA(chen_z_data_panel, tgt_factor, weak_test_start, N_factor=5)
    print("spca")
    spca.fit(1, true_oos=True, stack_lags=False, fit_factors_epanding_window=True)
    print("pca")
    spca.fit(1, true_oos=True, stack_lags=False, fit_factors_epanding_window=True,
            pca=True)
    print("gx")
    spca = GX_SPCA(chen_z_data_panel, tgt_factor, weak_test_start, N_factors=5)
    spca.fit(1, true_oos=True, stack_lags=False, fit_factors_epanding_window=True, print_res=True)
    print()
    

mkt
spca
is r2: 0.012614124066276839
r2 vs ar model 0.9718509840151561
ar r2 vs mean model -0.008956470002243533
r2 vs mean model 0.9715988681978952
EV 0.9719244931250737
pca
is r2: 0.01551556351788213
r2 vs ar model 0.9644800742586304
ar r2 vs mean model -0.008956470002243533
r2 vs mean model 0.9641619411092459
EV 0.9645728319638017
gx
is r2: 0.016001418016670686
r2 vs mean model 0.9475370577616697
EV 0.9481385563873217

SMB
spca
is r2: 0.33218597951784756
r2 vs ar model 0.49062316852052357
ar r2 vs mean model -0.03806164735615991
r2 vs mean model 0.4712354471893536
EV 0.47138946206509436
pca
is r2: 0.12360302984378417
r2 vs ar model 0.7212726830207307
ar r2 vs mean model -0.03806164735615991
r2 vs mean model 0.710663862173337
EV 0.7107481379990856
gx
is r2: 0.4229352285059945
r2 vs mean model 0.5243967320680396
EV 0.5245352624930144

HML
spca
is r2: 0.27586590642717107
r2 vs ar model 0.5842041191457537
ar r2 vs mean model 0.021473849942505452
r2 vs mean model 0.5931328574979295
EV 0.

In [844]:
for tgt in 'mkt', 'SMB', 'HML', 'CMA', 'RMW' :
    print(tgt)
    chen_z_data_panel, tgt_factor, weak_test_start = ChenZData(tgt_factor=tgt).data
    spca = SPCA(chen_z_data_panel, tgt_factor, weak_test_start, N_factor=5)
    print("spca")
    spca.fit(1, true_oos=True, stack_lags=False, fit_factors_epanding_window=False)
    print("pca")
    spca.fit(1, true_oos=True, stack_lags=False, fit_factors_epanding_window=False,
            pca=True)
    print("gx")
    spca = GX_SPCA(chen_z_data_panel, tgt_factor, weak_test_start, N_factors=5)
    spca.fit(1, true_oos=True, stack_lags=False, fit_factors_epanding_window=False, print_res=True)
    print()

mkt
spca
is r2: 0.0126141240662768
r2 vs ar model 0.9729738598080271
ar r2 vs mean model -0.008956470002243533
r2 vs mean model 0.9727318009941213
EV 0.9730444366094005
pca
is r2: 0.015515563517882042
r2 vs ar model 0.9691723205840495
ar r2 vs mean model -0.008956470002243533
r2 vs mean model 0.9688962133981217
EV 0.9692528248288838
gx
is r2: 0.014972537304636156
r2 vs mean model 0.9708260092003416
EV 0.9711604951178691

SMB
spca
is r2: 0.3321859795178906
r2 vs ar model 0.5125360009809733
ar r2 vs mean model -0.03806164735615991
r2 vs mean model 0.49398231815148763
EV 0.4941297074762414
pca
is r2: 0.12360302984378978
r2 vs ar model 0.74536102149714
ar r2 vs mean model -0.03806164735615991
r2 vs mean model 0.7356690424942314
EV 0.7357460349842881
gx
is r2: 0.42293522850726134
r2 vs mean model 0.33838857084242513
EV 0.3385812804359649

HML
spca
is r2: 0.27586590642717174
r2 vs ar model 0.627852320996273
ar r2 vs mean model 0.021473849942505452
r2 vs mean model 0.6358437644116508
EV 0.632