# APCA with Oblimin

In [25]:
import numpy as np
import pandas as pd
from factor_analyzer.rotator import Rotator

import matplotlib.pyplot as plt

In [26]:
stock_sample = 250
n_factors_list = [23, 17, 16, 20]

stock_samples = [25, 50, 75, 100, 250]
factors = [[4, 3, 3, 3], [5, 4, 4, 5], [7, 6, 6, 8], [10, 8, 7, 10], [23, 17, 16, 20]]

directory = f'Final_Data/Walkforward_Sets/{stock_sample}_stocks_seed42'
excel_filename = f'Final_Data/Final_Results/apca_obli_stability_results_{stock_sample}.xlsx'

## Walk-forward Import

Import done, now continue with estimation and then finish with eval. Then copy what you've done here over to the other notebooks. 

In [27]:
import os
import re
import pandas as pd

files = os.listdir(directory)

pattern_corr = re.compile(r'^(df\d{2}[a-z])_varresid_(is|os)_corr_(is|os)\.csv$')
pattern_resid = re.compile(r'^(df\d{2}[a-z])_varresid_(is|os)\.csv$')

for fname in files:
    # Skip covariance matrices (those with 'cov_' in the filename)
    if 'cov_' in fname:
        continue
    full_path = os.path.join(directory, fname)
    
    m = pattern_corr.match(fname)
    if m:
        # File: df00a_varresid_is_corr_is.csv -> Key: corr00a_is 
        base = m.group(1)    # e.g. "df00a"
        inout = m.group(2)   # e.g. "is" or "os"
        new_key = f"corr{base[2:]}_{inout}"
        df = pd.read_csv(full_path, index_col=0)
        globals()[new_key] = df
        continue

    m = pattern_resid.match(fname)
    if m:
        # File: df00a_varresid_is.csv -> Key: df00a_is
        base = m.group(1)
        inout = m.group(2)
        new_key = f"{base}_{inout}"
        df = pd.read_csv(full_path, index_col=0)
        globals()[new_key] = df
        continue

## Evaluation

No need for rotation here yet because it doesn't change the explained variance

In [28]:
from estimators import APCA, fit_apca

In [29]:
from sklearn.decomposition import PCA  # Only used for residual PCA step and for residual structure analysis
import numpy as np
import pandas as pd
from datetime import datetime
import scipy.stats as stats

# -------- Utility Functions --------

def congruence_coefficient(v1, v2):
    """
    Computes Tucker's coefficient of congruence between two vectors.
    """
    return np.sum(v1 * v2) / (np.sqrt(np.sum(v1**2)) * np.sqrt(np.sum(v2**2)))

def compute_sparsity_index(loadings):
    """
    Computes the normalized sparsity index for a given loadings DataFrame.
    For each factor vector v, compute:
      ratio = ||v||_1 / ||v||_2,
    then normalized = (sqrt(n) - ratio) / (sqrt(n)-1),
    where n is the number of assets.
    Returns the average sparsity score across factors.
    """
    sparsity_scores = []
    n_assets = loadings.shape[0]
    for col in loadings.columns:
        v = loadings[col].values
        l1_norm = np.linalg.norm(v, 1)
        l2_norm = np.linalg.norm(v, 2)
        ratio = l1_norm / l2_norm if l2_norm != 0 else np.nan
        normalized = (np.sqrt(n_assets) - ratio) / (np.sqrt(n_assets) - 1)
        sparsity_scores.append(normalized)
    return np.mean(sparsity_scores)

def oblimin_rotation(loadings_df, gamma=0.0, tol=1e-6, max_iter=1000):
    """
    Applies an oblimin (quartimin when gamma=0) rotation to the factor loadings.
    Returns the rotated loadings DataFrame and the rotation matrix.
    """
    loadings = loadings_df.to_numpy()    # (n_assets, n_factors)
    p, m = loadings.shape
    T = np.eye(m)
    step = 1e-4

    for iteration in range(max_iter):
        L_rot = loadings.dot(T)
        mean_sq = np.mean(L_rot**2, axis=0, keepdims=True)
        gradient = 4 * loadings.T.dot(L_rot * (L_rot**2 - mean_sq))
        T_new = T - step * gradient
        if np.linalg.norm(T_new - T) < tol:
            T = T_new
            break
        T = T_new

    rotated = loadings.dot(T)
    rotated_df = pd.DataFrame(rotated, index=loadings_df.index, columns=loadings_df.columns)
    return rotated_df, T

def calc_proj_ev(df_std, loadings):
    """
    Given standardized returns (df_std) and factor loadings, compute predicted returns 
    via regression-based score estimation and return the explained variance.
    """
    inv_LL = np.linalg.inv(loadings.T.dot(loadings))
    factor_scores = (inv_LL.dot(loadings.T.dot(df_std.T))).T  # shape: (n_obs, n_factors)
    projected = factor_scores.dot(loadings.T)
    residuals = df_std - projected
    total_var = df_std.var(axis=0).mean()
    ev = 1 - residuals.var(axis=0).mean() / total_var
    return ev

def fit_apca(corr_matrix, returns, n_factors, tickers):
    """
    Dummy placeholder for APCA factor model fitting.
    In practice, this function fits an approximate PCA model (APCA) and
    returns: loadings (DataFrame), factors, eigenvalues, and r2 scores.
    """
    pca = PCA(n_components=n_factors)
    pca.fit(corr_matrix)
    loadings = pd.DataFrame(pca.components_.T, index=tickers, 
                            columns=[f'PC{i+1}' for i in range(n_factors)])
    factors = None
    eigenvals = pca.explained_variance_
    r2_scores = None
    return loadings, factors, eigenvals, r2_scores

# -------- Revised Evaluation Function --------

def evaluate_apca_os(corr_is, df_os, df_is, n_factors):
    """
    Estimates an APCA factor model on in-sample data and evaluates the model 
    via regression-based projections on both in-sample and out-of-sample returns.
    
    In addition to explained variances, it computes:
      - Factor congruence between in-sample and out-of-sample loadings,
      - Factor weights (the rotated in-sample loadings),
      - Sparsity index on the factor weights.
    Also computes the residuals with proper asset names.
    
    Returns:
        metrics (dict): Dictionary of evaluation metrics.
        rotated_loadings_is (DataFrame): In-sample factor loadings after rotation (factor weights).
        resid (DataFrame): Out-of-sample residuals with asset names as columns.
    """
    # Determine common assets
    cols_os = df_os.columns.tolist()
    idx_corr = corr_is.index.tolist()
    cols_is = df_is.columns.tolist()
    common_assets = sorted(list(set(cols_os) & set(idx_corr) & set(cols_is)))
    if len(common_assets) == 0:
        raise ValueError("No common assets between matrices")
    
    # Align data
    corr_is_aligned = corr_is.loc[common_assets, common_assets]
    df_os_aligned = df_os[common_assets]
    df_is_aligned = df_is[common_assets]
    
    # Standardize in-sample returns using in-sample moments
    is_means = df_is_aligned.mean()
    is_stds = df_is_aligned.std()
    df_is_std = (df_is_aligned - is_means) / is_stds
    # Standardize OOS returns using the same moments for consistency
    df_os_std = (df_os_aligned - is_means) / is_stds
    
    # APCA on in-sample data: fit using correlation matrix and actual returns (transposed)
    loadings_is, _, _, _ = fit_apca(corr_is_aligned.to_numpy(), df_is_aligned.T, n_factors, tickers=common_assets)
    loadings_is.columns = [f'PC{i+1}' for i in range(n_factors)]
    # Apply oblimin rotation
    rotated_loadings_is, _ = oblimin_rotation(loadings_is, gamma=0.0)
    
    # Compute projection-based explained variance for in-sample and out-of-sample
    explained_variance_is = calc_proj_ev(df_is_std, rotated_loadings_is)
    explained_variance_os = calc_proj_ev(df_os_std, rotated_loadings_is)
    
    # APCA on out-of-sample data via its correlation matrix
    loadings_os, _, _, _ = fit_apca(df_os_std.corr().to_numpy(), df_os_aligned.T, n_factors, tickers=common_assets)
    loadings_os.columns = [f'PC{i+1}' for i in range(n_factors)]
    # Apply oblimin rotation
    rotated_loadings_os, _ = oblimin_rotation(loadings_os, gamma=0.0)
    
    # Compute loading correlations and factor congruence per factor
    correlations = []
    congruences = []
    for i in range(n_factors):
        is_vec = rotated_loadings_is.iloc[:, i]
        os_vec = rotated_loadings_os.iloc[:, i]
        corr_val = abs(np.corrcoef(is_vec, os_vec)[0, 1])
        correlations.append(corr_val)
        cong_val = abs(congruence_coefficient(is_vec, os_vec))
        congruences.append(cong_val)
    avg_loading_corr = np.mean(correlations)
    avg_congruence = np.mean(congruences)
    
    # Residual structure analysis: project OOS returns onto rotated in-sample loadings
    inv_LL = np.linalg.inv(rotated_loadings_is.T.dot(rotated_loadings_is))
    factor_scores_os = (inv_LL.dot(rotated_loadings_is.T.dot(df_os_std.T))).T
    projected_returns_os = factor_scores_os.dot(rotated_loadings_is.T)
    # Wrap residuals in a DataFrame to preserve asset names
    resid = pd.DataFrame(df_os_std - projected_returns_os, index=df_os_std.index, columns=common_assets)
    
    pca_residuals = PCA(n_components=1)
    pca_residuals.fit(resid)
    residual_first_pc_ev = pca_residuals.explained_variance_ratio_[0]
    
    # Compute sparsity index on the rotated in-sample loadings (factor weights)
    sparsity_index = compute_sparsity_index(rotated_loadings_is)
    
    metrics = {
        'n_factors': n_factors,
        'explained_variance_is': explained_variance_is,
        'explained_variance_os': explained_variance_os,
        'residual_first_pc_ev': residual_first_pc_ev,
        'avg_loading_correlation': avg_loading_corr,
        'avg_congruence': avg_congruence,
        'sparsity_index': sparsity_index,
    }
    for i, cv in enumerate(correlations):
        metrics[f'factor_{i+1}_correlation'] = round(cv, 3)
    for i, cong in enumerate(congruences):
        metrics[f'factor_{i+1}_congruence'] = round(cong, 3)
    
    return metrics, rotated_loadings_is, resid

# -------- USAGE PART --------

# Configuration
# Global variables assumed to be defined externally:
#   n_factors_list: a list with number of factors per period (e.g. [3, 4, 3, 5])
#   stock_sample: a string identifier for the stock sample
# Also assume that globals() contains correlation and returns DataFrames 
# keyed by strings like "corr00a_is", "df00a_is", "df00b_os", etc.

periods = ["00", "05", "10", "15"]
# Define pairings for in-sample/out-of-sample: each period gets pairings like: a-b, b-c, c-d, d-e
pairings = [("a", "b"), ("b", "c"), ("c", "d"), ("d", "e")]

print("Using the following number of factors per period:")
for period, n in zip(periods, n_factors_list):
    print(f"Period {period}: {n} factors")
print("\n")

results_list = []
factor_weights_list = []  # to store rotated in-sample loadings per pairing
residuals_dict = {}       # to store residuals per pairing

for period in periods:
    for ins, oos in pairings:
        corr_key = f"corr{period}{ins}_is"   # in-sample correlation matrix
        is_key   = f"df{period}{ins}_is"       # in-sample returns
        os_key   = f"df{period}{oos}_os"       # out-of-sample returns
        
        required_keys = [corr_key, is_key, os_key]
        if all(key in globals() for key in required_keys):
            try:
                metrics, loadings_is, residuals = evaluate_apca_os(
                    globals()[corr_key],
                    globals()[os_key],
                    globals()[is_key],
                    n_factors=n_factors_list[periods.index(period)]
                )
                metrics['period'] = period
                metrics['in_sample'] = ins
                metrics['oos_sample'] = oos
                results_list.append(metrics)
                
                # Save factor weights (rotated in-sample loadings) with asset identifiers
                loadings_is_reset = loadings_is.reset_index().rename(columns={'index': 'Asset'})
                loadings_is_reset['period'] = period
                loadings_is_reset['in_sample'] = ins
                factor_weights_list.append(loadings_is_reset)
                
                # Store residuals with label key for later aggregation
                label = f"{period}_{ins}_vs_{oos}"
                residuals_dict[label] = residuals
                
                print(f"Completed {period}: in-sample '{ins}' vs out-of-sample '{oos}'")
            except Exception as e:
                print(f"Error in {period} for pairing {ins}-{oos}: {str(e)}")
        else:
            missing = [key for key in required_keys if key not in globals()]
            print(f"Skipping {period} pairing {ins}-{oos} – missing data: {missing}")

if results_list:
    results_df = pd.DataFrame(results_list)
    column_order = [
        'period', 'in_sample', 'oos_sample', 'n_factors',
        'explained_variance_is', 'explained_variance_os',
        'residual_first_pc_ev', 'avg_loading_correlation',
        'avg_congruence', 'sparsity_index'
    ]
    factor_cols = sorted([col for col in results_df.columns if 'factor_' in col])
    column_order.extend(factor_cols)
    results_df = results_df[column_order]
    
    numeric_columns = results_df.select_dtypes(include=[np.number]).columns
    results_df[numeric_columns] = results_df[numeric_columns].round(3)
    results_df = results_df.sort_values(['period', 'in_sample', 'oos_sample'])
    
    # Combine factor weights from all pairings if available
    if factor_weights_list:
        factor_weights_df = pd.concat(factor_weights_list, ignore_index=True)
        factor_weight_cols = [col for col in factor_weights_df.columns if col.startswith('PC')]
        factor_weights_df = factor_weights_df[['period', 'in_sample', 'Asset'] + factor_weight_cols]
    
    # Write results to Excel with separate sheets including the residuals sheet.
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    excel_filename = f"Final_Data/Final_Results/apca_{stock_sample}.xlsx"
    
    with pd.ExcelWriter(excel_filename) as writer:
        results_df.to_excel(writer, sheet_name='Results', index=False)
        summary_stats = results_df.groupby('period').agg({
            'explained_variance_is': ['mean', 'std'],
            'explained_variance_os': ['mean', 'std'],
            'residual_first_pc_ev': ['mean', 'std'],
            'avg_loading_correlation': ['mean', 'std'],
            'avg_congruence': ['mean', 'std'],
            'sparsity_index': ['mean', 'std']
        }).round(3)
        summary_stats.columns = ['EV_IS_mean', 'EV_IS_std',
                                 'EV_OS_mean', 'EV_OS_std',
                                 'Residual_PC1_mean', 'Residual_PC1_std',
                                 'Corr_mean', 'Corr_std',
                                 'Congruence_mean', 'Congruence_std',
                                 'Sparsity_Index_mean', 'Sparsity_Index_std']
        summary_stats.to_excel(writer, sheet_name='Summary_Stats')
        
        factor_summary = results_df[['period'] + factor_cols].groupby('period').agg(['mean', 'std']).round(3)
        factor_summary.to_excel(writer, sheet_name='Factor_Correlations')
        
        if factor_weights_list:
            factor_weights_df.to_excel(writer, sheet_name='Factor_Weights', index=False)
        
        # Add residuals sheet: join residuals across all pairings
        if residuals_dict:
            all_residuals = None
            for label, resid in residuals_dict.items():
                # Rename columns to include the pairing label for clarity
                resid_copy = resid.copy()
                resid_copy.columns = [f"{label}_{col}" for col in resid_copy.columns]
                if all_residuals is None:
                    all_residuals = resid_copy
                else:
                    all_residuals = all_residuals.join(resid_copy, how='outer')
            all_residuals.to_excel(writer, sheet_name='Residuals', index=True)
        
        # New aggregated results sheet (optional)
        agg_metrics = {}
        metric_columns = [col for col in results_df.columns if col not in ['period', 'in_sample', 'oos_sample'] and np.issubdtype(results_df[col].dtype, np.number)]
        for col in metric_columns:
            data = results_df[col].dropna()
            n = len(data)
            if n > 1:
                mean_val = data.mean()
                std_val = data.std()
                sem = std_val / np.sqrt(n)
                t_stat = stats.t.ppf(1-0.025, df=n-1)
                ci_lower = mean_val - t_stat * sem
                ci_upper = mean_val + t_stat * sem
            else:
                mean_val = data.mean()
                std_val = np.nan
                ci_lower = np.nan
                ci_upper = np.nan
            agg_metrics[col] = {
                "mean": round(mean_val, 3),
                "std": round(std_val, 3) if pd.notnull(std_val) else std_val,
                "count": n,
                "CI_lower": round(ci_lower, 3) if pd.notnull(ci_lower) else ci_lower,
                "CI_upper": round(ci_upper, 3) if pd.notnull(ci_upper) else ci_upper
            }
        agg_df = pd.DataFrame(agg_metrics).T.reset_index().rename(columns={'index': 'Metric'})
        agg_df = agg_df[['Metric', 'mean', 'std', 'count', 'CI_lower', 'CI_upper']]
        agg_df.to_excel(writer, sheet_name='Results_Aggregated', index=False)
    
    print(f"\nResults saved to: {excel_filename}")
    print("\nResults DataFrame:")
    print(results_df)
    print("\nSummary Statistics by Period:")
    print(summary_stats)
    print("\nResiduals sheet saved with columns:")
    if 'all_residuals' in locals():
        print(list(all_residuals.columns))
else:
    print("No results were generated. Check your input data.")

Using the following number of factors per period:
Period 00: 23 factors
Period 05: 17 factors
Period 10: 16 factors
Period 15: 20 factors


Completed 00: in-sample 'a' vs out-of-sample 'b'
Completed 00: in-sample 'b' vs out-of-sample 'c'
Completed 00: in-sample 'c' vs out-of-sample 'd'
Completed 00: in-sample 'd' vs out-of-sample 'e'
Completed 05: in-sample 'a' vs out-of-sample 'b'
Completed 05: in-sample 'b' vs out-of-sample 'c'
Completed 05: in-sample 'c' vs out-of-sample 'd'
Completed 05: in-sample 'd' vs out-of-sample 'e'
Completed 10: in-sample 'a' vs out-of-sample 'b'
Completed 10: in-sample 'b' vs out-of-sample 'c'
Completed 10: in-sample 'c' vs out-of-sample 'd'
Completed 10: in-sample 'd' vs out-of-sample 'e'
Completed 15: in-sample 'a' vs out-of-sample 'b'
Completed 15: in-sample 'b' vs out-of-sample 'c'
Completed 15: in-sample 'c' vs out-of-sample 'd'
Completed 15: in-sample 'd' vs out-of-sample 'e'

Results saved to: Final_Data/Final_Results/apca_250.xlsx

Results DataFrame