In [None]:
import pandas as pd
import numpy as np
from ipca_classes_update import IPCA_v1

In [None]:
# === STEP 1: Load your dataset ===
df = pd.read_csv('/teamspace/studios/this_studio/goup_project_sample_v3.csv', parse_dates=['date'])
df = df.dropna()

In [None]:
# Optional: Keep only relevant columns
non_factor_cols = ['date', 'permno', 'exret']
factor_cols = [col for col in df.columns if col not in non_factor_cols]

In [None]:
# === STEP 2: Rank-normalize daily ===
adj_data = pd.DataFrame()
for date, group in df.groupby('date'):
    g = group.copy()
    for var in factor_cols:
        g[var] = g[var].rank(method='dense') - 1
        max_val = g[var].max()
        g[var] = (g[var] / max_val - 0.5) if max_val > 0 else 0
    adj_data = pd.concat([adj_data, g], ignore_index=True)

In [None]:
# === STEP 3: Format for IPCA ===
adj_data = adj_data.sort_values(['date', 'permno'])
adj_data = adj_data.set_index(['date', 'permno'])

In [None]:
# === STEP 4: Loop over K to select best via OOS R² ===
best_k, best_r2 = None, float('-inf')
r2_scores = {}
K_range = range(5, 21)  # You can adjust this range

print("Finding best K using out-of-sample R²...")

for K in K_range:
    try:
        ipca = IPCA_v1(adj_data, return_column='exret', add_constant=True)
        results = ipca.fit(K=K, OOS=True, OOS_window='recursive', OOS_window_specs=120)

        r2_oos = results['rfits']['R2_Pred']
        r2_scores[K] = r2_oos

        print(f"K = {K}, R²_Pred = {r2_oos:.4f}")

        if r2_oos > best_r2:
            best_k, best_r2 = K, r2_oos
            best_results = results
    except Exception as e:
        print(f"Failed at K = {K}: {e}")

print(f"\n✅ Best K: {best_k} with R²_Pred = {best_r2:.4f}")

In [None]:
# === STEP 5: Save best latent factor predictions ===
factors = best_results['rfits']['Fits_Pred'].dropna().reset_index()
factors.to_csv('/teamspace/studios/this_studio/ipca_latent_factors.csv', index=False)
print("Saved latent factors to: ipca_latent_factors.csv")