In [None]:
import pandas as pd
import csv
import pickle
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from pathlib import Path
import os
from itertools import product


from stage1 import lasso_rolling_window, calculate_r_squared
from stage2 import estimate_kappa_curve_fit, compute_alm_returns, compute_stage2_r_squared
from grid_search import estimate_single_config, grid_search

In [None]:
base_dir = Path(os.getenv("LASSO_OUTPUT_DIR", "output"))

In [None]:
features_path = Path("output") / "features.pkl"
response_path = Path("output") / "response.pkl"
return_path = Path("data") / "return84_20.csv"

with features_path.open("rb") as f:
    X = pickle.load(f)
    
with response_path.open("rb") as f:
    y = pickle.load(f)

if not return_path.exists():
    return_path = Path(r"C:\Users\jonat\Lasso_paper\Empirical\data\return84_20.csv")
    print("Using absolute path:", return_path)

y = np.log(y+1)


### We now estimated the 1st stage under the assumption that the agents PLM is estimated by LASSO.

The next step is now to use these forecasted returns to estimate the ALM.
The ALM in the 2nd stage is specified as: 

$$
r_{t+1} = \log(\varepsilon_{t+1}) 
+ \log(1 - \kappa e^{x'_t \beta}) 
- \log(1 - \kappa e^{x'_{t+1} \beta})
$$

$$
\kappa := \delta a^{-\gamma} \phi
$$

Here, $x'_t \beta$ and $x'_{t+1} \beta$ are the $t$ and $t+1$ return foreacsts of the agent from the 1st stage.

In [None]:
# # --- Configuration ---
# lambda_values = [0.01, 0.001]
# WINDOW_SIZE = 30
# N_LAGS = 3

# def compute_alm_returns(predictions, kappa, intercept):
#     """Compute ALM-implied realized returns."""
#     pred_t, pred_t1 = predictions[:-1], predictions[1:]
#     valid = (1 - kappa * np.exp(pred_t) > 0) & (1 - kappa * np.exp(pred_t1) > 0)
    
#     if not np.all(valid):
#         valid_idx = np.where(valid)[0]
#         pred_t, pred_t1 = pred_t[valid_idx], pred_t1[valid_idx]
    
#     return np.log(1 - kappa * np.exp(pred_t)) - np.log(1 - kappa * np.exp(pred_t1)) + intercept

# def calculate_r_squared(y_true, y_pred):
#     """Calculate R-squared."""
#     ss_total = np.sum((y_true - np.mean(y_true))**2)
#     ss_residual = np.sum((y_true - y_pred)**2)
#     return 1 - (ss_residual / ss_total)

# def process_lambda(lam, X, y):
#     """Process a single lambda value through both stages."""
#     print(f"\nProcessing λ = {lam:.6f}...")
    
#     # Stage 1: Rolling LASSO
#     lasso_results = lasso_rolling_window(
#         X=X, y=y, window_size=WINDOW_SIZE, n_lags=N_LAGS,
#         lambda_mode="fixed", fixed_lambda=lam, verbose=False
#     )
    
#     preds = np.array(lasso_results["predictions"])
#     y_valid = y[-len(preds):] if not isinstance(y, pd.Series) else y.iloc[-len(preds):]
#     y_vals = y_valid.values if isinstance(y_valid, pd.Series) else y_valid
    
#     # Stage 1 metrics
#     residuals = y_vals - preds
#     r2_stage1 = calculate_r_squared(y_vals, preds)
#     phi = np.exp(0.5 * np.var(residuals))
#     insample_r2 = np.mean(lasso_results['insample_r_squareds'])
    
#     # Stage 2: Estimate kappa
#     stage2_input = pd.DataFrame({"vwretd": y_vals, "predictions": preds})
#     kappa, intercept, kappa_tstat, intercept_tstat, r2_stage2 = np.nan, np.nan, np.nan, np.nan, np.nan
    
#     try:
#         popt, pcov = estimate_kappa_curve_fit(stage2_input)
#         kappa, intercept = popt
#         se = np.sqrt(np.diag(pcov))
#         kappa_tstat, intercept_tstat = kappa / se[0], intercept / se[1]
        
#         # Stage 2 R-squared
#         alm_rets = compute_alm_returns(preds, kappa, intercept)
#         r2_stage2 = calculate_r_squared(y_vals[1:len(alm_rets)+1], alm_rets)
        
#         # Store ALM returns
#         alm_df = pd.DataFrame({
#             "date": y_valid.index[1:len(alm_rets)+1] if isinstance(y_valid, pd.Series) else range(len(alm_rets)),
#             "r_hat": alm_rets,
#             "lambda": lam
#         })
#     except Exception as e:
#         print(f"⚠️ Kappa estimation failed: {e}")
#         alm_df = None
    
#     # Active predictors
#     coefs = np.array(lasso_results["coefficients"])
#     active_counts = np.count_nonzero(coefs, axis=1)
#     dates = pd.to_datetime(lasso_results.get("window_end_dates", pd.RangeIndex(len(active_counts))))
    
#     active_df = pd.DataFrame({
#         "date": dates,
#         "active_predictors": active_counts,
#         "lambda": lam
#     })
    
#     return {
#         "summary": {
#             "lambda": lam,
#             "kappa": kappa,
#             "kappa_tstat": kappa_tstat,
#             "intercept": intercept,
#             "intercept_tstat": intercept_tstat,
#             "avg_active_predictors": np.mean(active_counts),
#             "insample_r_squared": insample_r2,
#             "r_squared_stage_1": r2_stage1,
#             "phi_stage_1": phi,
#             "r_squared_stage_2": r2_stage2
#         },
#         "active_df": active_df,
#         "alm_df": alm_df
#     }

# # --- Main Execution ---
# results = [process_lambda(lam, X, y) for lam in lambda_values]

# # Aggregate results
# results_df = pd.DataFrame([r["summary"] for r in results])
# active_predictors_df = pd.concat([r["active_df"] for r in results], ignore_index=True)
# alm_returns_df = pd.concat([r["alm_df"] for r in results if r["alm_df"] is not None], ignore_index=True)

# if not alm_returns_df.empty:
#     alm_returns_df = alm_returns_df.pivot(index="date", columns="lambda", values="r_hat").sort_index()

# print("\n=== Results Summary ===")
# print(results_df)

In [None]:
X_subset = X.sample(n=60, axis=1)

In [None]:
# Define your grid
param_grid = {
    'window_sizes': [30,80,160,300,500],
    'n_lags': [3,7,12],
    'lambdas': [0.001,0.0001]
}

# Run grid search 
results_df3 = grid_search(X_subset, y, param_grid, verbose=True)

# Save
results_df3.to_csv('grid_search_results2.csv', index=False)

In [None]:
no_fix = pd.read_csv("grid_search_results.csv")

In [None]:
final_df = pd.concat([results_df1, result_df], ignore_index=True)

In [None]:
# --- Plotting ---
if pd.api.types.is_datetime64_any_dtype(active_predictors_df["date"]):
    active_predictors_df["month"] = active_predictors_df["date"].dt.to_period("M").dt.to_timestamp()
    monthly_avg = active_predictors_df.groupby(["lambda", "month"])["active_predictors"].mean().reset_index()
    
    plt.figure(figsize=(10, 6))
    for lam in lambda_values:
        subset = monthly_avg[monthly_avg["lambda"] == lam]
        plt.plot(subset["month"], subset["active_predictors"], marker="o", label=f"λ={lam}")
    
    plt.title("Average Active Predictors per Month")
    plt.xlabel("Month")
    plt.ylabel("Average Active Predictors")
    plt.legend(title="Lambda")
    plt.grid(True, linestyle="--", alpha=0.6)
    plt.tight_layout()
    plt.show()
else:
    print("⚠️ No datetime index — skipping plot.")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# ---- Load results ----

display(final_df)

# Sort for consistent plotting
final_df = final_df.sort_values(['window_size', 'lambda'])

window_sizes = sorted(final_df['window_size'].unique())
lambdas = sorted(final_df['lambda'].unique())

# ============================================================
# Compute global y-limits for consistency
# ============================================================

# In-sample R² min/max across both stages
r2_insample_min = min(
    final_df['r2_insample_stage1'].min(),
    final_df['r2_insample_stage2'].min()
)
r2_insample_max = max(
    final_df['r2_insample_stage1'].max(),
    final_df['r2_insample_stage2'].max()
)

# OOS R² min/max across both stages
r2_oos_min = min(
    final_df['r2_oos_stage1'].min(),
    final_df['r2_oos_stage2'].min()
)
r2_oos_max = max(
    final_df['r2_oos_stage1'].max(),
    final_df['r2_oos_stage2'].max()
)



# ============================================================
# 2. LINE PLOTS: OOS R² vs WINDOW SIZE (same y-scale)
# ============================================================

plt.figure(figsize=(12, 6))
for lam in lambdas:
    subset = final_df[final_df['lambda'] == lam]
    plt.plot(subset['window_size'], subset['r2_oos_stage1'], marker='o', label=f"λ={lam}")
plt.title("Stage 1 OOS R² vs Window Size")
plt.xlabel("Window Size")
plt.ylabel("R²")
plt.ylim(r2_oos_min, r2_oos_max)
plt.grid(True)
plt.legend()
plt.show()

plt.figure(figsize=(12, 6))
for lam in lambdas:
    subset = final_df[final_df['lambda'] == lam]
    plt.plot(subset['window_size'], subset['r2_oos_stage2'], marker='o', label=f"λ={lam}")
plt.title("Stage 2 OOS R² vs Window Size")
plt.xlabel("Window Size")
plt.ylabel("R²")
plt.ylim(r2_oos_min, r2_oos_max)
plt.grid(True)
plt.legend()
plt.show()


# ============================================================
# 3. LINE PLOTS: IN-SAMPLE R² vs WINDOW SIZE (same y-scale)
# ============================================================

plt.figure(figsize=(12, 6))
for lam in lambdas:
    subset = final_df[final_df['lambda'] == lam]
    plt.plot(subset['window_size'], subset['r2_insample_stage1'], marker='o', label=f"λ={lam}")
plt.title("Stage 1 In-Sample R² vs Window Size")
plt.xlabel("Window Size")
plt.ylabel("R²")
plt.ylim(r2_insample_min, r2_insample_max)
plt.grid(True)
plt.legend()
plt.show()

plt.figure(figsize=(12, 6))
for lam in lambdas:
    subset = final_df[final_df['lambda'] == lam]
    plt.plot(subset['window_size'], subset['r2_insample_stage2'], marker='o', label=f"λ={lam}")
plt.title("Stage 2 In-Sample R² vs Window Size")
plt.xlabel("Window Size")
plt.ylabel("R²")
plt.ylim(r2_insample_min, r2_insample_max)
plt.grid(True)
plt.legend()
plt.show()