In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import PredefinedSplit
from xgboost import XGBRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import warnings
warnings.filterwarnings("ignore")

# === Step 1: Load Data ===
df = pd.read_csv("goup_project_sample_v3.csv", parse_dates=["date"])
factors = pd.read_csv("factor_char_list.csv")["variable"].tolist()

df["date"] = df["date"].dt.to_period("M").dt.to_timestamp()

# Compute EPS surprise
eps_stdevest_adj = df["eps_stdevest"].fillna(1e-6).replace(0, 1e-6)
df["eps_surprise"] = (df["eps_actual"] - df["eps_meanest"]) / eps_stdevest_adj
df = df.dropna(subset=["eps_surprise"] + factors)

# === Step 2: Expanding Window with Bayesian Optimization ===
results = []
starting = pd.to_datetime("20000101")
counter = 0

while (starting + pd.DateOffset(years=11 + counter)) <= pd.to_datetime("20240101"):
    cutoff = [
        starting,
        starting + pd.DateOffset(years=8 + counter),
        starting + pd.DateOffset(years=10 + counter),
        starting + pd.DateOffset(years=11 + counter)
    ]

    train = df[(df["date"] >= cutoff[0]) & (df["date"] < cutoff[1])]
    val   = df[(df["date"] >= cutoff[1]) & (df["date"] < cutoff[2])]
    test  = df[(df["date"] >= cutoff[2]) & (df["date"] < cutoff[3])]

    if train.empty or val.empty or test.empty:
        counter += 1
        continue

    # === Preprocessing ===
    scaler = StandardScaler()
    X_train = scaler.fit_transform(train[factors])
    y_train = train["eps_surprise"].values
    X_val = scaler.transform(val[factors])
    y_val = val["eps_surprise"].values
    X_test = scaler.transform(test[factors])
    y_test = test["eps_surprise"].values

    # Combine for tuning
    X_comb = np.vstack([X_train, X_val])
    y_comb = np.concatenate([y_train, y_val])
    split_idx = [-1] * len(X_train) + [0] * len(X_val)
    ps = PredefinedSplit(test_fold=split_idx)

    # === Bayesian Optimization ===
    bayes_cv = BayesSearchCV(
        XGBRegressor(objective="reg:squarederror", random_state=42, n_jobs=-1),
        search_spaces={
            'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
            'max_depth': Integer(3, 7),
            'subsample': Real(0.5, 1.0),
            'colsample_bytree': Real(0.5, 1.0),
            'n_estimators': Integer(50, 300),
        },
        n_iter=20,
        scoring='neg_mean_squared_error',
        cv=ps,
        refit=True,
        random_state=42,
        verbose=0
    )
    bayes_cv.fit(X_comb, y_comb)
    model = bayes_cv.best_estimator_

    # === Predict and store ===
    test_pred = model.predict(X_test)
    temp_df = test[["permno", "date"]].copy()
    temp_df["actual"] = y_test
    temp_df["pred"] = test_pred
    results.append(temp_df)

    print(f"✅ Window {counter + 1} — Best params: {bayes_cv.best_params_}")
    counter += 1

# === Step 3: Evaluate OOS R² ===
df_eval = pd.concat(results).dropna()
y_true = df_eval["actual"].values
y_pred = df_eval["pred"].values

numerator = np.sum((y_true - y_pred) ** 2)
denominator = np.sum((y_true - np.mean(y_true)) ** 2)
oos_r2 = 1 - numerator / denominator

print(f"\n✅ EPS Surprise — OOS R² (vs mean): {oos_r2:.4%}")

