In [452]:
import pandas as pd
import numpy as np
df = pd.read_csv('plots/2008/svm_qp_results.csv')

In [453]:
# choose only the lambda_hinge = 0 ones
df = df[df["lambda_hinge"] == 0.0]
#df

In [454]:
import pandas as pd
import numpy as np

def export_insample_frontier(df: pd.DataFrame,
                             goals: list,
                             out_csv: str,
                             select_split: str,
                             select_metric: str,
                             stats_split: str,
                             goal_col: str | None = None) -> pd.DataFrame:
    """
    For each goal in `goals`:
      1) Choose grid_case that maximizes `select_metric` on `select_split`.
      2) Extract NN/SVM mean return & vol from `stats_split` for that grid_case.
      3) Save rows to `out_csv` and return the DataFrame.

    Output columns:
      ['return_goal','grid_case','nn_mean_ret','nn_vol','svm_mean_ret','svm_vol']
    """

    # --- resolve goal column name ---
    if goal_col is None:
        if "goal_monthly" in df.columns:
            goal_col = "goal_monthly"
        elif "return_goal" in df.columns:
            goal_col = "return_goal"
        else:
            raise ValueError("Could not find goal column ('goal_monthly' or 'return_goal').")

    required = {goal_col, "grid_case", "split",
                "nn_mean_ret", "nn_vol", "svm_mean_ret", "svm_vol", select_metric}
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Input df missing required columns: {missing}")

    # clean bad rows
    df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=list(required))

    rows = []
    for g in goals:
        # 1) choose the best grid_case on the selection split
        cand = df[(df[goal_col] == g) & (df["split"] == select_split)]
        if cand.empty:
            print(f"[warn] No rows for goal={g} on split='{select_split}'. Skipping.")
            continue

        best_row = cand.sort_values(select_metric, ascending=True).iloc[0]
        chosen_case = best_row["grid_case"]

        # 2) fetch stats from stats_split for that case
        stats = df[(df["split"] == stats_split) &
                   (df[goal_col] == g) &
                   (df["grid_case"] == chosen_case)]

        if stats.empty:
            print(f"[warn] No {stats_split} stats for goal={g}, grid_case={chosen_case}. Skipping.")
            continue

        # If duplicates exist, take the one with highest nn_mean_ret (arbitrary but stable)
        stats = stats.sort_values("nn_mean_ret", ascending=False).iloc[0]

        rows.append({
            "return_goal": float(g),
            "grid_case": chosen_case,
            "nn_mean_ret": float(stats["nn_mean_ret"]),
            "nn_vol": float(stats["nn_vol"]),
            "svm_mean_ret": float(stats["svm_mean_ret"]),
            "svm_vol": float(stats["svm_vol"]),
        })

    out = pd.DataFrame(rows).sort_values("return_goal").reset_index(drop=True)
    out.to_csv(out_csv, index=False)
    print(f"Saved {out_csv} with {len(out)} rows.")
    return out

In [455]:
goals = [0.002, 0.0025, 0.003, 0.0035, 0.004, 0.0045, 0.005, 0.0055, 0.006, 0.0065, 0.007, 0.0075]
#goals = [0.005, 0.0065, 0.0075, 0.009, 0.0105, 0.012, 0.013, 0.0145, 0.016, 0.017, 0.0185, 0.02]  # 2024 goals
out_csv = "plots/2008/insample.csv"
#out_csv = "plots/2008/insample_2008_select.csv"
insample_frontier = export_insample_frontier(
    df, goals, out_csv,
    select_split="val",      # pick best case by validation
    select_metric="nn_vol",  # maximize nn_sharpe on validation
    stats_split="train"      # pull stats from training (in-sample)
)

Saved plots/2008/insample.csv with 12 rows.


In [456]:
outputs = pd.read_csv(out_csv)

outputs["nn_sharpe"] = outputs["nn_mean_ret"] / outputs["nn_vol"]
outputs["svm_sharpe"] = outputs["svm_mean_ret"] / outputs["svm_vol"]
outputs

Unnamed: 0,return_goal,grid_case,nn_mean_ret,nn_vol,svm_mean_ret,svm_vol,nn_sharpe,svm_sharpe
0,0.002,Q,0.011702,0.018266,0.00878,0.01806,0.640638,0.48614
1,0.0025,Q,0.01118,0.018164,0.008992,0.018064,0.615491,0.497788
2,0.003,Q,0.011405,0.018077,0.009219,0.018077,0.630918,0.510015
3,0.0035,Q,0.012073,0.018123,0.009448,0.0181,0.66618,0.521995
4,0.004,Q,0.012316,0.018259,0.009675,0.018145,0.674529,0.533209
5,0.0045,Q,0.012695,0.018226,0.010003,0.018133,0.696517,0.551638
6,0.005,Q,0.013273,0.01831,0.010437,0.01812,0.724866,0.575985
7,0.0055,Q,0.013635,0.018386,0.010916,0.018226,0.741584,0.598924
8,0.006,Q,0.014003,0.018566,0.011414,0.018463,0.754195,0.618194
9,0.0065,Q,0.014291,0.018473,0.0119,0.018782,0.773631,0.63359


In [457]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

CSV_PATH= "plots/2008/insample.csv"

OUTFILE  = "plots/2008/in_sample_frontier.png"

ANNUALIZE   = False     # True → annualize return & vol
SMOOTH_CURVE = True     # True → draw a smooth visual curve (PCHIP if available)
N_SMOOTH     = 200      # points on the smooth curve

def _pct(x, _): return f"{x*100:.1f}%"

def _require_cols(df, cols):
    missing = [c for c in cols if c not in df.columns]
    if missing:
        raise ValueError(f"CSV missing columns: {missing}")

def _unique_sorted_xy(x, y):
    """ensure strictly increasing x and collapse duplicates by averaging y"""
    order = np.argsort(x)
    x, y = np.asarray(x)[order], np.asarray(y)[order]
    # collapse duplicate x's
    ux, idx = np.unique(x, return_index=True)
    if len(ux) == len(x):
        return x, y
    y_collapsed = np.array([y[x == v].mean() for v in ux])
    return ux, y_collapsed

def _smooth_xy(x, y, n=200):
    """PCHIP if SciPy is present; otherwise linear interpolation."""
    x, y = _unique_sorted_xy(x, y)
    xs = np.linspace(x.min(), x.max(), n)
    try:
        from scipy.interpolate import PchipInterpolator
        ys = PchipInterpolator(x, y)(xs)
    except Exception:
        ys = np.interp(xs, x, y)
    return xs, ys

def plot_frontier_best_only(csv_path=CSV_PATH, outfile=OUTFILE,
                            annualize=ANNUALIZE, smooth=SMOOTH_CURVE,
                            n_smooth=N_SMOOTH):

    df = pd.read_csv(csv_path)
    # allow either return_goal or goal_monthly
    if "return_goal" not in df.columns and "goal_monthly" in df.columns:
        df = df.rename(columns={"goal_monthly": "return_goal"})

    _require_cols(df, ["return_goal","nn_mean_ret","nn_vol","svm_mean_ret","svm_vol"])

    # one row per goal already; drop any incomplete lines
    df = (df.replace([np.inf,-np.inf], np.nan)
            .dropna(subset=["return_goal","nn_mean_ret","nn_vol","svm_mean_ret","svm_vol"])
            .copy())

    # sort by goal just for consistent labeling; we’ll sort by vol for line drawing
    df.sort_values("return_goal", inplace=True)

    # annualize if requested
    if annualize:
        df["nn_mean_ret"]  *= 12.0
        df["svm_mean_ret"] *= 12.0
        df["nn_vol"]        *= np.sqrt(12.0)
        df["svm_vol"]       *= np.sqrt(12.0)

    # prepare (vol,ret) arrays
    x_svm, y_svm = df["svm_vol"].values, df["svm_mean_ret"].values
    x_nn,  y_nn  = df["nn_vol"].values,  df["nn_mean_ret"].values

    # --- plot ---
    fig, ax = plt.subplots(figsize=(7.5, 5.5))

    # smooth “guides” (optional, purely visual)
    if smooth and len(df) >= 3:
        xs_svm, ys_svm = _smooth_xy(x_svm, y_svm, n=n_smooth)
        xs_nn,  ys_nn  = _smooth_xy(x_nn,  y_nn,  n=n_smooth)
        ax.plot(xs_svm, ys_svm, lw=2, alpha=0.85, label="SVM+MVO (two-stage)")
        ax.plot(xs_nn,  ys_nn,  lw=2, alpha=0.85, label="End-to-end (NN)")
    else:
        # fallback: polyline through points
        xs, ys = _unique_sorted_xy(x_svm, y_svm)
        ax.plot(xs, ys, marker="o", lw=1.6, label="SVM+MVO (two-stage)")
        xs, ys = _unique_sorted_xy(x_nn, y_nn)
        ax.plot(xs, ys, marker="o", lw=1.6, label="End-to-end (NN)")

    # draw the actual observed best-per-goal points
    ax.scatter(x_svm, y_svm, s=28, zorder=3, label=None)
    ax.scatter(x_nn,  y_nn,  s=28, zorder=3, label=None)

    # arrows + goal labels (at NN point)
    for _, r in df.iterrows():
        ax.annotate("", xy=(r["nn_vol"], r["nn_mean_ret"]),
                         xytext=(r["svm_vol"], r["svm_mean_ret"]),
                         arrowprops=dict(arrowstyle="->", lw=1, alpha=0.7))
        ax.annotate(f"{r['return_goal']*100:.2f}%",
                    xy=(r["nn_vol"], r["nn_mean_ret"]),
                    xytext=(5,5), textcoords="offset points", fontsize=9)

    unit = "annualized" if annualize else "monthly"
    ax.set_xlabel(f"Realized volatility ({unit})")
    ax.set_ylabel(f"Realized mean return ({unit})")
    ax.set_title(f"In-sample decision frontier ({unit})")

    ax.xaxis.set_major_formatter(FuncFormatter(_pct))
    ax.yaxis.set_major_formatter(FuncFormatter(_pct))
    ax.grid(True, linewidth=0.6, alpha=0.5)
    ax.legend(frameon=False, loc="lower right")
    fig.tight_layout()
    fig.savefig(outfile, dpi=300)
    plt.close(fig)
    print(f"Saved {outfile}")

# run
plot_frontier_best_only()

Saved plots/2008/in_sample_frontier.png


In [458]:
output = pd.read_csv(out_csv)
output

Unnamed: 0,return_goal,grid_case,nn_mean_ret,nn_vol,svm_mean_ret,svm_vol
0,0.002,Q,0.011702,0.018266,0.00878,0.01806
1,0.0025,Q,0.01118,0.018164,0.008992,0.018064
2,0.003,Q,0.011405,0.018077,0.009219,0.018077
3,0.0035,Q,0.012073,0.018123,0.009448,0.0181
4,0.004,Q,0.012316,0.018259,0.009675,0.018145
5,0.0045,Q,0.012695,0.018226,0.010003,0.018133
6,0.005,Q,0.013273,0.01831,0.010437,0.01812
7,0.0055,Q,0.013635,0.018386,0.010916,0.018226
8,0.006,Q,0.014003,0.018566,0.011414,0.018463
9,0.0065,Q,0.014291,0.018473,0.0119,0.018782


In [447]:
df = pd.read_csv('plots/2008/svm_qp_results.csv')

In [448]:
df_train = df[df['split'] == 'train']
df_val = df[df['split'] == 'val']
df_test = df[df['split'] == 'test']

In [449]:
goal = 0.0075
df_val = df_val[df_val['goal_monthly'] == goal]
df_val_sorted = df_val.sort_values(by="nn_vol", ascending=True)
df_val_sorted

Unnamed: 0,grid_case,C_svm_init,tau,lambda_hinge,goal_monthly,split,train_feas_pct,val_feas_pct,nn_mean_ret,nn_vol,nn_sharpe,svm_mean_ret,svm_vol,svm_sharpe
320,U,0.1,0.1,0.0,0.0075,val,100.0,100.0,0.005012,0.012322,0.406737,0.00442,0.012535,0.35265
317,Q,0.1,0.05,0.0,0.0075,val,100.0,100.0,0.00509,0.012469,0.408241,0.00442,0.012535,0.35265
308,E,0.01,0.05,0.0,0.0075,val,100.0,100.0,0.004513,0.012507,0.360852,0.00442,0.012535,0.35265
311,I,0.01,0.1,0.0,0.0075,val,100.0,100.0,0.004439,0.012529,0.354316,0.00442,0.012535,0.35265
299,A,0.01,0.01,0.0,0.0075,val,100.0,100.0,0.004589,0.012553,0.365587,0.00442,0.012535,0.35265
305,AG,1.0,0.1,0.0,0.0075,val,100.0,100.0,0.005662,0.01322,0.428275,0.00442,0.012535,0.35265
302,AC,1.0,0.05,0.0,0.0075,val,100.0,100.0,0.007502,0.014399,0.521018,0.00442,0.012535,0.35265
314,M,0.1,0.01,0.0,0.0075,val,100.0,100.0,0.007941,0.015965,0.497407,0.00442,0.012535,0.35265
323,Y,1.0,0.01,0.0,0.0075,val,100.0,100.0,0.008986,0.017341,0.518204,0.00442,0.012535,0.35265


In [450]:
df_train[(df_train['grid_case']== 'Q') & (df_train['goal_monthly'] == goal)]

Unnamed: 0,grid_case,C_svm_init,tau,lambda_hinge,goal_monthly,split,train_feas_pct,val_feas_pct,nn_mean_ret,nn_vol,nn_sharpe,svm_mean_ret,svm_vol,svm_sharpe
316,Q,0.1,0.05,0.0,0.0075,train,100.0,100.0,0.014897,0.018377,0.810605,0.012883,0.019545,0.659138


In [451]:
df_test[(df_test['grid_case']== 'U') & (df_test['goal_monthly'] == goal)]

Unnamed: 0,grid_case,C_svm_init,tau,lambda_hinge,goal_monthly,split,train_feas_pct,val_feas_pct,nn_mean_ret,nn_vol,nn_sharpe,svm_mean_ret,svm_vol,svm_sharpe
318,U,0.1,0.1,0.0,0.0075,test,100.0,100.0,0.025471,0.030701,0.829653,0.021201,0.027793,0.762809
