
Public Demo Notebook (Confidentiality-Safe)
==========================================

This notebook is a PUBLIC, reproducible demo that mirrors the structure of an internal research pipeline,
but replaces any proprietary signals/data (e.g., "Swan Beta") with PUBLIC proxies.

Footnotes / Demo Notes
----------------------
[DEMO-1] Proprietary preselection features (e.g., internal Swan metrics) are replaced with a public tail-risk proxy:
        rolling Historical CVaR computed from yfinance prices.

[DEMO-2] Any internal universe/coverage list is replaced with an explicitly public ticker universe
        (you may edit the list, but do NOT use internal lists).

[DEMO-3] Optional modeling blocks (HMM regime features, clustering) are kept to reflect the workflow structure,
        but they use ONLY public returns.

[DEMO-4] The final MVO (Mean-Variance Optimization) uses canonical, explainable inputs (mu, covariance) computed from
        public price data (PyPortfolioOpt). This is intentionally chosen to be defensible and easy to audit.

Dependencies
------------
pip install yfinance numpy pandas scikit-learn hmmlearn PyPortfolioOpt



In [None]:
import numpy as np
import pandas as pd
import yfinance as yf

from sklearn.cluster import KMeans
from sklearn import metrics
from hmmlearn.hmm import GaussianHMM

# MVO / Mean-Variance
from pypfopt import expected_returns, risk_models
from pypfopt.efficient_frontier import EfficientFrontier


# 1) User Parameters

In [None]:
# --- Data window for the demo ---
start_date = "2020-11-15"
end_date   = "2022-01-01"

# --- Preselection size ---
selected_stock_number = 22

# --- Tail-risk proxy settings (CVaR) ---
cvar_alpha  = 0.05     # left-tail probability
cvar_window = 60       # rolling window length (trading days)
min_obs_req = 120      # minimum non-NaN returns for eligibility

# --- Optional feature blocks (structure-only; public returns) ---
use_hmm_features = True      # [DEMO-3]
n_states         = 3
hmm_n_iter       = 200
hmm_random_state = 0

use_clustering      = True   # [DEMO-3]
n_clusters          = 7
kmeans_random_state = 1

# --- MVO constraints ---
max_weight = 0.17            # cap per asset (like your original)
rf_rate    = 0.01            # risk-free rate for max Sharpe

# If you want to force some tickers to 0 weight (e.g., benchmark ETFs),
# add them here. Example: {"SPY", "IWD"}
force_zero_weight = set()    # e.g., {"SPY", "IWD"}

# Public demo universe (edit freely, but keep it PUBLIC)  [DEMO-2]
tickers_universe = [
    "SPY","QQQ","IWM","TLT","GLD","VNQ",
    "XLF","XLK","XLE","XLV","XLP","XLY","XLI","XLB","XLU",
    "AAPL","MSFT","AMZN","GOOGL","META","NVDA","JPM","UNH","HD","KO","COST","PEP","V","MA"
]


# 2) Helper Functions

In [None]:
def download_prices(tickers, start, end):
    """
    Download adjusted close prices from yfinance.
    Returns DataFrame with Date index, columns=tickers.
    """
    px = yf.download(
        tickers,
        start=start,
        end=end,
        auto_adjust=True,
        progress=False
    )["Close"]

    if isinstance(px, pd.Series):
        px = px.to_frame()

    # Ensure column order is stable
    px = px.sort_index(axis=1)
    return px


def compute_returns(prices: pd.DataFrame) -> pd.DataFrame:
    """Simple returns."""
    return prices.pct_change().dropna(how="all")


def rolling_hist_cvar(r: pd.Series, window: int = 60, alpha: float = 0.05) -> pd.Series:
    """
    Rolling Historical CVaR (left-tail mean).
    CVaR typically negative for equity-like return series.
    More negative => worse tail risk.
    """
    def _cvar(x):
        x = pd.Series(x).dropna().values
        if x.size < 50:
            return np.nan
        xs = np.sort(x)
        k = max(int(np.floor(alpha * xs.size)), 1)  # tail count
        return xs[:k].mean()

    return r.rolling(window).apply(_cvar, raw=False)


def get_state_distribution(series: pd.Series, n_states: int = 3, n_iter: int = 200, random_state: int = 0):
    """
    Fit a 1D Gaussian HMM to returns and output state frequency distribution.
    [DEMO-3] Uses ONLY public returns. Optional block to mimic the internal workflow structure.
    """
    s = series.dropna()
    if len(s) < max(80, n_states * 20):
        return [1.0 / n_states] * n_states

    X = s.values.reshape(-1, 1)

    model = GaussianHMM(
        n_components=n_states,
        covariance_type="diag",
        n_iter=n_iter,
        random_state=random_state
    )
    model.fit(X)
    hidden_states = model.predict(X)

    total = len(hidden_states)
    dist = [(hidden_states == i).sum() / total for i in range(n_states)]
    return dist


def safe_fillna_for_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Defensive fill: replace NaNs with column means (numeric only).
    Safe for clustering / ML preprocessing in a demo notebook.
    """
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    out = df.copy()
    out[numeric_cols] = out[numeric_cols].fillna(out[numeric_cols].mean())
    return out


# 3) STEP 1 - Preselection (PUBLIC DEMO)
##    CVaR-based tail-risk selection + optional HMM + optional clustering

In [None]:
# [DEMO-1] Replace proprietary preselection with public tail-risk proxy (rolling CVaR).
prices_all = download_prices(tickers_universe, start_date, end_date)
R_all      = compute_returns(prices_all)

# Eligibility filter: enough observations
eligible_mask = (R_all.notna().sum(axis=0) >= min_obs_req)
eligible_tickers = R_all.columns[eligible_mask].tolist()

R = R_all[eligible_tickers].copy()
prices = prices_all[eligible_tickers].copy()

print(f"Eligible tickers: {len(eligible_tickers)} / {len(tickers_universe)}")

# 1) CVaR scoring
scores = {}
cvar_last = {}

for tkr in R.columns:
    r = R[tkr].dropna()
    rcvar = rolling_hist_cvar(r, window=cvar_window, alpha=cvar_alpha)

    # robust score: median of rolling CVaR
    # More negative => worse tail risk
    scores[tkr] = rcvar.median()
    cvar_last[tkr] = rcvar.dropna().iloc[-1] if rcvar.dropna().shape[0] > 0 else np.nan

tail_scores = pd.Series(scores, name="CVaR_median").sort_values()  # ascending => worst first
selected = tail_scores.head(selected_stock_number).index.tolist()

print("\nSelected tickers (worst tail risk first):")
print(selected)

# 2) Build features table for selected assets
feat = pd.DataFrame(index=selected)
feat["CVaR_median"] = tail_scores.loc[selected].values
feat["CVaR_last"]   = pd.Series(cvar_last).loc[selected].values

# Simple return stats (public)
feat["mean_ret"] = R[selected].mean().values
feat["vol"]      = R[selected].std().values

# 3) Optional: HMM features
if use_hmm_features:
    hmm_features = []
    for tkr in selected:
        dist = get_state_distribution(
            R[tkr],
            n_states=n_states,
            n_iter=hmm_n_iter,
            random_state=hmm_random_state
        )
        hmm_features.append(dist)

    hmm_df = pd.DataFrame(
        hmm_features,
        index=selected,
        columns=[f"HMM_state{i}" for i in range(n_states)]
    )
    feat = pd.concat([feat, hmm_df], axis=1)

# 4) Optional: clustering for diversity / grouping
if use_clustering:
    X = safe_fillna_for_features(feat).values
    km = KMeans(n_clusters=n_clusters, random_state=kmeans_random_state, n_init=10)
    km.fit(X)
    labels = km.labels_
    feat["Group"] = labels

    try:
        db = metrics.davies_bouldin_score(X, labels)
        print(f"\nDavies-Bouldin score (lower is better): {db:.4f}")
    except Exception as e:
        print("DB score skipped:", e)

# 5) Final preselected_stock output
# TailScore: rank ascending (1 = worst tail risk)
feat["TailScore"] = feat["CVaR_median"].rank(ascending=True)
preselected_stock = feat.sort_values("TailScore").copy()

print("\npreselected_stock (top rows):")
display(preselected_stock.head(10))

# 4) STEP 2 - Data Prep for MVO (PUBLIC)

In [None]:
# We will run MVO only on the selected tickers (preselection output).
selected_tickers = preselected_stock.index.tolist()

# Slice the price data to selected tickers
prices_sel = prices[selected_tickers].copy()

# Defensive: drop columns with all-NaN (rare, but can happen with yfinance)
prices_sel = prices_sel.dropna(how="all", axis=1)

# Optional: forward-fill minor gaps (public data hygiene)
prices_sel = prices_sel.ffill().dropna(how="all")

print(f"\nMVO universe size after cleaning: {prices_sel.shape[1]}")

# If user wants to keep only a subset post-cleaning:
selected_tickers_clean = prices_sel.columns.tolist()

# 5) STEP 3 - MVO (Mean-Variance Optimization)  [DEMO-4]

In [None]:
"""
[DEMO-4] In the internal version, expected returns / risk inputs may use proprietary forecasts, forward signals,
        or special risk models. For a public demo, we use canonical, auditable choices:

- mu: mean historical return (annualized)
- S : covariance matrix (shrinkage recommended)

This makes the demo defensible, reproducible, and standard for MVO.
"""

# 1) Expected returns (annualized)
mu = expected_returns.mean_historical_return(prices_sel, frequency=252)

# 2) Risk model (Ledoit-Wolf shrinkage covariance)
S = risk_models.CovarianceShrinkage(prices_sel, frequency=252).ledoit_wolf()

# 3) Efficient Frontier with constraints
ef = EfficientFrontier(mu, S, weight_bounds=(0.0, max_weight))

# Force some tickers to 0 if requested
for tkr in force_zero_weight:
    if tkr in selected_tickers_clean:
        ef.add_constraint(lambda w, ix=selected_tickers_clean.index(tkr): w[ix] == 0.0)

# 4) Optimize for max Sharpe
raw_weights = ef.max_sharpe(risk_free_rate=rf_rate)
cleaned_weights = ef.clean_weights()

print("\nOptimized weights (cleaned):")
print(cleaned_weights)

perf = ef.portfolio_performance(verbose=True, risk_free_rate=rf_rate)

# Convert weights dict -> DataFrame
w = pd.Series(cleaned_weights).sort_values(ascending=False)
weights_df = pd.DataFrame({"weight": w})
weights_df = weights_df[weights_df["weight"] > 0]  # keep positives
display(weights_df)

# Save outputs
preselected_stock.to_csv("preselected_stock_public_demo.csv")
weights_df.to_csv("mvo_weights_public_demo.csv")

print("\nSaved:")
print("- preselected_stock_public_demo.csv")
print("- mvo_weights_public_demo.csv")


# ============================
# 6) (Optional) Simple Backtest of MVO Weights on Price Window
# ============================

"""
This is a simple, transparent backtest:
- Buy-and-hold from first available date to last available date.
- Portfolio value = sum_i w_i * (P_i / P_i0)

Note: This is NOT a full transaction-cost / rebalancing engineâ€”kept intentionally simple for demo clarity.
"""

if weights_df.shape[0] > 0:
    tickers_bt = weights_df.index.tolist()
    w_bt = weights_df["weight"].values

    px_bt = prices_sel[tickers_bt].copy()
    px_bt = px_bt.dropna(how="all").ffill().dropna()

    # Normalize prices to 1 at start
    px_norm = px_bt / px_bt.iloc[0]

    # Portfolio value
    port_val = (px_norm.values @ w_bt)
    port_val = pd.Series(port_val, index=px_norm.index, name="portfolio_value")

    # Benchmark (optional) - SPY if available
    bench = None
    if "SPY" in prices_all.columns:
        spy = prices_all["SPY"].loc[px_norm.index].ffill().dropna()
        bench = (spy / spy.iloc[0]).rename("SPY_norm")

    out = pd.DataFrame({"portfolio_value": port_val})
    if bench is not None and len(bench) == len(out):
        out["SPY_norm"] = bench.values

    display(out.head())
    display(out.tail())

    out.to_csv("mvo_portfolio_path_public_demo.csv")
    print("\nSaved: mvo_portfolio_path_public_demo.csv")
else:
    print("\nNo positive weights produced (check constraints / data).")
