In [5]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, brier_score_loss, confusion_matrix


In [7]:
DATA_PATH = "combined_cleaned.csv"
LOOKBACK = 1
HIGH_VOL_Q = 0.5
THRESHOLDS = [0.40, 0.45, 0.50, 0.55, 0.60]

BASE_FEATURES = ["log_return","lag1","lag3","lag5","MA5","MA20","vol_5","Volume","rel_vol","regime_high_vol"]
CANDIDATE_FEATURES = ["log_return","lag1","lag3","lag5","MA5","MA20","vol_5","Volume","rel_vol","regime_high_vol",
    "MA_diff","return_over_vol","volume_spike","log_return_z_cs","MA5_z_cs","MA20_z_cs","vol_5_z_cs","Volume_z_cs","rel_vol_z_cs","log_return_rank_cs","vol_5_rank_cs","Volume_rank_cs"]

SVM_C_LINEAR = 1.0
SVM_C_RBF = 1.0
SVM_GAMMA_RBF = 0.01
def load_and_basic_clean(path):
    df = pd.read_csv(path)
    df["Date"] = pd.to_datetime(df["Date"], dayfirst=True, errors="coerce")
    df = df.dropna(subset=["Date"]).copy()
    df = df.sort_values(["Ticker", "Date"])
    df = df.drop_duplicates(subset=["Ticker", "Date"])
    return df

def ensure_log_return(df):
    df = df.copy()
    if "log_return" not in df.columns:
        df["log_return"] = (
            np.log(df["Close"]) - np.log(df.groupby("Ticker")["Close"].shift(1))
        )
    return df

def add_market_regime(df, vol_window=20, min_periods=10):
    df = df.copy()
    mkt_ret = (
        df.groupby("Date")["log_return"]
          .mean()
          .sort_index()
    )
    mkt_vol = mkt_ret.rolling(window=vol_window, min_periods=min_periods).std()
    vol_median = mkt_vol.median()
    df["market_vol"] = df["Date"].map(mkt_vol)
    df["regime_high_vol"] = (df["market_vol"] > vol_median).astype(int)
    return df

def add_realized_vol_5(df, horizon=5):    
    df = df.copy()
    df["realized_vol_5"] = np.nan
    for ticker, g in df.groupby("Ticker"):
        idx = g.index
        r = g["log_return"]
        r_sq = r ** 2
        acc = None
        for i in range(1, horizon + 1):
            shifted = r_sq.shift(-i)
            acc = shifted if acc is None else acc + shifted
        rv5 = np.sqrt(acc / horizon)
        df.loc[idx, "realized_vol_5"] = rv5
    return df

def add_interaction_features(df):
    df = df.copy()
    if "MA5" in df.columns and "MA20" in df.columns:
        df["MA_diff"] = df["MA5"] - df["MA20"]
    if "log_return" in df.columns and "vol_5" in df.columns:
        eps = 1e-6
        df["return_over_vol"] = df["log_return"] / (df["vol_5"].replace(0, np.nan) + eps)
    if "Volume" in df.columns:
        df["volume_spike"] = np.nan
        for ticker, g in df.groupby("Ticker"):
            idx = g.index
            vol = g["Volume"]
            vol_ma20 = vol.rolling(window=20, min_periods=5).mean()
            df.loc[idx, "volume_spike"] = vol / (vol_ma20.replace(0, np.nan))
    return df

def add_cross_sectional_features(df):
    df = df.copy()
    cs_cols_for_z = ["log_return","MA5","MA20","vol_5","Volume","rel_vol"]
    for col in cs_cols_for_z:
        if col not in df.columns:
            continue
        group = df.groupby("Date")[col]
        mean = group.transform("mean")
        std = group.transform("std").replace(0, np.nan)
        z_name = f"{col}_z_cs"
        df[z_name] = (df[col] - mean) / std
        df[z_name] = df[z_name].fillna(0.0)
    rank_cols = ["log_return", "vol_5", "Volume"]
    for col in rank_cols:
        if col not in df.columns:
            continue
        r_name = f"{col}_rank_cs"
        df[r_name] = (
            df.groupby("Date")[col]
              .rank(pct=True)
        )
    return df

def get_feature_columns(df):
    feature_cols = [c for c in CANDIDATE_FEATURES if c in df.columns]
    if not feature_cols:
        raise ValueError("No candidate features found in dataframe.")
    df = df.dropna(subset=feature_cols).copy()
    feature_cols = [c for c in feature_cols if c in df.columns]
    return df, feature_cols

def define_folds():
    folds = [
        {
            "name": "fold1",
            "train_start": "2009-01-01",
            "train_end":   "2013-12-31",
            "val_start":   "2014-01-01",
            "val_end":     "2014-12-31",
            "test_start":  "2015-01-01",
            "test_end":    "2015-12-31",
        },
        {
            "name": "fold2",
            "train_start": "2009-01-01",
            "train_end":   "2014-12-31",
            "val_start":   "2015-01-01",
            "val_end":     "2015-12-31",
            "test_start":  "2016-01-01",
            "test_end":    "2016-12-31",
        },
        {
            "name": "fold3",
            "train_start": "2009-01-01",
            "train_end":   "2015-12-31",
            "val_start":   "2016-01-01",
            "val_end":     "2016-12-31",
            "test_start":  "2017-01-01",
            "test_end":    "2017-12-31",
        },
    ]
    for f in folds:
        for k in ["train_start","train_end","val_start","val_end","test_start","test_end"]:
            f[k] = pd.to_datetime(f[k])
    return folds

def build_single_day_data(df, feature_cols):
    """
    Transforms the DataFrame into NumPy arrays of single-day features.
    X: (N, d), rv5: (N,), dates: (N,), tickers: (N,)
    """
    df = df.sort_values(["Ticker", "Date"])
    X = df[feature_cols].values       
    rv = df["realized_vol_5"].values 
    dates = df["Date"].values         
    tickers = df["Ticker"].values
    valid_mask = ~np.isnan(rv)
    return X[valid_mask], rv[valid_mask], dates[valid_mask], tickers[valid_mask]

def evaluate_metrics(y_true, y_pred, p_pred):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    try:
        roc_auc = roc_auc_score(y_true, p_pred)
    except ValueError:
        roc_auc = np.nan
    try:
        pr_auc = average_precision_score(y_true, p_pred)
    except ValueError:
        pr_auc = np.nan
    brier = brier_score_loss(y_true, p_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "roc_auc": roc_auc,
        "pr_auc": pr_auc,
        "brier": brier,
        "tn": int(tn),
        "fp": int(fp),
        "fn": int(fn),
        "tp": int(tp),
    }

def tune_threshold(y_val, p_val, thresholds):
    best_tau = 0.5
    best_acc = -1.0
    for tau in thresholds:
        preds = (p_val >= tau).astype(int)
        acc = accuracy_score(y_val, preds)
        if acc > best_acc:
            best_acc = acc
            best_tau = tau
    return best_tau

In [13]:
def train_one_fold_linear_svm(X, rv5, dates, feature_cols, fold):
    train_mask = (dates >= fold["train_start"]) & (dates <= fold["train_end"])
    val_mask   = (dates >= fold["val_start"])   & (dates <= fold["val_end"])
    test_mask  = (dates >= fold["test_start"])  & (dates <= fold["test_end"])

    X_train, rv_train = X[train_mask], rv5[train_mask]
    X_val,   rv_val   = X[val_mask],   rv5[val_mask]
    X_test,  rv_test  = X[test_mask],  rv5[test_mask]

    if X_train.shape[0] == 0 or X_val.shape[0] == 0 or X_test.shape[0] == 0:
        return None, None

    thr = np.quantile(rv_train, HIGH_VOL_Q)
    y_train = (rv_train >= thr).astype(int)
    y_val   = (rv_val   >= thr).astype(int)
    y_test  = (rv_test  >= thr).astype(int)
    majority_label = 1 if y_train.mean() >= 0.5 else 0
    baseline_acc_test = (y_test == majority_label).mean()
    
    # Model: Linear Kernel
    model = Pipeline([
        ('scaler', StandardScaler()),
        # Linear kernel allows extracting feature weights (coefficients_)
        ('svm', SVC(C=SVM_C_LINEAR, kernel='linear', probability=True, random_state=42))
    ])

    print(f"\n===== Training {fold['name']} (Linear SVM) =====")
    model.fit(X_train, y_train)

    # Extract Feature Weights and Support Vector Count
    svm_step = model.named_steps['svm']
    linear_weights = svm_step.coef_[0]
    sv_count = svm_step.n_support_.sum()
    
    p_val = model.predict_proba(X_val)[:, 1]
    p_test = model.predict_proba(X_test)[:, 1]

    best_tau = tune_threshold(y_val, p_val, THRESHOLDS)
    y_pred_test = (p_test >= best_tau).astype(int)
    
    metrics = evaluate_metrics(y_test, y_pred_test, p_test)
    
    result = {
        "fold": fold["name"],
        "accuracy": accuracy_score(y_test, y_pred_test),
        "baseline_acc": baseline_acc_test,
        "sv_count": int(sv_count),
        **metrics
    }
    
    # 提取权重和前10个最重要特征
    weight_df = pd.DataFrame({'Feature': feature_cols, 'Weight': linear_weights})
    top_10_weights = weight_df.reindex(weight_df['Weight'].abs().sort_values(ascending=False).index).head(10)
    
    return result, top_10_weights

# RBF SVM
def train_one_fold_rbf_svm(X, rv5, dates, feature_cols, fold):
    train_mask = (dates >= fold["train_start"]) & (dates <= fold["train_end"])
    val_mask   = (dates >= fold["val_start"])   & (dates <= fold["val_end"])
    test_mask  = (dates >= fold["test_start"])  & (dates <= fold["test_end"])

    X_train, rv_train = X[train_mask], rv5[train_mask]
    X_val,   rv_val   = X[val_mask],   rv5[val_mask]
    X_test,  rv_test  = X[test_mask],  rv5[test_mask]

    if X_train.shape[0] == 0 or X_val.shape[0] == 0 or X_test.shape[0] == 0:
        return None

    thr = np.quantile(rv_train, HIGH_VOL_Q)
    y_train = (rv_train >= thr).astype(int)
    y_val   = (rv_val   >= thr).astype(int)
    y_test  = (rv_test  >= thr).astype(int)
    
    majority_label = 1 if y_train.mean() >= 0.5 else 0
    baseline_acc_test = (y_test == majority_label).mean()

    # Model: RBF Kernel (Tests robustness to high-dimensional separation)
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('svm', SVC(C=SVM_C_RBF, kernel='rbf', gamma=SVM_GAMMA_RBF, probability=True, random_state=42))
    ])

    print(f"\n===== Training {fold['name']} (RBF SVM - High-Dimensional Separation Test) =====")
    model.fit(X_train, y_train)

    p_val = model.predict_proba(X_val)[:, 1]
    p_test = model.predict_proba(X_test)[:, 1]
    
    # Extract Support Vector Count (for RBF analysis)
    sv_count = model.named_steps['svm'].n_support_.sum()
    
    best_tau = tune_threshold(y_val, p_val, THRESHOLDS)
    y_pred_test = (p_test >= best_tau).astype(int)
    
    metrics = evaluate_metrics(y_test, y_pred_test, p_test)
    
    result = {
        "fold": fold["name"],
        "accuracy": accuracy_score(y_test, y_pred_test),
        "baseline_acc": baseline_acc_test,
        "sv_count": int(sv_count),
        **metrics
    }
    return result

In [15]:
def run_svm_comparison():
    df = load_and_basic_clean(DATA_PATH)
    df = ensure_log_return(df)
    df = add_market_regime(df)
    df = add_realized_vol_5(df, horizon=5)
    df = add_interaction_features(df)
    df = add_cross_sectional_features(df)
    df = df.dropna(subset=["realized_vol_5"]).copy()
    df, feature_cols = get_feature_columns(df)
    X, rv5, dates, tickers = build_single_day_data(df, feature_cols)
    print(f"Data built: {X.shape[0]} samples, features={X.shape[1]}")
    folds = define_folds()
    
    linear_results = []
    all_weights = {}

    for fold in folds:
        res, weights_df = train_one_fold_linear_svm(X, rv5, dates, feature_cols, fold)
        if res is not None:
            linear_results.append(res)
            all_weights[fold['name']] = weights_df
            
    linear_df = pd.DataFrame(linear_results)
    
    print("\n\n=============== LINEAR SVM RESULTS (Kernel: Linear) ===============")
    print("Interpretation: Measures linear separability and provides feature weights.")
    print("Per-fold Metrics:\n", linear_df[["fold", "accuracy", "roc_auc", "f1", "sv_count"]])
    print("\nAverage ROC-AUC (Linear):", linear_df["roc_auc"].mean())
    print("\nTop 10 Feature Weights (Fold 3 Example):")
    if 'fold3' in all_weights:
        print(all_weights['fold3'])
    
    rbf_results = []

    for fold in folds:
        res = train_one_fold_rbf_svm(X, rv5, dates, feature_cols, fold)
        if res is not None:
            rbf_results.append(res)
            
    rbf_df = pd.DataFrame(rbf_results)
    
    print("\n\n=============== RBF SVM RESULTS (Kernel: RBF) ===============")
    print("Interpretation: Measures robustness to non-linear separation (Project Goal).")
    print("Per-fold Metrics:\n", rbf_df[["fold", "accuracy", "roc_auc", "f1", "sv_count"]])
    print("\nAverage ROC-AUC (RBF):", rbf_df["roc_auc"].mean())
    print("\nAverage Performance Comparison:")
    comparison = {
        "Linear AUC": linear_df["roc_auc"].mean(),
        "RBF AUC": rbf_df["roc_auc"].mean(),
        "Linear SV Count Mean": linear_df["sv_count"].mean(),
        "RBF SV Count Mean": rbf_df["sv_count"].mean()
    }
    print(pd.Series(comparison))
    
    return linear_df, rbf_df

run_svm_comparison()

Data built: 10806 samples, features=22

===== Training fold1 (Linear SVM) =====

===== Training fold2 (Linear SVM) =====

===== Training fold3 (Linear SVM) =====


Interpretation: Measures linear separability and provides feature weights.
Per-fold Metrics:
     fold  accuracy   roc_auc        f1  sv_count
0  fold1  0.572391  0.607145  0.015504      1737
1  fold2  0.713333  0.649006  0.472393      2176
2  fold3  0.878968  0.748043  0.666667      2685

Average ROC-AUC (Linear): 0.668064429080868

Top 10 Feature Weights (Fold 3 Example):
            Feature    Weight
16       vol_5_z_cs -1.281267
15        MA20_z_cs -0.915006
20    vol_5_rank_cs  0.399082
21   Volume_rank_cs -0.235659
6             vol_5  0.184144
9   regime_high_vol  0.123826
13  log_return_z_cs  0.107508
5              MA20 -0.092386
4               MA5 -0.091660
17      Volume_z_cs  0.076836

===== Training fold1 (RBF SVM - High-Dimensional Separation Test) =====

===== Training fold2 (RBF SVM - High-Dimensional Separa

(    fold  accuracy  baseline_acc  sv_count  precision    recall        f1  \
 0  fold1  0.572391      0.427609      1737   0.500000  0.007874  0.015504   
 1  fold2  0.713333      0.386667      2176   0.819149  0.331897  0.472393   
 2  fold3  0.878968      0.216270      2685   0.824324  0.559633  0.666667   
 
     roc_auc    pr_auc     brier   tn  fp   fn  tp  
 0  0.607145  0.514037  0.276594  338   2  252   2  
 1  0.649006  0.572791  0.211763  351  17  155  77  
 2  0.748043  0.616272  0.169786  382  13   48  61  ,
     fold  accuracy  baseline_acc  sv_count  precision    recall        f1  \
 0  fold1  0.646465      0.427609      1738   0.686441  0.318898  0.435484   
 1  fold2  0.710000      0.386667      2166   0.745763  0.379310  0.502857   
 2  fold3  0.886905      0.216270      2669   0.809524  0.623853  0.704663   
 
     roc_auc    pr_auc     brier   tn  fp   fn  tp  
 0  0.645357  0.566484  0.250184  303  37  173  81  
 1  0.700244  0.618851  0.202625  338  30  144  88  
