In [35]:
# %%
import os
import pandas as pd
import numpy as np

DATA_DIR = "../data"

# Read train/test feature files (keeping Date temporarily to filter)
X_train = pd.read_csv(os.path.join(DATA_DIR, "X_train.csv"))
X_test  = pd.read_csv(os.path.join(DATA_DIR, "X_test.csv"))
y_train = pd.read_csv(os.path.join(DATA_DIR, "y_train.csv"))["0"]
y_test  = pd.read_csv(os.path.join(DATA_DIR, "y_test.csv"))["0"]

# Now drop Date column (model shouldn't see raw dates)
X_train = X_train.drop(columns=["Date"])
X_test  = X_test.drop(columns=["Date"])

print("After shifting and dropping 1990 rows:")
print("Shapes:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print("Example columns:", list(X_train.columns)[:10])
print("Unique y_train values:", np.unique(y_train))

After shifting and dropping 1990 rows:
Shapes: (4034, 77) (3018, 77) (4034,) (3018,)
Example columns: ['VIX_Close', 'ADS_Index', 'RECBARS', 'fft', '3mth', '10yr', '30yr', 'Aaa', 'Baa', 'term_spread']
Unique y_train values: [-1.  0.  1.]


In [39]:
# %%
# Drop the correlated predictors to match the paper's final logistic regression feature set
cols_to_drop = [
    "MACDsig",      # highly correlated with MACD
    "MA10",         # correlated with MOM10
    "RSI",          # correlated with MOM10 / StochK
    "StochD",       # correlated with StochK
    "Aaa_minus_10Y" # redundant spread measure
]

X_train = X_train.drop(columns=cols_to_drop, errors="ignore")
X_test  = X_test.drop(columns=cols_to_drop, errors="ignore")

print("After dropping correlated columns:")
print("X_train shape:", X_train.shape)
print("Remaining features:", list(X_train.columns))

# %% Use ONLY 1-day lags and drop the 5 correlated predictors (paper spec)
# 1) start from just the *_lag1 features
lag1_cols = [c for c in X_train.columns if c.endswith("_lag1")]

# 2) remove the correlated ones (with _lag1 suffix)
drop_lag1 = {
    "MACDsig_lag1",
    "MA10_lag1",
    "RSI_lag1",
    "StochD_lag1",
    "Aaa_minus_10Y_lag1",
}

use_cols = [c for c in lag1_cols if c not in drop_lag1]

# 3) restrict X to this final feature set
X_train = X_train[use_cols].copy()
X_test  = X_test[use_cols].copy()

print("After enforcing paper feature set (only *_lag1, correlated 5 dropped):")
print("X_train shape:", X_train.shape)
print("Example features:", use_cols[:12])

After dropping correlated columns:
X_train shape: (4034, 72)
Remaining features: ['VIX_Close', 'ADS_Index', 'RECBARS', 'fft', '3mth', '10yr', '30yr', 'Aaa', 'Baa', 'term_spread', 'long_spread', 'corp_spread', 'TED', 'majcurr_ret', 'DAX', 'rdax', 'FTSE', 'oil', 'copper_ret', 'MOM10', 'StochK', 'Rperc', 'MACD', 'gold_ret', 'silver_ret', 'VIX_Close_lag1', 'Close_lag1', 'High_lag1', 'Low_lag1', 'Open_lag1', 'Volume_lag1', 'ADS_Index_lag1', 'RECBARS_lag1', 'fft_lag1', '3mth_lag1', '10yr_lag1', '30yr_lag1', 'Aaa_lag1', 'Baa_lag1', 'term_spread_lag1', 'long_spread_lag1', 'corp_spread_lag1', 'Aaa_minus_10Y_lag1', 'TED_lag1', 'majcurr_ret_lag1', 'DAX_lag1', 'rdax_lag1', 'FTSE_lag1', 'oil_lag1', 'copper_ret_lag1', 'MA10_lag1', 'MOM10_lag1', 'StochK_lag1', 'StochD_lag1', 'RSI_lag1', 'Rperc_lag1', 'MACD_lag1', 'MACDsig_lag1', 'gold_ret_lag1', 'silver_ret_lag1', 'VIX_Close_lag2', 'VIX_Close_lag3', 'MACD_lag2', 'MACD_lag3', 'term_spread_lag2', 'term_spread_lag3', 'corp_spread_lag2', 'corp_spread_lag

In [40]:
# %% [markdown]
# Multinomial Logistic Regression (baseline) on current CSV splits
# Assumes: run from source/ ; data at ../data/
# Files: X_train.csv, X_test.csv, y_train.csv, y_test.csv

# %%
import os
import json
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils.multiclass import unique_labels

RANDOM_STATE = 42
DATA_DIR = os.path.join("..", "data")
ARTIFACTS_DIR = os.path.join("..", "artifacts", "logreg")
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

def timestamp_dir(base):
    ts = datetime.now().strftime("%Y%m%d-%H%M%S")
    path = os.path.join(base, ts)
    os.makedirs(path, exist_ok=True)
    return path

In [41]:
# %%
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import numpy as np

# Define the pipeline (same as before)
pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        solver="lbfgs",
        penalty="l2",
        C=1.0,
        max_iter=2000,
        n_jobs=-1,
        random_state=RANDOM_STATE
    ))
])

# 10-fold cross-validation on the training window (1991–2006)
cv = StratifiedKFold(n_splits=10, shuffle=False)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="accuracy")

print(f"10-fold CV accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Refit on full training data after CV (like the paper)
pipe.fit(X_train, y_train)

# Evaluate on 2007–2018 test set
test_acc = pipe.score(X_test, y_test)
print(f"Out-of-sample test accuracy: {test_acc:.4f}")

10-fold CV accuracy: 0.4288 ± 0.1508
Out-of-sample test accuracy: 0.4400


In [34]:
# %%
y_pred = pipe.predict(X_test)

acc = accuracy_score(y_test, y_pred)
labels = unique_labels(y_test, y_pred)

print(f"Test accuracy: {acc:.4f}\n")
print("Classification report:")
#print(classification_report(y_test, y_pred, labels=labels, digits=4))

cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(cm, index=[f"true_{c}" for c in labels],
                        columns=[f"pred_{c}" for c in labels])
cm_df.head()

Test accuracy: 0.2505

Classification report:


Unnamed: 0,pred_-1.0,pred_0.0,pred_1.0
true_-1.0,602,11,93
true_0.0,1297,46,214
true_1.0,628,19,108


In [15]:
# %%
import joblib

run_dir = timestamp_dir(ARTIFACTS_DIR)

# Save the fitted pipeline (scaler + model)
joblib.dump(pipe, os.path.join(run_dir, "model_pipeline.joblib"))

# Save columns to ensure consistent feature order at inference
X_train.columns.to_series().to_csv(os.path.join(run_dir, "feature_columns.csv"), index=False)

# Save metrics
metrics = {
    "test_accuracy": float(acc),
    "labels": [int(x) if isinstance(x, (np.integer,)) else x for x in labels]
}
with open(os.path.join(run_dir, "metrics.json"), "w") as f:
    json.dump(metrics, f, indent=2)

# Confusion matrix + full report
cm_df.to_csv(os.path.join(run_dir, "confusion_matrix.csv"))
report_df = pd.DataFrame(classification_report(y_test, y_pred, labels=labels, output_dict=True)).T
report_df.to_csv(os.path.join(run_dir, "classification_report.csv"))

print(f"Artifacts saved to: {run_dir}")

Artifacts saved to: ../artifacts/logreg/20251016-134900


In [16]:
y_train.value_counts(normalize=True)

0
 0    0.499883
-1    0.250058
 1    0.250058
Name: proportion, dtype: float64

In [29]:
print(X_train.head(3))
print(y_train.head(3))

   VIX_Close  ADS_Index  RECBARS   fft  3mth  10yr  30yr   Aaa   Baa  \
0  26.620001   -1.31895        1  2.32 -0.02  0.03  0.06 -0.01  0.01   
1  27.930000   -1.32174        1 -0.67 -0.02 -0.04 -0.03 -0.01 -0.03   
2  27.190001   -1.32657        1 -0.66  0.09  0.09  0.09  0.01  0.04   

   term_spread  ...      rdax         FTSE       oil  copper_ret     MOM10  \
0         1.51  ... -0.021770  2128.300049  0.032111   -0.015199 -3.599976   
1         1.49  ...  0.007789  2117.800049 -0.035293    0.000000 -8.290009   
2         1.49  ...  0.014507  2126.100098 -0.028919    0.000000 -9.119995   

      StochK      Rperc      MACD  gold_ret  silver_ret  
0  10.416846  89.583154  2.504066  0.008843   -0.000732  
1   0.094700  99.905300  1.783934 -0.011582   -0.016221  
2  15.661794  84.338206  1.126806  0.002843    0.027208  

[3 rows x 25 columns]
0    0
1    0
2   -1
Name: 0, dtype: int64
