In [3]:
import pandas as pd
import numpy as np
from hmmlearn.hmm import GaussianHMM
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
from sklearn.metrics import silhouette_score
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

os.environ["LOKY_MAX_CPU_COUNT"] = "4"

# === Feature Set Used ===
features = [
    "gn_distribution_balance_wbtc",
    "gn_addresses_supply_balance_more_100k",
    "gn_derivatives_futures_open_interest_crypto_margin_sum",
    "gn_supply_active_3y_5y",
    "gn_market_realized_volatility_3_months",
    "cq_reserve_usd",
    "gn_supply_active_more_1y_percent",
    "gn_supply_illiquid_sum",
    "gn_blockchain_utxo_loss_count",
    "gn_derivatives_futures_annualized_basis_3m"
]
# === Load and Prepare Data ===
print("Loading and preparing data...")
df = pd.read_csv("btc_features_output.csv", parse_dates=["start_time"])
df['start_time'] = pd.to_datetime(df['start_time'], unit='ms')
df = df.sort_values("start_time").reset_index(drop=True)
df.rename(columns={'unified_close': 'price'}, inplace=True)

# Check for missing values and handle them
print(f"Original data shape: {df.shape}")
missing_pct = df[features + ["price"]].isna().mean() * 100
print(f"Missing values percentage:\n{missing_pct}")

# Drop rows with missing values in the critical features
df = df.dropna(subset=features + ["price"])
print(f"Clean data shape after removing NAs: {df.shape}")

# === Feature Engineering ===
# Add price momentum features
df['price_1d_change'] = df['price'].pct_change(1)
df['price_7d_change'] = df['price'].pct_change(7)
df['price_volatility'] = df['price'].pct_change().rolling(7).std().fillna(0)

# Add volume-related features
if 'volume' in df.columns:
    df['volume_change'] = df['volume'].pct_change().fillna(0)
    df['volume_ma_ratio'] = df['volume'] / df['volume'].rolling(7).mean().fillna(df['volume'])
    features.extend(['volume_change', 'volume_ma_ratio'])

# Add price-related features to the feature set
features.extend(['price_1d_change', 'price_7d_change', 'price_volatility'])

# Remove rows after feature engineering that might have NaNs
df = df.dropna(subset=features).reset_index(drop=True)

# === Feature Selection and Scaling ===
print("Scaling features...")
scaler = StandardScaler()
X = scaler.fit_transform(df[features])

# === Train-Test Split ===
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
df_train = df.iloc[:train_size].copy()
df_test = df.iloc[train_size:].copy()

# === BIC Score for Model Selection ===
def compute_bic_score(X, model):
    """Compute the BIC score for the fitted model."""
    n_samples = X.shape[0]
    n_features = X.shape[1]
    n_components = model.n_components
    n_parameters = n_components * n_features + n_components * (n_components - 1)  # mean + transition matrix

    log_likelihood = model.score(X)
    bic = -2 * log_likelihood + n_parameters * np.log(n_samples)
    return bic

# === Find Optimal Number of Components ===
print("Finding optimal number of components...")
n_components_range = range(2, 10)
models = []
bic_scores = []
aic_scores = []

for n_components in n_components_range:
    model = GaussianHMM(
        n_components=n_components,
        covariance_type="diag",
        n_iter=1000,
        tol=1e-4,
        random_state=42
    )
    model.fit(X_train)
    models.append(model)

    # Compute BIC score
    bic = compute_bic_score(X_train, model)
    bic_scores.append(bic)

    # Compute AIC score0
    log_likelihood = model.score(X_train)
    n_features = X_train.shape[1]
    n_params = n_components * (n_features + n_features + n_components - 1)
    aic = -2 * log_likelihood + 2 * n_params
    aic_scores.append(aic)

    print(f"Components: {n_components}, BIC: {bic:.2f}, AIC: {aic:.2f}")

# Plot BIC and AIC scores
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(n_components_range, bic_scores, '-o')
plt.xlabel('Number of Components')
plt.ylabel('BIC Score')
plt.title('BIC Score vs Number of Components')

plt.subplot(1, 2, 2)
plt.plot(n_components_range, aic_scores, '-o')
plt.xlabel('Number of Components')
plt.ylabel('AIC Score')
plt.title('AIC Score vs Number of Components')
plt.tight_layout()
plt.savefig("model_selection_scores.png")

# Select optimal number of components based on BIC score (lower is better)
optimal_components = n_components_range[np.argmin(bic_scores)]
print(f"Optimal number of components based on BIC: {optimal_components}")

# === Train Gaussian HMM with Optimal Components (5 for this enhanced version) ===
print("Training HMM model with 5 components...")
model = GaussianHMM(
    n_components=5,  # Using 5 regimes for enhanced version
    covariance_type="diag",
    n_iter=2000,
    tol=1e-5,
    random_state=42
)
model.fit(X_train)

# Save the model
joblib.dump(model, 'Enhanced_HMM_Model.pkl')
joblib.dump(scaler, 'Feature_Scaler.pkl')

# === Regime Assignment ===
print("Assigning regimes...")
df_train["Regime"] = model.predict(X_train)
df_train["Return"] = df_train["price"].pct_change().fillna(0)

# Analyze regime characteristics for labeling
regime_stats = pd.DataFrame()
for regime in range(5):
    regime_data = df_train[df_train["Regime"] == regime]
    stats = {
        'avg_return': regime_data["Return"].mean() * 100,
        'volatility': regime_data["Return"].std() * 100,
        'median_return': regime_data["Return"].median() * 100,
        'count': len(regime_data),
        'pct_positive_days': (regime_data["Return"] > 0).mean() * 100
    }
    regime_stats = pd.concat([regime_stats, pd.DataFrame([stats])], ignore_index=True)

# Label regimes based on return and volatility characteristics
regime_stats.index = range(5)
print("\n=== Regime Statistics ===")
print(regime_stats)

# Sort regimes by average return for labeling
sorted_regimes = regime_stats.sort_values('avg_return')
regime_map = {}

# Assign labels to regimes
labels = ["Strong Bear", "Weak Bear", "Neutral", "Weak Bull", "Strong Bull"]
for i, idx in enumerate(sorted_regimes.index):
    regime_map[int(idx)] = labels[i]

print("\n=== Regime Labels ===")
for regime, label in regime_map.items():
    print(f"Regime {regime}: {label} (Avg Return: {regime_stats.loc[regime, 'avg_return']:.2f}%, Volatility: {regime_stats.loc[regime, 'volatility']:.2f}%)")

# === Predict Test Set Regimes ===
print("Predicting test set regimes...")
test_predictions = []
window_size = 30

for i in range(len(X_test)):
    sequence = np.vstack((X_train, X_test[:i+1]))[-window_size:]
    current_state = model.predict(sequence)[-1]
    test_predictions.append(current_state)

df_test["Regime"] = test_predictions
df_test["Market_Regime"] = df_test["Regime"].map(regime_map)

# === Combine Train & Test Sets ===
df_all = pd.concat([df_train, df_test]).reset_index(drop=True)
df_all["Market_Regime"] = df_all["Regime"].map(regime_map)

# === Regime Transition Analysis ===
print("\n=== Regime Transition Matrix ===")
print(pd.DataFrame(model.transmat_,
                   index=[regime_map[i] for i in range(5)],
                   columns=[regime_map[i] for i in range(5)]))

# === Feature Analysis ===
print("\n=== Feature Importance in Each Regime ===")
feature_importance = pd.DataFrame()

for i in range(5):
    means = model.means_[i]
    stds = np.sqrt(model.covars_[i])

    # Z-scores of feature means relative to overall distribution
    importance = pd.Series(means, index=features)
    feature_importance[regime_map[i]] = importance

print(feature_importance)

# Visualize feature importance by regime
plt.figure(figsize=(20, 10))
sns.heatmap(feature_importance, annot=True, cmap="coolwarm", center=0)
plt.title("Feature Importance by Market Regime")
plt.tight_layout()
plt.savefig("feature_importance.png")

# === Regime Duration Analysis ===
df_all['regime_change'] = df_all['Regime'].ne(df_all['Regime'].shift()).astype(int)
df_all['regime_group'] = df_all['regime_change'].cumsum()

regime_durations = df_all.groupby(['regime_group', 'Market_Regime']).size().reset_index(name='duration')
print("\n=== Regime Duration Analysis ===")
duration_stats = regime_durations.groupby('Market_Regime')['duration'].agg(['mean', 'median', 'min', 'max'])
print(duration_stats)

# Plot regime duration distribution
plt.figure(figsize=(12, 6))
sns.boxplot(x='Market_Regime', y='duration', data=regime_durations)
plt.title('Distribution of Regime Durations')
plt.ylabel('Duration (days)')
plt.tight_layout()
plt.savefig("regime_durations.png")

# === Visualization of Market Regimes ===
plt.figure(figsize=(14, 8))
colors = {"Strong Bull": "darkgreen", "Weak Bull": "lightgreen",
          "Neutral": "yellow", "Weak Bear": "salmon", "Strong Bear": "darkred"}

# Create background for regimes
for regime, color in colors.items():
    mask = df_all["Market_Regime"] == regime
    if mask.any():
        plt.scatter(df_all.loc[mask, "start_time"], df_all.loc[mask, "price"],
                    c=color, label=regime, alpha=0.7, s=10)

plt.title("Cryptocurrency Price with Market Regimes")
plt.xlabel("Date")
plt.ylabel("Price")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("./results/market_regimes_visualization.png")

# === Confusion Matrix for Regimes ===
# Compute next day regime predictions based on current day
df_all['pred_next_regime'] = df_all['Regime'].shift(1)
df_all['actual_regime'] = df_all['Regime']

# Create confusion matrix
confusion = pd.crosstab(df_all['actual_regime'].dropna(),
                         df_all['pred_next_regime'].dropna(),
                         rownames=['Actual'],
                         colnames=['Predicted'])

# Convert to actual regime names
confusion.index = [regime_map[i] for i in confusion.index]
confusion.columns = [regime_map[i] for i in confusion.columns]

print("\n=== Regime Prediction Confusion Matrix ===")
print(confusion)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues')
plt.title('Regime Prediction Confusion Matrix')
plt.tight_layout()
plt.savefig("regime_confusion_matrix.png")

# === Regime Prediction Function ===
def predict_regime(features_df, window_size=30):
    """
    Predict the current market regime based on recent feature data

    Parameters:
    -----------
    features_df : DataFrame with the same features used for training
    window_size : How many days of data to use for prediction

    Returns:
    --------
    regime : The predicted market regime
    probabilities : Probability distribution over all regimes
    """
    # Ensure we have the right features
    missing_features = set(features) - set(features_df.columns)
    if missing_features:
        raise ValueError(f"Missing features: {missing_features}")

    # Scale the features
    X = scaler.transform(features_df[features].tail(window_size))

    # Predict hidden states
    states = model.predict(X)
    current_state = states[-1]

    # Get transition probabilities from current state
    next_state_probs = model.transmat_[current_state]

    return regime_map[current_state], {regime_map[i]: prob for i, prob in enumerate(next_state_probs)}

# === Model Performance Metrics ===
print("\n=== Model Performance Analysis ===")

# 1. Predictive Accuracy
df_all['correct_prediction'] = (df_all['pred_next_regime'] == df_all['actual_regime'])
accuracy = df_all['correct_prediction'].mean()
print(f"Next-Day Regime Prediction Accuracy: {accuracy:.2%}")

# 2. Feature Importance Analysis
def feature_importance_analysis(model, feature_names):
    """Analyze which features are most important for distinguishing between regimes"""
    feature_importance = np.zeros(len(feature_names))

    for i in range(model.n_components):
        for j in range(model.n_components):
            if i != j:
                # Calculate the difference in means between regimes
                diff = np.abs(model.means_[i] - model.means_[j])
                feature_importance += diff

    # Normalize
    feature_importance = feature_importance / feature_importance.sum()
    return pd.Series(feature_importance, index=feature_names).sort_values(ascending=False)

importance = feature_importance_analysis(model, features)
print("\nFeature Importance for Regime Classification:")
print(importance)

# Plot feature importance
plt.figure(figsize=(12, 6))
importance.plot(kind='bar')
plt.title('Feature Importance for Regime Classification')
plt.ylabel('Importance Score')
plt.tight_layout()
plt.savefig("feature_importance_bar.png")

# 3. Model Stability Analysis
print("\n=== Model Stability Analysis ===")
# Check regime distribution over time
regime_counts = df_all.groupby(pd.Grouper(key='start_time', freq='M'))['Market_Regime'].value_counts().unstack().fillna(0)
regime_counts = regime_counts.div(regime_counts.sum(axis=1), axis=0)

plt.figure(figsize=(16, 6))
regime_counts.plot(kind='area', stacked=True, colormap='viridis')
plt.title('Regime Distribution Over Time')
plt.ylabel('Proportion')
plt.legend(title='Market Regime')
plt.tight_layout()
plt.savefig("regime_distribution_time.png")

# === Feature Correlation within Regimes ===
# Analyze how features correlate differently in different regimes
plt.figure(figsize=(20, 16))
for i, regime in enumerate(regime_map.values()):
    plt.subplot(3, 2, i+1)
    regime_data = df_all[df_all['Market_Regime'] == regime]

    if len(regime_data) > 10:  # Need enough data for correlation
        corr_matrix = regime_data[features].corr()
        sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, vmin=-1, vmax=1)
        plt.title(f'Feature Correlation in {regime} Regime')
    else:
        plt.text(0.5, 0.5, f"Insufficient data for {regime} regime",
                 horizontalalignment='center', verticalalignment='center')

plt.tight_layout()
plt.savefig("feature_correlations_by_regime.png")

# === Price Change Distribution by Regime ===
plt.figure(figsize=(12, 8))
for regime in regime_map.values():
    regime_returns = df_all[df_all['Market_Regime'] == regime]['Return'].dropna()
    if len(regime_returns) > 10:
        sns.kdeplot(regime_returns, label=regime)

plt.title('Return Distribution by Market Regime')
plt.xlabel('Daily Return')
plt.ylabel('Density')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("return_distribution_by_regime.png")

# === Save Labeled Data ===
output_cols = ["start_time", "price", "Regime", "Market_Regime", "Return"] + features
df_all[output_cols].to_csv("hmm_labeled_data.csv", index=False)

print("\n=== Final Regime Analysis ===")
print(f"Current Market Regime: {df_all['Market_Regime'].iloc[-1]}")

# Predict next regime
current_features = df_all[features].tail(30)
next_regime, probabilities = predict_regime(df_all[features], window_size=30)

print(f"Predicted Next Regime: {next_regime}")
print("Transition Probabilities:")
for regime, prob in probabilities.items():
    print(f"  {regime}: {prob:.2%}")

# Final regime summary
regime_summary = df_all.groupby('Market_Regime').agg({
    'Return': ['mean', 'std', 'median'],
    'price': ['count', 'min', 'max']
}).round(4)

print("\n=== Regime Summary Statistics ===")
print(regime_summary)

ModuleNotFoundError: No module named 'hmmlearn'

# Prediction

In [None]:
import pandas as pd
import joblib  # Ensure joblib is imported
from sklearn.preprocessing import StandardScaler

# === Load the trained model and scaler ===
with open("Enhanced_HMM_Model.pkl", "rb") as f:
    hmm_model = joblib.load(f)

with open("Feature_Scaler.pkl", "rb") as f:
    scaler = joblib.load(f)

# === Load new test dataset ===
data_path = "btc_features_output.csv"
df = pd.read_csv(data_path)

# === Define the same features used during training ===
features = [
    "gn_distribution_balance_wbtc",
    "gn_addresses_supply_balance_more_100k",
    "gn_derivatives_futures_open_interest_crypto_margin_sum",
    "gn_supply_active_3y_5y",
    "gn_market_realized_volatility_3_months",
    "cq_reserve_usd",
    "gn_supply_active_more_1y_percent",
    "gn_supply_illiquid_sum",
    "gn_blockchain_utxo_loss_count",
    "gn_derivatives_futures_annualized_basis_3m"
]

df.rename(columns={'unified_close': 'price'}, inplace=True)
df['start_time'] = pd.to_datetime(df['start_time'], unit='ms')
df = df.sort_values("start_time").reset_index(drop=True)

# Check for missing values and handle them
print(f"Original data shape: {df.shape}")
missing_pct = df[features + ["price"]].isna().mean() * 100
print(f"Missing values percentage:\n{missing_pct}")

# === Feature Engineering ===
df['price_1d_change'] = df['price'].pct_change(1)
df['price_7d_change'] = df['price'].pct_change(7)
df['price_volatility'] = df['price'].pct_change().rolling(7).std().fillna(0)

if 'volume' in df.columns:
    df['volume_change'] = df['volume'].pct_change().fillna(0)
    df['volume_ma_ratio'] = df['volume'] / df['volume'].rolling(7).mean().fillna(df['volume'])
    features.extend(['volume_change', 'volume_ma_ratio'])

features.extend(['price_1d_change', 'price_7d_change', 'price_volatility'])

# === Extract these features from the test data ===
X_test = df[features]

# Handle missing values in feature columns (mean imputation)
X_test = X_test.fillna(X_test.mean())

# === Apply the same scaler to the test features ===
X_test_scaled = scaler.transform(X_test)

# === Use the model to predict regimes ===
predicted_regimes = hmm_model.predict(X_test_scaled)

# === Attach the prediction result to the original dataframe ===
df["predicted_regime"] = predicted_regimes

# === Count occurrences of each regime ===
regime_counts = df["predicted_regime"].value_counts().sort_index()
print("\nNumber of data points in each regime:")
print(regime_counts)

# === Optionally save it to a new CSV ===
df.to_csv("btc_test_data_with_predictions.csv", index=False)

# === Preview last few predictions ===
print(df[["start_time", "predicted_regime"]].tail(10))


Original data shape: (8343, 61)
Missing values percentage:
gn_distribution_balance_wbtc                              0.023972
gn_addresses_supply_balance_more_100k                     0.011986
gn_derivatives_futures_open_interest_crypto_margin_sum    0.011986
gn_supply_active_3y_5y                                    0.023972
gn_market_realized_volatility_3_months                    0.023972
cq_reserve_usd                                            0.000000
gn_supply_active_more_1y_percent                          0.023972
gn_supply_illiquid_sum                                    0.023972
gn_blockchain_utxo_loss_count                             0.011986
gn_derivatives_futures_annualized_basis_3m                0.011986
price                                                     0.023972
dtype: float64

Number of data points in each regime:
predicted_regime
0    2647
1    2913
2    1417
3    1365
4       1
Name: count, dtype: int64
              start_time  predicted_regime
8333 2024-04-1

# Strategy

In [None]:
import pandas as pd
import numpy as np

# === Load your prediction dataset ===
df = pd.read_csv("btc_test_data_with_predictions.csv")
df['start_time'] = pd.to_datetime(df['start_time'], errors='coerce')

# === Compute price metrics ===
df['price_1d_change'] = df['price'].pct_change()
df['price_volatility'] = df['price'].pct_change().rolling(7).std()
df['price_mean'] = df['price'].rolling(20).mean()
df['price_std'] = df['price'].rolling(20).std()
df['price_zscore'] = (df['price'] - df['price_mean']) / df['price_std']

# === Define parameters ===
fee_rate = 0.0006  # 0.06%
df['position'] = 0
df['trade'] = 0
df['pnl'] = 0.0
df['cum_pnl'] = 0.0
df['trade_return'] = 0.0

# === Initialize position, PnL tracker ===
position = 0
cash_pnl = 0.0
trade_returns = []

# === Strategy logic ===
for i in range(1, len(df)):
    regime = df.loc[i, 'predicted_regime']
    momentum = df.loc[i, 'price_1d_change']
    vol = df.loc[i, 'price_volatility']
    zscore = df.loc[i, 'price_zscore']
    price_now = df.loc[i, 'price']

    new_position = position

    if regime == 4:  # Bull
        if momentum > 0.005 and 0.01 < vol < 0.03 and zscore < 2:
            new_position = 1
    elif regime == 0:  # Bear
        if momentum < -0.005 and vol > 0.03 and zscore > -2:
            new_position = -1
    else:
        new_position = 0

    if new_position != position:
        trade_value = price_now * abs(new_position - position)

        trade_pnl = 0

        if position == 1:
            trade_pnl += price_now * 1
            trade_pnl -= fee_rate * price_now * 1
        elif position == -1:
            trade_pnl -= price_now * 1
            trade_pnl -= fee_rate * price_now * 1

        if new_position == 1:
            trade_pnl -= price_now * 1
            trade_pnl -= fee_rate * price_now * 1
        elif new_position == -1:
            trade_pnl += price_now * 1
            trade_pnl -= fee_rate * price_now * 1

        cash_pnl += trade_pnl
        df.loc[i, 'trade'] = 1
        df.loc[i, 'trade_return'] = trade_pnl
        if trade_pnl != 0:
            trade_returns.append(trade_pnl)

        position = new_position

    df.loc[i, 'position'] = position
    df.loc[i, 'pnl'] = cash_pnl
    df.loc[i, 'cum_pnl'] = cash_pnl

# === Summary Stats ===
total_pnl = df['cum_pnl'].iloc[-1]
num_trades = df['trade'].sum()

# Trading period
period_days = (df['start_time'].max() - df['start_time'].min()).days
trades_per_day = num_trades / period_days if period_days else 0

# Win rate
wins = len([x for x in trade_returns if x > 0])
win_rate = wins / len(trade_returns) if trade_returns else 0

# Average return per trade
avg_trade_return = np.mean(trade_returns) if trade_returns else 0

# Max Drawdown
df['peak'] = df['cum_pnl'].cummax()
df['drawdown'] = df['cum_pnl'] - df['peak']
max_drawdown = df['drawdown'].min()

# Sharpe Ratio (daily returns, risk-free rate 0)
daily_pnl = df['cum_pnl'].diff()
sharpe_ratio = (daily_pnl.mean() / daily_pnl.std()) * np.sqrt(365) if daily_pnl.std() != 0 else 0

# === Print Summary ===
print("=== Strategy Performance Summary ===")
print(f"Total PnL: {total_pnl:.4f}")
print(f"Number of Trades: {int(num_trades)}")
print(f"Trading Period: {period_days} days")
print(f"Trade Frequency: {trades_per_day:.2f} trades/day")
print(f"Win Rate: {win_rate*100:.2f}%")
print(f"Average Trade Return: {avg_trade_return:.4f}")
print(f"Max Drawdown: {max_drawdown:.4f}")
print(f"Sharpe Ratio: {sharpe_ratio:.4f}")

# === Export result ===
df.to_csv("btc_strategy_pnl_output.csv", index=False)


=== Strategy Performance Summary ===
Total PnL: -2.8826
Number of Trades: 40
Trading Period: 19529 days
Trade Frequency: 0.00 trades/day
Win Rate: 50.00%
Average Trade Return: -0.0721
Max Drawdown: -2.8826
Sharpe Ratio: -0.3202


# Backtest

In [None]:
class Strategy(ABC):
    @abstractmethod
    def generate_signals(self, data: pd.DataFrame) -> pd.Series:
        """
        data: OHLCV (and extra) up to t‑1
        returns: Series indexed                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    like data of +1 (long), -1 (short), 0 (flat)
        """

class RegimeBasedStrategy(Strategy):
    def __init__(self, regime_column: str, long_regimes: list, short_regimes: list):
        self.regime_column = regime_column
        self.long_regimes = long_regimes
        self.short_regimes = short_regimes

    def generate_signals(self, data: pd.DataFrame) -> pd.Series:
        signals = pd.Series(index=data.index, dtype=int)
        signals[:] = 0  # Flat by default

        signals[data[self.regime_column].isin(self.long_regimes)] = 1
        signals[data[self.regime_column].isin(self.short_regimes)] = -1

        return signals

In [None]:
from typing import Any
import pandas as pd

class EasyPipeline:
    def __init__(
        self,
        data: pd.DataFrame,
        strategy: Any,
        trade_fee: float = 0.0005
    ):
        # Validate data type
        if not isinstance(data, pd.DataFrame):
            raise ValueError("`data` must be a pandas DataFrame.")
        # Ensure a 'close' column exists (or rename 'close_price')
        if 'close' not in data.columns:
            if 'close_price' in data.columns:
                data = data.rename(columns={'close_price': 'close'})
            else:
                raise ValueError("`data` must contain a 'close' or 'close_price' column.")
        self.data = data.copy()
        self.strategy = strategy
        self.trade_fee = trade_fee

        # Pre-generate and validate signals as positions
        positions = self.strategy.generate_signals(self.data)
        if not isinstance(positions, pd.Series):
            raise ValueError("Signals must be returned as a pandas Series.")
        invalid = set(positions.unique()) - {-1, 0, 1}
        if invalid:
            raise ValueError(f"Signals may only contain -1, 0, or 1; found {invalid}.")
        # Align index
        positions = positions.reindex(self.data.index).fillna(0).astype(int)
        self.positions = positions

    def run_pipeline(self) -> pd.DataFrame:
        # Run the backtest using pre-validated data & positions
        bt = Backtester(self.data, self.positions, trade_fee=self.trade_fee)
        return bt.run()

In [None]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from strategies.regime_based_strategy import RegimeBasedStrategy
from backtest.backtester import Backtester

# Load model and scaler
model = joblib.load("models/Enhanced_HMM_Model.pkl")
scaler = joblib.load("models/Feature_Scaler.pkl")

# Load data
df = pd.read_csv("btc_features_output.csv")
df.rename(columns={'unified_close': 'price'}, inplace=True)
df['start_time'] = pd.to_datetime(df['start_time'], unit='ms')
df = df.sort_values("start_time").reset_index(drop=True)

# Define features
features = [
    "gn_distribution_balance_wbtc",
    "gn_addresses_supply_balance_more_100k",
    "gn_derivatives_futures_open_interest_crypto_margin_sum",
    "gn_supply_active_3y_5y",
    "gn_market_realized_volatility_3_months",
    "cq_reserve_usd",
    "gn_supply_active_more_1y_percent",
    "gn_supply_illiquid_sum",
    "gn_blockchain_utxo_loss_count",
    "gn_derivatives_futures_annualized_basis_3m"
]  # Same as before

# Feature engineering
df['price_1d_change'] = df['price'].pct_change(1)
df['price_7d_change'] = df['price'].pct_change(7)
df['price_volatility'] = df['price'].pct_change().rolling(7).std().fillna(0)

# Scaling features
X_test = df[features].fillna(df[features].mean())
X_scaled = scaler.transform(X_test)

# Predict regimes
df['predicted_regime'] = model.predict(X_scaled)

# Initialize strategy
strategy = RegimeBasedStrategy("predicted_regime", [0, 3], [1, 2])
df['signal'] = strategy.generate_signals(df)

# Backtest
backtester = Backtester(df['price'], df['signal'])
results = backtester.run()

# Plot performance
backtester.plot_performance(results, df['start_time'])
