<a href="https://colab.research.google.com/github/holguinmora123/OpenAi_Codex/blob/main/3.%20XAUUSD_MultyLabel_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set_Up

In [1]:
symbol     = 'BTCUSD'
strategy   = 'Kalman'
time_frame = 'M5'

root_data = f'/content/drive/MyDrive/Course Folder/Forex/XAUUSD/'
print(root_data)

rolling_window = 100

direction = 'Short'
direction_number = -1

/content/drive/MyDrive/Course Folder/Forex/XAUUSD/


# Import_Libraries

In [2]:
!pip install ta-lib
import talib as ta
print(ta.__version__)

Collecting ta-lib
  Downloading ta_lib-0.6.7-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (24 kB)
Downloading ta_lib-0.6.7-cp312-cp312-manylinux_2_28_x86_64.whl (4.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/4.1 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ta-lib
Successfully installed ta-lib-0.6.7
0.6.7


In [3]:
# ============================================================
# 1. __FUTURE__ IMPORTS & TYPE HINTS

from __future__ import annotations
from typing import List, Tuple, Optional, Dict, Any

# ============================================================
# 2. DATA MANIPULATION & UTILITIES

import os
import sys
import warnings
from datetime import timedelta

import numpy as np
import pandas as pd

# Ignore warnings
warnings.filterwarnings("ignore")

# Add custom path for local modules
sys.path.append("..")

# ============================================================
# 3. MACHINE LEARNING MODELS & PREPROCESSING

# --- XGBoost ---
from xgboost import XGBClassifier
import xgboost

# --- Scikit-learn Models ---
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    VotingClassifier,
    ExtraTreesClassifier)

from sklearn.pipeline import make_pipeline

from sklearn import svm
from sklearn.svm import SVC

# --- Scikit-learn Preprocessing ---
from sklearn.preprocessing import (
    StandardScaler,
    label_binarize
)

# --- Scikit-learn Model Selection ---
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    GridSearchCV
)

# --- Scikit-learn Metrics ---
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score)

from imblearn.over_sampling import SMOTE

# ============================================================
# 4. DEEP LEARNING (TENSORFLOW / KERAS)

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense

# ============================================================
# 5. VISUALIZATION

import matplotlib.pyplot as plt
plt.style.use("seaborn-v0_8-whitegrid")

# ============================================================
# 6. MODEL PERSISTENCE

import joblib


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Support Functions

In [5]:
def results(data, pnl_column='PnL'):
    # Calculate the metrics
    time_difference = data.index.max() - data.index.min()
    days = time_difference.days
    total_trades = data[data['Open_Trade'].notna() & (data['Open_Trade'] != 0)].shape[0]
    profit_trades = data[data[pnl_column] > 0].shape[0]
    loss_trades = data[data[pnl_column] < 0].shape[0]
    profits = data[data[pnl_column] > 0][pnl_column].sum()
    losses = data[data[pnl_column] < 0][pnl_column].sum()

    # Create a dictionary with the results
    results_dict = {
        'days': days,
        'total_trades': total_trades,
        '': '',
        'income': profits,
        'losses': losses,
        'profits': profits + losses,
        ' ':' ',
        'profit_trades': profit_trades,
        'loss_trades': loss_trades,
        '  ':'  ',
        '% Win_Trades': profit_trades / (profit_trades + loss_trades) * 100 if (profit_trades + loss_trades) > 0 else 0,
        '% Loss_Trades': loss_trades / (profit_trades + loss_trades)*100 if (profit_trades + loss_trades) > 0 else 0
    }

    # Create a DataFrame from the dictionary and transpose it
    perf_metrics = pd.DataFrame([results_dict]).T

    # Rename the column
    perf_metrics.rename(columns={0: 'Results'}, inplace=True)

    # Format the DataFrame for display
    perf_metrics.loc[['days', 'total_trades', 'profit_trades', 'loss_trades','% Win_Trades','% Loss_Trades'], 'Results'] = perf_metrics.loc[['days', 'total_trades', 'profit_trades', 'loss_trades','% Win_Trades','% Loss_Trades'], 'Results'].apply(lambda x: f"{x:,.0f}" if pd.notna(x) else '')
    perf_metrics.loc[['income', 'losses', 'profits'], 'Results'] = perf_metrics.loc[['income', 'losses', 'profits'], 'Results'].apply(lambda x: f"${x:,.2f}" if pd.notna(x) else '')

    return perf_metrics

In [6]:
# Function for creating features
def create_features(train_data, index):
    # Define short and long periods
    short_periods = [3, 5, 7, 10, 15, 17]
    long_periods = [20, 22, 66, 126, 252]

    # Combined list of lookbacks
    periods = short_periods + long_periods

    # Initialise an empty DataFrame to store the results
    features = pd.DataFrame(index=index)

    # Calculate technical indicators for each specified period
    for period in periods:
        # Relative Strength Index (RSI)
        features[f'RSI_{period}'] = ta.RSI(
            train_data['Close'], timeperiod=period)

        # Money Flow Index (MFI)
        features[f'MFI_{period}'] = ta.MFI(
            train_data['High'], train_data['Low'], train_data['Close'], train_data['Volume'], timeperiod=period)

        # Average Directional Index (ADX)
        features[f'ADX_{period}'] = ta.ADX(
            train_data['High'], train_data['Low'], train_data['Close'], timeperiod=period)

        # On-Balance Volume (OBV)
        features[f'OBV_{period}'] = ta.OBV(
            train_data['Close'], train_data['Volume'])

        # Accumulation/Distribution Line (AD)
        features[f'AD_{period}'] = ta.AD(
            train_data['High'], train_data['Low'], train_data['Close'], train_data['Volume'])

        # Rate of Change (ROCP)
        features[f'ROCP_{period}'] = ta.ROCP(
            train_data['Close'], timeperiod=period)

    # Calculate Simple Moving Average and Exponential Moving Average Crossovers
    for short_period in short_periods:
        for long_period in long_periods:
            # SMA Crossover
            features[f'SMA_Crossover_{short_period}_{long_period}'] = ta.SMA(
                train_data['Close'], timeperiod=short_period) - ta.SMA(train_data['Close'], timeperiod=long_period)

            # EMA Crossover
            features[f'EMA_Crossover_{short_period}_{long_period}'] = ta.EMA(
                train_data['Close'], timeperiod=short_period) - ta.EMA(train_data['Close'], timeperiod=long_period)


    # Drop the NaN values
    features.dropna(inplace=True)

    # Return the dataframe features_df
    return features

def scale_features_data(features):
    # Standardise the input data (X)
    scaler = StandardScaler()

    # Split the dataset into training and testing sets as 80:20
    train_data = features.iloc[:(int(len(features) * 0.8))]
    test_data  = features.iloc[(int(len(features) * 0.8)):]

    # Scale the training and testing sets
    X_train = pd.DataFrame(data=scaler.fit_transform(
        train_data), columns=features.columns, index=train_data.index)
    X_test = pd.DataFrame(data=scaler.transform(test_data),
                          columns=features.columns, index=test_data.index)

    return X_train, X_test

def strategy_returns_dynamic_different_thresholds(prices, threshold):
    curr_pos = 0
    hold_days = 0
    for dt in prices.index:
        if curr_pos==0 or hold_days==20:
            if prices.loc[dt, 'Rolling Returns'] >= threshold * prices.loc[dt, 'Yearly Stdev']:
                prices.loc[dt, 'Signal_'+str(threshold)] = 1

            elif prices.loc[dt, 'Rolling Returns'] < -threshold * prices.loc[dt, 'Yearly Stdev']:
                prices.loc[dt, 'Signal_'+str(threshold)] = -1

            else:
                prices.loc[dt, 'Signal_'+str(threshold)] = 0

            curr_pos = prices.loc[dt, 'Signal_'+str(threshold)]
            hold_days = 0

        elif curr_pos!=0:
            hold_days+=1

    prices['Signal_'+str(threshold)].ffill(inplace=True)
    prices['Strategy Returns_'+str(threshold)] = prices['Close'].pct_change() * prices['Signal_'+str(threshold)].shift(1)
    cumulative_strategy_returns = (prices['Strategy Returns_'+str(threshold)] +1).cumprod()

    return cumulative_strategy_returns

def strategy_returns_different_thresholds(prices, threshold):
    curr_pos = 0
    hold_days = 0
    for dt in prices.index:
        if curr_pos==0 or hold_days==20:
            if prices.loc[dt, 'Rolling Returns'] >= threshold:
                prices.loc[dt, 'Signal_'+str(threshold)] = 1
            elif prices.loc[dt, 'Rolling Returns'] < threshold:
                prices.loc[dt, 'Signal_'+str(threshold)] = -1

            curr_pos = prices.loc[dt, 'Signal_'+str(threshold)]
            hold_days = 0

        elif curr_pos!=0:
            hold_days+=1

    prices['Signal_'+str(threshold)].ffill(inplace=True)

    prices['Strategy Returns_'+str(threshold)] = prices['Close'].pct_change() * prices['Signal_'+str(threshold)].shift(1)

    cumulative_strategy_returns = (prices['Strategy Returns_'+str(threshold)] +1).cumprod()
    return cumulative_strategy_returns

def generate_trade_sheet(data):
    trade_list = []  # Use a list to store trade data
    current_position = 0
    entry_date = ''
    entry_price = ''
    exit_date = ''
    exit_price = ''
    data.reset_index(inplace=True)

    for i in data.index:

        if current_position == 0:
            entry_date = data.loc[i, 'Date']
            entry_price = data.loc[i, 'Close']
            current_position = data.loc[i, 'signal']

        elif np.abs(data.loc[i, 'signal'] - data.loc[i-1, 'signal']) != 0:
            exit_date = data.loc[i, 'Date']
            exit_price = data.loc[i, 'Close']
            trade_list.append(
                (current_position, entry_date, round(entry_price,2), exit_date, round(exit_price,2))) # Append to list
            current_position = 0

    trade_sheet = pd.DataFrame(trade_list, columns=['Position', 'Entry Date', # Convert list to DataFrame
                           'Entry Price', 'Exit Date', 'Exit Price'])
    trade_sheet['PnL'] = round((trade_sheet['Exit Price'] - trade_sheet['Entry Price']) * trade_sheet['Position'],2)
    return trade_sheet


def trade_analytics(trades):
    analytics = pd.DataFrame(index=['Strategy'])
    analytics['Total PnL'] = round(trades.PnL.sum(),2)
    analytics['Total Trades'] = len(trades.loc[trades.Position!=0])
    analytics['Number of Winners'] = len(trades.loc[trades.PnL>0])
    analytics['Number of Losers'] = len(trades.loc[trades.PnL<=0])
    analytics['Win (%)'] = round(100*analytics['Number of Winners']/analytics['Total Trades'],2)
    analytics['Loss (%)'] = round(100*analytics['Number of Losers']/analytics['Total Trades'],2)
    analytics['Average Profit of Winning Trade'] = round(trades.loc[trades.PnL>0].PnL.mean(),2)
    analytics['Average Loss of Losing Trade'] = round(np.abs(trades.loc[trades.PnL<=0].PnL.mean()),2)
    trades['Entry Date'] = pd.to_datetime(trades['Entry Date'])
    trades['Exit Date'] = pd.to_datetime(trades['Exit Date'])
    holding_period = trades['Exit Date'] - trades['Entry Date']
    analytics['Average Holding Time'] = holding_period.mean()
    analytics['Profit Factor'] = round((analytics['Win (%)']/100*analytics['Average Profit of Winning Trade'])/(analytics['Loss (%)']/100*analytics['Average Loss of Losing Trade']),2)
    return analytics.T

def performance_metrics(data):
    data.set_index('Date', inplace=True)
    performance_metrics = pd.DataFrame(index=['Strategy'])
    data['Strategy Returns'] = data.signal.shift(1) * data.Close.pct_change()
    data['Cumulative Returns'] = (data['Strategy Returns'] + 1.0).cumprod()
    data['Cumulative Benchmark Returns'] = (data['Close'].pct_change() +1).cumprod()
    data['Cumulative Returns'].plot(figsize=(15, 7), label='Strategy Returns')
    data['Cumulative Benchmark Returns'].plot(label='Benchmark Returns')
    plt.title('Equity Curve', fontsize=14)
    plt.ylabel('Cumulative Returns', fontsize = 12)
    plt.xlabel('Date', fontsize = 12)
    plt.legend()
    plt.show()
    days = len(data['Cumulative Returns'])
    performance_metrics['CAGR'] = "{0:.2f}%".format(
        (data['Cumulative Returns'].iloc[-1]**(252/days)-1)*100)
    performance_metrics['Annualised Volatility'] = "{0:.2f}%".format(
        data['Strategy Returns'].std()*np.sqrt(252) * 100)
    risk_free_rate = 0.02/252
    performance_metrics['Sharpe Ratio'] = round(np.sqrt(252)*(np.mean(data['Strategy Returns']) -
                                                        (risk_free_rate))/np.std(data['Strategy Returns']),2)
    data['Peak'] = data['Cumulative Returns'].cummax()
    data['Drawdown'] = ((data['Cumulative Returns'] - data['Peak'])/data['Peak'])
    performance_metrics['Maximum Drawdown'] =  "{0:.2f}%".format((data['Drawdown'].min())*100)
    plt.figure(figsize=(15, 7))
    plt.title('Drawdowns', fontsize=14)
    plt.ylabel('Drawdown', fontsize=12)
    plt.xlabel('Date', fontsize=12)
    plt.plot(data['Drawdown'], color='red')
    plt.fill_between(data['Drawdown'].index, data['Drawdown'].values, color='red')
    plt.show()
    print(performance_metrics.T)

# Function for creating target variable
def target_var(data, window_size=20):
    target = pd.DataFrame(index=data.index)

    # Calculate the change in closing prices i.e. momentum over the specified window size
    target['signal'] = data.Close.pct_change(window_size).shift(-window_size)

    # Drop the NaN values
    target.dropna(inplace=True)

    # Convert the change into binary signals: 1 for positive change, -1 for negative change
    target['signal'] = np.where(target['signal'] > 0, 1, -1)

    return target

# Function to split and scale the data
from sklearn.model_selection import train_test_split

def train_test_split_wrapper(features, target, test_size=0.2, random_state=None):
    # Split the features and target datasets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=test_size, random_state=random_state)

    return X_train, X_test, y_train, y_test

def compile_encoder_decoder_model(X_train, optimizer='adam', loss='mean_squared_error'):
    # Define the architecture of the autoencoder model
    model = Sequential()  # Create a sequential model

    # Add a dense layer with 64 neurons and ReLU activation function as the input layer
    model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))

    # Add a dense layer with 32 neurons and ReLU activation function
    model.add(Dense(32, activation='relu'))

    # Add a dense layer with 8 neurons and ReLU activation function
    model.add(Dense(8, activation='relu'))

    # Add another dense layer with 32 neurons and ReLU activation function
    model.add(Dense(32, activation='relu'))

    # Add another dense layer with 64 neurons and ReLU activation function
    model.add(Dense(64, activation='relu'))

    # Add a dense layer with the same number of neurons as the input data and linear activation function
    model.add(Dense(X_train.shape[1], activation='linear'))

    # Compile the model
    model.compile(optimizer=optimizer, loss=loss)

    return model

def calculate_portfolio_returns_csmom(monthly_returns, portfolio='long-short', lookback_months=12):
    stock_monthly_returns = pd.DataFrame()

    # Loop through each month after the lookback period
    for i in range(lookback_months, len(monthly_returns)):

        returns = monthly_returns[i - lookback_months:i + 1]
        trailing_returns = returns[:lookback_months]

        # Extract the starting, ending, and holding months from the subset
        starting_month = str(returns.index[0])[:7]
        ending_month = str(returns.index[-2])[:7]
        holding_month = str(returns.index[-1])[:7]

        # Set returns data as the transposed scaled trailing returns
        returns_data = trailing_returns.T

        # Initialize the number of clusters and maximum number of stocks per cluster
        num_clusters = 1
        max_stocks_per_cluster = 10

        # Perform hierarchical clustering using 'ward' linkage method
        linkage_matrix = linkage(trailing_returns.T, method='ward')

        # Assign cluster labels to stocks, ensuring each cluster has at most 10 stocks
        clusters = fcluster(linkage_matrix, num_clusters, criterion='maxclust')

        # Assign the cluster labels to the original returns data
        returns_data['Cluster'] = clusters

        # Adjust clusters until each cluster meets the constraint
        while max(returns_data['Cluster'].value_counts()) > max_stocks_per_cluster:
            num_clusters += 1
            clusters = fcluster(linkage_matrix, num_clusters, criterion='maxclust')
            returns_data['Cluster'] = clusters

        # Define the minimum number of stocks in a cluster
        minimum_stocks_in_cluster = 2

        # Filter out clusters with fewer than the minimum number of stocks
        filtered_clusters = returns_data.groupby('Cluster').filter(
            lambda x: len(x) >= minimum_stocks_in_cluster)['Cluster'].unique()

        # Assign the filtered cluster labels to the original price data
        returns_data = returns_data[returns_data['Cluster'].isin(filtered_clusters)]

        # Calculate the returns for each cluster and sum across clusters
        cluster_returns = returns_data.groupby('Cluster').mean().sum(axis=1)

        if portfolio == 'long-short':
            # Identify stocks to go short and long based on cluster returns
            short = np.array(returns_data[returns_data.Cluster ==
                                          cluster_returns.idxmin()].index)

            long = np.array(returns_data[returns_data.Cluster ==
                                         cluster_returns.idxmax()].index)

            # Extract the returns for holding stocks in the current month
            hold_returns = returns.iloc[-1]

            # Calculate the average returns for the stocks to go long and short
            long_returns = hold_returns[long].mean()
            short_returns = -1 * hold_returns[short].mean()

            # Copy monthly returns data for further manipulation
            returns_monthly = monthly_returns.copy()

            # Select returns for stocks in the long and short portfolios for the holding month
            monthly_portfolio_returns = returns_monthly[list(
                long) + list(short)][holding_month]

            # Adjust returns for short positions
            monthly_portfolio_returns[short] *= -1

        elif portfolio == 'long':
            long = np.array(returns_data[returns_data.Cluster ==
                                         cluster_returns.idxmax()].index)

            # Extract the returns for holding stocks in the current month
            hold_returns = returns.iloc[-1]

            # Calculate the average returns for the stocks to go long
            long_returns = hold_returns[long].mean()

            # Copy monthly returns data for further manipulation
            returns_monthly = monthly_returns.copy()

            # Select returns for stocks in the long portfolio for the holding month
            monthly_portfolio_returns = returns_monthly[list(
                long)][holding_month]

        elif portfolio == 'short':
            short = np.array(returns_data[returns_data.Cluster ==
                                          cluster_returns.idxmin()].index)

            # Extract the returns for holding stocks in the current month
            hold_returns = returns.iloc[-1]

            # Calculate the average returns for the stocks to go short
            short_returns = -1 * hold_returns[short].mean()

            # Copy monthly returns data for further manipulation
            returns_monthly = monthly_returns.copy()

            # Select returns for stocks in the short portfolio for the holding month
            monthly_portfolio_returns = returns_monthly[list(
                short)][holding_month]

        # Append adjusted returns for the holding month to the stock_monthly_returns dataframe
        stock_monthly_returns = stock_monthly_returns.append(monthly_portfolio_returns)

    return stock_monthly_returns

def plot_and_display_metrics_csmom(stock_monthly_returns):
    portfolio_returns = stock_monthly_returns.mean(axis=1)
    # Plot the portfolio returns
    fig, ax = plt.subplots(figsize=(15, 7))
    portfolio_returns.plot(ax=ax)

    # Set the title and axis labels
    ax.set_title('Portfolio Returns Over Time')
    ax.set_xlabel('Time')
    ax.set_ylabel('Returns')
    ax.axhline(y=0, color='black', linestyle='-')

    # Fill area below 0 with red color
    ax.fill_between(portfolio_returns.index, portfolio_returns, 0,
                    where=portfolio_returns < 0, color='red', alpha=0.3)

    # Fill area above 0 with green color
    ax.fill_between(portfolio_returns.index, portfolio_returns, 0,
                    where=portfolio_returns >= 0, color='green', alpha=0.3)

    # Rotate x-axis labels for better readability
    plt.xticks(rotation=45, ha='right')

    # Calculate cumulative returns
    cumulative_returns = (portfolio_returns + 1).cumprod()

    # Convert index to datetime format
    cumulative_returns.index = pd.to_datetime(cumulative_returns.index)

    # Plot cumulative returns
    plt.figure(figsize=(15, 7))
    cumulative_returns.plot()

    # Labeling axes and title
    plt.ylabel('Cumulative Returns', fontsize=12)
    plt.title('Cross Sectional Momentum Strategy Returns', fontsize=14)

    # Show plot
    plt.show()

    # Calculate maximum cumulative returns up to each point
    max_cumulative_returns = cumulative_returns.cummax()

    # Calculate drawdown
    drawdown = (cumulative_returns - max_cumulative_returns) / max_cumulative_returns

    # Plot drawdown
    plt.figure(figsize=(15, 7))
    # Fill area under the drawdown curve with red color
    plt.fill_between(drawdown.index, drawdown, 0, color='red', alpha=0.3)
    plt.ylabel('Drawdown', fontsize=12)
    plt.title('Cross Sectional Momentum Strategy Drawdown', fontsize=14)
    plt.show()

    # Display the metrics
    # Calculate monthly Sharpe ratio
    monthly_sharpe = portfolio_returns.mean() / portfolio_returns.std()

    # Annualize Sharpe ratio for monthly data
    sharpe = round(monthly_sharpe * ((12) ** 0.5),2)

    # Calculate drawdown
    max_cumulative_returns = cumulative_returns.cummax()
    drawdown = (cumulative_returns - max_cumulative_returns) / max_cumulative_returns
    max_drawdown_index = drawdown.idxmin()
    max_drawdown_date = max_drawdown_index.strftime('%Y-%m-%d')
    max_drawdown_value = round(drawdown.min(),2)

    # Create a DataFrame to hold the metrics
    metrics = pd.DataFrame({
        'Metric': ['Sharpe Ratio', 'Maximum Drawdown Date', 'Maximum Drawdown Value'],
        'Value': [sharpe, max_drawdown_date, max_drawdown_value]
    })

    # Display metrics
    print("\nPerformance Metrics:")
    display(metrics.rename_axis(None, axis=1))

def predict_signals(X_test, aapl_test_prices_ts, model, scaler):
    # Initialise current position
    current_pos = 0

    # Initialise count of holding days
    hold_days = 0

    # Iterate through the rows of test data
    for dt, row in X_test.iterrows():
        # Check if there is no position or holding period reaches 20 days
        if current_pos == 0 or hold_days == 20:
            # Prepare test data for prediction
            test = pd.DataFrame(data=scaler.transform(
                row.values.reshape(1, -1)), columns=X_test.columns)

            # Generate signal based on test data
            signal = model.predict(test)[-1]

            # Update current position
            current_pos = signal

            # Update predicted and actual labels for the current date
            aapl_test_prices_ts.loc[dt, 'signal'] = current_pos

            # Reset holding days counter
            hold_days = 0
        elif current_pos != 0:
            # If there is an existing position, increment holding days counter
            hold_days += 1

    # Forward fill the last observed value for 'y_pred'
    aapl_test_prices_ts['signal'].ffill(inplace=True)

    return aapl_test_prices_ts

# Data

In [11]:
data_type = 'Scale'

In [12]:
### Open OHLC dataframe

ohlc = pd.read_csv(root_data + 'Data/'+symbol+'_M5.csv', index_col=0)
ohlc.index = pd.to_datetime(ohlc.index)
time_difference = ohlc.index.max() - ohlc.index.min()
number_of_days = time_difference.days

print(f"The train_data DataFrame covers a period of {number_of_days} days.")
ohlc.tail(3)

The train_data DataFrame covers a period of 937 days.


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Spread
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-07-26 23:45:00,118007.0,118100.13,117967.5,118023.88,757,100
2025-07-26 23:50:00,118023.63,118052.38,117951.88,118005.75,547,0
2025-07-26 23:55:00,118008.75,118049.38,118008.5,118042.63,485,587


In [13]:
### Features

features_5m = pd.read_csv(root_data + 'Results/'+symbol+'_'+direction+'_M5M10_'+data_type+'_Features.csv', index_col=0)
features_5m['Date'] = pd.to_datetime(features_5m['Date'])
features_5m.set_index('Date', inplace=True)

print(features_5m.shape)
features_5m.tail(3)

(14364, 358)


Unnamed: 0_level_0,10min_RSI_3,10min_ROCP_3,slope_angle_900_18,RSI_7,OBV_15,10min_RSI_5,slope_angle_100_3,slope_lin_reg_300_3,slope_angle_signal_900_15,10min_SMA_Crossover_5_22,...,MFI_17,MFI_126,10min_ROCP_10,EMA_Crossover_7_20,ADX_17,ADX_20,skew_4,slope_signal_500_18,ADX_7,slope_lin_reg_signal_700_6
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-07-26 20:30:00,-0.926057,-0.156305,-1.037497,-0.803935,-1.906249,-0.871041,-1.015568,1.915695,-0.995793,-0.508498,...,-0.124052,1.310115,-0.65576,-0.141638,-1.247766,-1.012565,-0.156181,-0.995065,-1.484394,-1.006785
2025-07-26 21:15:00,-1.507646,-0.409844,-1.033299,-0.951771,-1.905907,-1.438174,-0.359104,1.914857,-0.995793,-0.533613,...,0.19762,1.637437,-0.623155,-0.101397,-1.325697,-1.239763,-0.744348,1.00496,-1.081896,0.993261
2025-07-26 23:05:00,-0.843252,-0.177475,0.979639,-0.420833,-1.89761,-0.597205,1.006584,1.922193,-0.995793,-0.085704,...,1.374808,1.722243,-0.167033,0.238822,-0.971668,-1.052633,-0.368052,-0.995065,-0.590553,0.993261


In [14]:
### Labels

lab = pd.read_csv(root_data + 'Results/'+symbol+'_'+strategy+'_'+time_frame+'_Strategy_Gen_Labels.csv', index_col=0)
lab['Date'] = pd.to_datetime(lab['Date'])
lab.set_index('Date', inplace=True)

print(lab.columns,'\n')
print(lab.shape)
lab.loc[lab['Open_Trade']== 1,'Open_Trade'].sum()

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Spread', 'ATR', 'kal_1',
       'kal_2', 'kal_3', 'kal_4', 'Open_Trade', 'Close_Trade', 'Entry_Date',
       'Type', 'Trade_Number', 'st_Exit_Date', 'trade type', 'st_Duration',
       'st_row_PnL_close', 'st_row_PnL_high', 'st_row_PnL_Low',
       'st_row_PnL_low', 'st_Max', 'st_Min', 'st_PnL', 'st_atr_PnL',
       'st_atr_max_PnL'],
      dtype='object') 

(267395, 28)


np.float64(19600.0)

In [15]:
# --- Parámetros / campos
result_field = 'st_atr_max_PnL'   # métrica a evaluar

# --- Filtro de filas válidas
valid = (
    (lab['Type'] == direction) &
    (lab['Open_Trade'].isin([1, -1])) &
    (lab[result_field].notna())
)

# --- Conteos por rango (st_max_4..6)
st_max_4 = (valid & (lab[result_field] <= 0.5)).sum()
st_max_5 = (valid & (lab[result_field] >= 0.5) & (lab[result_field] <= 1.0)).sum()
st_max_6 = (valid & (lab[result_field] > 1.0)).sum()

print(f'<= 0.5          = {st_max_4:,d}')
print(f'> 0.5 & <= 1.0  = {st_max_5:,d}')
print(f'> 1.0           = {st_max_6:,d}')

# --- Etiquetado en la columna "label" con valores 4/5/6
lab['label'] = np.nan
lab.loc[valid & (lab[result_field] <= 0.5), 'label'] = 0
lab.loc[valid & (lab[result_field] > 0.5) & (lab[result_field] <= 1.0), 'label'] = 1
lab.loc[valid & (lab[result_field] > 1.0), 'label'] = 2

# --- Mantener solo filas válidas y con label
lab = lab.loc[valid & lab['label'].notna()].copy()
lab['label'] = lab['label'].astype('int8')

# --- Ver distribución de labels 4/5/6
print('\nValue counts de label 4/5/6:')
print(lab['label'].value_counts(dropna=False).sort_index())


<= 0.5          = 5,650
> 0.5 & <= 1.0  = 3,887
> 1.0           = 10,054

Value counts de label 4/5/6:
label
0     5650
1     3887
2    10054
Name: count, dtype: int64


In [198]:
#lab.loc[lab['Open_Trade'].notna(),['Close','Open_Trade', 'Entry_Date', 'Type', 'Trade_Number', 'label']]

In [21]:
### Merge

feat_obj = lab.merge(features_5m, left_index=True, right_index=True, how='left')
#feat_obj = feat_obj.merge(features_10m, left_index=True, right_index=True, how='left')
feat_obj.fillna(method='ffill', inplace=True)
#feat_obj.dropna(inplace=True)
print('Shape = ',feat_obj.shape,'\n')
print(list(feat_obj.columns),'\n')
feat_obj.tail(3)

Shape =  (19591, 387) 

['Open', 'High', 'Low', 'Close', 'Volume', 'Spread', 'ATR', 'kal_1', 'kal_2', 'kal_3', 'kal_4', 'Open_Trade', 'Close_Trade', 'Entry_Date', 'Type', 'Trade_Number', 'st_Exit_Date', 'trade type', 'st_Duration', 'st_row_PnL_close', 'st_row_PnL_high', 'st_row_PnL_Low', 'st_row_PnL_low', 'st_Max', 'st_Min', 'st_PnL', 'st_atr_PnL', 'st_atr_max_PnL', 'label', '10min_RSI_3', '10min_ROCP_3', 'slope_angle_900_18', 'RSI_7', 'OBV_15', '10min_RSI_5', 'slope_angle_100_3', 'slope_lin_reg_300_3', 'slope_angle_signal_900_15', '10min_SMA_Crossover_5_22', 'slope_angle_100_6', 'slope_div_900_18', 'slope_lin_reg_signal_700_21', 'Kal_change_100_3', 'RSI_15', 'slope_lin_reg_900_3', 'EMA_Crossover_5_22', 'slope_angle_signal_100_15', 'RSI_5', 'slope_lin_reg_signal_300_15', 'slope_lin_reg_signal_900_21', 'RSI_10', '10min_SMA_Crossover_3_22', 'slope_angle_900_15', 'AD_7', 'slope_angle_signal_700_15', 'EMA_Crossover_5_20', 'slope_div_500_18', 'slope_signal_900_15', '10min_ROCP_5', '10min_SM

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Spread,ATR,kal_1,kal_2,kal_3,...,MFI_17,MFI_126,10min_ROCP_10,EMA_Crossover_7_20,ADX_17,ADX_20,skew_4,slope_signal_500_18,ADX_7,slope_lin_reg_signal_700_6
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-07-26 20:30:00,118099.63,118104.88,118051.63,118063.0,553,1200,77.12338,118094.685132,118085.650245,118077.185723,...,-0.124052,1.310115,-0.65576,-0.141638,-1.247766,-1.012565,-0.156181,-0.995065,-1.484394,-1.006785
2025-07-26 21:15:00,118117.88,118124.63,117983.0,118032.63,790,875,70.277366,118078.346452,118081.234543,118089.919364,...,0.19762,1.637437,-0.623155,-0.101397,-1.325697,-1.239763,-0.744348,1.00496,-1.081896,0.993261
2025-07-26 23:05:00,118188.63,118207.38,118159.63,118174.38,498,412,64.619303,118260.485208,118265.006008,118267.04478,...,1.374808,1.722243,-0.167033,0.238822,-0.971668,-1.052633,-0.368052,-0.995065,-0.590553,0.993261


In [17]:
columns_to_drop = ['st_row_PnL_Low']
feat_obj = feat_obj.drop(columns=columns_to_drop)
feat_obj.dropna(inplace=True)

In [18]:
columns_to_use = ['Open', 'High', 'Low', 'Close', 'Volume', 'Spread',
                  'Open_Trade', 'st_Close_Trade', 'Entry_Date', 'Type',
                  'Trade_Number', 'st_Exit_Date', 'trade_type',
                  'atr_mult_low', 'atr_mult_close', 'atr_mult_high', 'atr_dyn',
                  'atr_PnL', 'atr_Exit_Date', 'atr_Duration',
                  'atr_PnL_dollar',
                  'label',
                  'Encoded_0', 'Encoded_1', 'Encoded_2', 'Encoded_3', 'Encoded_4',
                  'Encoded_5', 'Encoded_6', 'Encoded_7', '10min_Encoded_0', '10min_Encoded_1',
                  '10min_Encoded_2', '10min_Encoded_3', '10min_Encoded_4', '10min_Encoded_5',
                  '10min_Encoded_6', '10min_Encoded_7']

nan_counts = feat_obj.isnull().sum()
print("\nNaN counts in feat_obj sorted by highest to lowest:")
print(nan_counts.sort_values(ascending=False))

print("\nTotal NaN count in feat_obj:", nan_counts.sum())


NaN counts in feat_obj sorted by highest to lowest:
slope_lin_reg_signal_700_6    0
Open                          0
10min_MFI_5                   0
10min_ADX_15                  0
10min_ROCP_126                0
                             ..
ATR                           0
Spread                        0
Volume                        0
Close                         0
Low                           0
Length: 386, dtype: int64

Total NaN count in feat_obj: 0


In [19]:
# Split the data into 70% train and 30% test based on index
train_size = int(0.7 * len(feat_obj))
train = feat_obj.iloc[:train_size]
test  = feat_obj.iloc[train_size:]

print("Shape of train_data:", train.shape)
print("Shape of test_data:", test.shape)

Shape of train_data: (10054, 386)
Shape of test_data: (4310, 386)


In [20]:
def show_columns_as_table(df: pd.DataFrame):
    cols = pd.DataFrame(df.columns, columns=["Column Names"])
    print(cols.to_string(index=True))

# usage
#show_columns_as_table(train)

# Results


In [None]:
#### TRAIN data

results(train, pnl_column = result_field)

Unnamed: 0,Results
days,95
total_trades,1929
,
income,"$3,803.34"
losses,$0.00
profits,"$3,803.34"
,
profit_trades,1927
loss_trades,0
,


In [None]:
#### TEST data

results(test, pnl_column = result_field)

Unnamed: 0,Results
days,41
total_trades,828
,
income,"$1,664.90"
losses,$0.00
profits,"$1,664.90"
,
profit_trades,828
loss_trades,0
,



# ML


## Train

In [23]:
# Use all columns from 'kal_3' onwards except the label
train_features = [col for col in train.columns[train.columns.get_loc('10min_RSI_3'):] if col != 'label']


In [24]:
# -*- coding: utf-8 -*-
"""
Ensamble multiclase (0/1/2) con VotingClassifier usando EXACTAMENTE
las columnas Encoded_0..Encoded_7. Incluye SMOTE, escalado en Pipeline
(para LR y SVC), evaluación y aplicación sobre todo el DataFrame.

Requisitos:
    pip install scikit-learn xgboost imbalanced-learn pandas numpy
"""
# ---------- 1) Construir el ensamble multiclase ----------


def build_estimators_3class(n_classes: int = 3, random_state: int = 42) -> VotingClassifier:
    """
    Crea un VotingClassifier (voto 'soft') con:
      - XGBClassifier (multi:softprob)
      - LogisticRegression (multinomial) + StandardScaler
      - SVC (RBF, probability=True) + StandardScaler
      - AdaBoost (SAMME.R por defecto)
    """
    xgb = XGBClassifier(
        n_estimators=15,
        max_depth=3,
        random_state=random_state,
        tree_method="hist",
        objective="multi:softprob" if n_classes > 2 else "binary:logistic",
        num_class=n_classes if n_classes > 2 else None,
        eval_metric="mlogloss" if n_classes > 2 else "logloss",
        n_jobs=-1
    )

    lr = make_pipeline(
        StandardScaler(),
        LogisticRegression(
            multi_class="multinomial" if n_classes > 2 else "auto",
            solver="lbfgs",
            max_iter=1000,
            random_state=random_state
        )
    )

    svc = make_pipeline(
        StandardScaler(),
        SVC(kernel="rbf", probability=True, random_state=random_state)
    )

    ada = AdaBoostClassifier(n_estimators=15, random_state=random_state)

    estimators = [
        ("LR",  lr),
        ("XGB", xgb),
        ("ADA", ada),
        ("SVC", svc),
    ]
    model = VotingClassifier(estimators=estimators, voting="soft")
    return model


# ---------- 2) Entrenar + evaluar + aplicar al DataFrame ----------
def train_apply_ensemble_3class(
    train: pd.DataFrame,
    feature_cols: List[str],
    label_col: str = "label",
    test_size: float = 0.30,
    random_state: int = 42,
    use_smote: bool = True,
    verbose: bool = True
) -> Tuple[VotingClassifier, pd.DataFrame, Dict[str, Any]]:
    """
    - Split estratificado (train/test).
    - SMOTE sobre train (opcional).
    - Entrena VotingClassifier multiclase (0/1/2).
    - Evalúa en holdout.
    - Aplica al DataFrame completo (añade label_ml, prob_0, prob_1, prob_2).
    """
    # Asegurar tipos y columnas
    X = train.loc[:, feature_cols].copy()
    y = train[label_col].copy().to_numpy().ravel()

    # Split con estratificación
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # SMOTE sólo al train
    if use_smote:
        smote = SMOTE(random_state=random_state)
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
        X_train_res = pd.DataFrame(X_train_res, columns=feature_cols)
    else:
        X_train_res, y_train_res = X_train, y_test # Corrected

    # Construcción del modelo (3 clases por defecto)
    n_classes = len(np.unique(y_train_res))
    model = build_estimators_3class(n_classes=n_classes, random_state=random_state)

    # Entrenar
    model.fit(X_train_res, y_train_res)

    # Evaluación holdout
    y_pred_test = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred_test)
    cr = classification_report(y_test, y_pred_test, digits=4)

    # Aplicar sobre TODO el DataFrame original
    y_pred_full = model.predict(X)
    proba_full = model.predict_proba(X)  # (n_samples, n_classes) en el orden model.classes_
    prob_cols = [f"prob_{int(c)}" for c in model.classes_]
    proba_df = pd.DataFrame(proba_full, columns=prob_cols, index=X.index)

    # Asegurar columnas prob_0, prob_1, prob_2 (por si el orden es distinto)
    for c in [0, 1, 2]:
        col = f"prob_{c}"
        if col not in proba_df.columns:
            proba_df[col] = 0.0
    proba_df = proba_df[["prob_0", "prob_1", "prob_2"]]

    # Devolver copia con columnas nuevas
    train_out = train.copy()
    train_out["label_ml"] = y_pred_full
    train_out[["prob_0", "prob_1", "prob_2"]] = proba_df

    if verbose:
        print("Shapes ->",
              "X_train:", X_train.shape,
              "X_test:",  X_test.shape)
        print("\nHoldout - Confusion Matrix:\n", cm)
        print("\nHoldout - Classification Report:\n", cr)

    metrics = {"confusion_matrix": cm, "classification_report": cr}
    return model, train_out, metrics

In [25]:
ml_model, train_with_preds, metrics = train_apply_ensemble_3class(
    train=train,
    feature_cols=train_features,
    label_col="label",     # cambia si tu columna objetivo se llama distinto
    test_size=0.30,
    random_state=42,
    use_smote=True,
    verbose=True
)


Shapes -> X_train: (7037, 358) X_test: (3017, 358)

Holdout - Confusion Matrix:
 [[413 135 326]
 [150 132 315]
 [302 268 976]]

Holdout - Classification Report:
               precision    recall  f1-score   support

           0     0.4775    0.4725    0.4750       874
           1     0.2467    0.2211    0.2332       597
           2     0.6036    0.6313    0.6171      1546

    accuracy                         0.5041      3017
   macro avg     0.4426    0.4417    0.4418      3017
weighted avg     0.4964    0.5041    0.5000      3017



In [26]:
train[["label_ml","prob_0","prob_1","prob_2"]] = \
    train_with_preds[["label_ml","prob_0","prob_1","prob_2"]]

In [27]:
### Save ML Model
model_path = root_data + 'Models/'+symbol+'_'+direction+'_ml_model.joblib'
joblib.dump(ml_model, model_path)
print(f"Model saved successfully at: {model_path}")

Model saved successfully at: /content/drive/MyDrive/Course Folder/Forex/XAUUSD/Models/BTCUSD_Short_ml_model.joblib


## Meta

In [28]:
### Import ML Model
model_path = root_data+'Models/'+symbol+'_'+direction+'_ml_model.joblib'
ml_model = joblib.load(model_path)
print(f"Model loaded successfully from: {model_path}")

Model loaded successfully from: /content/drive/MyDrive/Course Folder/Forex/XAUUSD/Models/BTCUSD_Short_ml_model.joblib


In [29]:
train[["label_ml","prob_0","prob_1","prob_2"]] = \
    train_with_preds[["label_ml","prob_0","prob_1","prob_2"]]

In [35]:
# ============ 1) Construir el ensamble multiclase para el META-modelo ============
def build_meta_estimators_3class(n_classes: int = 3, random_state: int = 42) -> VotingClassifier:
    """
    Ensamble (voto 'soft') con:
      - XGBClassifier (multi:softprob)
      - LogisticRegression (multinomial) + StandardScaler
      - SVC (RBF, probability=True) + StandardScaler
      - AdaBoost
    """
    xgb = XGBClassifier(
        n_estimators=15,
        max_depth=3,
        random_state=random_state,
        tree_method="hist",
        objective="multi:softprob" if n_classes > 2 else "binary:logistic",
        num_class=n_classes if n_classes > 2 else None,
        eval_metric="mlogloss" if n_classes > 2 else "logloss",
        n_jobs=-1
    )
    lr = make_pipeline(
        StandardScaler(),
        LogisticRegression(
            multi_class="multinomial" if n_classes > 2 else "auto",
            solver="lbfgs",
            max_iter=1000,
            random_state=random_state
        )
    )
    svc = make_pipeline(
        StandardScaler(),
        SVC(kernel="rbf", probability=True, random_state=random_state)
    )
    ada = AdaBoostClassifier(n_estimators=15, random_state=random_state)

    estimators = [("LR", lr), ("XGB", xgb), ("ADA", ada), ("SVC", svc)]
    model = VotingClassifier(estimators=estimators, voting="soft")
    return model


# ============ 2) Entrenar + evaluar + aplicar el META-modelo =====================
def train_meta_ensemble_3class(
    train: pd.DataFrame,
    base_feature_cols: List[str],
    proba_cols: List[str] = ["prob_0", "prob_1", "prob_2"],
    label_col: str = "label",
    test_size: float = 0.30,
    random_state: int = 42,
    use_smote: bool = True,
    verbose: bool = True
) -> Tuple[VotingClassifier, pd.DataFrame, Dict[str, Any]]:
    """
    Entrena el meta-modelo con:
       X_meta = base_feature_cols + proba_cols
       y      = train[label_col]
    Split estratificado, SMOTE sobre train, evalúa en holdout, y aplica sobre todo 'train'.

    Devuelve:
      - model: VotingClassifier entrenado
      - train_out: copia de 'train' con columnas del meta-modelo:
            'meta_label_ml', 'meta_prob_0', 'meta_prob_1', 'meta_prob_2'
      - metrics: dict con matriz de confusión, classification report, accuracy, f1_macro, auc_macro_ovr
    """
    # Verificar que existan las columnas requeridas
    # Corrected: Removed 'label_ml' from the required columns for the meta model training
    required_cols = base_feature_cols + proba_cols + [label_col]
    missing = [c for c in required_cols if c not in train.columns]
    if missing:
        raise ValueError(f"Faltan columnas en 'train': {missing}")

    # Construir X, y
    meta_cols = base_feature_cols + proba_cols # Corrected: X_meta only includes base features and probabilities
    X = train.loc[:, meta_cols].copy()
    y = train[label_col].copy().to_numpy().ravel()

    # Split estratificado
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # SMOTE (opcional) sólo en train
    if use_smote:
        smote = SMOTE(random_state=random_state)
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
        # Convert to numpy array after SMOTE to avoid dtype issues with XGBoost
        X_train_res = X_train_res.to_numpy()
    else:
        X_train_res, y_train_res = X_train.to_numpy(), y_train # Convert to numpy array

    # Construir y entrenar el meta-modelo
    n_classes = len(np.unique(y_train_res))
    meta_model = build_meta_estimators_3class(n_classes=n_classes, random_state=random_state)
    meta_model.fit(X_train_res, y_train_res)

    # ---- Evaluación holdout ----
    y_pred = meta_model.predict(X_test)
    proba_test = meta_model.predict_proba(X_test)

    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred, digits=4)
    acc = (y_pred == y_test).mean()
    try:
        # AUC macro OVR (si hay predict_proba y 3 clases)
        classes_sorted = np.unique(np.concatenate([y_train_res, y_test]))
        y_test_bin = label_binarize(y_test, classes=classes_sorted)
        auc_macro_ovr = roc_auc_score(y_test_bin, proba_test, average="macro", multi_class="ovr")
    except Exception:
        auc_macro_ovr = None

    # ---- Aplicar sobre TODO 'train' ----
    full_pred = meta_model.predict(X.to_numpy()) # Convert to numpy array for prediction
    full_proba = meta_model.predict_proba(X.to_numpy()) # Convert to numpy array for prediction
    prob_cols_out = [f"meta_prob_{int(c)}" for c in meta_model.classes_]
    proba_full_df = pd.DataFrame(full_proba, columns=prob_cols_out, index=X.index)

    # Asegurar columnas meta_prob_0/1/2
    for c in [0, 1, 2]:
        col = f"meta_prob_{c}"
        if col not in proba_full_df.columns:
            proba_full_df[col] = 0.0
    proba_full_df = proba_full_df[["meta_prob_0", "meta_prob_1", "meta_prob_2"]]

    train_out = train.copy()
    train_out["meta_label_ml"] = full_pred
    train_out[["meta_prob_0", "meta_prob_1", "meta_prob_2"]] = proba_full_df[["meta_prob_0", "meta_prob_1", "meta_prob_2"]]

    if verbose:
        print("META — Shapes -> X_train:", X_train_res.shape, "X_test:", X_test.shape)
        print("\nMETA — Confusion Matrix (holdout):\n", cm)
        print("\nMETA — Classification Report (holdout):\n", cr)
        print(f"META — Accuracy: {acc:.4f}")
        if auc_macro_ovr is not None:
            print(f"META — AUC Macro OVR: {auc_macro_ovr:.4f}")

    metrics = {
        "confusion_matrix": cm,
        "classification_report": cr,
        "accuracy": acc,
        "auc_macro_ovr": auc_macro_ovr
    }
    return meta_model, train_out, metrics

In [36]:

# Use all columns from 'kal_3' onwards except the label
train_features = [col for col in train.columns[train.columns.get_loc('10min_RSI_3'):] if col != 'label']
proba_cols = ['label_ml','prob_0','prob_1','prob_2']  # vienen del primer modelo

# 2) Entrena el META-modelo y aplica sobre todo 'train'
meta_ml_model, train_with_meta, meta_metrics = train_meta_ensemble_3class(
    train=train,
    base_feature_cols=train_features,
    proba_cols=proba_cols,
    label_col="label",       # tu y real (0/1/2)
    test_size=0.30,
    random_state=42,
    use_smote=True,          # o False si no quieres SMOTE en el meta
    verbose=True
)

# 3) (Opcional) guardar salida
train_with_meta.to_csv(root_data + 'Results/' + symbol + 'Meta_Prob_M5+M10_train_l.csv', index=False)

# 4) Si prefieres mantener todo en el DataFrame original:
train[['meta_label_ml','meta_prob_0','meta_prob_1','meta_prob_2']] = \
    train_with_meta[['meta_label_ml','meta_prob_0','meta_prob_1','meta_prob_2']]


AttributeError: 'DataFrame' object has no attribute 'dtype'

In [None]:
### Save ML Model
model_path = root_data + 'Models/'+symbol+'_'+direction+'_Meta_ml_model.joblib'
joblib.dump(meta_ml_model, model_path)
print(f"Model saved successfully at: {model_path}")

In [None]:
train.columns

## PnL Train

In [None]:
train['ml_results'] = np.where(train['label_ml'] == 1, 50,
                               np.where(train['label_ml'] == 2, train[result_field],
                                        np.where(train['label_ml'] == 0, 0, np.nan)))

results(train, pnl_column='ml_results')

Unnamed: 0,Results
days,95
total_trades,1929
,
income,"$33,346.09"
losses,$0.00
profits,"$33,346.09"
,
profit_trades,1305
loss_trades,0
,


In [None]:
train['meta_ml_results'] = np.where(train['meta_label_ml'] == 1, 50,
                               np.where(train['meta_label_ml'] == 2, train[result_field],
                                        np.where(train['meta_label_ml'] == 0, 0, np.nan)))
results(train, pnl_column='meta_ml_results')

Unnamed: 0,Results
days,95
total_trades,1929
,
income,"$27,625.53"
losses,$0.00
profits,"$27,625.53"
,
profit_trades,1339
loss_trades,0
,



# Test


## Results_ML

In [None]:
### Import ML Model

model_path = root_data+'Models/'+symbol+'_'+direction+'_ml_model.joblib'
ml_model = joblib.load(model_path)
print(f"Model loaded successfully from: {model_path}")

In [None]:
### Import Meta Model

model_path = root_data+'Models/'+symbol+'_'+direction+'_Meta_ml_model.joblib'
meta_ml_model = joblib.load(model_path)
print(f"Model loaded successfully from: {model_path}")

In [None]:
# -*- coding: utf-8 -*-
"""
Apply base (level-1) model and meta (level-2) model to a new DataFrame `test`.

Requisitos:
    pip install scikit-learn xgboost pandas numpy joblib
"""

# ---------- Helpers ----------
def _ensure_columns(df: pd.DataFrame, cols: List[str], name: str) -> None:
    missing = [c for c in cols if c not in df.columns]
    if missing:
        raise ValueError(f"Columnas faltantes en '{name}': {missing}")

def _proba_df(model, X: pd.DataFrame, prefix: str, classes_expected=(0,1,2)) -> pd.DataFrame:
    """
    Convierte predict_proba -> DataFrame con columnas 'prefix{class}' siguiendo model.classes_.
    Asegura columnas para todas las clases esperadas (0,1,2).
    """
    proba = model.predict_proba(X)
    cls = list(model.classes_)  # orden de clases internas del modelo
    out = pd.DataFrame(proba, columns=[f"{prefix}{int(c)}" for c in cls], index=X.index)
    for c in classes_expected:
        col = f"{prefix}{c}"
        if col not in out.columns:
            out[col] = 0.0
    # ordenar 0,1,2
    out = out[[f"{prefix}{c}" for c in classes_expected]]
    return out

def _print_metrics(y_true: np.ndarray, y_pred: np.ndarray, y_proba: np.ndarray | None, header: str) -> Dict[str, object]:
    print(f"\n=== {header} ===")
    labels_sorted = np.sort(np.unique(np.concatenate([np.unique(y_true), np.unique(y_pred)])))
    cm = confusion_matrix(y_true, y_pred, labels=labels_sorted)
    print("Confusion Matrix (labels ordered):", labels_sorted)
    print(cm)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, digits=4))
    auc_macro = None
    if y_proba is not None and len(labels_sorted) > 2:
        try:
            y_true_bin = label_binarize(y_true, classes=labels_sorted)
            auc_macro = roc_auc_score(y_true_bin, y_proba, average="macro", multi_class="ovr")
            print(f"AUC Macro OVR: {auc_macro:.4f}")
        except Exception:
            pass
    return {"confusion_matrix": cm, "auc_macro_ovr": auc_macro}


# ---------- 1) Apply base model on test ----------
def apply_base_model_to_test(
    test: pd.DataFrame,
    ml_model,
    train_features: List[str],
    label_col: str = "label"
) -> Tuple[pd.DataFrame, Dict[str, object]]:
    """
    Aplica el modelo base sobre test[train_features]:
      - Añade: 'label_ml', 'prob_0', 'prob_1', 'prob_2'
      - Imprime métricas vs test[label]
    """
    _ensure_columns(test, train_features + [label_col], name="test (base)")
    Xb = test.loc[:, train_features]
    y_true = test[label_col].to_numpy().ravel()

    # Predicciones base
    y_pred = ml_model.predict(Xb)
    proba_df = _proba_df(ml_model, Xb, prefix="prob_")

    out = test.copy()
    out["label_ml"] = y_pred
    out[["prob_0","prob_1","prob_2"]] = proba_df[["prob_0","prob_1","prob_2"]]

    # Métricas (probabilidades en el orden 0/1/2)
    metrics = _print_metrics(y_true, y_pred, proba_df.values, header="BASE MODEL")
    return out, metrics


# ---------- 2) Apply meta model on test (using base probs) ----------
def apply_meta_model_to_test(
    test_with_base: pd.DataFrame,
    meta_ml_model,
    train_features: List[str],
    label_col: str = "label"
) -> Tuple[pd.DataFrame, Dict[str, object]]:
    """
    Aplica el meta-modelo sobre test[meta].
    Detecta los features esperados por 'meta_ml_model' y selecciona las columnas
    en el mismo ORDEN de entrenamiento para evitar errores de StandardScaler/Pipeline.

    Añade: 'meta_label_ml', 'meta_prob_0', 'meta_prob_1', 'meta_prob_2'
    e imprime métricas vs test[label].
    """
    # 1) Descubrir features esperados por el modelo (si están disponibles)
    expected_meta_cols = None
    if hasattr(meta_ml_model, "feature_names_in_"):
        expected_meta_cols = list(meta_ml_model.feature_names_in_)
    else:
        # Fallback: base features + probs; si existe 'label_ml' en test, la incluimos
        base = list(train_features)
        probs = ["prob_0", "prob_1", "prob_2"]
        expected_meta_cols = base + (["label_ml"] if "label_ml" in test_with_base.columns else []) + probs

    # 2) Validar presencia de columnas (y de la etiqueta para métricas)
    _ensure_columns(test_with_base, expected_meta_cols + [label_col], name="test (meta)")

    # 3) Seleccionar EXACTAMENTE en el orden esperado por el modelo
    Xm = test_with_base.loc[:, expected_meta_cols]
    print("Columns being passed to meta_ml_model.predict in order:", list(Xm.columns))

    # 4) Predicción y probabilidades
    y_true = test_with_base[label_col].to_numpy().ravel()
    y_pred = meta_ml_model.predict(Xm)
    proba_df = _proba_df(meta_ml_model, Xm, prefix="meta_prob_")

    # 5) Salida: usar nombres consistentes con el flujo de 'train' (meta_label_ml)
    out = test_with_base.copy()
    out["meta_label_ml"] = y_pred
    out[["meta_prob_0", "meta_prob_1", "meta_prob_2"]] = proba_df[["meta_prob_0", "meta_prob_1", "meta_prob_2"]]

    # 6) Métricas meta (probabilidades en orden 0/1/2)
    metrics = _print_metrics(y_true, y_pred, proba_df.values, header="META MODEL")
    return out, metrics

##########################################################
# ---------- 3) Driver: load models, apply both ----------
def run_inference_on_test(
    test: pd.DataFrame,
    train_features: List[str],
    root_data: str,
    symbol: str,
    direction: str,
    label_col: str = "label",
    save_csv: bool = True
) -> Tuple[pd.DataFrame, Dict[str, object], Dict[str, object]]:
    """
    Carga modelos, aplica modelo base y meta sobre 'test', imprime métricas y (opcional) guarda CSV.
    """
    # Load base model
    base_path = root_data + 'Models/' + f'{symbol}_{direction}_ml_model.joblib'
    ml_model = joblib.load(base_path)
    print(f"Base model loaded: {base_path}")

    # Load meta model
    meta_path = root_data + 'Models/' + f'{symbol}_{direction}_Meta_ml_model.joblib'
    meta_ml_model = joblib.load(meta_path)
    print(f"Meta model loaded: {meta_path}")

    # Apply base
    test_after_base, base_metrics = apply_base_model_to_test(
        test=test,
        ml_model=ml_model,
        train_features=train_features,
        label_col=label_col
    )

    # Apply meta
    # Corrected: Pass the DataFrame with base model predictions and probabilities
    test_after_meta, meta_metrics = apply_meta_model_to_test(
        test_with_base=test_after_base,
        meta_ml_model=meta_ml_model,
        train_features=train_features,
        label_col=label_col)


    if save_csv:
        out_path = root_data + 'Results/' + f'{symbol}_{direction}_test_with_base_and_meta.csv'
        test_after_meta.to_csv(out_path, index=False)
        print(f"\nSaved predictions to: {out_path}")

    return test_after_meta, base_metrics, meta_metrics

In [None]:
# Use all columns from 'kal_3' onwards except the label
train_features = [col for col in test.columns[test.columns.get_loc('kal_3'):] if col != 'label']


## Results_PnL

In [None]:
test_with_preds['ml_results'] = np.where(test_with_preds[result_field]<-100, -100,
         np.where(test_with_preds['label_ml']==0, 0,
             np.where(test_with_preds['label_ml']==1, 100,#test_with_preds[result_field],
                 np.where(test_with_preds['label_ml']==2, test_with_preds[result_field],0))))

results(test_with_preds, pnl_column='ml_results')

In [None]:
print(test_with_preds.loc[test_with_preds['label_ml']==2, 'ml_results'].describe())

In [None]:
test_with_preds['meta_ml_results'] = np.where(test_with_preds[result_field]<-100, -100,
         np.where(test_with_preds['meta_label_ml']==0, 0,
             np.where(test_with_preds['meta_label_ml']==1, test_with_preds[result_field],
                 np.where(test_with_preds['meta_label_ml']==2, test_with_preds[result_field],0))))


results(test_with_preds, pnl_column='meta_ml_results')

# Pendientes
