# Gpu

In [None]:
!nvidia-smi

In [None]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

# Set_Up

In [None]:
strategy   = 'Kalman'
process    = 'Train'
symbol     = 'XAUUSD'
direction  = 'Short'
time_frame = 'M5'

root_data = f'/content/drive/MyDrive/Course Folder/Forex/XAUUSD/'
print(root_data)

rolling_window = 100

# Import_Libraries

In [None]:
!pip install ta-lib
import talib as ta
print(ta.__version__)

In [None]:
# Import libraries for data manipulation
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from datetime import timedelta

# For machine learning models
import xgboost
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split # Import train_test_split

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from tensorflow.keras.models import load_model

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

# Import matplotlib as an alias plt and set the style
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("seaborn-v0_8-whitegrid")

# Import sys to append the path for custom function file
import sys
sys.path.append("..")

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

import joblib

In [None]:
def confusion_probability_summary(y_true, y_pred, positive_class_probabilities, positive_class=1):
    """
    Calculate summary statistics of predicted probabilities for each outcome in a confusion matrix.
    """
    data = pd.DataFrame({
        'actual': np.asarray(y_true),
        'predicted': np.asarray(y_pred),
        'prob_positive': np.asarray(positive_class_probabilities),
    })

    outcomes = {
        'True Positive': (data['actual'] == positive_class) & (data['predicted'] == positive_class),
        'False Positive': (data['actual'] != positive_class) & (data['predicted'] == positive_class),
        'True Negative': (data['actual'] != positive_class) & (data['predicted'] != positive_class),
        'False Negative': (data['actual'] == positive_class) & (data['predicted'] != positive_class),
    }

    summary_rows = []
    for outcome, mask in outcomes.items():
        probabilities = data.loc[mask, 'prob_positive']
        summary_rows.append({
            'Outcome': outcome,
            'Count': int(probabilities.count()),
            'Mean Probability': probabilities.mean() if not probabilities.empty else np.nan,
            'Std Probability': probabilities.std(ddof=0) if probabilities.count() > 1 else np.nan,
        })

    summary = pd.DataFrame(summary_rows).set_index('Outcome')
    return summary


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Support Functions

In [None]:
def results(data, pnl_column='PnL'):
    # Calculate the metrics
    time_difference = data.index.max() - data.index.min()
    days = time_difference.days
    total_trades = data[data['Open_Trade'].notna() & (data['Open_Trade'] != 0)].shape[0]
    profit_trades = data[data[pnl_column] > 0].shape[0]
    loss_trades = data[data[pnl_column] < 0].shape[0]
    profits = data[data[pnl_column] > 0][pnl_column].sum()
    losses = data[data[pnl_column] < 0][pnl_column].sum()

    # Create a dictionary with the results
    results_dict = {
        'days': days,
        'total_trades': total_trades,
        '': '',
        'income': profits,
        'losses': losses,
        'profits': profits + losses,
        ' ':' ',
        'profit_trades': profit_trades,
        'loss_trades': loss_trades,
        '  ':'  ',
        '% Win_Trades': profit_trades / (profit_trades + loss_trades) * 100 if (profit_trades + loss_trades) > 0 else 0,
        '% Loss_Trades': loss_trades / (profit_trades + loss_trades)*100 if (profit_trades + loss_trades) > 0 else 0
    }

    # Create a DataFrame from the dictionary and transpose it
    perf_metrics = pd.DataFrame([results_dict]).T

    # Rename the column
    perf_metrics.rename(columns={0: 'Results'}, inplace=True)

    # Format the DataFrame for display
    perf_metrics.loc[['days', 'total_trades', 'profit_trades', 'loss_trades','% Win_Trades','% Loss_Trades'], 'Results'] = perf_metrics.loc[['days', 'total_trades', 'profit_trades', 'loss_trades','% Win_Trades','% Loss_Trades'], 'Results'].apply(lambda x: f"{x:,.0f}" if pd.notna(x) else '')
    perf_metrics.loc[['income', 'losses', 'profits'], 'Results'] = perf_metrics.loc[['income', 'losses', 'profits'], 'Results'].apply(lambda x: f"${x:,.2f}" if pd.notna(x) else '')

    return perf_metrics

In [None]:
def create_features(train_data, index):
    short_periods = [3, 5, 7, 10, 15, 17]
    long_periods = [20, 22, 66, 126, 252]
    periods = short_periods + long_periods

    features = pd.DataFrame(index=index)

    # Indicators that do not depend on the lookback period
    features['OBV'] = ta.OBV(train_data['Close'], train_data['Volume'])
    features['AD'] = ta.AD(train_data['High'], train_data['Low'],
                           train_data['Close'], train_data['Volume'])

    # Pre-compute moving averages to avoid repeated calculations
    sma = {p: ta.SMA(train_data['Close'], timeperiod=p) for p in periods}
    ema = {p: ta.EMA(train_data['Close'], timeperiod=p) for p in periods}

    for period in periods:
        features[f'RSI_{period}'] = ta.RSI(train_data['Close'], timeperiod=period)
        features[f'MFI_{period}'] = ta.MFI(train_data['High'], train_data['Low'],
                                           train_data['Close'], train_data['Volume'],
                                           timeperiod=period)
        features[f'ADX_{period}'] = ta.ADX(train_data['High'], train_data['Low'],
                                           train_data['Close'], timeperiod=period)
        features[f'ROCP_{period}'] = ta.ROCP(train_data['Close'], timeperiod=period)

    for s in short_periods:
        for l in long_periods:
            features[f'SMA_Crossover_{s}_{l}'] = sma[s] - sma[l]
            features[f'EMA_Crossover_{s}_{l}'] = ema[s] - ema[l]

    features.dropna(inplace=True)
    return features


In [None]:
def strategy_returns_dynamic_different_thresholds(prices, threshold):
    rolling = prices['Rolling Returns'].to_numpy()
    yearly_std = prices['Yearly Stdev'].to_numpy()
    pct_change = prices['Close'].pct_change().to_numpy()
    signals = np.zeros(len(prices), dtype=np.int8)

    curr_pos = 0
    hold_days = 0
    for i in range(len(prices)):
        if curr_pos == 0 or hold_days == 20:
            thresh_val = threshold * yearly_std[i]
            if rolling[i] >= thresh_val:
                curr_pos = 1
                hold_days = 0
            elif rolling[i] < -thresh_val:
                curr_pos = -1
                hold_days = 0
            else:
                curr_pos = 0
                hold_days = 0
        else:
            hold_days += 1
        signals[i] = curr_pos

    prices[f'Signal_{threshold}'] = signals
    strategy_returns = pct_change * np.roll(signals, 1)
    prices[f'Strategy Returns_{threshold}'] = strategy_returns
    return np.cumprod(strategy_returns + 1)


def strategy_returns_different_thresholds(prices, threshold):
    rolling = prices['Rolling Returns'].to_numpy()
    pct_change = prices['Close'].pct_change().to_numpy()
    signals = np.zeros(len(prices), dtype=np.int8)

    curr_pos = 0
    hold_days = 0
    for i in range(len(prices)):
        if curr_pos == 0 or hold_days == 20:
            if rolling[i] >= threshold:
                curr_pos = 1
                hold_days = 0
            elif rolling[i] < threshold:
                curr_pos = -1
                hold_days = 0
        else:
            hold_days += 1
        signals[i] = curr_pos

    prices[f'Signal_{threshold}'] = signals
    strategy_returns = pct_change * np.roll(signals, 1)
    prices[f'Strategy Returns_{threshold}'] = strategy_returns
    return np.cumprod(strategy_returns + 1)


# Data

In [214]:
data_type = 'Scale'

In [215]:
### Open OHLC dataframe

ohlc = pd.read_csv(root_data + 'Data/'+symbol+'_M5.csv', index_col=0)
ohlc.index = pd.to_datetime(ohlc.index)
time_difference = ohlc.index.max() - ohlc.index.min()
number_of_days = time_difference.days

print(f"The train_data DataFrame covers a period of {number_of_days} days.")

The train_data DataFrame covers a period of 937 days.


In [216]:
### Features

features_5m = pd.read_csv(root_data + 'Results/'+symbol+'_'+direction+'_M5M10_'+data_type+'_Features.csv', index_col=0)
features_5m['Date'] = features_5m.index
features_5m['Date'] = pd.to_datetime(features_5m['Date'])
features_5m.set_index("Date", inplace=True)

print(list(features_5m.columns),'\n')
print('Shape = ',features_5m.shape)

features_5m.tail(3)

['label', '10min_RSI_3_diff', '10min_RSI_3 - RSI_7', '10min_slope_angle_900_3_diff', '10min_slope_div_600_3_diff', '10min_slope_angle_600_3_diff', '10min_slope_lin_reg_signal_600_3 - slope_lin_reg_signal_600_9', '10min_slope_lin_reg_300_3_diff', '10min_slope_signal_900_9_diff', '10min_slope_div_300_3_diff', '10min_OBV_diff', '10min_slope_signal_300_3_diff', 'slope_angle_300_6_diff', '10min_slope_lin_reg_signal_300_3 - slope_lin_reg_signal_900_9', '10min_slope_angle_300_9_diff', '10min_slope_signal_300_3 - slope_signal_600_3', '10min_slope_div_300_3 - slope_div_900_6', '10min_slope_lin_reg_signal_300_3_diff', 'RSI_7 - RSI_14', 'slope_div_300_3_diff', 'slope_lin_reg_signal_300_3_diff', '10min_slope_angle_300_6 - slope_angle_600_6', 'slope_div_300_6_diff', 'slope_div_300_9_diff', '10min_slope_signal_300_6_diff', 'slope_lin_reg_signal_600_3_diff', 'Open_Trade', '10min_slope_angle_300_9 - slope_angle_600_3', '10min_slope_signal_900_6_diff', '10min_slope_lin_reg_signal_300_3 - slope_lin_reg_

Unnamed: 0_level_0,label,10min_RSI_3_diff,10min_RSI_3 - RSI_7,10min_slope_angle_900_3_diff,10min_slope_div_600_3_diff,10min_slope_angle_600_3_diff,10min_slope_lin_reg_signal_600_3 - slope_lin_reg_signal_600_9,10min_slope_lin_reg_300_3_diff,10min_slope_signal_900_9_diff,10min_slope_div_300_3_diff,...,10min_slope_angle_300_6 - slope_angle_900_3,10min_slope_signal_300_3 - slope_signal_300_6,10min_slope_lin_reg_signal_600_6_diff,slope_signal_900_6 - slope_signal_900_9,slope_angle_300_3 - slope_angle_900_6,slope_lin_reg_signal_600_9 - slope_lin_reg_signal_900_6,slope_signal_600_6 - slope_signal_900_6,slope_angle_300_3 - slope_angle_300_9,10min_slope_lin_reg_signal_300_9_diff,slope_signal_600_3 - slope_signal_600_6
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-07-26 21:25:00,1,0.378237,1.637643,-0.000604,-0.252278,-0.000309,-0.107928,1.412983,0.0,0.261028,...,-0.055921,0.03088,0.0,-1.967378,-3.260804,2.449032,3.438999,-1.888883,0.0,-1.751248
2025-07-26 22:25:00,1,0.378237,1.637643,-0.000604,-0.252278,-0.000309,-0.107928,1.412983,0.0,0.261028,...,-0.055921,0.03088,0.0,-1.857623,2.844258,0.07734,0.017109,-0.224794,0.0,1.758542
2025-07-26 23:05:00,1,-0.378237,-1.637643,0.000604,0.252278,0.000309,0.107928,-1.412983,-0.0,-0.261028,...,0.055921,-0.03088,-0.0,-0.089245,0.032381,-0.115181,-0.0,0.049432,-0.0,-0.008887


In [217]:
### Labels

lab = pd.read_csv(root_data + 'Results/'+symbol+'_'+strategy+'_'+time_frame+'_Strategy_Gen_Labels.csv', index_col=0)
lab['Date'] = pd.to_datetime(lab['Date'])
lab.set_index('Date', inplace=True)

columns_to_drop = ['st_row_PnL_Low','Close_Trade']
lab = lab.drop(columns=columns_to_drop)

print(list(lab.columns),'\n')
print('Shape : ',lab.shape,'\n')

nan_counts = lab.isnull().sum()
total_missing_counts = nan_counts

print("Missing values (NaN and Inf) in lab sorted by highest to lowest:")
print(total_missing_counts.sort_values(ascending=False))
print("Total missing value count in lab:", total_missing_counts.sum())


['Open', 'High', 'Low', 'Close', 'Volume', 'Spread', 'ATR', 'kal_1', 'kal_2', 'kal_3', 'kal_4', 'Open_Trade', 'Entry_Date', 'Type', 'Trade_Number', 'st_Exit_Date', 'trade type', 'st_Duration', 'st_row_PnL_close', 'st_row_PnL_high', 'st_row_PnL_low', 'st_Max', 'st_Min', 'st_PnL', 'st_atr_PnL', 'st_atr_max_PnL'] 

Shape :  (267395, 26) 

Missing values (NaN and Inf) in lab sorted by highest to lowest:
st_Exit_Date        228204
st_Duration         228204
st_Max              228204
st_Min              228204
st_PnL              228204
Type                228203
Entry_Date          228203
Open_Trade          228203
st_atr_PnL              15
st_atr_max_PnL          15
ATR                     14
st_row_PnL_high         11
st_row_PnL_close        11
st_row_PnL_low          11
trade type               1
Trade_Number             1
kal_3                    0
High                     0
Open                     0
kal_2                    0
kal_1                    0
Spread                   0
Clo

In [218]:
### Merge

feat_obj = features_5m.copy()

print(list(feat_obj.columns),'\n')
print('Shape = ',feat_obj.shape,'\n')
print('Label_Counts = ', feat_obj['label'].value_counts(),'\n')

feat_obj.dropna(inplace=True)

nan_counts = feat_obj.isnull().sum()

print("Missing values (NaN and Inf) in feat_obj sorted by highest to lowest AFTER DROPPING COLUMNS:")

nan_counts_after = feat_obj.isnull().sum()
inf_counts_after = np.isinf(feat_obj.select_dtypes(include=np.number)).sum()
total_missing_counts_after = nan_counts_after + inf_counts_after
print(total_missing_counts_after.sort_values(ascending=False))




['label', '10min_RSI_3_diff', '10min_RSI_3 - RSI_7', '10min_slope_angle_900_3_diff', '10min_slope_div_600_3_diff', '10min_slope_angle_600_3_diff', '10min_slope_lin_reg_signal_600_3 - slope_lin_reg_signal_600_9', '10min_slope_lin_reg_300_3_diff', '10min_slope_signal_900_9_diff', '10min_slope_div_300_3_diff', '10min_OBV_diff', '10min_slope_signal_300_3_diff', 'slope_angle_300_6_diff', '10min_slope_lin_reg_signal_300_3 - slope_lin_reg_signal_900_9', '10min_slope_angle_300_9_diff', '10min_slope_signal_300_3 - slope_signal_600_3', '10min_slope_div_300_3 - slope_div_900_6', '10min_slope_lin_reg_signal_300_3_diff', 'RSI_7 - RSI_14', 'slope_div_300_3_diff', 'slope_lin_reg_signal_300_3_diff', '10min_slope_angle_300_6 - slope_angle_600_6', 'slope_div_300_6_diff', 'slope_div_300_9_diff', '10min_slope_signal_300_6_diff', 'slope_lin_reg_signal_600_3_diff', 'Open_Trade', '10min_slope_angle_300_9 - slope_angle_600_3', '10min_slope_signal_900_6_diff', '10min_slope_lin_reg_signal_300_3 - slope_lin_reg_

In [219]:
# Split the data into 70% train and 30% test based on index
train_size = int(0.7 * len(feat_obj))
train = feat_obj.iloc[:train_size]
test  = feat_obj.iloc[train_size:]

print("Shape of train_data:", train.shape)
print("Shape of test_data:", test.shape)

Shape of train_data: (20246, 225)
Shape of test_data: (8678, 225)


In [220]:
### Define Train dataframe

print('Train_Min_Date', train.index.min())
print('Train_Max_Date', train.index.max(),'\n')

print('Test_Min_Date', test.index.min())
print('Test_Max_Date', test.index.max(),'\n')

print('Train_Columns : ',list(train.columns), '\n')
print('Test_Columns : ',list(test.columns))

Train_Min_Date 2023-08-29 19:10:00
Train_Max_Date 2024-12-25 17:40:00 

Test_Min_Date 2024-12-25 18:00:00
Test_Max_Date 2025-07-26 23:05:00 

Train_Columns :  ['label', '10min_RSI_3_diff', '10min_RSI_3 - RSI_7', '10min_slope_angle_900_3_diff', '10min_slope_div_600_3_diff', '10min_slope_angle_600_3_diff', '10min_slope_lin_reg_signal_600_3 - slope_lin_reg_signal_600_9', '10min_slope_lin_reg_300_3_diff', '10min_slope_signal_900_9_diff', '10min_slope_div_300_3_diff', '10min_OBV_diff', '10min_slope_signal_300_3_diff', 'slope_angle_300_6_diff', '10min_slope_lin_reg_signal_300_3 - slope_lin_reg_signal_900_9', '10min_slope_angle_300_9_diff', '10min_slope_signal_300_3 - slope_signal_600_3', '10min_slope_div_300_3 - slope_div_900_6', '10min_slope_lin_reg_signal_300_3_diff', 'RSI_7 - RSI_14', 'slope_div_300_3_diff', 'slope_lin_reg_signal_300_3_diff', '10min_slope_angle_300_6 - slope_angle_600_6', 'slope_div_300_6_diff', 'slope_div_300_9_diff', '10min_slope_signal_300_6_diff', 'slope_lin_reg_signa

# Results


In [None]:
results(train, pnl_column='st_PnL')

In [None]:
results(test, pnl_column= 'st_PnL')


# ML


## Train

In [221]:
### When using train_test_split it applies the scaler to X_train only
start_feature = train.columns.get_loc('10min_RSI_3_diff')
train_features = [col for col in train.columns[start_feature:] if col != 'label']

X = train.loc[:, train_features]
y = train['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:",  X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:",  y_test.shape)

Shape of X_train: (14172, 224)
Shape of X_test: (6074, 224)
Shape of y_train: (14172,)
Shape of y_test: (6074,)


In [222]:
print(y_train.value_counts())
print(y_test.value_counts())

label
0    7113
1    7059
Name: count, dtype: int64
label
1    3117
0    2957
Name: count, dtype: int64


In [223]:
### Rebalance Data frame due to the low number of class 1
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Now train your model on the resampled data
print(X_train_resampled.value_counts(),'\n')
print(y_train_resampled.value_counts())

10min_RSI_3_diff  10min_RSI_3 - RSI_7  10min_slope_angle_900_3_diff  10min_slope_div_600_3_diff  10min_slope_angle_600_3_diff  10min_slope_lin_reg_signal_600_3 - slope_lin_reg_signal_600_9  10min_slope_lin_reg_300_3_diff  10min_slope_signal_900_9_diff  10min_slope_div_300_3_diff  10min_OBV_diff  10min_slope_signal_300_3_diff  slope_angle_300_6_diff  10min_slope_lin_reg_signal_300_3 - slope_lin_reg_signal_900_9  10min_slope_angle_300_9_diff  10min_slope_signal_300_3 - slope_signal_600_3  10min_slope_div_300_3 - slope_div_900_6  10min_slope_lin_reg_signal_300_3_diff  RSI_7 - RSI_14  slope_div_300_3_diff  slope_lin_reg_signal_300_3_diff  10min_slope_angle_300_6 - slope_angle_600_6  slope_div_300_6_diff  slope_div_300_9_diff  10min_slope_signal_300_6_diff  slope_lin_reg_signal_600_3_diff  Open_Trade  10min_slope_angle_300_9 - slope_angle_600_3  10min_slope_signal_900_6_diff  10min_slope_lin_reg_signal_300_3 - slope_lin_reg_signal_600_9  slope_lin_reg_300_3 - slope_lin_reg_600_3  slope_lin_

In [224]:
# --------------- Classifier 1: XGBoost ------------------------------------------
xgb = XGBClassifier(n_estimators=15, max_depth=3,
                    random_state=42, eval_metric='logloss')

# --------------- Classifier 2: Logistic Regression Classifier------------------------------------------
lr = LogisticRegression(random_state=42)

# --------------- Classifier 3: AdaBoost Classifier ------------------------------------------
ada = AdaBoostClassifier(n_estimators=15, random_state=42)

# --------------- Classifier 4: SVM------------------------------------------
svc = svm.SVC(kernel='rbf', probability=True, random_state=42)

# Define a list to store the different models
estimator = []
estimator.append(('LR', lr))
estimator.append(('XGB', xgb))
estimator.append(('ada', ada))
estimator.append(('SVC', svc))

In [225]:
# --------------- Classifier 1: XGBoost ------------------------------------------
xgb = XGBClassifier(n_estimators=15, max_depth=3,
                    random_state=42, eval_metric='logloss')

# --------------- Classifier 2: Logistic Regression Classifier------------------------------------------
lr = LogisticRegression(random_state=42)

# --------------- Classifier 3: AdaBoost Classifier ------------------------------------------
ada = AdaBoostClassifier(n_estimators=15, random_state=42)

# --------------- Classifier 4: SVM------------------------------------------
svc = svm.SVC(kernel='rbf', probability=True, random_state=42)

# Define a list to store the different models
estimator = []
estimator.append(('LR', lr))
estimator.append(('XGB', xgb))
estimator.append(('ada', ada))
estimator.append(('SVC', svc))

In [226]:
### Implement voting classifier with hard voting

ml_model = VotingClassifier(estimators=estimator, voting='soft')
ml_model.fit(X_train_resampled, y_train_resampled)

In [227]:
y_true = y_test
y_pred = ml_model.predict(X_test)
y_proba_default = ml_model.predict_proba(X_test)[:, 1]

conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(y_true, y_pred)
print("\nClassification Report:")
print(class_report)

probability_summary = confusion_probability_summary(y_true, y_pred, y_proba_default)
print("\nProbability Summary by Confusion Outcome:")
print(probability_summary)


Confusion Matrix:
[[1720 1237]
 [1274 1843]]

Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.58      0.58      2957
           1       0.60      0.59      0.59      3117

    accuracy                           0.59      6074
   macro avg       0.59      0.59      0.59      6074
weighted avg       0.59      0.59      0.59      6074


Probability Summary by Confusion Outcome:
                Count  Mean Probability  Std Probability
Outcome                                                 
True Positive    1843          0.591992         0.062095
False Positive   1237          0.567730         0.049548
True Negative    1720          0.402470         0.082788
False Negative   1274          0.438327         0.055391


In [228]:
# Evaluate ML model using a 0.7 probability threshold for class 1
y_proba = ml_model.predict_proba(X_test)[:, 1]
y_pred_threshold = (y_proba >= 0.7).astype(int)

conf_matrix_threshold = confusion_matrix(y_true, y_pred_threshold)
print("Confusion Matrix (Threshold 0.7):")
print(conf_matrix_threshold)

class_report_threshold = classification_report(y_true, y_pred_threshold)
print("\nClassification Report (Threshold 0.7):")
print(class_report_threshold)

probability_summary_threshold = confusion_probability_summary(y_true, y_pred_threshold, y_proba)
print("\nProbability Summary by Confusion Outcome (Threshold 0.7):")
print(probability_summary_threshold)


Confusion Matrix (Threshold 0.7):
[[2942   15]
 [3015  102]]

Classification Report (Threshold 0.7):
              precision    recall  f1-score   support

           0       0.49      0.99      0.66      2957
           1       0.87      0.03      0.06      3117

    accuracy                           0.50      6074
   macro avg       0.68      0.51      0.36      6074
weighted avg       0.69      0.50      0.35      6074


Probability Summary by Confusion Outcome (Threshold 0.7):
                Count  Mean Probability  Std Probability
Outcome                                                 
True Positive     102          0.730132         0.021852
False Positive     15          0.720027         0.016302
True Negative    2942          0.470336         0.106776
False Negative   3015          0.522387         0.090136


In [229]:
### Save ML Model
model_path = root_data + 'Models/'+symbol+'_'+direction+'_ml_model.joblib'
joblib.dump(ml_model, model_path)
print(f"Model saved successfully at: {model_path}")

Model saved successfully at: /content/drive/MyDrive/Course Folder/Forex/XAUUSD/Models/BTCUSD_Short_ml_model.joblib


## Meta

In [230]:
### Import ML Model
model_path = root_data+'Models/'+symbol+'_'+direction+'_ml_model.joblib'
ml_model = joblib.load(model_path)
print(f"Model loaded successfully from: {model_path}")

Model loaded successfully from: /content/drive/MyDrive/Course Folder/Forex/XAUUSD/Models/BTCUSD_Short_ml_model.joblib


In [231]:
#train.head(5)

In [None]:
### Import Data
start_feature = train.columns.get_loc('10min_RSI_3_diff')
train_features = [col for col in train.columns[start_feature:] if col != 'label']

# Select the feature columns from the 'train' DataFrame
X_train_features = train[train_features]

train['label_ml'] = ml_model.predict(X_train_features)

prediction_probabilities = ml_model.predict_proba(X_train_features)
train['prob_0'] = prediction_probabilities[:, 0]
train['prob_1'] = prediction_probabilities[:, 1]

# Additional outputs of the initial model to feed into the meta model.

#meta_manual_features = ['label_ml', 'prob_1', 'prob_0']
meta_manual_features = ['prob_1']
meta_features = train_features + meta_manual_features
X_meta_features = train[meta_features]

#train.head()

In [None]:
train.to_csv(root_data + 'Results/'+symbol+'Meta_Prob_M5+M10_train_l.csv')

In [None]:
# Adjust the manual meta features in 'meta_manual_features' if different inputs are required.
meta = meta_features.copy()
X_meta_features = train[meta]

X = train[meta]
y = train['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
print(y_train.value_counts())
print(y_test.value_counts())

In [None]:
### Rebalance Data frame due to the low number of class 1

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# --------------- Classifier 1: XGBoost ------------------------------------------
xgb = XGBClassifier(n_estimators=15, max_depth=3,
                    random_state=42, eval_metric='logloss')

# --------------- Classifier 2: Logistic Regression Classifier------------------------------------------
lr = LogisticRegression(random_state=42)

# --------------- Classifier 3: AdaBoost Classifier ------------------------------------------
ada = AdaBoostClassifier(n_estimators=15, random_state=42)

# --------------- Classifier 4: SVM------------------------------------------
svc = svm.SVC(kernel='rbf', probability=True, random_state=42)

# Define a list to store the different models
estimator = []
estimator.append(('LR', lr))
estimator.append(('XGB', xgb))
estimator.append(('ada', ada))
estimator.append(('SVC', svc))

In [None]:
### Implement voting classifier with hard voting

meta_ml_model = VotingClassifier(estimators=estimator, voting='soft')
meta_ml_model.fit(X_train_resampled, y_train_resampled)

In [None]:
y_true = y_test
y_pred = meta_ml_model.predict(X_test)
meta_y_proba_default = meta_ml_model.predict_proba(X_test)[:, 1]

conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(y_true, y_pred)
print("\nClassification Report:")
print(class_report)

probability_summary = confusion_probability_summary(y_true, y_pred, meta_y_proba_default)
print("\nProbability Summary by Confusion Outcome:")
print(probability_summary)


In [None]:
# Evaluate Meta ML model using a 0.7 probability threshold for class 1
meta_y_proba = meta_ml_model.predict_proba(X_test)[:, 1]
meta_y_pred_threshold = (meta_y_proba >= 0.7).astype(int)

meta_conf_matrix_threshold = confusion_matrix(y_true, meta_y_pred_threshold)
print("Confusion Matrix (Threshold 0.7):")
print(meta_conf_matrix_threshold)

meta_class_report_threshold = classification_report(y_true, meta_y_pred_threshold)
print("\nClassification Report (Threshold 0.7):")
print(meta_class_report_threshold)

meta_probability_summary_threshold = confusion_probability_summary(y_true, meta_y_pred_threshold, meta_y_proba)
print("\nProbability Summary by Confusion Outcome (Threshold 0.7):")
print(meta_probability_summary_threshold)


In [None]:
train['meta_results'] = meta_ml_model.predict(X_meta_features)

In [None]:
train.to_csv(root_data + 'Results/'+symbol+'train_l_Signals & Meta_Signals.csv')

In [None]:
### Save ML Model
model_path = root_data + 'Models/'+symbol+'_'+direction+'_Meta_ml_model.joblib'
joblib.dump(meta_ml_model, model_path)
print(f"Model saved successfully at: {model_path}")

## PnL Train

In [None]:
train['ml_results'] = np.where(train['label_ml']==1, train['st_PnL'],0)
results(train, pnl_column='ml_results')

In [None]:
train['meta_ml_results'] = np.where(train['meta_results']==1, train['st_PnL'],0)
results(train, pnl_column='meta_ml_results')


# Test


## Results_ML

In [None]:
### Import ML Model

model_path = root_data+'Models/'+symbol+'_'+direction+'_ml_model.joblib'
ml_model = joblib.load(model_path)
print(f"Model loaded successfully from: {model_path}")

In [None]:
### Import Meta Model

model_path = root_data+'Models/'+symbol+'_'+direction+'_Meta_ml_model.joblib'
meta_ml_model = joblib.load(model_path)
print(f"Model loaded successfully from: {model_path}")

In [None]:
# Calculate and display Confusion Matrix
conf_matrix = confusion_matrix(test['label'], test['label_ml'])
print("Confusion Matrix:")
print(conf_matrix)

# Calculate and display Classification Report
class_report = classification_report(test['label'], test['label_ml'])
print("\nClassification Report:")
print(class_report)

probability_summary_test = confusion_probability_summary(test['label'], test['label_ml'], test['prob_1'])
print("\nProbability Summary by Confusion Outcome:")
print(probability_summary_test)


In [None]:
print(test.columns)

In [None]:
# Calculate and display Confusion Matrix
conf_matrix = confusion_matrix(test['label'], test['label_ml'])
print("Confusion Matrix:")
print(conf_matrix)

# Calculate and display Classification Report
class_report = classification_report(test['label'], test['label_ml'])
print("\nClassification Report:")
print(class_report)

probability_summary_test = confusion_probability_summary(test['label'], test['label_ml'], test['prob_1'])
print("\nProbability Summary by Confusion Outcome:")
print(probability_summary_test)


In [None]:
# Evaluate ML model on the test set using a 0.7 probability threshold for class 1
test['label_ml_prob_70'] = np.where(test['prob_1'] >= 0.7, 1, 0)

conf_matrix_prob_70 = confusion_matrix(test['label'], test['label_ml_prob_70'])
print("Confusion Matrix (Threshold 0.7):")
print(conf_matrix_prob_70)

class_report_prob_70 = classification_report(test['label'], test['label_ml_prob_70'])
print("\nClassification Report (Threshold 0.7):")
print(class_report_prob_70)

probability_summary_prob_70 = confusion_probability_summary(test['label'], test['label_ml_prob_70'], test['prob_1'])
print("\nProbability Summary by Confusion Outcome (Threshold 0.7):")
print(probability_summary_prob_70)


In [None]:
# Calculate and display Confusion Matrix
conf_matrix = confusion_matrix(test['label'], test['meta_label'])
print("Confusion Matrix:")
print(conf_matrix)

# Calculate and display Classification Report
class_report = classification_report(test['label'], test['meta_label'])
print("\nClassification Report:")
print(class_report)

meta_probability_summary_test = confusion_probability_summary(test['label'], test['meta_label'], test['meta_prob_1'])
print("\nProbability Summary by Confusion Outcome:")
print(meta_probability_summary_test)


In [None]:
# Calculate and display Confusion Matrix
conf_matrix = confusion_matrix(test['label'], test['meta_label'])
print("Confusion Matrix:")
print(conf_matrix)

# Calculate and display Classification Report
class_report = classification_report(test['label'], test['meta_label'])
print("\nClassification Report:")
print(class_report)

meta_probability_summary_test = confusion_probability_summary(test['label'], test['meta_label'], test['meta_prob_1'])
print("\nProbability Summary by Confusion Outcome:")
print(meta_probability_summary_test)


In [None]:
# Evaluate Meta ML model on the test set using a 0.7 probability threshold for class 1
test['meta_label_prob_70'] = np.where(test['meta_prob_1'] >= 0.7, 1, 0)

meta_conf_matrix_prob_70 = confusion_matrix(test['label'], test['meta_label_prob_70'])
print("Confusion Matrix (Threshold 0.7):")
print(meta_conf_matrix_prob_70)

meta_class_report_prob_70 = classification_report(test['label'], test['meta_label_prob_70'])
print("\nClassification Report (Threshold 0.7):")
print(meta_class_report_prob_70)

meta_probability_summary_prob_70 = confusion_probability_summary(test['label'], test['meta_label_prob_70'], test['meta_prob_1'])
print("\nProbability Summary by Confusion Outcome (Threshold 0.7):")
print(meta_probability_summary_prob_70)


## Results_PnL

In [None]:
# Calculate and display Confusion Matrix
conf_matrix = confusion_matrix(test['label'], test['label_ml'])
print("Confusion Matrix:")
print(conf_matrix)

# Calculate and display Classification Report
class_report = classification_report(test['label'], test['label_ml'])
print("\nClassification Report:")
print(class_report)

probability_summary_test = confusion_probability_summary(test['label'], test['label_ml'], test['prob_1'])
print("\nProbability Summary by Confusion Outcome:")
print(probability_summary_test)


In [None]:
# Calculate and display Confusion Matrix
conf_matrix = confusion_matrix(test['label'], test['meta_label'])
print("Confusion Matrix:")
print(conf_matrix)

# Calculate and display Classification Report
class_report = classification_report(test['label'], test['meta_label'])
print("\nClassification Report:")
print(class_report)

meta_probability_summary_test = confusion_probability_summary(test['label'], test['meta_label'], test['meta_prob_1'])
print("\nProbability Summary by Confusion Outcome:")
print(meta_probability_summary_test)


# Pendientes
