# Set_Up

In [19]:
strategy   = 'Kalman'
process    = 'Train'
symbol     = 'BTCUSD'
direction  = 'Long'
time_frame = 'M5'

root_data = f'/content/drive/MyDrive/Course Folder/Forex/XAUUSD/'
print(root_data)

rolling_window = 100

/content/drive/MyDrive/Course Folder/Forex/XAUUSD/


# Import_Libraries

In [3]:
!pip install ta-lib
import talib as ta
print(ta.__version__)

Collecting ta-lib
  Downloading ta_lib-0.6.7-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (24 kB)
Downloading ta_lib-0.6.7-cp312-cp312-manylinux_2_28_x86_64.whl (4.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/4.1 MB[0m [31m79.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ta-lib
Successfully installed ta-lib-0.6.7
0.6.7


In [4]:
# Import libraries for data manipulation
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from datetime import timedelta

# For machine learning models
import xgboost
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from tensorflow.keras.models import load_model

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

# Import matplotlib as an alias plt and set the style
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("seaborn-v0_8-whitegrid")

# Import sys to append the path for custom function file
import sys
sys.path.append("..")

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

import joblib

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Support Functions

In [6]:
def results(data, pnl_column='PnL'):
    # Calculate the metrics
    time_difference = data.index.max() - data.index.min()
    days = time_difference.days
    total_trades = data[data['Open_Trade'].notna() & (data['Open_Trade'] != 0)].shape[0]
    profit_trades = data[data[pnl_column] > 0].shape[0]
    loss_trades = data[data[pnl_column] < 0].shape[0]
    profits = data[data[pnl_column] > 0][pnl_column].sum()
    losses = data[data[pnl_column] < 0][pnl_column].sum()

    # Create a dictionary with the results
    results_dict = {
        'days': days,
        'total_trades': total_trades,
        '': '',
        'income': profits,
        'losses': losses,
        'profits': profits + losses,
        ' ':' ',
        'profit_trades': profit_trades,
        'loss_trades': loss_trades,
        '  ':'  ',
        '% Win_Trades': profit_trades / (profit_trades + loss_trades) * 100 if (profit_trades + loss_trades) > 0 else 0,
        '% Loss_Trades': loss_trades / (profit_trades + loss_trades)*100 if (profit_trades + loss_trades) > 0 else 0
    }

    # Create a DataFrame from the dictionary and transpose it
    perf_metrics = pd.DataFrame([results_dict]).T

    # Rename the column
    perf_metrics.rename(columns={0: 'Results'}, inplace=True)

    # Format the DataFrame for display
    perf_metrics.loc[['days', 'total_trades', 'profit_trades', 'loss_trades','% Win_Trades','% Loss_Trades'], 'Results'] = perf_metrics.loc[['days', 'total_trades', 'profit_trades', 'loss_trades','% Win_Trades','% Loss_Trades'], 'Results'].apply(lambda x: f"{x:,.0f}" if pd.notna(x) else '')
    perf_metrics.loc[['income', 'losses', 'profits'], 'Results'] = perf_metrics.loc[['income', 'losses', 'profits'], 'Results'].apply(lambda x: f"${x:,.2f}" if pd.notna(x) else '')

    return perf_metrics

In [7]:
def create_features(train_data, index):
    short_periods = [3, 5, 7, 10, 15, 17]
    long_periods = [20, 22, 66, 126, 252]
    periods = short_periods + long_periods

    features = pd.DataFrame(index=index)

    # Indicators that do not depend on the lookback period
    features['OBV'] = ta.OBV(train_data['Close'], train_data['Volume'])
    features['AD'] = ta.AD(train_data['High'], train_data['Low'],
                           train_data['Close'], train_data['Volume'])

    # Pre-compute moving averages to avoid repeated calculations
    sma = {p: ta.SMA(train_data['Close'], timeperiod=p) for p in periods}
    ema = {p: ta.EMA(train_data['Close'], timeperiod=p) for p in periods}

    for period in periods:
        features[f'RSI_{period}'] = ta.RSI(train_data['Close'], timeperiod=period)
        features[f'MFI_{period}'] = ta.MFI(train_data['High'], train_data['Low'],
                                           train_data['Close'], train_data['Volume'],
                                           timeperiod=period)
        features[f'ADX_{period}'] = ta.ADX(train_data['High'], train_data['Low'],
                                           train_data['Close'], timeperiod=period)
        features[f'ROCP_{period}'] = ta.ROCP(train_data['Close'], timeperiod=period)

    for s in short_periods:
        for l in long_periods:
            features[f'SMA_Crossover_{s}_{l}'] = sma[s] - sma[l]
            features[f'EMA_Crossover_{s}_{l}'] = ema[s] - ema[l]

    features.dropna(inplace=True)
    return features


In [None]:
def strategy_returns_dynamic_different_thresholds(prices, threshold):
    rolling = prices['Rolling Returns'].to_numpy()
    yearly_std = prices['Yearly Stdev'].to_numpy()
    pct_change = prices['Close'].pct_change().to_numpy()
    signals = np.zeros(len(prices), dtype=np.int8)

    curr_pos = 0
    hold_days = 0
    for i in range(len(prices)):
        if curr_pos == 0 or hold_days == 20:
            thresh_val = threshold * yearly_std[i]
            if rolling[i] >= thresh_val:
                curr_pos = 1
                hold_days = 0
            elif rolling[i] < -thresh_val:
                curr_pos = -1
                hold_days = 0
            else:
                curr_pos = 0
                hold_days = 0
        else:
            hold_days += 1
        signals[i] = curr_pos

    prices[f'Signal_{threshold}'] = signals
    strategy_returns = pct_change * np.roll(signals, 1)
    prices[f'Strategy Returns_{threshold}'] = strategy_returns
    return np.cumprod(strategy_returns + 1)


def strategy_returns_different_thresholds(prices, threshold):
    rolling = prices['Rolling Returns'].to_numpy()
    pct_change = prices['Close'].pct_change().to_numpy()
    signals = np.zeros(len(prices), dtype=np.int8)

    curr_pos = 0
    hold_days = 0
    for i in range(len(prices)):
        if curr_pos == 0 or hold_days == 20:
            if rolling[i] >= threshold:
                curr_pos = 1
                hold_days = 0
            elif rolling[i] < threshold:
                curr_pos = -1
                hold_days = 0
        else:
            hold_days += 1
        signals[i] = curr_pos

    prices[f'Signal_{threshold}'] = signals
    strategy_returns = pct_change * np.roll(signals, 1)
    prices[f'Strategy Returns_{threshold}'] = strategy_returns
    return np.cumprod(strategy_returns + 1)


# Data

In [None]:
### Open OHLC dataframe

ohlc = pd.read_csv(root_data + 'Data/'+symbol+'_M5.csv', index_col=0)
ohlc.index = pd.to_datetime(ohlc.index)
time_difference = ohlc.index.max() - ohlc.index.min()
number_of_days = time_difference.days

print(f"The train_data DataFrame covers a period of {number_of_days} days.")
ohlc.tail(3)

In [None]:
### Features

features_5m = pd.read_csv(root_data + 'Results/'+symbol+'_'+direction+'_M5M10_Enc_Features.csv', index_col=0)
features_5m['Date'] = pd.to_datetime(features_5m['Date'])
features_5m.set_index('Date', inplace=True)

print(features_5m.shape)
features_5m.tail(3)

In [21]:
### Labels

lab = pd.read_csv(root_data + 'Results/'+symbol+'_'+strategy+'_'+time_frame+'_Strategy_Gen_Labels.csv', index_col=0)
lab['Date'] = pd.to_datetime(lab['Date'])
lab.set_index('Date', inplace=True)

print(lab.columns,'\n')
print(lab.shape)
lab.loc[lab['Open_Trade']== 1,'Open_Trade'].sum()

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Spread', 'ATR', 'kal_1',
       'kal_2', 'kal_3', 'kal_4', 'Open_Trade', 'Close_Trade', 'Entry_Date',
       'Type', 'Trade_Number', 'st_Exit_Date', 'trade type', 'st_Duration',
       'st_row_PnL_close', 'st_row_PnL_high', 'st_row_PnL_Low',
       'st_row_PnL_low', 'st_Max', 'st_Min', 'st_PnL', 'st_atr_PnL',
       'st_atr_max_PnL'],
      dtype='object') 

(267395, 28)


np.float64(19600.0)

In [22]:
# --- Parámetros / campos
result_field = 'st_atr_max_PnL'   # métrica a evaluar

# --- Filtro de filas válidas
valid = (
    (lab['Type'] == direction) &
    (lab['Open_Trade'].isin([1, -1])) &
    (lab[result_field].notna())
)

# --- Conteos por rango (st_max_4..6)
st_max_4 = (valid & (lab[result_field] <= 0.5)).sum()
st_max_5 = (valid & (lab[result_field] >= 0.5) & (lab[result_field] <= 1.0)).sum()
st_max_6 = (valid & (lab[result_field] > 1.0)).sum()

print(f'<= 0.5          = {st_max_4:,d}')
print(f'> 0.5 & <= 1.0  = {st_max_5:,d}')
print(f'> 1.0           = {st_max_6:,d}')

# --- Etiquetado en la columna "label" con valores 4/5/6
#lab['label'] = np.nan
#lab.loc[valid & (lab[result_field] <= 0.5), 'label'] = 0
#lab.loc[valid & (lab[result_field] > 0.5) & (lab[result_field] <= 1.0), 'label'] = 1
#lab.loc[valid & (lab[result_field] > 1.0), 'label'] = 2

lab.loc[valid & (lab[result_field] <= 1), 'label'] = 0
lab.loc[valid & (lab[result_field] >= 1), 'label'] = 1


# --- Mantener solo filas válidas y con label
lab = lab.loc[valid & lab['label'].notna()].copy()
lab['label'] = lab['label'].astype('int8')

# --- Ver distribución de labels 4/5/6
print('\nValue counts de label 4/5/6:')
print(lab['label'].value_counts(dropna=False).sort_index())


<= 0.5          = 6,031
> 0.5 & <= 1.0  = 3,854
> 1.0           = 9,714

Value counts de label 4/5/6:
label
0    9885
1    9714
Name: count, dtype: int64


In [24]:
### Merge

feat_obj = lab.merge(features_5m, left_index=True, right_index=True, how='left')
#feat_obj = feat_obj.merge(features_10m, left_index=True, right_index=True, how='left')
feat_obj.fillna(method='ffill', inplace=True)
#feat_obj.dropna(inplace=True)
print('Shape = ',feat_obj.shape,'\n')
print(feat_obj.columns,'\n')
feat_obj.tail(3)

Shape =  (19599, 37) 

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Spread', 'ATR', 'kal_1',
       'kal_2', 'kal_3', 'kal_4', 'Open_Trade', 'Close_Trade', 'Entry_Date',
       'Type', 'Trade_Number', 'st_Exit_Date', 'trade type', 'st_Duration',
       'st_row_PnL_close', 'st_row_PnL_high', 'st_row_PnL_Low',
       'st_row_PnL_low', 'st_Max', 'st_Min', 'st_PnL', 'st_atr_PnL',
       'st_atr_max_PnL', 'label', 'Encoded_0', 'Encoded_1', 'Encoded_2',
       'Encoded_3', 'Encoded_4', 'Encoded_5', 'Encoded_6', 'Encoded_7'],
      dtype='object') 



Unnamed: 0_level_0,Open,High,Low,Close,Volume,Spread,ATR,kal_1,kal_2,kal_3,...,st_atr_max_PnL,label,Encoded_0,Encoded_1,Encoded_2,Encoded_3,Encoded_4,Encoded_5,Encoded_6,Encoded_7
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-07-26 21:00:00,118087.13,118129.88,118078.88,118126.63,515,1200,67.6924,118069.417073,118068.861623,118076.997886,...,0.048011,0,16.971224,6.789091,7.373481,7.686589,2.716153,6.574689,9.243799,0.0
2025-07-26 21:25:00,118082.63,118110.88,118076.13,118108.38,354,775,67.00829,118079.131188,118082.370877,118088.465601,...,3.581646,1,16.971224,6.789091,7.373481,7.686589,2.716153,6.574689,9.243799,0.0
2025-07-26 22:25:00,118132.88,118163.25,118129.0,118151.75,417,1200,55.87521,118144.560587,118145.58871,118142.464087,...,3.519092,1,16.971224,6.789091,7.373481,7.686589,2.716153,6.574689,9.243799,0.0


In [26]:
columns_to_drop = ['st_row_PnL_Low']
feat_obj = feat_obj.drop(columns=columns_to_drop)
feat_obj.dropna(inplace=True)

In [27]:
columns_to_use = ['Open', 'High', 'Low', 'Close', 'Volume', 'Spread',
                  'Open_Trade', 'st_Close_Trade', 'Entry_Date', 'Type',
                  'Trade_Number', 'st_Exit_Date', 'trade_type',
                  'atr_mult_low', 'atr_mult_close', 'atr_mult_high', 'atr_dyn',
                  'atr_PnL', 'atr_Exit_Date', 'atr_Duration',
                  'atr_PnL_dollar',
                  'label',
                  'Encoded_0', 'Encoded_1', 'Encoded_2', 'Encoded_3', 'Encoded_4',
                  'Encoded_5', 'Encoded_6', 'Encoded_7', '10min_Encoded_0', '10min_Encoded_1',
                  '10min_Encoded_2', '10min_Encoded_3', '10min_Encoded_4', '10min_Encoded_5',
                  '10min_Encoded_6', '10min_Encoded_7']

nan_counts = feat_obj.isnull().sum()
print("\nNaN counts in feat_obj sorted by highest to lowest:")
print(nan_counts.sort_values(ascending=False))

print("\nTotal NaN count in feat_obj:", nan_counts.sum())


NaN counts in feat_obj sorted by highest to lowest:
Open                0
High                0
Low                 0
Close               0
Volume              0
Spread              0
ATR                 0
kal_1               0
kal_2               0
kal_3               0
kal_4               0
Open_Trade          0
Close_Trade         0
Entry_Date          0
Type                0
Trade_Number        0
st_Exit_Date        0
trade type          0
st_Duration         0
st_row_PnL_close    0
st_row_PnL_high     0
st_row_PnL_low      0
st_Max              0
st_Min              0
st_PnL              0
st_atr_PnL          0
st_atr_max_PnL      0
label               0
Encoded_0           0
Encoded_1           0
Encoded_2           0
Encoded_3           0
Encoded_4           0
Encoded_5           0
Encoded_6           0
Encoded_7           0
dtype: int64

Total NaN count in feat_obj: 0


In [None]:
# Split the data into 70% train and 30% test based on index
train_size = int(0.7 * len(feat_obj))
train = feat_obj.iloc[:train_size]
test  = feat_obj.iloc[train_size:]

print("Shape of train_data:", train.shape)
print("Shape of test_data:", test.shape)

Shape of train_data: (3423, 57)
Shape of test_data: (1468, 57)


In [None]:
### Define Train dataframe

print('Train_Min_Date', train.index.min())
print('Train_Max_Date', train.index.max(),'\n')

print('Test_Min_Date', test.index.min())
print('Test_Max_Date', test.index.max(),'\n')

print('Train_Columns : ',train.columns, '\n')
print('Test_Columns : ',test.columns)

Train_Min_Date 2024-02-07 06:15:00
Train_Max_Date 2025-01-29 15:55:00 

Test_Min_Date 2025-01-29 16:40:00
Test_Max_Date 2025-07-04 16:45:00 

Train_Columns :  Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Spread', 'ATR', 'hl2',
       'lag_1', 'it_1', 'lag_2', 'it_2', 'lag_3', 'it_3', 'Open_Trade',
       'st_Close_Trade', 'Entry_Date', 'Type', 'Trade_Number', 'st_Exit_Date',
       'trade type', 'st_Duration', 'st_row_PnL_close', 'st_row_PnL_high',
       'st_row_PnL_low', 'st_Max', 'st_Min', 'st_PnL', 'dyn_stoploss',
       'SL_PnL', 'SL_Exit_Date', 'SL_Duration', 'SL_PnL_-100_100_50',
       'atr_mult_low', 'atr_mult_close', 'atr_mult_high', 'atr_dyn', 'atr_PnL',
       'atr_Exit_Date', 'atr_Duration', 'atr_PnL_dollar',
       'atr_PnL_dollar_-1.0_0.5_2.0', 'atr_H_dyn', 'atr_H_PnL',
       'atr_H_Exit_Date', 'atr_H_Duration', 'atr_H_PnL_dollar',
       'atr_H_PnL_dollar_-3.0_0.1', 'label', 'Encoded_0', 'Encoded_1',
       'Encoded_2', 'Encoded_3', 'Encoded_4', 'Encoded_5', 'Enco

# Results


In [None]:
results(train, pnl_column='atr_PnL_dollar')

Unnamed: 0,Results
days,357
total_trades,3423
,
income,"$540,468.30"
losses,"$-60,867.52"
profits,"$479,600.78"
,
profit_trades,3026
loss_trades,397
,


In [None]:
results(test, pnl_column='atr_PnL_dollar')

Unnamed: 0,Results
days,156
total_trades,1468
,
income,"$394,980.76"
losses,"$-47,452.27"
profits,"$347,528.49"
,
profit_trades,1298
loss_trades,170
,



# ML


## Train

In [None]:
### When using train_test_split it applies the scaler to X_train only
train_features = ['Encoded_0', 'Encoded_1', 'Encoded_2', 'Encoded_3', 'Encoded_4',
                  'Encoded_5', 'Encoded_6', 'Encoded_7']
                  #,'10min_Encoded_0', '10min_Encoded_1',
                  #'10min_Encoded_2', '10min_Encoded_3', '10min_Encoded_4', '10min_Encoded_5',
                  # '10min_Encoded_6', '10min_Encoded_7']


X = train.loc[:,train_features]
y = train['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:" , X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:" , y_test.shape)

Shape of X_train: (2396, 8)
Shape of X_test: (1027, 8)
Shape of y_train: (2396,)
Shape of y_test: (1027,)


In [None]:
print(y_train.value_counts())
print(y_test.value_counts())

label
1.0    2109
0.0     287
Name: count, dtype: int64
label
1.0    917
0.0    110
Name: count, dtype: int64


In [None]:
### Rebalance Data frame due to the low number of class 1
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Now train your model on the resampled data
print(X_train_resampled.value_counts(),'\n')
print(y_train_resampled.value_counts())

Encoded_0  Encoded_1  Encoded_2  Encoded_3  Encoded_4  Encoded_5  Encoded_6  Encoded_7
13.416625  4.401815   7.739627   0.0        17.141453  5.795266   11.534381  10.484607    1
0.000000   2.166997   0.231695   0.0        3.329105   2.816230   2.642849   3.321607     1
           3.725765   2.642064   0.0        2.718543   5.091058   2.545305   3.121900     1
           5.091003   3.900258   0.0        5.062963   4.850760   2.186991   3.599518     1
           5.691338   3.633762   0.0        3.692160   4.981380   4.430609   4.650314     1
                                                                                         ..
0.179648   5.693472   2.372745   0.0        2.736366   3.846178   3.431102   2.288439     1
0.133495   8.850371   8.400334   0.0        10.042542  11.154220  9.551336   5.225968     1
0.045813   5.663464   2.347816   0.0        5.802249   4.218128   2.602168   3.027248     1
0.000000   7.785099   7.071434   0.0        7.214082   4.199494   2.597097   1.773494

In [None]:
# --------------- Classifier 1: XGBoost ------------------------------------------
xgb = XGBClassifier(n_estimators=15, max_depth=3,
                    random_state=42, eval_metric='logloss')

# --------------- Classifier 2: Logistic Regression Classifier------------------------------------------
lr = LogisticRegression(random_state=42)

# --------------- Classifier 3: AdaBoost Classifier ------------------------------------------
ada = AdaBoostClassifier(n_estimators=15, random_state=42)

# --------------- Classifier 4: SVM------------------------------------------
svc = svm.SVC(kernel='rbf', probability=True, random_state=42)

# Define a list to store the different models
estimator = []
estimator.append(('LR', lr))
estimator.append(('XGB', xgb))
estimator.append(('ada', ada))
estimator.append(('SVC', svc))

In [None]:
# --------------- Classifier 1: XGBoost ------------------------------------------
xgb = XGBClassifier(n_estimators=15, max_depth=3,
                    random_state=42, eval_metric='logloss')

# --------------- Classifier 2: Logistic Regression Classifier------------------------------------------
lr = LogisticRegression(random_state=42)

# --------------- Classifier 3: AdaBoost Classifier ------------------------------------------
ada = AdaBoostClassifier(n_estimators=15, random_state=42)

# --------------- Classifier 4: SVM------------------------------------------
svc = svm.SVC(kernel='rbf', probability=True, random_state=42)

# Define a list to store the different models
estimator = []
estimator.append(('LR', lr))
estimator.append(('XGB', xgb))
estimator.append(('ada', ada))
estimator.append(('SVC', svc))

In [None]:
### Implement voting classifier with hard voting

ml_model = VotingClassifier(estimators=estimator, voting='soft')
ml_model.fit(X_train_resampled, y_train_resampled)
#ml_model.fit(X_train, y_train)

In [None]:
y_true = y_test
y_pred = ml_model.predict(X_test)

conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(y_true, y_pred)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[ 63  47]
 [414 503]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.13      0.57      0.21       110
         1.0       0.91      0.55      0.69       917

    accuracy                           0.55      1027
   macro avg       0.52      0.56      0.45      1027
weighted avg       0.83      0.55      0.64      1027



In [None]:
### Save ML Model
model_path = root_data + 'Models/'+symbol+'_'+direction+'_ml_model.joblib'
joblib.dump(ml_model, model_path)
print(f"Model saved successfully at: {model_path}")

Model saved successfully at: /content/drive/MyDrive/Course Folder/Forex/XAUUSD/Models/XAUUSD_Long_ml_model.joblib


## Meta

In [None]:
### Import ML Model
model_path = root_data+'Models/'+symbol+'_'+direction+'_ml_model.joblib'
ml_model = joblib.load(model_path)
print(f"Model loaded successfully from: {model_path}")

Model loaded successfully from: /content/drive/MyDrive/Course Folder/Forex/XAUUSD/Models/XAUUSD_Long_ml_model.joblib


In [None]:
#train.head(5)

In [None]:
### Import Data
train_features = ['Encoded_0', 'Encoded_1', 'Encoded_2', 'Encoded_3', 'Encoded_4',
                  'Encoded_5', 'Encoded_6', 'Encoded_7']
                  #'10min_Encoded_0', '10min_Encoded_1',
                  #'10min_Encoded_2', '10min_Encoded_3', '10min_Encoded_4', '10min_Encoded_5',
                  #'10min_Encoded_6', '10min_Encoded_7']

# Select the feature columns from the 'train' DataFrame
X_train_features = train[train_features]

train['label_ml'] = ml_model.predict(X_train_features)

meta_features = train_features + ['label_ml']
X_meta_features = train[meta_features]

prediction_probabilities = ml_model.predict_proba(X_train_features)
train['prob_0'] = prediction_probabilities[:, 0]
train['prob_1'] = prediction_probabilities[:, 1]

#train.head()

In [None]:
train.to_csv(root_data + 'Results/'+symbol+'Meta_Prob_M5+M10_train_l.csv')

In [None]:
meta = train_features + ['label_ml', 'prob_1', 'prob_0']
#meta = train_features + ['prob_1']
X_meta_features = train[meta]

X = train[meta]
y = train['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (2396, 11)
Shape of X_test: (1027, 11)
Shape of y_train: (2396,)
Shape of y_test: (1027,)


In [None]:
print(y_train.value_counts())
print(y_test.value_counts())

label
1.0    2109
0.0     287
Name: count, dtype: int64
label
1.0    917
0.0    110
Name: count, dtype: int64


In [None]:
### Rebalance Data frame due to the low number of class 1

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# --------------- Classifier 1: XGBoost ------------------------------------------
xgb = XGBClassifier(n_estimators=15, max_depth=3,
                    random_state=42, eval_metric='logloss')

# --------------- Classifier 2: Logistic Regression Classifier------------------------------------------
lr = LogisticRegression(random_state=42)

# --------------- Classifier 3: AdaBoost Classifier ------------------------------------------
ada = AdaBoostClassifier(n_estimators=15, random_state=42)

# --------------- Classifier 4: SVM------------------------------------------
svc = svm.SVC(kernel='rbf', probability=True, random_state=42)

# Define a list to store the different models
estimator = []
estimator.append(('LR', lr))
estimator.append(('XGB', xgb))
estimator.append(('ada', ada))
estimator.append(('SVC', svc))

In [None]:
### Implement voting classifier with hard voting

meta_ml_model = VotingClassifier(estimators=estimator, voting='soft')
meta_ml_model.fit(X_train_resampled, y_train_resampled)

In [None]:
y_true = y_test
y_pred = meta_ml_model.predict(X_test)

conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(y_true, y_pred)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[ 61  49]
 [403 514]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.13      0.55      0.21       110
         1.0       0.91      0.56      0.69       917

    accuracy                           0.56      1027
   macro avg       0.52      0.56      0.45      1027
weighted avg       0.83      0.56      0.64      1027



In [None]:
train['meta_results'] = meta_ml_model.predict(X_meta_features)

In [None]:
train.to_csv(root_data + 'Results/'+symbol+'train_l_Signals & Meta_Signals.csv')

In [None]:
### Save ML Model
model_path = root_data + 'Models/'+symbol+'_'+direction+'_Meta_ml_model.joblib'
joblib.dump(meta_ml_model, model_path)
print(f"Model saved successfully at: {model_path}")

Model saved successfully at: /content/drive/MyDrive/Course Folder/Forex/XAUUSD/Models/XAUUSD_Long_Meta_ml_model.joblib


## PnL Train

In [None]:
train['ml_results'] = np.where(train['label_ml']==1, train['atr_PnL_dollar'],0)
results(train, pnl_column='ml_results')

Unnamed: 0,Results
days,357
total_trades,3423
,
income,"$351,441.28"
losses,"$-22,614.37"
profits,"$328,826.91"
,
profit_trades,1787
loss_trades,127
,


In [None]:
train['meta_ml_results'] = np.where(train['meta_results']==1, train['atr_PnL_dollar'],0)
results(train, pnl_column='meta_ml_results')

Unnamed: 0,Results
days,357
total_trades,3423
,
income,"$360,329.45"
losses,"$-23,070.33"
profits,"$337,259.12"
,
profit_trades,1839
loss_trades,131
,



# Test


## Results_ML

In [None]:
### Import ML Model

model_path = root_data+'Models/'+symbol+'_'+direction+'_ml_model.joblib'
ml_model = joblib.load(model_path)
print(f"Model loaded successfully from: {model_path}")

Model loaded successfully from: /content/drive/MyDrive/Course Folder/Forex/XAUUSD/Models/XAUUSD_Long_ml_model.joblib


In [None]:
### Import Meta Model

model_path = root_data+'Models/'+symbol+'_'+direction+'_Meta_ml_model.joblib'
meta_ml_model = joblib.load(model_path)
print(f"Model loaded successfully from: {model_path}")

Model loaded successfully from: /content/drive/MyDrive/Course Folder/Forex/XAUUSD/Models/XAUUSD_Long_Meta_ml_model.joblib


In [None]:

test['label_ml'] = ml_model.predict(test.loc[:,train_features])

prediction_probabilities = ml_model.predict_proba(test.loc[:,train_features])
test['prob_0'] = prediction_probabilities[:, 0]
test['prob_1'] = prediction_probabilities[:, 1]

print(test.columns)

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Spread', 'ATR', 'hl2',
       'lag_1', 'it_1', 'lag_2', 'it_2', 'lag_3', 'it_3', 'Open_Trade',
       'st_Close_Trade', 'Entry_Date', 'Type', 'Trade_Number', 'st_Exit_Date',
       'trade type', 'st_Duration', 'st_row_PnL_close', 'st_row_PnL_high',
       'st_row_PnL_low', 'st_Max', 'st_Min', 'st_PnL', 'dyn_stoploss',
       'SL_PnL', 'SL_Exit_Date', 'SL_Duration', 'SL_PnL_-100_100_50',
       'atr_mult_low', 'atr_mult_close', 'atr_mult_high', 'atr_dyn', 'atr_PnL',
       'atr_Exit_Date', 'atr_Duration', 'atr_PnL_dollar',
       'atr_PnL_dollar_-1.0_0.5_2.0', 'atr_H_dyn', 'atr_H_PnL',
       'atr_H_Exit_Date', 'atr_H_Duration', 'atr_H_PnL_dollar',
       'atr_H_PnL_dollar_-3.0_0.1', 'label', 'Encoded_0', 'Encoded_1',
       'Encoded_2', 'Encoded_3', 'Encoded_4', 'Encoded_5', 'Encoded_6',
       'Encoded_7', 'label_ml', 'prob_0', 'prob_1'],
      dtype='object')


In [None]:
print(test.columns)

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Spread', 'ATR', 'hl2',
       'lag_1', 'it_1', 'lag_2', 'it_2', 'lag_3', 'it_3', 'Open_Trade',
       'st_Close_Trade', 'Entry_Date', 'Type', 'Trade_Number', 'st_Exit_Date',
       'trade type', 'st_Duration', 'st_row_PnL_close', 'st_row_PnL_high',
       'st_row_PnL_low', 'st_Max', 'st_Min', 'st_PnL', 'dyn_stoploss',
       'SL_PnL', 'SL_Exit_Date', 'SL_Duration', 'SL_PnL_-100_100_50',
       'atr_mult_low', 'atr_mult_close', 'atr_mult_high', 'atr_dyn', 'atr_PnL',
       'atr_Exit_Date', 'atr_Duration', 'atr_PnL_dollar',
       'atr_PnL_dollar_-1.0_0.5_2.0', 'atr_H_dyn', 'atr_H_PnL',
       'atr_H_Exit_Date', 'atr_H_Duration', 'atr_H_PnL_dollar',
       'atr_H_PnL_dollar_-3.0_0.1', 'label', 'Encoded_0', 'Encoded_1',
       'Encoded_2', 'Encoded_3', 'Encoded_4', 'Encoded_5', 'Encoded_6',
       'Encoded_7', 'label_ml', 'prob_0', 'prob_1'],
      dtype='object')


In [None]:
# Calculate and display Confusion Matrix
conf_matrix = confusion_matrix(test['label'], test['label_ml'])
print("Confusion Matrix:")
print(conf_matrix)

# Calculate and display Classification Report
class_report = classification_report(test['label'], test['label_ml'])
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[  33  137]
 [ 203 1095]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.14      0.19      0.16       170
         1.0       0.89      0.84      0.87      1298

    accuracy                           0.77      1468
   macro avg       0.51      0.52      0.51      1468
weighted avg       0.80      0.77      0.78      1468



In [None]:
test['meta_label'] = meta_ml_model.predict(test.loc[:,meta])

prediction_probabilities = meta_ml_model.predict_proba(test.loc[:,meta])
test['meta_prob_0'] = prediction_probabilities[:, 0]
test['meta_prob_1'] = prediction_probabilities[:, 1]

print(test.columns)

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Spread', 'ATR', 'hl2',
       'lag_1', 'it_1', 'lag_2', 'it_2', 'lag_3', 'it_3', 'Open_Trade',
       'st_Close_Trade', 'Entry_Date', 'Type', 'Trade_Number', 'st_Exit_Date',
       'trade type', 'st_Duration', 'st_row_PnL_close', 'st_row_PnL_high',
       'st_row_PnL_low', 'st_Max', 'st_Min', 'st_PnL', 'dyn_stoploss',
       'SL_PnL', 'SL_Exit_Date', 'SL_Duration', 'SL_PnL_-100_100_50',
       'atr_mult_low', 'atr_mult_close', 'atr_mult_high', 'atr_dyn', 'atr_PnL',
       'atr_Exit_Date', 'atr_Duration', 'atr_PnL_dollar',
       'atr_PnL_dollar_-1.0_0.5_2.0', 'atr_H_dyn', 'atr_H_PnL',
       'atr_H_Exit_Date', 'atr_H_Duration', 'atr_H_PnL_dollar',
       'atr_H_PnL_dollar_-3.0_0.1', 'label', 'Encoded_0', 'Encoded_1',
       'Encoded_2', 'Encoded_3', 'Encoded_4', 'Encoded_5', 'Encoded_6',
       'Encoded_7', 'label_ml', 'prob_0', 'prob_1', 'meta_label',
       'meta_prob_0', 'meta_prob_1'],
      dtype='object')


In [None]:
# Calculate and display Confusion Matrix
conf_matrix = confusion_matrix(test['label'], test['meta_label'])
print("Confusion Matrix:")
print(conf_matrix)

# Calculate and display Classification Report
class_report = classification_report(test['label'], test['meta_label'])
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[  30  140]
 [ 184 1114]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.14      0.18      0.16       170
         1.0       0.89      0.86      0.87      1298

    accuracy                           0.78      1468
   macro avg       0.51      0.52      0.51      1468
weighted avg       0.80      0.78      0.79      1468



## Results_PnL

In [None]:
test['ml_results'] = np.where(test['label_ml']==1, test['atr_PnL_dollar'],0)
results(test, pnl_column='ml_results')

Unnamed: 0,Results
days,156
total_trades,1468
,
income,"$353,733.88"
losses,"$-41,584.91"
profits,"$312,148.97"
,
profit_trades,1095
loss_trades,137
,


In [None]:
train['meta_ml_results'] = np.where(train['meta_results']==1, train['atr_PnL_dollar'],0)
results(train, pnl_column='meta_ml_results')

Unnamed: 0,Results
days,357
total_trades,3423
,
income,"$360,329.45"
losses,"$-23,070.33"
profits,"$337,259.12"
,
profit_trades,1839
loss_trades,131
,


# Pendientes


In [None]:


# Define a more comprehensive parameter grid
#param_grid = {
#    'weights': [(0.5, 0.5, 0.5, 0.5), (0.25, 0.25, 0.25, 0.25), (0.3, 0.2, 0.3, 0.2)], # Example weights
#    'voting': ['soft'],
#    'XGB__n_estimators': [50, 100, 150],
#    'XGB__learning_rate': [0.01, 0.1, 0.2],
#    'XGB__max_depth': [3, 5, 7],
#    'LR__C': [0.1, 1.0, 10.0], # Inverse of regularization strength
#    'ada__n_estimators': [50, 100, 150],
#    'SVC__C': [0.1, 1.0, 10.0], # Regularization parameter
#    'SVC__gamma': ['scale', 'auto'] # Kernel coefficient
#}

# Initialize GridSearchCV
# You might want to use cross-validation (cv) and specify scoring metrics
#grid_search = GridSearchCV(estimator=ml_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1) # Added n_jobs for faster execution

# Fit GridSearchCV to the resampled testing data
# This step can take a significant amount of time depending on the grid size and data
#grid_search.fit(X_train_resampled, y_train_resampled)

# Print the best hyperparameters found
#print("Best hyperparameters found:")
#print(grid_search.best_params_)

# Get the best model
#best_ml_model = grid_search.best_estimator_

# Evaluate the best model on the test data
#y_true = y_test
#y_pred = best_ml_model.predict(X_test)

#conf_matrix = confusion_matrix(y_true, y_pred)
#print("\nConfusion Matrix (Tuned Model):")
#print(conf_matrix)

#class_report = classification_report(y_true, y_pred)
#print("\nClassification Report (Tuned Model):")
#print(class_report)

**Reasoning**:
Implement k-fold cross-validation as instructed.



In [None]:
# Define the number of folds for cross-validation
n_splits = 5

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Create empty lists to store evaluation results
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Iterate through the folds
for fold, (train_index, val_index) in enumerate(skf.split(X_scaled_combined, y_combined)):
    print(f"Fold {fold+1}/{n_splits}")

    # Split data into training and validation sets for the current fold
    X_train_fold, X_val_fold = X_scaled_combined.iloc[train_index], X_scaled_combined.iloc[val_index]
    y_train_fold, y_val_fold = y_combined.iloc[train_index], y_combined.iloc[val_index]

    # Train the model on the training fold
    vot_soft.fit(X_train_fold, y_train_fold)

    # Predict on the validation fold
    y_pred_fold = vot_soft.predict(X_val_fold)

    # Calculate performance metrics for the current fold
    accuracy = accuracy_score(y_val_fold, y_pred_fold)
    precision = precision_score(y_val_fold, y_pred_fold)
    recall = recall_score(y_val_fold, y_pred_fold)
    f1 = f1_score(y_val_fold, y_pred_fold)

    # Append metrics to the lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

    print(f"  Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")


# Varios

In [None]:
train['meta_results'] = meta_ml_model.predict(X_meta_features)
