# Gpu

In [1]:
!nvidia-smi

Tue Sep 16 18:42:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   63C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

# Set_Up

In [3]:
strategy   = 'Kalman'
process    = 'Train'
symbol     = 'BTCUSD'
direction  = 'Short'
time_frame = 'M5'

root_data = f'/content/drive/MyDrive/Course Folder/Forex/XAUUSD/'
print(root_data)

rolling_window = 100

/content/drive/MyDrive/Course Folder/Forex/XAUUSD/


# Import_Libraries

In [4]:
!pip install ta-lib
import talib as ta
print(ta.__version__)

0.6.7


In [5]:
# Import libraries for data manipulation
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from datetime import timedelta

# For machine learning models
import xgboost
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split # Import train_test_split

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from tensorflow.keras.models import load_model

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

# Import matplotlib as an alias plt and set the style
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("seaborn-v0_8-whitegrid")

# Import sys to append the path for custom function file
import sys
sys.path.append("..")

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

import joblib

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Support Functions

In [7]:
def results(data, pnl_column='PnL'):
    # Calculate the metrics
    time_difference = data.index.max() - data.index.min()
    days = time_difference.days
    total_trades = data[data['Open_Trade'].notna() & (data['Open_Trade'] != 0)].shape[0]
    profit_trades = data[data[pnl_column] > 0].shape[0]
    loss_trades = data[data[pnl_column] < 0].shape[0]
    profits = data[data[pnl_column] > 0][pnl_column].sum()
    losses = data[data[pnl_column] < 0][pnl_column].sum()

    # Create a dictionary with the results
    results_dict = {
        'days': days,
        'total_trades': total_trades,
        '': '',
        'income': profits,
        'losses': losses,
        'profits': profits + losses,
        ' ':' ',
        'profit_trades': profit_trades,
        'loss_trades': loss_trades,
        '  ':'  ',
        '% Win_Trades': profit_trades / (profit_trades + loss_trades) * 100 if (profit_trades + loss_trades) > 0 else 0,
        '% Loss_Trades': loss_trades / (profit_trades + loss_trades)*100 if (profit_trades + loss_trades) > 0 else 0
    }

    # Create a DataFrame from the dictionary and transpose it
    perf_metrics = pd.DataFrame([results_dict]).T

    # Rename the column
    perf_metrics.rename(columns={0: 'Results'}, inplace=True)

    # Format the DataFrame for display
    perf_metrics.loc[['days', 'total_trades', 'profit_trades', 'loss_trades','% Win_Trades','% Loss_Trades'], 'Results'] = perf_metrics.loc[['days', 'total_trades', 'profit_trades', 'loss_trades','% Win_Trades','% Loss_Trades'], 'Results'].apply(lambda x: f"{x:,.0f}" if pd.notna(x) else '')
    perf_metrics.loc[['income', 'losses', 'profits'], 'Results'] = perf_metrics.loc[['income', 'losses', 'profits'], 'Results'].apply(lambda x: f"${x:,.2f}" if pd.notna(x) else '')

    return perf_metrics

In [8]:
def create_features(train_data, index):
    short_periods = [3, 5, 7, 10, 15, 17]
    long_periods = [20, 22, 66, 126, 252]
    periods = short_periods + long_periods

    features = pd.DataFrame(index=index)

    # Indicators that do not depend on the lookback period
    features['OBV'] = ta.OBV(train_data['Close'], train_data['Volume'])
    features['AD'] = ta.AD(train_data['High'], train_data['Low'],
                           train_data['Close'], train_data['Volume'])

    # Pre-compute moving averages to avoid repeated calculations
    sma = {p: ta.SMA(train_data['Close'], timeperiod=p) for p in periods}
    ema = {p: ta.EMA(train_data['Close'], timeperiod=p) for p in periods}

    for period in periods:
        features[f'RSI_{period}'] = ta.RSI(train_data['Close'], timeperiod=period)
        features[f'MFI_{period}'] = ta.MFI(train_data['High'], train_data['Low'],
                                           train_data['Close'], train_data['Volume'],
                                           timeperiod=period)
        features[f'ADX_{period}'] = ta.ADX(train_data['High'], train_data['Low'],
                                           train_data['Close'], timeperiod=period)
        features[f'ROCP_{period}'] = ta.ROCP(train_data['Close'], timeperiod=period)

    for s in short_periods:
        for l in long_periods:
            features[f'SMA_Crossover_{s}_{l}'] = sma[s] - sma[l]
            features[f'EMA_Crossover_{s}_{l}'] = ema[s] - ema[l]

    features.dropna(inplace=True)
    return features


In [9]:
def strategy_returns_dynamic_different_thresholds(prices, threshold):
    rolling = prices['Rolling Returns'].to_numpy()
    yearly_std = prices['Yearly Stdev'].to_numpy()
    pct_change = prices['Close'].pct_change().to_numpy()
    signals = np.zeros(len(prices), dtype=np.int8)

    curr_pos = 0
    hold_days = 0
    for i in range(len(prices)):
        if curr_pos == 0 or hold_days == 20:
            thresh_val = threshold * yearly_std[i]
            if rolling[i] >= thresh_val:
                curr_pos = 1
                hold_days = 0
            elif rolling[i] < -thresh_val:
                curr_pos = -1
                hold_days = 0
            else:
                curr_pos = 0
                hold_days = 0
        else:
            hold_days += 1
        signals[i] = curr_pos

    prices[f'Signal_{threshold}'] = signals
    strategy_returns = pct_change * np.roll(signals, 1)
    prices[f'Strategy Returns_{threshold}'] = strategy_returns
    return np.cumprod(strategy_returns + 1)


def strategy_returns_different_thresholds(prices, threshold):
    rolling = prices['Rolling Returns'].to_numpy()
    pct_change = prices['Close'].pct_change().to_numpy()
    signals = np.zeros(len(prices), dtype=np.int8)

    curr_pos = 0
    hold_days = 0
    for i in range(len(prices)):
        if curr_pos == 0 or hold_days == 20:
            if rolling[i] >= threshold:
                curr_pos = 1
                hold_days = 0
            elif rolling[i] < threshold:
                curr_pos = -1
                hold_days = 0
        else:
            hold_days += 1
        signals[i] = curr_pos

    prices[f'Signal_{threshold}'] = signals
    strategy_returns = pct_change * np.roll(signals, 1)
    prices[f'Strategy Returns_{threshold}'] = strategy_returns
    return np.cumprod(strategy_returns + 1)


# Data

In [209]:
data_type = 'Scale'

In [220]:
### Open OHLC dataframe

ohlc = pd.read_csv(root_data + 'Data/'+symbol+'_M5.csv', index_col=0)
ohlc.index = pd.to_datetime(ohlc.index)
time_difference = ohlc.index.max() - ohlc.index.min()
number_of_days = time_difference.days

print(f"The train_data DataFrame covers a period of {number_of_days} days.")
#ohlc.tail(3)

The train_data DataFrame covers a period of 937 days.


In [221]:
### Features

features_5m = pd.read_csv(root_data + 'Results/'+symbol+'_'+direction+'_M5M10_'+data_type+'_Features.csv', index_col=0)
features_5m['Date'] = features_5m.index
features_5m['Date'] = pd.to_datetime(features_5m['Date'])
features_5m.set_index("Date", inplace=True)

#features_5m = features_5m.drop('label', axis=1)

print(list(features_5m.columns),'\n')
print('Shape = ',features_5m.shape)

print('IsNull = ',features_5m['slope_angle_600_9'].isnull().sum())
print('IsNull = ',features_5m['10min_RSI_3_diff'].isnull().sum())

features_5m.tail(3)

['label', '10min_RSI_3_diff', '10min_RSI_3', '10min_slope_signal_600_3', '10min_slope_signal_300_3', '10min_slope_div_300_3_diff', '10min_slope_lin_reg_signal_900_6', 'RSI_7', '10min_slope_signal_900_3', '10min_slope_angle_900_3_diff', '10min_slope_signal_900_9_diff', '10min_slope_signal_600_3_diff', '10min_slope_lin_reg_signal_600_3', '10min_slope_div_300_6 - slope_div_600_9', '10min_slope_div_300_9', '10min_slope_signal_300_6 - slope_signal_600_3', '10min_slope_signal_300_6', '10min_slope_signal_300_6 - slope_signal_600_6', '10min_slope_angle_600_3', '10min_slope_angle_600_3_diff', '10min_slope_lin_reg_signal_600_6', '10min_slope_lin_reg_signal_300_6 - slope_lin_reg_signal_900_9', 'Kal_600_minus_Kal_900', '10min_slope_signal_300_9 - slope_signal_900_9', '10min_slope_div_300_3 - slope_div_300_6', '10min_slope_lin_reg_signal_600_9', '10min_slope_signal_600_6', '10min_slope_signal_300_3 - slope_signal_600_3', '10min_slope_signal_300_6 - slope_signal_900_6', '10min_slope_lin_reg_600_3 - 

Unnamed: 0_level_0,label,10min_RSI_3_diff,10min_RSI_3,10min_slope_signal_600_3,10min_slope_signal_300_3,10min_slope_div_300_3_diff,10min_slope_lin_reg_signal_900_6,RSI_7,10min_slope_signal_900_3,10min_slope_angle_900_3_diff,...,10min_slope_angle_600_9 - slope_angle_900_6,slope_signal_600_3 - slope_signal_600_6,slope_lin_reg_signal_600_9,slope_angle_300_3 - slope_angle_900_6,slope_angle_900_6,10min_slope_angle_300_9 - slope_angle_600_6,10min_slope_angle_300_6 - slope_angle_600_3,slope_signal_300_9,10min_slope_lin_reg_signal_300_9 - slope_lin_reg_signal_900_9,slope_angle_300_3 - slope_angle_900_3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-07-26 20:30:00,0,0.378237,1.588212,-0.977742,1.059255,0.261028,1.102774,-0.982505,-0.987571,-0.000604,...,-0.006334,-0.008887,-0.920648,0.019566,-0.935725,0.009094,-0.008663,1.069946,0.0,-0.577616
2025-07-26 21:15:00,0,0.378237,1.588212,-0.977742,1.059255,0.261028,1.102774,-1.136307,-0.987571,-0.000604,...,-0.006334,0.017918,1.091702,-0.130668,1.073741,0.009094,-0.008663,0.987571,0.0,0.059524
2025-07-26 23:05:00,1,0.378237,1.588212,-0.977742,1.059255,0.261028,1.102774,-0.655377,-0.987571,-0.000604,...,-0.006334,0.008887,0.987571,-0.032381,0.988749,0.009094,-0.008663,-1.059255,0.0,2.504577


In [222]:
### Labels

lab = pd.read_csv(root_data + 'Results/'+symbol+'_'+strategy+'_'+time_frame+'_Strategy_Gen_Labels.csv', index_col=0)
lab['Date'] = pd.to_datetime(lab['Date'])
lab.set_index('Date', inplace=True)

columns_to_drop = ['st_row_PnL_Low','Close_Trade']
lab = lab.drop(columns=columns_to_drop)

#lab.dropna(inplace=True)

print(list(lab.columns),'\n')
print('Shape : ',lab.shape,'\n')

nan_counts = lab.isnull().sum()
total_missing_counts = nan_counts

print("Missing values (NaN and Inf) in lab sorted by highest to lowest:")
print(total_missing_counts.sort_values(ascending=False))
print("Total missing value count in lab:", total_missing_counts.sum())


['Open', 'High', 'Low', 'Close', 'Volume', 'Spread', 'ATR', 'kal_1', 'kal_2', 'kal_3', 'kal_4', 'Open_Trade', 'Entry_Date', 'Type', 'Trade_Number', 'st_Exit_Date', 'trade type', 'st_Duration', 'st_row_PnL_close', 'st_row_PnL_high', 'st_row_PnL_low', 'st_Max', 'st_Min', 'st_PnL', 'st_atr_PnL', 'st_atr_max_PnL'] 

Shape :  (267395, 26) 

Missing values (NaN and Inf) in lab sorted by highest to lowest:
st_Exit_Date        228204
st_Duration         228204
st_Max              228204
st_Min              228204
st_PnL              228204
Type                228203
Entry_Date          228203
Open_Trade          228203
st_atr_PnL              15
st_atr_max_PnL          15
ATR                     14
st_row_PnL_high         11
st_row_PnL_close        11
st_row_PnL_low          11
trade type               1
Trade_Number             1
kal_3                    0
High                     0
Open                     0
kal_2                    0
kal_1                    0
Spread                   0
Clo

In [223]:
# --- Parámetros / campos
result_field = 'st_atr_max_PnL'   # métrica a evaluar

# --- Filtro de filas válidas
valid = (
    (lab['Type'] == direction) &
    (lab['Open_Trade'].isin([1, -1])) &
    (lab[result_field].notna()))


lab['label'] = np.nan
lab.loc[valid & (lab[result_field] <= 1), 'label'] = 0
lab.loc[valid & (lab[result_field] >= 1), 'label'] = 1

# --- Mantener solo filas válidas y con label
lab = lab.loc[valid & lab['label'].notna()].copy()
lab['label'] = lab['label'].astype('int8')

# Check for NaN and infinite values

print('\nLabel counts = ')
print(lab['label'].value_counts())
print(lab['Open_Trade'].value_counts())


Label counts = 
label
1    10054
0     9537
Name: count, dtype: int64
Open_Trade
-1.0    19591
Name: count, dtype: int64


In [224]:
### Merge

#feat_obj = lab.merge(features_5m, left_index=True, right_index=True, how='left')
feat_obj = features_5m.merge(lab, left_index=True, right_index=True, how='left')
feat_obj = feat_obj.loc[feat_obj['Open_Trade']==-1,:] # Commented out filtering


print(list(feat_obj.columns),'\n')
print('Shape = ',feat_obj.shape,'\n')
print('Open_Trade = ', feat_obj['Open_Trade'].value_counts(),'\n')
print('Label_Counts = ', feat_obj['label'].value_counts(),'\n')

feat_obj.dropna(inplace=True)

nan_counts = feat_obj.isnull().sum()
print("Missing values (NaN and Inf) in feat_obj sorted by highest to lowest AFTER DROPPING COLUMNS:")
nan_counts_after = feat_obj.isnull().sum()
inf_counts_after = np.isinf(feat_obj.select_dtypes(include=np.number)).sum()
total_missing_counts_after = nan_counts_after + inf_counts_after
print(total_missing_counts_after.sort_values(ascending=False))
print("Total missing value count in feat_obj AFTER DROPPING COLUMNS:", total_missing_counts_after.sum())



['label_x', '10min_RSI_3_diff', '10min_RSI_3', '10min_slope_signal_600_3', '10min_slope_signal_300_3', '10min_slope_div_300_3_diff', '10min_slope_lin_reg_signal_900_6', 'RSI_7', '10min_slope_signal_900_3', '10min_slope_angle_900_3_diff', '10min_slope_signal_900_9_diff', '10min_slope_signal_600_3_diff', '10min_slope_lin_reg_signal_600_3', '10min_slope_div_300_6 - slope_div_600_9', '10min_slope_div_300_9', '10min_slope_signal_300_6 - slope_signal_600_3', '10min_slope_signal_300_6', '10min_slope_signal_300_6 - slope_signal_600_6', '10min_slope_angle_600_3', '10min_slope_angle_600_3_diff', '10min_slope_lin_reg_signal_600_6', '10min_slope_lin_reg_signal_300_6 - slope_lin_reg_signal_900_9', 'Kal_600_minus_Kal_900', '10min_slope_signal_300_9 - slope_signal_900_9', '10min_slope_div_300_3 - slope_div_300_6', '10min_slope_lin_reg_signal_600_9', '10min_slope_signal_600_6', '10min_slope_signal_300_3 - slope_signal_600_3', '10min_slope_signal_300_6 - slope_signal_900_6', '10min_slope_lin_reg_600_3 

KeyError: 'label'

In [196]:
# Split the data into 70% train and 30% test based on index
train_size = int(0.7 * len(feat_obj))
train = feat_obj.iloc[:train_size]
test  = feat_obj.iloc[train_size:]

print("Shape of train_data:", train.shape)
print("Shape of test_data:", test.shape)

Shape of train_data: (13713, 27)
Shape of test_data: (5878, 27)


In [200]:
### Define Train dataframe

print('Train_Min_Date', train.index.min())
print('Train_Max_Date', train.index.max(),'\n')

print('Test_Min_Date', test.index.min())
print('Test_Max_Date', test.index.max(),'\n')

print('Train_Columns : ',list(train.columns), '\n')
print('Test_Columns : ',list(test.columns))

Train_Min_Date 2023-01-01 02:05:00
Train_Max_Date 2024-10-10 18:20:00 

Test_Min_Date 2024-10-10 19:25:00
Test_Max_Date 2025-07-26 23:05:00 

Train_Columns :  ['Open', 'High', 'Low', 'Close', 'Volume', 'Spread', 'ATR', 'kal_1', 'kal_2', 'kal_3', 'kal_4', 'Open_Trade', 'Entry_Date', 'Type', 'Trade_Number', 'st_Exit_Date', 'trade type', 'st_Duration', 'st_row_PnL_close', 'st_row_PnL_high', 'st_row_PnL_low', 'st_Max', 'st_Min', 'st_PnL', 'st_atr_PnL', 'st_atr_max_PnL', 'label'] 

Test_Columns :  ['Open', 'High', 'Low', 'Close', 'Volume', 'Spread', 'ATR', 'kal_1', 'kal_2', 'kal_3', 'kal_4', 'Open_Trade', 'Entry_Date', 'Type', 'Trade_Number', 'st_Exit_Date', 'trade type', 'st_Duration', 'st_row_PnL_close', 'st_row_PnL_high', 'st_row_PnL_low', 'st_Max', 'st_Min', 'st_PnL', 'st_atr_PnL', 'st_atr_max_PnL', 'label']


# Results


In [182]:
results(train, pnl_column='st_PnL')

Unnamed: 0,Results
days,648
total_trades,13713
,
income,"$592,962.50"
losses,"$-264,915.05"
profits,"$328,047.45"
,
profit_trades,5315
loss_trades,6821
,


In [183]:
results(test, pnl_column= 'st_PnL')

Unnamed: 0,Results
days,289
total_trades,5878
,
income,"$615,151.93"
losses,"$-260,051.00"
profits,"$355,100.93"
,
profit_trades,2419
loss_trades,2857
,



# ML


## Train

In [199]:
### When using train_test_split it applies the scaler to X_train only
start_feature = train.columns.get_loc('10min_RSI_3_diff')
train_features = [col for col in train.columns[start_feature:] if col != 'label']

X = train.loc[:, train_features]
y = train['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:",  X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:",  y_test.shape)

KeyError: '10min_RSI_3_diff'

In [None]:
print(y_train.value_counts())
print(y_test.value_counts())

In [None]:
### Rebalance Data frame due to the low number of class 1
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Now train your model on the resampled data
print(X_train_resampled.value_counts(),'\n')
print(y_train_resampled.value_counts())

In [None]:
# --------------- Classifier 1: XGBoost ------------------------------------------
xgb = XGBClassifier(n_estimators=15, max_depth=3,
                    random_state=42, eval_metric='logloss')

# --------------- Classifier 2: Logistic Regression Classifier------------------------------------------
lr = LogisticRegression(random_state=42)

# --------------- Classifier 3: AdaBoost Classifier ------------------------------------------
ada = AdaBoostClassifier(n_estimators=15, random_state=42)

# --------------- Classifier 4: SVM------------------------------------------
svc = svm.SVC(kernel='rbf', probability=True, random_state=42)

# Define a list to store the different models
estimator = []
estimator.append(('LR', lr))
estimator.append(('XGB', xgb))
estimator.append(('ada', ada))
estimator.append(('SVC', svc))

In [None]:
# --------------- Classifier 1: XGBoost ------------------------------------------
xgb = XGBClassifier(n_estimators=15, max_depth=3,
                    random_state=42, eval_metric='logloss')

# --------------- Classifier 2: Logistic Regression Classifier------------------------------------------
lr = LogisticRegression(random_state=42)

# --------------- Classifier 3: AdaBoost Classifier ------------------------------------------
ada = AdaBoostClassifier(n_estimators=15, random_state=42)

# --------------- Classifier 4: SVM------------------------------------------
svc = svm.SVC(kernel='rbf', probability=True, random_state=42)

# Define a list to store the different models
estimator = []
estimator.append(('LR', lr))
estimator.append(('XGB', xgb))
estimator.append(('ada', ada))
estimator.append(('SVC', svc))

In [None]:
### Implement voting classifier with hard voting

ml_model = VotingClassifier(estimators=estimator, voting='soft')
ml_model.fit(X_train_resampled, y_train_resampled)
#ml_model.fit(X_train, y_train)

In [None]:
y_true = y_test
y_pred = ml_model.predict(X_test)

conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(y_true, y_pred)
print("\nClassification Report:")
print(class_report)

In [None]:
### Save ML Model
model_path = root_data + 'Models/'+symbol+'_'+direction+'_ml_model.joblib'
joblib.dump(ml_model, model_path)
print(f"Model saved successfully at: {model_path}")

## Meta

In [157]:
### Import ML Model
model_path = root_data+'Models/'+symbol+'_'+direction+'_ml_model.joblib'
ml_model = joblib.load(model_path)
print(f"Model loaded successfully from: {model_path}")

Model loaded successfully from: /content/drive/MyDrive/Course Folder/Forex/XAUUSD/Models/BTCUSD_Short_ml_model.joblib


In [158]:
#train.head(5)

In [159]:
### Import Data
start_feature = train.columns.get_loc('10min_RSI_3')
train_features = [col for col in train.columns[start_feature:] if col != 'label']

# Select the feature columns from the 'train' DataFrame
X_train_features = train[train_features]

train['label_ml'] = ml_model.predict(X_train_features)

meta_features = train_features + ['label_ml']
X_meta_features = train[meta_features]

prediction_probabilities = ml_model.predict_proba(X_train_features)
train['prob_0'] = prediction_probabilities[:, 0]
train['prob_1'] = prediction_probabilities[:, 1]

#train.head()

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- 10min_Kal_300
- 10min_MFI_14_diff
- 10min_MFI_3_diff
- 10min_MFI_7 - MFI_14
- 10min_MFI_7_diff
- ...
Feature names seen at fit time, yet now missing:
- 10min_ADX_10
- 10min_ADX_126
- 10min_ADX_15
- 10min_ADX_17
- 10min_ADX_20
- ...


In [None]:
train.to_csv(root_data + 'Results/'+symbol+'Meta_Prob_M5+M10_train_l.csv')

In [None]:
meta = train_features + ['label_ml', 'prob_1', 'prob_0']
#meta = train_features + ['prob_1']
X_meta_features = train[meta]

X = train[meta]
y = train['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
print(y_train.value_counts())
print(y_test.value_counts())

In [None]:
### Rebalance Data frame due to the low number of class 1

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# --------------- Classifier 1: XGBoost ------------------------------------------
xgb = XGBClassifier(n_estimators=15, max_depth=3,
                    random_state=42, eval_metric='logloss')

# --------------- Classifier 2: Logistic Regression Classifier------------------------------------------
lr = LogisticRegression(random_state=42)

# --------------- Classifier 3: AdaBoost Classifier ------------------------------------------
ada = AdaBoostClassifier(n_estimators=15, random_state=42)

# --------------- Classifier 4: SVM------------------------------------------
svc = svm.SVC(kernel='rbf', probability=True, random_state=42)

# Define a list to store the different models
estimator = []
estimator.append(('LR', lr))
estimator.append(('XGB', xgb))
estimator.append(('ada', ada))
estimator.append(('SVC', svc))

In [None]:
### Implement voting classifier with hard voting

meta_ml_model = VotingClassifier(estimators=estimator, voting='soft')
meta_ml_model.fit(X_train_resampled, y_train_resampled)

In [None]:
y_true = y_test
y_pred = meta_ml_model.predict(X_test)

conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(y_true, y_pred)
print("\nClassification Report:")
print(class_report)

In [None]:
train['meta_results'] = meta_ml_model.predict(X_meta_features)

In [None]:
train.to_csv(root_data + 'Results/'+symbol+'train_l_Signals & Meta_Signals.csv')

In [None]:
### Save ML Model
model_path = root_data + 'Models/'+symbol+'_'+direction+'_Meta_ml_model.joblib'
joblib.dump(meta_ml_model, model_path)
print(f"Model saved successfully at: {model_path}")

## PnL Train

In [None]:
train['ml_results'] = np.where(train['label_ml']==1, train['st_PnL'],0)
results(train, pnl_column='ml_results')

In [None]:
train['meta_ml_results'] = np.where(train['meta_results']==1, train['st_PnL'],0)
results(train, pnl_column='meta_ml_results')


# Test


## Results_ML

In [None]:
### Import ML Model

model_path = root_data+'Models/'+symbol+'_'+direction+'_ml_model.joblib'
ml_model = joblib.load(model_path)
print(f"Model loaded successfully from: {model_path}")

In [None]:
### Import Meta Model

model_path = root_data+'Models/'+symbol+'_'+direction+'_Meta_ml_model.joblib'
meta_ml_model = joblib.load(model_path)
print(f"Model loaded successfully from: {model_path}")

In [None]:

test['label_ml'] = ml_model.predict(test.loc[:,train_features])

prediction_probabilities = ml_model.predict_proba(test.loc[:,train_features])
test['prob_0'] = prediction_probabilities[:, 0]
test['prob_1'] = prediction_probabilities[:, 1]

print(test.columns)

In [None]:
print(test.columns)

In [None]:
# Calculate and display Confusion Matrix
conf_matrix = confusion_matrix(test['label'], test['label_ml'])
print("Confusion Matrix:")
print(conf_matrix)

# Calculate and display Classification Report
class_report = classification_report(test['label'], test['label_ml'])
print("\nClassification Report:")
print(class_report)

In [None]:
test['meta_label'] = meta_ml_model.predict(test.loc[:,meta])

prediction_probabilities = meta_ml_model.predict_proba(test.loc[:,meta])
test['meta_prob_0'] = prediction_probabilities[:, 0]
test['meta_prob_1'] = prediction_probabilities[:, 1]

print(test.columns)

In [None]:
# Calculate and display Confusion Matrix
conf_matrix = confusion_matrix(test['label'], test['meta_label'])
print("Confusion Matrix:")
print(conf_matrix)

# Calculate and display Classification Report
class_report = classification_report(test['label'], test['meta_label'])
print("\nClassification Report:")
print(class_report)

## Results_PnL

In [None]:
test['ml_results'] = np.where(test['label_ml']==1, test['st_PnL'],0)
results(test, pnl_column='ml_results')

In [74]:
test['meta_ml_results'] = np.where(test['meta_label']==1, test['st_PnL'],0)
results(test, pnl_column='meta_ml_results')

KeyError: 'meta_label'

# Pendientes
