In [5]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import pandas as pd

### Import Data Set: Baseline + EMA Features + RSI + Volatility + MACD

In [6]:
X_train_scaled = pd.read_csv('../data/processed/X_train_scaled.csv')
X_test_scaled = pd.read_csv('../data/processed/X_test_scaled.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')

In [7]:
model = xgb.XGBClassifier(eval_metric=['logloss','auc'])

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

# Set up the Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=model, 
                           param_grid=param_grid, 
                           scoring='f1', 
                           cv=3, 
                           verbose=2, 
                           n_jobs=-1)

# Fit the model
grid_search.fit(X_train_scaled, y_train)

# Best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation F1 Score: {:.2f}".format(grid_search.best_score_))

Fitting 3 folds for each of 2187 candidates, totalling 6561 fits
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_d

### Import Data Set: Baseline - Correlation + EMA Features + RSI + Volatility + MACD

In [8]:
X_train_scaled = pd.read_csv('../data/processed/X_train_scaled.csv')
X_test_scaled = pd.read_csv('../data/processed/X_test_scaled.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')

Features obtained during XGBoost recursive feature elimination that have an F1 score of 0.718. See EDA notebook. To see if hyperparameter tuning will improve the model.

In [10]:
features = ['WTI Price', 'Natural Gas Price', 'Gold Price', 'Platinum Price', 'Silver Price', 'Lumber Price', 'Interest_Rate', 'GDP', 'Unemployment', 'DAX', 'FTSE 100', 'TSX', 'CADJPY', 'EURCAD', 'GBPCAD', 'USDCAD', '10 YR Bond Yield', 'Month', 'Day', 'DayofWeek', 'EMA Slope', 'EMA/Close', 'RSI', 'MACD Histogram', 'Daily Volatility']

In [13]:
X_train_filtered = X_train_scaled.loc[:,features]
X_train_filtered.head()

Unnamed: 0,WTI Price,Natural Gas Price,Gold Price,Platinum Price,Silver Price,Lumber Price,Interest_Rate,GDP,Unemployment,DAX,...,USDCAD,10 YR Bond Yield,Month,Day,DayofWeek,EMA Slope,EMA/Close,RSI,MACD Histogram,Daily Volatility
0,2.277739,1.759921,-0.657392,1.943472,0.558896,-0.342755,0.233348,-1.522394,0.38397,-1.427562,...,-2.651614,2.140465,-1.560902,-1.672346,-0.346322,-0.099733,0.062948,-2.824339,-0.002606,-1.016012
1,2.097361,1.878502,-0.657392,2.079949,0.558896,-0.342755,0.233348,-1.522394,0.38397,-1.427562,...,-2.618108,2.140465,-1.560902,-1.559306,0.160695,-0.099733,0.062948,-2.824339,-0.002606,-1.016012
2,2.015472,1.855852,-0.600696,2.134809,0.582134,-0.326452,0.233348,-1.522394,0.38397,-1.405888,...,-2.662782,2.159278,-1.560902,-1.446265,0.667712,-0.250572,0.216611,-2.824339,-0.067105,-1.016012
3,2.015472,1.855852,-0.600696,2.134809,0.582134,-0.326452,0.233348,-1.522394,0.38397,-1.405888,...,-2.662782,2.159278,-1.560902,-1.333225,1.17473,-0.236206,0.201977,-2.824339,-0.104004,-1.016012
4,2.015472,1.855852,-0.600696,2.106974,0.582134,-0.326452,0.233348,-1.522394,0.38397,-1.405888,...,-2.662782,2.159278,-1.560902,-1.220184,1.681747,-0.223209,0.188736,-2.824339,-0.121438,-1.016012


In [14]:
X_test_filtered = X_test_scaled.loc[:,features]
X_test_filtered.head()

Unnamed: 0,WTI Price,Natural Gas Price,Gold Price,Platinum Price,Silver Price,Lumber Price,Interest_Rate,GDP,Unemployment,DAX,...,USDCAD,10 YR Bond Yield,Month,Day,DayofWeek,EMA Slope,EMA/Close,RSI,MACD Histogram,Daily Volatility
0,0.689196,2.304863,1.823246,-0.111237,1.619997,0.419271,-1.201119,1.496503,0.625872,2.548286,...,-0.285162,-0.850786,0.782024,-1.559306,0.160695,0.875878,-0.584602,0.805962,0.477535,-0.346775
1,0.650465,2.399462,1.917174,0.037941,1.867494,0.560404,-1.201119,1.496503,0.625872,2.511624,...,-0.316186,-0.775535,0.782024,-1.446265,0.667712,0.870511,-0.580226,1.072433,0.580639,-0.381172
2,0.620586,2.415451,1.917174,0.011997,1.867494,0.560404,-1.201119,1.496503,0.625872,2.511624,...,-0.316186,-0.775535,0.782024,-1.220184,1.681747,0.778107,-0.518971,1.449796,0.59789,-0.440963
3,0.623353,2.374147,1.768243,0.00443,1.747385,0.560404,-1.201119,1.496503,0.625872,2.604789,...,-0.302535,-0.775535,0.782024,-1.107144,-1.360357,0.694503,-0.46355,1.317743,0.559057,-0.466997
4,0.598454,2.2076,1.763589,-0.100427,1.747385,0.424395,-1.201119,1.496503,0.625872,2.549829,...,-0.166033,-0.700283,0.782024,-0.994103,-0.853339,0.569613,-0.381077,1.239843,0.464769,-0.46165


In [18]:
model = xgb.XGBClassifier(eval_metric=['logloss','auc'])

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0.5, 1.0, 1.5]
    
}

# Set up the Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=model, 
                           param_grid=param_grid, 
                           scoring='f1', 
                           cv=5, 
                           verbose=2, 
                           n_jobs=-1)

# Fit the model
grid_search.fit(X_train_filtered, y_train)

# Best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation F1 Score: {:.2f}".format(grid_search.best_score_))

Fitting 5 folds for each of 6561 candidates, totalling 32805 fits
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0, reg_lambda=0.5, subsample=0.7; total time=   0.0s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0, reg_lambda=0.5, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0, reg_lambda=0.5, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0, reg_lambda=0.5, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0, reg_lambda=0.5, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_

Originally, I had a cross validation (CV) of 3 and no regularization, which resulted in an F1 Score of 0.70, which is slightly below the highest score of 0.718 seen in XGBoost's RFE. I increased the CV to 5 and adding regularization to the GridSearch to address any overfitting, which resulted in the same F1 Score of 0.70.

Many machine learning algorithms, including XGBoost, have elements of randomness (e.g., random subsampling, random seed for tree construction). Different random initializations can lead to slight variations in model performance. The difference between an F1 score of 0.718 and 0.7 might be within the range of normal variability due to random factors.