In [9]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, StratifiedKFold
import pandas as pd
from sklearn.metrics import classification_report, make_scorer, f1_score

## Import Data Set: Baseline - Correlation + EMA Features + RSI + Volatility + MACD

In [3]:
X_train_scaled = pd.read_csv('../data/final/X_train_scaled.csv')
X_test_scaled = pd.read_csv('../data/final/X_test_scaled.csv')
y_train = pd.read_csv('../data/final/y_train.csv')
y_test = pd.read_csv('../data/final/y_test.csv')

## Hyperparameter Tuning Using XGBoost Model

In [7]:
model = xgb.XGBClassifier(eval_metric=['logloss','auc'])

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

# Set up the Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=model, 
                           param_grid=param_grid, 
                           scoring='f1', 
                           cv=3, 
                           verbose=2, 
                           n_jobs=-1)

# Fit the model
grid_search.fit(X_train_scaled, y_train)

# Best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation F1 Score: {:.2f}".format(grid_search.best_score_))

Fitting 3 folds for each of 2187 candidates, totalling 6561 fits
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_d

Features obtained during XGBoost recursive feature elimination that have an F1 score of 0.718. See EDA notebook. To see if hyperparameter tuning will improve the model.

In [4]:
features = ['WTI Price', 'Natural Gas Price', 'Aluminum Price', 'Gold Price',
       'Platinum Price', 'Silver Price', 'FTSE 100', 'TSX', 'EURCAD', 'USDCAD',
       '10 YR Bond Yield', 'Day', 'EMA Slope', 'EMA/Close', 'RSI',
       'MACD Histogram', 'Daily Volatility']

In [5]:
X_train_filtered = X_train_scaled.loc[:,features]
X_train_filtered.head()

Unnamed: 0,WTI Price,Natural Gas Price,Aluminum Price,Gold Price,Platinum Price,Silver Price,FTSE 100,TSX,EURCAD,USDCAD,10 YR Bond Yield,Day,EMA Slope,EMA/Close,RSI,MACD Histogram,Daily Volatility
0,2.293442,1.807859,-0.135133,-0.648474,1.953043,0.573334,-0.276789,-1.268132,-0.338321,-2.662849,2.139209,-1.668557,-0.106611,0.066605,-3.110036,-0.003025,-1.03415
1,2.11272,1.928437,-0.135133,-0.648474,2.089783,0.573334,-0.276789,-1.268132,-0.436343,-2.629305,2.139209,-1.554955,-0.106611,0.066605,-3.110036,-0.003025,-1.03415
2,2.030675,1.905406,-0.135133,-0.59144,2.144749,0.596629,-0.252619,-1.297522,-0.699657,-2.67403,2.158031,-1.441354,-0.242715,0.204441,-3.110036,-0.060008,-1.03415
3,2.001294,1.908115,-0.135133,-0.593994,2.161808,0.566317,-0.252505,-1.332094,-0.565117,-2.649183,2.101564,-1.10055,-0.389848,0.354579,-3.110036,-0.159632,-1.03415
4,2.014598,1.898631,-0.135133,-0.629746,2.151248,0.477627,-0.205679,-1.266355,-0.303725,-2.51128,2.007453,-0.986948,-0.058447,0.018,-0.253767,-0.085926,-1.03415


In [6]:
X_test_filtered = X_test_scaled.loc[:,features]
X_test_filtered.head()

Unnamed: 0,WTI Price,Natural Gas Price,Aluminum Price,Gold Price,Platinum Price,Silver Price,FTSE 100,TSX,EURCAD,USDCAD,10 YR Bond Yield,Day,EMA Slope,EMA/Close,RSI,MACD Histogram,Daily Volatility
0,0.651976,1.402769,2.973872,1.592864,0.020006,1.411964,0.623594,3.222765,-0.240299,-0.325946,-0.665322,-0.418941,0.525413,-0.356003,1.165204,0.50437,-0.668428
1,0.615943,1.305222,3.047173,1.705229,0.063871,1.598046,0.67186,3.221125,-0.119212,-0.340855,-0.778256,-0.30534,0.457624,-0.310722,1.309321,0.426604,-0.677631
2,0.552191,1.420381,3.05555,1.754601,0.036253,1.601414,0.549244,3.198659,-0.019268,-0.267555,-0.834723,0.035465,0.29985,-0.205672,1.05705,0.295953,-0.65966
3,0.513386,1.272706,3.045078,1.746089,-0.109422,1.564366,0.600636,3.120964,-0.053864,-0.195497,-0.853546,0.149066,-0.098653,0.061243,0.236039,0.031247,-0.576874
4,0.450743,1.293028,2.858685,1.731617,-0.096425,1.497848,0.578302,3.081102,0.005718,-0.164438,-0.834723,0.262668,-0.284006,0.186498,-0.331831,-0.230336,-0.584396


In [7]:
model = xgb.XGBClassifier(eval_metric=['logloss','auc'])

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0.5, 1.0, 1.5]
    
}

# Set up the Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=model, 
                           param_grid=param_grid, 
                           scoring='f1', 
                           cv=3, 
                           verbose=2, 
                           n_jobs=-1)

# Fit the model
grid_search.fit(X_train_filtered, y_train)

# Best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation F1 Score: {:.2f}".format(grid_search.best_score_))

Fitting 3 folds for each of 6561 candidates, totalling 19683 fits
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0, reg_lambda=0.5, subsample=0.7; total time=   0.0s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0, reg_lambda=1.0, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0, reg_lambda=1.0, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0, reg_lambda=0.5, subsample=0.7; total time=   0.0s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0, reg_lambda=0.5, subsample=0.7; total time=   0.0s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_

Originally, I had a cross validation (CV) of 3 and no regularization, which resulted in an F1 Score of 0.70, which is slightly below the highest score of 0.718 seen in XGBoost's RFE. I increased the CV to 5 and adding regularization to the GridSearch to address any overfitting, which resulted in the same F1 Score of 0.70.

Many machine learning algorithms, including XGBoost, have elements of randomness (e.g., random subsampling, random seed for tree construction). Different random initializations can lead to slight variations in model performance. The difference between an F1 score of 0.718 and 0.7 might be within the range of normal variability due to random factors.

## Hyperparameter Tuning Using Logistic Regression

In [12]:
log_reg = LogisticRegression(max_iter=1000)

# Define the refined parameter grid for Logistic Regression
param_grid_logreg = {
    'C': [0.01, 0.1, 1, 10],  # Regularization strength
    'penalty': ['l2'],  # Start with 'l2' for compatibility with solvers
    'solver': ['lbfgs', 'liblinear'],  # Compatible solvers with 'l2'
}

# Use StratifiedKFold to maintain class balance in cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Use 'f1_weighted' scoring for multiclass problems
scorer = make_scorer(f1_score, average='weighted')

# Initialize GridSearchCV
grid_search_logreg = GridSearchCV(estimator=log_reg, param_grid=param_grid_logreg, 
                                  scoring=scorer, cv=cv, verbose=1)

# Fit the model with GridSearchCV
grid_search_logreg.fit(X_train_filtered, y_train)

# Get the best model and parameters
best_model_logreg = grid_search_logreg.best_estimator_
best_params_logreg = grid_search_logreg.best_params_


# Make predictions on the test set using the best model
y_pred = best_model_logreg.predict(X_test_filtered)
f1 = f1_score(y_test, y_pred, average='weighted')
print("Best Parameters for Logistic Regression:", best_params_logreg)

# Print the F1 score
print("F1 Score of the best Logistic Regression model: {:.2f}".format(f1))

Fitting 5 folds for each of 8 candidates, totalling 40 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best Parameters for Logistic Regression: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
F1 Score of the best Logistic Regression model: 0.68


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
