## Hyperparameter Tuning

In [1]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from src.config import PROCESSED_DATA_DIR
from sklearn.metrics import mean_absolute_error

input_test_path: Path = PROCESSED_DATA_DIR / "testset.csv"
input_train_path: Path = PROCESSED_DATA_DIR / "trainset.csv"

df_test = pd.read_csv(str(input_test_path), index_col='id')
df_train = pd.read_csv(str(input_train_path), index_col='id')

X_train, y_train = df_train.drop(columns=['r_spread']), df_train['r_spread']
X_test, y_test = df_test.drop(columns=['r_spread']), df_test['r_spread']

[32m2024-10-19 11:57:17.887[0m | [1mINFO    [0m | [36msrc.config[0m:[36m<module>[0m:[36m14[0m - [1mPROJ_ROOT path is: G:\Work\DS\dont-bet-on-sports[0m


In [2]:
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

# Initialize the RandomForestRegressor
rf = RandomForestRegressor(random_state=42)

param_distributions = {
    'n_estimators': np.arange(100, 1000, 50),  # Number of trees (high range due to large search space)
    'max_depth': [None, 10, 20, 30, 50, 100],  # Increasing depth to explore complex models
    'min_samples_split': [2, 5, 10, 20],       # Control overfitting with more splits
    'min_samples_leaf': [1, 2, 4, 8],           # Minimum samples per leaf
    'bootstrap': [True, False],                 # Whether to use bootstrapping
    'min_weight_fraction_leaf': np.linspace(0, 0.5, 5),  # Regularization parameter
    'ccp_alpha': np.linspace(0, 0.1, 10)        # Complexity parameter for pruning
}

# RandomizedSearchCV setup: sampling from 170 parameters efficiently
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_distributions,
    n_iter=30,  # Sample only 30 combinations
    cv=3,  # Use 3-fold cross-validation for faster results
    verbose=2,
    random_state=42,
    n_jobs=-1,  # Utilize all available CPU cores
    scoring='neg_mean_absolute_error'
)

# Fit GridSearchCV on the training data
random_search.fit(df_train.drop(columns=['r_spread']), df_train['r_spread'])

# Get the best parameters and model
print("Best Parameters:", random_search.best_params_)
rfr_best_model = random_search.best_estimator_

# Evaluate the model on the test data
y_pred = rfr_best_model.predict(df_test.drop(columns=['r_spread']))

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best Parameters: {'n_estimators': 950, 'min_weight_fraction_leaf': 0.125, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 30, 'ccp_alpha': 0.07777777777777778, 'bootstrap': True}


In [3]:
rf_mae = mean_absolute_error(df_test['r_spread'], y_pred)
print("RFR Test MAE:", rf_mae)

RFR Test MAE: 9.971237437377846


In [4]:
# Import necessary libraries
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error
import numpy as np

# Initialize the XGBoost Regressor
xgb = XGBRegressor(objective='reg:absoluteerror', random_state=42, n_jobs=-1)

# Define the hyperparameters to search over
param_distributions = {
    'n_estimators': np.arange(50, 1000, 50),  # Number of boosting rounds
    'learning_rate': np.linspace(0.01, 0.4, 16),  # Shrinkage rate
    'subsample': np.linspace(0.5, 1.0, 7),   # Fraction of samples per tree
    'colsample_bytree': np.linspace(0.5, 1.0, 7),  # Fraction of features per tree
    'gamma': np.linspace(0, 0.5, 5),         # Minimum loss reduction for further splits
    'reg_alpha': np.logspace(-3, 1, 5),      # L1 regularization
    'reg_lambda': np.logspace(-3, 1, 5)      # L2 regularization
}

# Setup RandomizedSearchCV to use MAE as the evaluation metric
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_distributions,
    n_iter=30,  # Number of random parameter combinations to try
    scoring='neg_mean_absolute_error',  # Use MAE as the evaluation metric
    cv=3,  # 3-fold cross-validation for faster performance
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all CPU cores
)

# Fit the RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)
gbt_best_model = random_search.best_estimator_

# Predict on the test data and evaluate using MAE
y_pred = gbt_best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"GBT Test  MAE: {mae}")


Fitting 3 folds for each of 30 candidates, totalling 90 fits




Best Parameters: {'subsample': 0.8333333333333333, 'reg_lambda': 0.1, 'reg_alpha': 0.001, 'n_estimators': 200, 'learning_rate': 0.036000000000000004, 'gamma': 0.25, 'colsample_bytree': 0.5}
GBT Test  MAE: 10.052160716606405


In [5]:
# Import necessary libraries
from sklearn.svm import SVR

svr = SVR()

# Define the hyperparameters to search over
param_distributions = {
    'kernel':  ['rbf'],  # Kernel types
    'C': np.logspace(-3, 2, 10),  # Regularization parameter
    'epsilon': np.linspace(0.01, 1.0, 10),  # Epsilon in the epsilon-SVR model
    'gamma': ['scale', 'auto'] + list(np.logspace(-3, 1, 5))  # Kernel coefficient
}

# Setup RandomizedSearchCV with MAE as the scoring metric
random_search = RandomizedSearchCV(
    estimator=svr,
    param_distributions=param_distributions,
    n_iter=30,  # Number of parameter combinations to try
    scoring='neg_mean_absolute_error',  # Use MAE as the metric
    cv=3,  # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available CPU cores
)

# Fit RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)
svr_best_model = random_search.best_estimator_

# Predict on the test data and evaluate using MAE
y_pred = svr_best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"SVR Test MAE: {mae}")


Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best Parameters: {'kernel': 'rbf', 'gamma': 0.001, 'epsilon': 0.12, 'C': 7.742636826811277}
SVR Test MAE: 10.066851060626746
