In [12]:
import pandas as pd 
import numpy as np 

import cupy as cp
import xgboost

import optuna
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import f1_score, make_scorer


In [2]:
model_dat = pd.read_csv(r'C:\Users\jcmar\my_files\SportsBetting\data\ufc_train_data.csv')


model_cols = ['control_pr_red', 'control_pr_blue',
                'td_pr_red','td_pr_blue', 
                'red_td_accuracy','blue_td_accuracy',
                'red_td_defense','blue_td_defense',
                'sub_att_pr_red','sub_att_pr_blue',

                'red_kd_total', 'blue_kd_total',
                'sigstrikes_pm_red','sigstrikes_pm_blue',
                'red_sigstrike_accuracy','blue_sigstrike_accuracy',
                'red_sigstrike_defense','blue_sigstrike_defense',
                
                'sigstrikes_pm_red', 'sigstrikes_pm_blue',
                'leg_strikes_pm_red','leg_strikes_pm_blue',
                'body_strikes_pm_red', 'body_strikes_pm_blue',
                'head_strikes_pm_red','head_strikes_pm_blue',
                'clinch_strikes_pm_red', 'clinch_strikes_pm_blue',
                'ground_strikes_pm_red', 'ground_strikes_pm_blue',
                'sigstrikes_absorbed_pm_red','sigstrikes_absorbed_pm_blue',

                'reach_diff', 'age_diff',
                'reach_red','reach_blue','weight_class',
                'height_red', 'height_blue', 
                'red_age', 'blue_age', 
                
                'red_elo','blue_elo', 'elo_diff',
                'math_red','math_blue',

                'red_win_streak','blue_win_streak',
                'red_lose_streak','blue_lose_streak',
                'ko_wins_red','ko_wins_blue',
                'sub_wins_red','sub_wins_blue',
                'decision_wins_blue','decision_wins_red',

                'winner', 'red_fighter','blue_fighter',
                'event_country']

xgb_dat = model_dat[model_cols]

le = LabelEncoder()
xgb_dat['event_country'] = le.fit_transform(xgb_dat['event_country'])
xgb_dat['weight_class'] = le.fit_transform(xgb_dat['weight_class'])

mm_scaler = MinMaxScaler()
cat_cols = xgb_dat.select_dtypes(include=['object', 'category']).columns.tolist()
num_categorical = xgb_dat.select_dtypes(include=['object']).shape[1]
print(f"Number of categorical columns: {num_categorical}")
numerical_cols = xgb_dat.columns.difference(cat_cols)
mm_scaler = MinMaxScaler()
xgb_dat[numerical_cols] = mm_scaler.fit_transform(xgb_dat[numerical_cols])

train_len = int(model_dat.shape[0] * .95)
train_dat = xgb_dat[:train_len]
test_dat = xgb_dat[train_len:]

X_train = train_dat.drop(columns=['winner','red_fighter','blue_fighter'])
y_train = train_dat['winner']

X_test = test_dat.drop(columns=['winner','red_fighter','blue_fighter'])
y_test= test_dat['winner']

print(X_train.shape, X_test.shape)

Number of categorical columns: 2
(5293, 57) (279, 57)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xgb_dat['event_country'] = le.fit_transform(xgb_dat['event_country'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xgb_dat['weight_class'] = le.fit_transform(xgb_dat['weight_class'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xgb_dat[numerical_cols] = mm_scaler.fit_transform(xgb_dat[numerical

In [14]:
X_train = cp.array(X_train)
X_test = np.array(X_test)
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

def objective(trial):
    param = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'reg_lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),  # fixed name
        'reg_alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),    # fixed name
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),  # ‚Üê pruning regularization
        'device': 'cuda',  # Enables GPU usage
        'scale_pos_weight': scale_pos_weight
    }


    model = xgboost.XGBClassifier(**param)

    # 5-fold CV AUC
    cv = StratifiedKFold(n_splits=15, shuffle=True, random_state=42)

    f1_scorer = make_scorer(f1_score, average='macro')  # Use macro-average to consider both classes equally
    f1_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=f1_scorer)

    # Return the negative F1 score (since Optuna minimizes the objective function)
    return 1 - f1_scores.mean()

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print("Best trial:")
trial = study.best_trial
print("  F1: ", 1 - trial.value)
print("  Params: ", trial.params)

[I 2025-04-13 16:01:21,354] A new study created in memory with name: no-name-cd32dc9e-cf41-4e31-93b2-cceab71b9ba4
[I 2025-04-13 16:01:26,474] Trial 0 finished with value: 0.3926615081608261 and parameters: {'lambda': 5.94241286646472e-07, 'alpha': 9.318203144846235, 'colsample_bytree': 0.5554805360753654, 'subsample': 0.7794680264447966, 'learning_rate': 0.23269786553993302, 'max_depth': 6, 'n_estimators': 516, 'min_child_weight': 8, 'gamma': 2.9811535434667604}. Best is trial 0 with value: 0.3926615081608261.
[I 2025-04-13 16:01:34,940] Trial 1 finished with value: 0.40865085088460573 and parameters: {'lambda': 0.02366786334011244, 'alpha': 3.45182509942453e-08, 'colsample_bytree': 0.5676702530340847, 'subsample': 0.5513344353539894, 'learning_rate': 0.21977756335238263, 'max_depth': 10, 'n_estimators': 757, 'min_child_weight': 2, 'gamma': 4.25948727121033}. Best is trial 0 with value: 0.3926615081608261.
[I 2025-04-13 16:01:39,178] Trial 2 finished with value: 0.39137458256040714 and

Best trial:
  F1:  0.6167254543016097
  Params:  {'lambda': 1.5262358086850977e-05, 'alpha': 0.00034206783207601854, 'colsample_bytree': 0.7665222121861451, 'subsample': 0.8031603263616118, 'learning_rate': 0.027262028250113212, 'max_depth': 7, 'n_estimators': 145, 'min_child_weight': 4, 'gamma': 2.842434862624574}
