In [1]:
import warnings

import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

In [2]:
# from hyperopt import hp, pyll
# 
# uniform_vals = [pyll.stochastic.sample(hp.uniform('max_ctr_complexity', 0, 8))
#                 for _ in range(10_000)]
# fig, ax = plt.subplots(figsize=(8, 4))
# ax.hist(uniform_vals)

In [3]:
def evaluate_model(X_train, X_val, y_train, y_val):
    cat_cols = X_train.select_dtypes(exclude=['number']).columns.to_list()
    
    model = CatBoostClassifier(random_seed=42,iterations=500,early_stopping_rounds=75,eval_metric='F1')

    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], use_best_model=True, cat_features=cat_cols, verbose=0)

    y_pred = model.predict(X_val)

    return f1_score(y_val, y_pred)


def handle_nan(X_train, X_val):
    cat_cols = X_train.select_dtypes(exclude=['number']).columns.to_list()
    num_cols = X_train.select_dtypes(include='number').columns

    # Impute numerical columns
    imputer = IterativeImputer(initial_strategy='most_frequent', max_iter=5, n_nearest_features=4)
    X_train_num = pd.DataFrame(imputer.fit_transform(X_train[num_cols]), columns=num_cols, index=X_train.index)
    X_val_num = pd.DataFrame(imputer.transform(X_val[num_cols]), columns=num_cols, index=X_val.index)

    # Handle categorical columns
    for column in cat_cols:
        # Add 'Missing' category
        X_train[column] = X_train[column].cat.add_categories('Missing')
        X_val[column] = X_val[column].cat.add_categories('Missing')

        # Fill NaN values with 'Missing'
        X_train[column] = X_train[column].fillna('Missing')
        X_val[column] = X_val[column].fillna('Missing')

    # Concatenate numerical and categorical dataframes
    X_train_processed = pd.concat([X_train_num, X_train[cat_cols]], axis=1)
    X_val_processed = pd.concat([X_val_num, X_val[cat_cols]], axis=1)

    return X_train_processed, X_val_processed

In [4]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from typing import Any, Dict, Union


def evaluate_model_opt(X_train, X_val, y_train, y_val, space):
    cat_cols = X_train.select_dtypes(exclude=['number']).columns.to_list()
    model = CatBoostClassifier(**space, early_stopping_rounds=75)

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=0,
        cat_features=cat_cols,
        use_best_model=True
    )

    # Make predictions on the validation data
    y_pred = model.predict(X_val)

    # Return the weighted F1 score and the model
    return f1_score(y_val, y_pred), model


def hyperparameter_tuning(space: Dict[str, Union[float, int]],
                          X_train: pd.DataFrame, y_train: pd.Series,
                          X_val: pd.DataFrame, y_val: pd.Series) -> Dict[str, Any]:
    """
    Perform hyperparameter tuning for an XGBoost classifier.
    
    This function takes a dictionary of hyperparameters, training
    and test data, and an optional value for early stopping rounds,
    and returns a dictionary with the loss and model resulting from
    the tuning process. The model is trained using the training
    data and evaluated on the test data. The loss is computed as
    the negative of the accuracy score.
    Parameters
    ----------
    space : Dict[str, Union[float, int]]
        A dictionary of hyperparameters for the XGBoost classifier.
    X_train : pd.DataFrame
        The training data.
    y_train : pd.Series
        The training target.
    X_test : pd.DataFrame
        The test data.
    y_test : pd.Series
        The test target.
    early_stopping_rounds : int, optional
        The number of early stopping rounds to use. The default value is 50.
    metric : callable
        Metric to maximize. Default is accuracy
        Returns
    -------
    Dict[str, Any]
        A dictionary with the loss and model resulting from the
        tuning process. The loss is a float, and the model is an
        XGBoost classifier.
    """

    int_vals = ['depth']
    space = {k: (int(val) if k in int_vals else val) for k, val in space.items()}
    score, model = evaluate_model_opt(X_train, X_val, y_train, y_val, space)
    return {'loss': -score, 'status': STATUS_OK, 'model': model}


In [5]:
from typing import Any, Dict, Sequence


def trial2df(trial: Sequence[Dict[str, Any]]) -> pd.DataFrame:
    """
    Convert a Trial object (sequence of trial dictionaries)
    to a Pandas DataFrame.
    Parameters
    ----------
    trial : List[Dict[str, Any]]
    A list of trial dictionaries.
    Returns
    -------
    pd.DataFrame
    A DataFrame with columns for the loss, trial id, and
    values from each trial dictionary.
    """
    vals = []
    for t in trial:
        result = t['result']
        misc = t['misc']
        val = {k: (v[0] if isinstance(v, list) else v)
               for k, v in misc['vals'].items()
               }
        val['loss'] = result['loss']
        val['tid'] = t['tid']
        vals.append(val)
    return pd.DataFrame(vals)

In [6]:
def jitter(df: pd.DataFrame, col: str, amount: float = 1) -> pd.Series:
    """
        Add random noise to the values in a Pandas DataFrame column.
        This function adds random noise to the values in a specified
        column of a Pandas DataFrame. The noise is uniform random
        noise with a range of `amount` centered around zero. The
        function returns a Pandas Series with the jittered values.
        Parameters
        ----------
        df : pd.DataFrame
        The input DataFrame.
        col : str
        The name of the column to jitter.
        amount : float, optional
        The range of the noise to add. The default value is 1.
        Returns
        -------
        pd.Series
        A Pandas Series with the jittered values.
    """
    vals = np.random.uniform(low=-amount / 2, high=amount / 2,
                             size=df.shape[0])
    return df[col] + vals
    fig, ax = plt.subplots(figsize=(8, 4))
    (hyper2hr
     .assign(max_depth=lambda df: jitter(df, 'depth', amount=.8))
     .plot.scatter(x='max_depth', y='loss', alpha=.1, color='purple', ax=ax)
     )



In [7]:
# X = pd.read_csv('../data/binned/df.csv')
# y = pd.read_csv('../data/binned/y.csv')

X = pd.read_csv('../data/new_features/df_.csv')
y = pd.read_csv('../data/new_features/y.csv')

for col in X.filter(like='_binned').columns:
    X[col] = X[col].astype('category')

X.filter(like='_binned').info()

KeyError: "['stage'] not found in axis"

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val = handle_nan(X_train, X_val)

In [None]:
X_train.info()

In [None]:
evaluate_model(X_train, X_val, y_train, y_val)

In [None]:
options = {
    'depth': hp.quniform('depth', 8, 15, 1),
    'bagging_temperature': hp.loguniform('bagging_temperature', np.log(0.75), np.log(0.95)),
    'rsm': hp.loguniform('rsm', np.log(0.75), np.log(0.95)),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1, 10),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'iterations': 50,
    'max_ctr_complexity': hp.randint('max_ctr_complexity', 0, 8),
    # 'boosting_type': hp.choice('boosting_type', ['Ordered', 'Plain']),
}

trials = Trials()

tuned_params = fmin(fn=lambda space: hyperparameter_tuning(space, X_train, y_train, X_val, y_val),
            space=options,
            algo=tpe.suggest,
            max_evals=200,
            trials=trials)

print("Best Hyperparameters:", tuned_params)

In [None]:
tuned_params = {'bagging_temperature': 0.8782622322892651, 'depth': 12.0, 'l2_leaf_reg': 3.449275895229593, 'learning_rate': 0.054678015156771904, 'max_ctr_complexity': 5, 'rsm': 0.8534565882197943}

In [None]:
hyper2hr = trial2df(trials)

In [None]:
hyper2hr

In [None]:
import seaborn as sns

fig, ax = plt.subplots(figsize=(8, 4))
sns.heatmap(hyper2hr.corr(method='spearman'),
            annot=True, fmt='.2f', vmin=-1, vmax=1, ax=ax)

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
(hyper2hr
 .plot.scatter(x='learning_rate', y='loss', alpha=.1, color='purple', ax=ax)
 )

In [None]:
# This makes it quite clear that the algorithm spent a good amount of time at depth 10.
# If we want to get even fancier, we can color this by trial attempt. The later attempts are
# represented by the yellow color.
fig, ax = plt.subplots(figsize=(8, 4))
(hyper2hr
 .assign(max_depth=lambda df: jitter(df, 'depth', amount=.8))
 .plot.scatter(x='max_depth', y='loss', alpha=.5,
               color='yellow', cmap='viridis', ax=ax)
 )

In [None]:
f1, cat_tune =evaluate_model_opt(X_train, X_val, y_train, y_val, tuned_params)
print(f'result: {f1}')

In [None]:
cat_cols = X_train.select_dtypes(exclude='number').columns.to_list()

In [None]:
cat_clf = CatBoostClassifier(early_stopping_rounds=75)

cat_clf.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=0,
    cat_features=cat_cols,
    use_best_model=True
)

y_pred_clf = cat_clf.predict(X_val)

f1_score(y_val, y_pred_clf)

In [None]:
from sklearn import metrics

print(metrics.classification_report(y_val, y_pred=cat_tune.predict(X_val), target_names=['Home Win', 'Home not win']))

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
metrics.RocCurveDisplay.from_estimator(cat_tune, X_val, y_val, ax=ax)

metrics.RocCurveDisplay.from_estimator(cat_clf, X_val, y_val, ax=ax, label='default')