In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
import category_encoders as ce

import warnings

warnings.filterwarnings("ignore")

def evaluate_categorical_encoders(X, y, encoders, cv_folds=5):
    """
    Evaluates different categorical encoders and returns the best one based on cross-validation scores.
    
    :param X: DataFrame with features, including categorical ones
    :param y: Target variable
    :param encoders: Dictionary of encoder names and their corresponding objects from category_encoders
    :param cv_folds: Number of folds for cross-validation
    :return: Name of the best encoder
    """
    results = {}
    
    # Define cross-validation strategy
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    for encoder_name, encoder in encoders.items():
        # Create a pipeline with the encoder and XGBClassifier
        pipeline = make_pipeline(encoder, XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
        
        # Evaluate the pipeline
        scores = cross_val_score(pipeline, X, y, cv=cv, scoring='f1_weighted')
        
        # Store the mean score and print the results
        results[encoder_name] = scores.mean()
        print(f"{encoder_name}: Mean CV accuracy = {scores.mean():.4f} ± {scores.std():.4f}")
    
    # Identify the best encoder
    best_encoder_name = max(results, key=results.get)
    print(f"\nBest encoder: {best_encoder_name} with accuracy = {results[best_encoder_name]:.4f}")
    
    return best_encoder_name

def evaluate_model(X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    best_f1 = 0.0

    model = XGBClassifier(random_state=42, enable_categorical=True)
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=0, early_stopping_rounds=25)
    
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred, average='weighted')
            
    return f1


In [2]:
X = pd.read_csv('../../data/binned/df.csv')
y = pd.read_csv('../../data/binned/y.csv') 

for col in X.filter(like='_binned').columns:
    X[col] = X[col].astype('category')

X.filter(like='_binned').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 5 columns):
 #   Column                                         Non-Null Count  Dtype   
---  ------                                         --------------  -----   
 0   avg_home_team_rating_x_ewm_shoton_away_binned  3022 non-null   category
 1   average_rating_away_binned                     3040 non-null   category
 2   average_rating_home_binned                     3040 non-null   category
 3   ewm_possession_home_binned                     3024 non-null   category
 4   avg_away_team_rating_binned                    3040 non-null   category
dtypes: category(5)
memory usage: 16.0 KB


In [3]:
evaluate_model(X, y)

0.6772014777016293

In [7]:
# Define the encoders to evaluate
encoders_to_evaluate = {
    'OneHotEncoder': ce.OneHotEncoder(drop_invariant=True),
    'OrdinalEncoder': ce.OrdinalEncoder(drop_invariant=True),
    'TargetEncoder': ce.TargetEncoder(drop_invariant=True, ),
    'BinaryEncoder': ce.BinaryEncoder(drop_invariant=True),
    'BaseNEncoder': ce.BaseNEncoder(drop_invariant=True),
    'CatBoostEncoder': ce.CatBoostEncoder(drop_invariant=True),
}

best_encoder = evaluate_categorical_encoders(X, y, encoders_to_evaluate)

OneHotEncoder: Mean CV accuracy = 0.6237 ± 0.0107
OrdinalEncoder: Mean CV accuracy = 0.6213 ± 0.0234
TargetEncoder: Mean CV accuracy = 0.6259 ± 0.0149
BinaryEncoder: Mean CV accuracy = 0.6277 ± 0.0144
BaseNEncoder: Mean CV accuracy = 0.6277 ± 0.0144
CatBoostEncoder: Mean CV accuracy = 0.6277 ± 0.0124

Best encoder: CatBoostEncoder with accuracy = 0.6277
