In [6]:
import pandas as pd

from itertools import combinations

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import category_encoders as ce

import warnings
warnings.filterwarnings('ignore')

In [7]:
X = pd.read_csv('../../data/new_features/df_.csv')
y = pd.read_csv('../../data/new_features/y.csv')

In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 46 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   stage                                   3040 non-null   int64  
 1   player_rating_home_player_1             3040 non-null   int64  
 2   player_rating_home_player_2             3040 non-null   int64  
 3   player_rating_home_player_3             3040 non-null   int64  
 4   player_rating_home_player_4             3040 non-null   int64  
 5   player_rating_home_player_5             3040 non-null   int64  
 6   player_rating_home_player_6             3040 non-null   int64  
 7   player_rating_home_player_7             3040 non-null   int64  
 8   player_rating_home_player_8             3040 non-null   int64  
 9   player_rating_home_player_9             3040 non-null   int64  
 10  player_rating_home_player_10            3040 non-null   int6

In [9]:
def apply_binning(df, feature, bins, labels):
    # Ensure bins are unique
    unique_bins = sorted(set(bins))
    if len(unique_bins) < len(bins):
        print(f"Non-unique bins for {feature}, adjusted bins: {unique_bins}")
        bins = unique_bins
        if len(bins) - 1 != len(labels):
            print(f"Skipping binning for {feature} due to mismatch in bins and labels.")
            return df[feature]
    return pd.cut(df[feature], bins=bins, labels=labels, include_lowest=True)


In [10]:
import time

# List of features to bin
features = [
    'avg_home_team_rating_x_ewm_shoton_away', 'average_rating_away', 
    'average_rating_home', 'ewm_possession_home', 
    'avg_home_team_rating', 'avg_away_team_rating', 
    'points_home', 'points_away'
]

quantile_options = [
    [0.2, 0.4, 0.6, 0.8],
    [0.1, 0.3, 0.7, 0.9],
    [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    [0.15, 0.5, 0.85],
    [0.05, 0.95]
]

def evaluate_model(features, target):
    # Splitting the data
    X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42, stratify=target)

    best_f1 = 0.0

    # Initialize and fit the model
    model = XGBClassifier(random_state=42, enable_categorical=True)
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=0, early_stopping_rounds=25)
    
    # Predict and evaluate
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred, average='weighted')
            
    return f1

best_df = None
best_f1_in_iteration = 0.0
best_interaction_constraints = []
best_feature_combo = []
best_quantiles = []

# Experiment with different combinations of binned features for each quantile option
for quantiles in quantile_options:
    for r in range(1, len(features) + 1):
        for feature_combo in combinations(features, r):
            df_temp = X.copy()  # Create a temporary copy of the dataframe
            for feature in feature_combo:
                bins = [-float('inf')] + X[feature].quantile(quantiles).tolist() + [float('inf')]
                labels = [f'Bin{i}' for i in range(1, len(bins))]  # Dynamic label creation based on the number of bins
                # Apply binning to each feature in the combination
                df_temp[f'{feature}_binned'] = apply_binning(df_temp, feature, bins, labels)
                df_temp = df_temp.drop([feature], axis=1)

            f1_result = evaluate_model(df_temp, y)
            
            if (f1_result > 0.67) and (f1_result > best_f1_in_iteration):
                best_f1_in_iteration = f1_result
                print(f'The best result: Quantiles: {quantiles}, Feature Combination: {feature_combo}, F1 Score: {f1_result}')
                best_df = df_temp.copy()
                best_feature_combo = feature_combo

The best result: Quantiles: [0.2, 0.4, 0.6, 0.8], Feature Combination: ('avg_home_team_rating_x_ewm_shoton_away', 'average_rating_away', 'average_rating_home', 'ewm_possession_home', 'avg_away_team_rating'), F1 Score: 0.6772014777016293


In [16]:
evaluate_model(best_df, y)

0.6772014777016293

In [17]:
import os
import json

output_dir = "../../data/binned/"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

with open(output_dir + 'interaction_constraints.json', 'w') as file:
    json.dump(best_interaction_constraints, file)

best_df.to_csv(output_dir + 'df.csv', index=False)
y.to_csv(output_dir + 'y.csv', index=False)

In [14]:
best_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 46 columns):
 #   Column                                         Non-Null Count  Dtype   
---  ------                                         --------------  -----   
 0   stage                                          3040 non-null   int64   
 1   player_rating_home_player_1                    3040 non-null   int64   
 2   player_rating_home_player_2                    3040 non-null   int64   
 3   player_rating_home_player_3                    3040 non-null   int64   
 4   player_rating_home_player_4                    3040 non-null   int64   
 5   player_rating_home_player_5                    3040 non-null   int64   
 6   player_rating_home_player_6                    3040 non-null   int64   
 7   player_rating_home_player_7                    3040 non-null   int64   
 8   player_rating_home_player_8                    3040 non-null   int64   
 9   player_rating_home_player_9              

In [15]:
best_interaction_constraints

[]