In [1]:
import pandas as pd

from itertools import combinations

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import category_encoders as ce

import warnings
warnings.filterwarnings('ignore')

In [2]:
X = pd.read_csv('../../data/new_features/df_.csv')
y = pd.read_csv('../../data/new_features/y.csv')

In [3]:
def apply_binning(df, feature, bins, labels):
    # Ensure bins are unique
    unique_bins = sorted(set(bins))
    if len(unique_bins) < len(bins):
        print(f"Non-unique bins for {feature}, adjusted bins: {unique_bins}")
        bins = unique_bins
        if len(bins) - 1 != len(labels):
            print(f"Skipping binning for {feature} due to mismatch in bins and labels.")
            return df[feature]
    return pd.cut(df[feature], bins=bins, labels=labels, include_lowest=True)


In [4]:
import time

# List of features to bin
features = [
    'avg_home_rating_attack', 'avg_away_rating_attack',
    'avg_away_rating_defence', 'avg_home_rating_defence',
    'points_home', 'points_away',
    'home_weighted_wins', 'away_weighted_wins',
    'ewm_home_team_goals', 'ewm_away_team_goals', 'ewm_away_team_goals_conceded', 'ewm_possession_home'
]

quantile_options = [
    # [0.2, 0.4, 0.6, 0.8],
    # [0.1, 0.3, 0.7, 0.9],
    [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    [0.15, 0.5, 0.85],
    # [0.05, 0.95]
]

def evaluate_model(features, target, interaction_constraints):
    # Splitting the data
    X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42, stratify=target)
    
    encoders = {
        'one_hot': ce.OneHotEncoder(cols=features.select_dtypes(include=['object', 'category']).columns, use_cat_names=True, drop_invariant=True, return_df=True),
        'ordinal': ce.OrdinalEncoder(cols=features.select_dtypes(include=['object', 'category']).columns),
        # 'binary': ce.BinaryEncoder(cols=features.select_dtypes(include=['object', 'category']).columns),
        # 'target': ce.TargetEncoder(cols=features.select_dtypes(include=['object', 'category']).columns)
    }

    best_f1 = 0.0
    best_encoder = None
    best_encoder_name = ""

    for encoder_name, encoder in encoders.items():
        # Apply encoding
        X_train_encoded = encoder.fit_transform(X_train, y_train)
        X_val_encoded = encoder.transform(X_val)

        # Initialize and fit the model
        model = XGBClassifier(random_state=42, enable_categorical=True)
        model.fit(X_train_encoded, y_train, eval_set=[(X_train_encoded, y_train), (X_val_encoded, y_val)], verbose=0, early_stopping_rounds=25)
        
        # Predict and evaluate
        y_pred = model.predict(X_val_encoded)
        f1 = f1_score(y_val, y_pred, average='weighted')

        if f1 > best_f1:
            best_f1 = f1
            best_encoder = encoder
            best_encoder_name = encoder_name
            
    # Return best F1 score and name of the best encoder
    return best_f1, best_encoder_name

best_df = None
best_f1_in_iteration = 0.0
best_interaction_constraints = []
best_feature_combo = []
best_quantiles = []

# Experiment with different combinations of binned features for each quantile option
for quantiles in quantile_options:
    for r in range(1, len(features) + 1):
        for feature_combo in combinations(features, r):
            df_temp = X.copy()  # Create a temporary copy of the dataframe
            for feature in feature_combo:
                bins = [-float('inf')] + X[feature].quantile(quantiles).tolist() + [float('inf')]
                labels = [f'Bin{i}' for i in range(1, len(bins))]  # Dynamic label creation based on the number of bins
                # Apply binning to each feature in the combination
                df_temp[f'{feature}_binned'] = apply_binning(df_temp, feature, bins, labels)
                df_temp = df_temp.drop([feature], axis=1)

            away_player_ratings = [col for index, col in enumerate(df_temp.columns) if col.startswith('player_rating_away_player_')]
            team_performance_metrics = [col for index, col in enumerate(df_temp.columns) if col.startswith('ewm_') or col.startswith('points_') or col.startswith('home_weighted_wins') or col.startswith('away_weighted_wins')]
            rating_range_avg_diff = [col for index, col in enumerate(df_temp.columns) if col in ['rating_range_home', 'rating_range_away', 'average_rating_diff']]
            top_players = [col for index, col in enumerate(df_temp.columns) if col.startswith('num_top')]

            interaction_constraints = [
                away_player_ratings,
                team_performance_metrics,
                top_players,
                rating_range_avg_diff,
            ]
            
            f1_result, encoder = evaluate_model(df_temp, y, interaction_constraints)
            
            if f1_result > 0.51:
                # print(f'Quantiles: {quantiles}, Feature Combination: {feature_combo}, F1 Score: {f1_result}')
                if f1_result > best_f1_in_iteration:
                    best_f1_in_iteration = f1_result
                    print(f'The best result: Quantiles: {quantiles}, Feature Combination: {feature_combo}, encoder: {encoder}, F1 Score: {f1_result}')
                    best_df = df_temp.copy()
                    best_interaction_constraints = interaction_constraints
                    best_feature_combo = feature_combo

The best result: Quantiles: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], Feature Combination: ('points_away', 'ewm_home_team_goals'), encoder: one_hot, F1 Score: 0.5122708355017405
The best result: Quantiles: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], Feature Combination: ('avg_away_rating_defence', 'avg_home_rating_defence', 'points_away', 'home_weighted_wins', 'ewm_away_team_goals', 'ewm_away_team_goals_conceded'), encoder: one_hot, F1 Score: 0.5203036300278563
The best result: Quantiles: [0.15, 0.5, 0.85], Feature Combination: ('avg_home_rating_attack', 'avg_away_rating_attack', 'points_home', 'ewm_home_team_goals', 'ewm_away_team_goals', 'ewm_away_team_goals_conceded', 'ewm_possession_home'), encoder: one_hot, F1 Score: 0.5203077996403164
The best result: Quantiles: [0.15, 0.5, 0.85], Feature Combination: ('avg_home_rating_attack', 'avg_away_rating_attack', 'avg_home_rating_defence', 'points_home', 'home_weighted_wins', 'away_weighted_wins', 'ewm_home_team_goals', 'ewm_away_t

In [5]:
# df_binned = best_df.copy()
# 
# for feature in best_feature_combo:
#     bins = [-float('inf')] + df_[fea
#     ture].quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]).tolist() + [float('inf')]
#     labels = [f'Bin{i}' for i in range(1, len(bins))]
#     df_binned[f'{feature}_binned'] = apply_binning(df_binned, feature, bins, labels)
#     df_binned = df_binned.drop([feature], axis=1)

In [6]:
evaluate_model(best_df, y, best_interaction_constraints)

(0.5322813298157089, 'one_hot')

In [7]:
best_df.filter(like='binned').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 9 columns):
 #   Column                               Non-Null Count  Dtype   
---  ------                               --------------  -----   
 0   avg_home_rating_attack_binned        3040 non-null   category
 1   avg_away_rating_attack_binned        3040 non-null   category
 2   avg_away_rating_defence_binned       3040 non-null   category
 3   avg_home_rating_defence_binned       3040 non-null   category
 4   points_home_binned                   3040 non-null   category
 5   home_weighted_wins_binned            3040 non-null   category
 6   away_weighted_wins_binned            3040 non-null   category
 7   ewm_home_team_goals_binned           3024 non-null   category
 8   ewm_away_team_goals_conceded_binned  3022 non-null   category
dtypes: category(9)
memory usage: 28.6 KB


In [8]:
import os
import json

output_dir = "../../data/binned/"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

with open(output_dir + 'interaction_constraints.json', 'w') as file:
    json.dump(best_interaction_constraints, file)

best_df.to_csv(output_dir + 'df.csv', index=False)
y.to_csv(output_dir + 'y.csv', index=False)

In [9]:
best_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 60 columns):
 #   Column                               Non-Null Count  Dtype   
---  ------                               --------------  -----   
 0   stage                                3040 non-null   int64   
 1   player_rating_home_player_1          3040 non-null   int64   
 2   player_rating_home_player_2          3040 non-null   int64   
 3   player_rating_home_player_3          3040 non-null   int64   
 4   player_rating_home_player_4          3040 non-null   int64   
 5   player_rating_home_player_5          3040 non-null   int64   
 6   player_rating_home_player_6          3040 non-null   int64   
 7   player_rating_home_player_7          3040 non-null   int64   
 8   player_rating_home_player_8          3040 non-null   int64   
 9   player_rating_home_player_9          3040 non-null   int64   
 10  player_rating_home_player_10         3040 non-null   int64   
 11  player_rating_hom

In [10]:
best_interaction_constraints

[['player_rating_away_player_1',
  'player_rating_away_player_2',
  'player_rating_away_player_3',
  'player_rating_away_player_4',
  'player_rating_away_player_5',
  'player_rating_away_player_6',
  'player_rating_away_player_7',
  'player_rating_away_player_8',
  'player_rating_away_player_9',
  'player_rating_away_player_10',
  'player_rating_away_player_11'],
 ['ewm_away_team_goals',
  'ewm_home_team_goals_conceded',
  'points_away',
  'ewm_shoton_home',
  'ewm_shoton_away',
  'ewm_possession_home',
  'ewm_possession_away',
  'points_home_binned',
  'home_weighted_wins_binned',
  'away_weighted_wins_binned',
  'ewm_home_team_goals_binned',
  'ewm_away_team_goals_conceded_binned'],
 ['num_top_players_home', 'num_top_players_away'],
 ['rating_range_home', 'rating_range_away']]