In [1]:
import pandas as pd
from itertools import combinations

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

import category_encoders as ce

import warnings

warnings.filterwarnings("ignore")

In [2]:
# df_['ewm_shoton_diff'] = df_['ewm_shoton_home'] - df_['ewm_shoton_away']
# df_['ewm_shoton_ratio'] = df_['ewm_shoton_home'] / df_['ewm_shoton_away']

In [3]:
X = pd.read_csv('../../data/binned/df.csv')
y = pd.read_csv('../../data/binned/y.csv')

for col in X.filter(like='_binned').columns:
    X[col] = X[col].astype('category')

X.filter(like='_binned').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 9 columns):
 #   Column                               Non-Null Count  Dtype   
---  ------                               --------------  -----   
 0   avg_home_rating_attack_binned        3040 non-null   category
 1   avg_away_rating_attack_binned        3040 non-null   category
 2   avg_away_rating_defence_binned       3040 non-null   category
 3   avg_home_rating_defence_binned       3040 non-null   category
 4   points_home_binned                   3040 non-null   category
 5   home_weighted_wins_binned            3040 non-null   category
 6   away_weighted_wins_binned            3040 non-null   category
 7   ewm_home_team_goals_binned           3024 non-null   category
 8   ewm_away_team_goals_conceded_binned  3022 non-null   category
dtypes: category(9)
memory usage: 28.6 KB


In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 60 columns):
 #   Column                               Non-Null Count  Dtype   
---  ------                               --------------  -----   
 0   stage                                3040 non-null   int64   
 1   player_rating_home_player_1          3040 non-null   int64   
 2   player_rating_home_player_2          3040 non-null   int64   
 3   player_rating_home_player_3          3040 non-null   int64   
 4   player_rating_home_player_4          3040 non-null   int64   
 5   player_rating_home_player_5          3040 non-null   int64   
 6   player_rating_home_player_6          3040 non-null   int64   
 7   player_rating_home_player_7          3040 non-null   int64   
 8   player_rating_home_player_8          3040 non-null   int64   
 9   player_rating_home_player_9          3040 non-null   int64   
 10  player_rating_home_player_10         3040 non-null   int64   
 11  player_rating_hom

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [41]:
def evaluate_model(X_train, X_val, y_train, y_val, interaction_constraints):
    encoder = ce.OneHotEncoder(cols=X_train.select_dtypes(include=['object', 'category']).columns, use_cat_names=True, drop_invariant=True, return_df=True)

    X_train_encoded = encoder.fit_transform(X_train, y_train)
    X_val_encoded = encoder.transform(X_val)

    model = XGBClassifier(random_state=42, enable_categorical=True, early_stopping_rounds=100, interaction_constraints=interaction_constraints)
    model.fit(X_train_encoded, y_train, eval_set=[(X_train_encoded, y_train), (X_val_encoded, y_val)], verbose=0)
    y_pred = model.predict(X_val_encoded)
    return f1_score(y_val, y_pred, average='weighted')

In [42]:
evaluate_model(X_train, X_val, y_train, y_val, [])

0.5322813298157089

In [8]:
home_player_ratings = [col for index, col in enumerate(X.columns) if col.startswith('player_rating_home_player_')]
away_player_ratings = [col for index, col in enumerate(X.columns) if col.startswith('player_rating_away_player_')]

team_performance_metrics = [col for index, col in enumerate(X.columns) if col.startswith('ewm_') or col.startswith('points_') or col.startswith('home_weighted_wins') or col.startswith('away_weighted_wins')]

team_avg_variance_ratings = [col for index, col in enumerate(X.columns) if col in ['average_rating_home', 'average_rating_away']]

rating_range_avg_diff = [col for index, col in enumerate(X.columns) if col in ['rating_range_home', 'rating_range_away', 'average_rating_diff']]

attack_defense_strength = [col for index, col in enumerate(X.columns) if col in ['diff_avg_rating_attack', 'diff_avg_rating_defence']]

home_away_team_strength = [col for index, col in enumerate(X.columns) if 'home' in col or 'away' in col]

binned_group = [col for index, col in enumerate(X.columns) if 'binned' in col]


In [9]:
features_interaction_depth_0 = ['ewm_shoton_away', 'ewm_shoton_home', 'ewm_home_team_goals_conceded',
                                'ewm_possession_home', 'diff_player_8', 'home_streak_wins', 'ewm_away_team_goals',
                                'away_streak_wins', 'diff_player_9', 'ewm_possession_away', 'diff_player_11',
                                'player_rating_home_player_8', 'stage', 'avg_home_team_rating', 'avg_away_team_rating',
                                'points_away', 'player_rating_away_player_7', 'diff_player_10', 'rating_range_away',
                                'player_rating_home_player_11', ]

features_interaction_depth_1 = ['diff_player_8', 'avg_home_team_rating', 'away_streak_wins',
                                'player_rating_away_player_7', 'player_rating_home_player_8',
                                'diff_player_9', 'ewm_possession_away', 'diff_player_11',
                                'home_streak_wins', 'ewm_shoton_away', 'player_rating_home_player_11',
                                'ewm_away_team_goals', 'ewm_possession_home', 'ewm_shoton_home',
                                'ewm_home_team_goals_conceded', ]

In [47]:
interaction_constraints = [
    home_player_ratings,
    away_player_ratings,
    # team_performance_metrics,
    # team_avg_variance_ratings,
    # # features_interaction_depth_0,
    # features_interaction_depth_1,
    # binned_group,
    # home_away_team_strength
]

In [48]:
X_train_full, X_val_full, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Dictionary to store F1 score for each combination
results = []

# Iterate over all combinations of interaction constraints
for i in range(1, len(interaction_constraints) + 1):
    for combo in combinations(interaction_constraints, i):
        # Initialize the XGBClassifier with the current combination of interaction constraints
        
        f1 = evaluate_model(X_train, X_val, y_train, y_val, combo)

        results.append((combo, f1))

# Sort the results based on F1 score in descending order
results.sort(key=lambda x: x[1], reverse=True)

# Retrieve the top 5 results
top_5_results = results[:5]

# Print or process the top 5 results
for combo, score in top_5_results:
    print(f"Combo: {combo}, F1 Score: {score}")


Combo: (['player_rating_home_player_1', 'player_rating_home_player_2', 'player_rating_home_player_3', 'player_rating_home_player_4', 'player_rating_home_player_5', 'player_rating_home_player_6', 'player_rating_home_player_7', 'player_rating_home_player_8', 'player_rating_home_player_9', 'player_rating_home_player_10', 'player_rating_home_player_11'],), F1 Score: 0.5322813298157089
Combo: (['player_rating_away_player_1', 'player_rating_away_player_2', 'player_rating_away_player_3', 'player_rating_away_player_4', 'player_rating_away_player_5', 'player_rating_away_player_6', 'player_rating_away_player_7', 'player_rating_away_player_8', 'player_rating_away_player_9', 'player_rating_away_player_10', 'player_rating_away_player_11'],), F1 Score: 0.5322813298157089
Combo: (['player_rating_home_player_1', 'player_rating_home_player_2', 'player_rating_home_player_3', 'player_rating_home_player_4', 'player_rating_home_player_5', 'player_rating_home_player_6', 'player_rating_home_player_7', 'playe