In [1]:
import os
import json

import pandas as pd

from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

import warnings

warnings.filterwarnings("ignore")

In [16]:
def evaluate_model(features, target, interaction_constraints):
    X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42, stratify=target)
    model = XGBClassifier(random_state=42, enable_categorical=True, interaction_constraints=interaction_constraints, early_stopping_raounds=25)
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=0)
    y_pred = model.predict(X_val)
    return f1_score(y_val, y_pred, average='weighted')

In [2]:
X = pd.read_csv('../../data/binned/df.csv')
y = pd.read_csv('../../data/binned/y.csv')

data_dir = "../../data/binned/"

with open(data_dir + 'interaction_constraints.json', 'r') as file:
    interaction_constraints = json.load(file)

print(interaction_constraints)

for col in X.filter(like='_binned').columns:
    X[col] = X[col].astype('category')

X.filter(like='_binned').info()

[['player_rating_away_player_1', 'player_rating_away_player_2', 'player_rating_away_player_3', 'player_rating_away_player_4', 'player_rating_away_player_5', 'player_rating_away_player_6', 'player_rating_away_player_7', 'player_rating_away_player_8', 'player_rating_away_player_9', 'player_rating_away_player_10', 'player_rating_away_player_11'], ['ewm_home_team_goals', 'ewm_away_team_goals', 'ewm_home_team_goals_conceded', 'points_home', 'ewm_shoton_home', 'ewm_shoton_away', 'ewm_possession_home', 'ewm_possession_away', 'points_away_binned', 'home_weighted_wins_binned', 'away_weighted_wins_binned', 'ewm_away_team_goals_conceded_binned'], ['num_top_players_home', 'num_top_players_away'], ['rating_range_home', 'rating_range_away']]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 5 columns):
 #   Column                               Non-Null Count  Dtype   
---  ------                               --------------  -----   
 0   avg_home_rating_a

In [18]:
X

Unnamed: 0,stage,player_rating_home_player_1,player_rating_home_player_2,player_rating_home_player_3,player_rating_home_player_4,player_rating_home_player_5,player_rating_home_player_6,player_rating_home_player_7,player_rating_home_player_8,player_rating_home_player_9,...,avg_home_rating_defence,average_rating_home,average_rating_away,num_top_players_home,num_top_players_away,avg_home_rating_attack_binned,points_away_binned,home_weighted_wins_binned,away_weighted_wins_binned,ewm_away_team_goals_conceded_binned
0,1.0,72.0,72.0,72.0,72.0,72.0,72.0,79.0,75.0,85.0,...,72.0,75.545455,71.272727,2,0,Bin1,Bin1,Bin4,Bin1,Bin5
1,1.0,79.0,79.0,79.0,79.0,79.0,79.0,78.0,75.0,74.0,...,79.0,77.272727,83.090909,0,9,Bin9,Bin1,Bin4,Bin1,Bin6
2,1.0,77.0,77.0,77.0,77.0,77.0,77.0,72.0,82.0,76.0,...,77.0,76.636364,74.636364,1,0,Bin3,Bin1,Bin4,Bin1,Bin9
3,1.0,82.0,82.0,82.0,82.0,82.0,82.0,72.0,73.0,77.0,...,82.0,78.909091,75.454545,6,1,Bin7,Bin1,Bin4,Bin1,Bin6
4,1.0,77.0,77.0,77.0,77.0,77.0,77.0,67.0,75.0,82.0,...,77.0,77.000000,81.454545,2,8,Bin8,Bin1,Bin4,Bin1,Bin6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3035,38.0,80.0,80.0,80.0,80.0,80.0,80.0,79.0,75.0,80.0,...,80.0,77.000000,73.636364,1,0,Bin3,Bin8,Bin3,Bin6,Bin7
3036,38.0,86.0,86.0,86.0,86.0,86.0,86.0,83.0,83.0,83.0,...,86.0,84.272727,75.727273,10,0,Bin5,Bin10,Bin1,Bin6,Bin2
3037,38.0,79.0,79.0,79.0,79.0,79.0,79.0,81.0,86.0,88.0,...,79.0,81.454545,75.272727,4,0,Bin4,Bin4,Bin2,Bin9,Bin9
3038,38.0,78.0,78.0,78.0,78.0,78.0,78.0,76.0,80.0,79.0,...,78.0,77.636364,77.545455,0,2,Bin8,Bin10,Bin7,Bin6,Bin6


In [17]:
evaluate_model(X, y, interaction_constraints)

0.4939901764471665

In [3]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

xgb = XGBClassifier(random_state=42, enable_categorical=True, interaction_constraints=interaction_constraints, early_stopping_raounds=25)

xgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=0)

y_pred = xgb.predict(X_val)
f1 = f1_score(y_val, y_pred, average='weighted')
print(f"F1: {f1}")

F1: 0.4939901764471665


In [6]:
X_val.info()

<class 'pandas.core.frame.DataFrame'>
Index: 608 entries, 1030 to 2009
Data columns (total 49 columns):
 #   Column                               Non-Null Count  Dtype   
---  ------                               --------------  -----   
 0   stage                                608 non-null    float64 
 1   player_rating_home_player_1          608 non-null    float64 
 2   player_rating_home_player_2          608 non-null    float64 
 3   player_rating_home_player_3          608 non-null    float64 
 4   player_rating_home_player_4          608 non-null    float64 
 5   player_rating_home_player_5          608 non-null    float64 
 6   player_rating_home_player_6          608 non-null    float64 
 7   player_rating_home_player_7          608 non-null    float64 
 8   player_rating_home_player_8          608 non-null    float64 
 9   player_rating_home_player_9          608 non-null    float64 
 10  player_rating_home_player_10         608 non-null    float64 
 11  player_rating_home_p

In [11]:
select_X_train

array([[3.933, 'Bin10'],
       [4.878, 'Bin1'],
       [5.556, 'Bin6'],
       ...,
       [5.623, 'Bin1'],
       [7.536, 'Bin9'],
       [4.301, 'Bin4']], dtype=object)

In [23]:
feature_importances = xgb.feature_importances_

results = []

thresholds = sorted(feature_importances, reverse=True)


for thresh in thresholds:
    # Apply feature selection
    selection = SelectFromModel(xgb, threshold=thresh, prefit=True)
    
    # Transform training and validation data
    select_X_train = selection.transform(X_train)
    select_X_val = selection.transform(X_val)
    
    selected_features = X_train.columns[selection.get_support()]

    # Convert to DataFrame and keep feature names
    select_X_train_df = pd.DataFrame(select_X_train, columns=selected_features)
    
    for col in selected_features:
        select_X_train_df[col] = select_X_train_df[col].astype(X_train[col].dtype)

    select_X_val_df = pd.DataFrame(select_X_val, columns=selected_features)
    
    for col in selected_features:
        select_X_val_df[col] = select_X_val_df[col].astype(X_val[col].dtype)

    # Train a new model on the selected features
    selection_model = XGBClassifier(random_state=42, enable_categorical=True)
    selection_model.fit(select_X_train_df, y_train, verbose=0)

    # Predict and evaluate
    y_pred = selection_model.predict(select_X_val_df)
    f1 = f1_score(y_val, y_pred, average='weighted')

    results.append({'threshold': thresh, 'n_features': select_X_train_df.shape[1], 'f1': f1})

# Sorting and printing results
sorted_results = sorted(results, key=lambda x: x['f1'], reverse=True)

print("Top 5 feature selection results:")
for result in sorted_results[:5]:
    print(f"Threshold: {result['threshold']}, Number of Features: {result['n_features']}, F1: {result['f1']}")

# Selecting the best result
best_result = max(results, key=lambda x: x['f1'])

# Determining the best features
selection = SelectFromModel(xgb, threshold=best_result['threshold'], prefit=True)
best_features = X_train.columns[selection.get_support()]

print(f"Best Feature Set: {list(best_features)}")

Top 5 feature selection results:
Threshold: 0.006575244013220072, Number of Features: 32, F1: 0.5021541929422327
Threshold: 0.02836219221353531, Number of Features: 14, F1: 0.4978476514690788
Threshold: 0.005790190305560827, Number of Features: 34, F1: 0.49556700942902737
Threshold: 0.010979455895721912, Number of Features: 27, F1: 0.49249078343784514
Threshold: 0.022858232259750366, Number of Features: 21, F1: 0.49050796366327276
Best Feature Set: ['stage', 'player_rating_home_player_1', 'player_rating_home_player_8', 'player_rating_home_player_9', 'player_rating_home_player_10', 'player_rating_home_player_11', 'player_rating_away_player_1', 'player_rating_away_player_7', 'player_rating_away_player_8', 'player_rating_away_player_9', 'player_rating_away_player_10', 'player_rating_away_player_11', 'ewm_home_team_goals', 'ewm_away_team_goals', 'ewm_home_team_goals_conceded', 'points_home', 'avg_home_team_rating', 'avg_away_team_rating', 'home_streak_wins', 'away_streak_wins', 'ewm_shoton