In [1]:
import pandas as pd
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import  f1_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
import json

X = pd.read_csv('../data/new_features/df_.csv')
y = pd.read_csv('../data/new_features/y.csv')

with open('../data/binned/interaction_constraints.json', 'r') as file:
    interaction_constraints = json.load(file)
    
print(interaction_constraints)

[['player_rating_away_player_1', 'player_rating_away_player_2', 'player_rating_away_player_3', 'player_rating_away_player_4', 'player_rating_away_player_5', 'player_rating_away_player_6', 'player_rating_away_player_7', 'player_rating_away_player_8', 'player_rating_away_player_9', 'player_rating_away_player_10', 'player_rating_away_player_11'], ['ewm_away_team_goals', 'ewm_home_team_goals_conceded', 'ewm_away_team_goals_conceded', 'points_home', 'points_away', 'away_weighted_wins', 'ewm_shoton_home', 'ewm_shoton_away', 'ewm_possession_home', 'ewm_possession_away', 'home_weighted_wins_binned', 'ewm_home_team_goals_binned'], ['num_top_players_home', 'num_top_players_away'], ['rating_range_home', 'rating_range_away']]


In [3]:
def evaluate_model(features, target, interaction_constraints):
    X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42, stratify=target)
    model = XGBClassifier(random_state=42, enable_categorical=True, interaction_constraints=interaction_constraints, early_stopping_raounds=25)
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=0)
    y_pred = model.predict(X_val)
    return f1_score(y_val, y_pred, average='weighted')

In [4]:
def apply_binning(df, feature, bins, labels):
    # Ensure bins are unique
    unique_bins = sorted(set(bins))
    if len(unique_bins) < len(bins):
        print(f"Non-unique bins for {feature}, adjusted bins: {unique_bins}")
        bins = unique_bins
        if len(bins) - 1 != len(labels):
            print(f"Skipping binning for {feature} due to mismatch in bins and labels.")
            return df[feature]
    return pd.cut(df[feature], bins=bins, labels=labels, include_lowest=True)

In [5]:
df_binned = X.copy()

for feature in ('avg_home_rating_attack', 'points_away', 'home_weighted_wins', 'away_weighted_wins', 'ewm_away_team_goals_conceded'):
    bins = [-float('inf')] + df_binned[feature].quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]).tolist() + [float('inf')]
    labels = [f'Bin{i}' for i in range(1, len(bins))]
    df_binned[f'{feature}_binned'] = apply_binning(df_binned, feature, bins, labels)
    df_binned = df_binned.drop([feature], axis=1)

In [6]:
# for col in X.filter(like='_binned').columns:
#     X[col] = X[col].astype('category')
# 
# df_binned[interaction_constraints[3]]

Unnamed: 0,rating_range_home,rating_range_away
0,15,6
1,13,19
2,10,12
3,10,12
4,17,14
...,...,...
3035,31,6
3036,14,13
3037,9,4
3038,5,8


In [8]:
evaluate_model(df_binned, y, interaction_constraints)

ValueError: Constrained features are not a subset of training data feature names

In [None]:
X.info()

In [None]:
interaction_constraints