In [5]:
import os
import json

import pandas as pd

from xgboost import XGBClassifier

from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

import category_encoders as ce

import warnings

warnings.filterwarnings("ignore")

In [14]:
def evaluate_model(X_train, X_val, y_train, y_val):
    encoder = ce.OneHotEncoder(cols=X_train.select_dtypes(include=['object', 'category']).columns, use_cat_names=True, drop_invariant=True, return_df=True)

    X_train_encoded = encoder.fit_transform(X_train, y_train)
    X_val_encoded = encoder.transform(X_val)
    
    model = XGBClassifier(random_state=42, enable_categorical=True, early_stopping_rounds=100)
    model.fit(X_train_encoded, y_train, eval_set=[(X_train_encoded, y_train), (X_val_encoded, y_val)], verbose=0)
    y_pred = model.predict(X_val_encoded)
    return f1_score(y_val, y_pred, average='weighted'), model

In [7]:
X = pd.read_csv('../../data/binned/df.csv')
y = pd.read_csv('../../data/binned/y.csv')

data_dir = "../../data/binned/"

with open(data_dir + 'interaction_constraints.json', 'r') as file:
    interaction_constraints = json.load(file)

print(interaction_constraints)

for col in X.filter(like='_binned').columns:
    X[col] = X[col].astype('category')

X.filter(like='_binned').info()

[['player_rating_away_player_1', 'player_rating_away_player_2', 'player_rating_away_player_3', 'player_rating_away_player_4', 'player_rating_away_player_5', 'player_rating_away_player_6', 'player_rating_away_player_7', 'player_rating_away_player_8', 'player_rating_away_player_9', 'player_rating_away_player_10', 'player_rating_away_player_11'], ['ewm_away_team_goals', 'ewm_home_team_goals_conceded', 'points_away', 'ewm_shoton_home', 'ewm_shoton_away', 'ewm_possession_home', 'ewm_possession_away', 'points_home_binned', 'home_weighted_wins_binned', 'away_weighted_wins_binned', 'ewm_home_team_goals_binned', 'ewm_away_team_goals_conceded_binned'], ['num_top_players_home', 'num_top_players_away'], ['rating_range_home', 'rating_range_away']]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 9 columns):
 #   Column                               Non-Null Count  Dtype   
---  ------                               --------------  -----   
 0   avg_home_r

In [8]:
# X['ewm_shoton_diff'] = X['ewm_shoton_home'] - X['ewm_shoton_away']
# X['ewm_shoton_ratio'] = X['ewm_shoton_home'] / X['ewm_shoton_away']

In [19]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

f1, xgb = evaluate_model(X_train, X_val, y_train, y_val)

In [24]:
from sklearn.feature_selection import SelectFromModel

def evaluate_model(X_train, X_val, y_train, y_val, threshold):
    encoder = ce.OneHotEncoder(cols=X_train.select_dtypes(include=['object', 'category']).columns, use_cat_names=True, drop_invariant=True, return_df=True)

    X_train_encoded = encoder.fit_transform(X_train, y_train)
    X_val_encoded = encoder.transform(X_val)

    # Apply feature selection based on the threshold
    selection = SelectFromModel(xgb, threshold=threshold, prefit=True)
    select_X_train = selection.transform(X_train_encoded)
    select_X_val = selection.transform(X_val_encoded)

    model = XGBClassifier(random_state=42, enable_categorical=True, early_stopping_rounds=100)
    model.fit(select_X_train, y_train, eval_set=[(select_X_train, y_train), (select_X_val, y_val)], verbose=0)
    
    y_pred = model.predict(select_X_val)
    return f1_score(y_val, y_pred, average='weighted')

# Your existing code for getting feature importances
feature_importances = xgb.feature_importances_
thresholds = sorted(feature_importances, reverse=True)

results = []

# Iterate over thresholds
for thresh in thresholds:
    f1 = evaluate_model(X_train, X_val, y_train, y_val, thresh)
    results.append({'threshold': thresh, 'f1': f1})

# Sorting and printing results
sorted_results = sorted(results, key=lambda x: x['f1'], reverse=True)

print("Top 5 feature selection results:")
for result in sorted_results[:5]:
    print(f"Threshold: {result['threshold']}, F1: {result['f1']}")

Top 5 feature selection results:
Threshold: 0.009775876998901367, F1: 0.5322813298157089
Threshold: 0.008295511826872826, F1: 0.5322813298157089
Threshold: 0.0, F1: 0.5322813298157089
Threshold: 0.0, F1: 0.5322813298157089
Threshold: 0.0, F1: 0.5322813298157089


In [25]:
best_threshold = max(results, key=lambda x: x['f1'])['threshold']

# Perform feature selection with the best threshold
encoder = ce.OneHotEncoder(cols=X_train.select_dtypes(include=['object', 'category']).columns, use_cat_names=True, drop_invariant=True, return_df=True)
X_train_encoded = encoder.fit_transform(X_train, y_train)

selection = SelectFromModel(xgb, threshold=best_threshold, prefit=True)
selection.fit(X_train_encoded, y_train)

# Get the names of the selected features
selected_features = X_train_encoded.columns[selection.get_support()]

print("Selected features for the best threshold:")
print(list(selected_features))

Selected features for the best threshold:
['stage', 'player_rating_home_player_1', 'player_rating_home_player_7', 'player_rating_home_player_8', 'player_rating_home_player_9', 'player_rating_home_player_10', 'player_rating_home_player_11', 'player_rating_away_player_1', 'player_rating_away_player_7', 'player_rating_away_player_8', 'player_rating_away_player_9', 'player_rating_away_player_10', 'player_rating_away_player_11', 'ewm_away_team_goals', 'ewm_home_team_goals_conceded', 'points_away', 'avg_home_team_rating', 'avg_away_team_rating', 'home_streak_wins', 'away_streak_wins', 'ewm_shoton_home', 'ewm_shoton_away', 'ewm_possession_home', 'ewm_possession_away', 'diff_player_1', 'diff_player_7', 'diff_player_8', 'diff_player_9', 'diff_player_10', 'diff_player_11', 'rating_range_home', 'rating_range_away', 'num_top_players_home', 'num_top_players_away', 'avg_home_rating_attack_binned_Bin2', 'avg_home_rating_attack_binned_Bin3', 'avg_home_rating_attack_binned_Bin4', 'avg_home_rating_attac