In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

import warnings

warnings.filterwarnings('ignore')

In [2]:
def train_and_evaluate(X_train, y_train, X_val, y_val, random_state=42, enable_categorical=True):
    """
    Train an XGBClassifier on the provided training data and evaluate it on the validation data.

    Parameters:
    X_train (DataFrame): Training feature data
    y_train (Series): Training target data
    X_val (DataFrame): Validation feature data
    y_val (Series): Validation target data
    random_state (int): Random state for reproducibility
    enable_categorical (bool): Enable categorical feature support

    Returns:
    float: The F1 score of the model on the validation data
    """
    # Initialize the XGBClassifier
    xgb_model = XGBClassifier(random_state=random_state, enable_categorical=enable_categorical)

    # Train the model on the full feature set
    xgb_model.fit(X_train, y_train)

    # Predictions and evaluation on the full feature set
    y_pred = xgb_model.predict(X_val)
    f1_score_val = f1_score(y_val, y_pred, average='weighted')

    return f1_score_val

In [3]:
# X = pd.read_csv('../../data/imputed/df.csv')
# y = pd.read_csv('../../data/imputed/y.csv')

X = pd.read_csv('../../data/start_dataset.csv')
y = pd.read_csv('../../data/y.csv')

X_train_full, X_val_full, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

f1=train_and_evaluate(X_train_full, y_train, X_val_full, y_val)
print(f'Base line f1: {f1}')

Base line f1: 0.47483131364816905


In [4]:
df_ = X.copy()

In [5]:
# # List of player positions for which we have ratings
player_positions = range(1, 12)

# Calculate the difference from the average for each player
for position in player_positions:
    home_player_col = f'player_rating_home_player_{position}'
    away_player_col = f'player_rating_away_player_{position}'

    df_[f'diff_player_{position}'] = df_[home_player_col] - df_[away_player_col]
    # df_.drop([home_player_col, away_player_col], axis=1, inplace=True)

# Display the dataframe to confirm the new features
df_.filter(like='diff_player').columns

Index(['diff_player_1', 'diff_player_2', 'diff_player_3', 'diff_player_4',
       'diff_player_5', 'diff_player_6', 'diff_player_7', 'diff_player_8',
       'diff_player_9', 'diff_player_10', 'diff_player_11'],
      dtype='object')

In [6]:
home_player_rating_columns = [col for col in df_.columns if 'player_rating_home_player' in col]
away_player_rating_columns = [col for col in df_.columns if 'player_rating_away_player' in col]

df_['rating_range_home'] = df_[home_player_rating_columns].max(axis=1) - df_[home_player_rating_columns].min(axis=1)
df_['rating_range_away'] = df_[away_player_rating_columns].max(axis=1) - df_[away_player_rating_columns].min(axis=1)

In [7]:
away_players_6_11 = [col for col in df_.columns if col.startswith('player_rating_away') and int(col.split('_')[-1]) in range(6,12)]
home_players_6_11 = [col for col in df_.columns if col.startswith('player_rating_home') and int(col.split('_')[-1]) in range(6, 12)] 

away_players_1_5 = [col for col in df_.columns if col.startswith('player_rating_away') and int(col.split('_')[-1]) in range(1,6)]
home_players_1_5 = [col for col in df_.columns if col.startswith('player_rating_home') and int(col.split('_')[-1]) in range(1, 6)]

df_['avg_home_rating_attack'] = df_[away_players_6_11].mean(axis=1).round(2)
df_['avg_away_rating_attack'] = df_[home_players_6_11].mean(axis=1).round(2)

df_['avg_away_rating_defence'] = df_[away_players_1_5].mean(axis=1).round(2)
df_['avg_home_rating_defence'] = df_[home_players_1_5].mean(axis=1).round(2)

In [8]:
home_player_columns = [col for col in df_.columns if 'player_rating_home_player' in col]
away_player_columns = [col for col in df_.columns if 'player_rating_away_player' in col]

df_['average_rating_home'] = df_[home_player_columns].mean(axis=1)
df_['average_rating_away'] = df_[away_player_columns].mean(axis=1)

In [9]:
# Top Players Impact: identify if the team has exceptionally high-rated players, threshold: 15% best players
all_player_columns = [col for col in df_.columns if 'player_rating_' in col]
top_5_percent_threshold = df_[all_player_columns].stack().quantile(0.9)

# Count the number of top players in each team
df_['num_top_players_home'] = df_[[col for col in df_.columns if 'player_rating_home_player' in col]].apply(lambda x: (x > top_5_percent_threshold).sum(), axis=1)
df_['num_top_players_away'] = df_[[col for col in df_.columns if 'player_rating_away_player' in col]].apply(lambda x: (x > top_5_percent_threshold).sum(), axis=1)

In [10]:
# Splitting the original dataset
X_train_full, X_val_full, y_train, y_val = train_test_split(df_, y, test_size=0.2, random_state=42, stratify=y)

f1=train_and_evaluate(X_train_full, y_train, X_val_full, y_val)
print(f'Base line f1: {f1}')

Base line f1: 0.4869535539215686


In [11]:
import os
import json

output_dir = "../../data/new_features/"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

df_.to_csv(output_dir + 'df_.csv', index=False)
y.to_csv(output_dir + 'y.csv', index=False)