In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

warnings.filterwarnings('ignore')

In [2]:
def visualize_data_with_target(df, col_1, col_2, diff_col, target):
    """
    Visualizes relationships between team rating, goals conceded, defensive weakness, and match results.

    Parameters:
    - df: DataFrame containing the data.
    - col_1: Column name for the average rating of the home team.
    - col_2: Column name for the goals conceded by the home team.
    - diff_col: Column name for different between col_1 and col_2.
    - target: Target variable with match results encoded as 0 (home not win), 1 (home win).
    """
    df_target = df.copy()
    df_target['result_match'] = target
    df_target['result_match_c'] = df_target['result_match'].apply(lambda x: 'home_win' if x == 1 else 'home_not_win')

    # Scatter plot for average rating vs goals conceded
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=df_target, x=col_1, y=col_2, hue='result_match_c', palette='viridis')
    plt.title(f'{col_1} vs {col_2}')
    plt.xlabel(f'{col_1}')
    plt.ylabel(f'{col_2}')
    plt.legend(title='Match Result')
    plt.show()

    # Box plot for defensive weakness across match result categories
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df_target, x='result_match', y=diff_col, palette='viridis')
    plt.title(f'{diff_col} Across Match Results')
    plt.xlabel('Match Result')
    plt.ylabel(f'{diff_col}')
    plt.show()


In [3]:
def evaluate_model(X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    best_f1 = 0.0

    model = XGBClassifier(random_state=42, enable_categorical=True)
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=0, early_stopping_rounds=25)
    
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred, average='weighted')
            
    return f1

In [4]:
# X = pd.read_csv('../../data/imputed/df.csv')
# y = pd.read_csv('../../data/imputed/y.csv')

X = pd.read_csv('../../data/start_dataset.csv')
y = pd.read_csv('../../data/y.csv', dtype='int8')

f1 = evaluate_model(X, y)
print(f'Base line f1: {f1}')

Base line f1: 0.6079095412522771


In [5]:
df_ = X.copy()

In [6]:
# away_players_6_11 = [col for col in df_.columns if col.startswith('player_rating_away') and int(col.split('_')[-2]) in range(6, 12)]
# home_players_6_11 = [col for col in df_.columns if col.startswith('player_rating_home') and int(col.split('_')[-2]) in range(6, 12)]
# 
# away_players_1_5 = [col for col in df_.columns if col.startswith('player_rating_away') and int(col.split('_')[-2]) in range(1, 6)]
# home_players_1_5 = [col for col in df_.columns if col.startswith('player_rating_home') and int(col.split('_')[-2]) in range(1, 6)]
# 
# df_['avg_home_rating_attack'] = df_[away_players_6_11].mean(axis=1).round(2)
# df_['avg_away_rating_attack'] = df_[home_players_6_11].mean(axis=1).round(2)
# 
# df_['avg_away_rating_defence'] = df_[away_players_1_5].mean(axis=1).round(2)
# df_['avg_home_rating_defence'] = df_[home_players_1_5].mean(axis=1).round(2)

In [7]:
home_player_columns = [col for col in df_.columns if 'player_rating_home_player' in col]
away_player_columns = [col for col in df_.columns if 'player_rating_away_player' in col]

df_['average_rating_home'] = df_[home_player_columns].mean(axis=1)
df_['average_rating_away'] = df_[away_player_columns].mean(axis=1)

In [8]:
# Top Players Impact: identify if the team has exceptionally high-rated players, threshold: 5% best players
all_player_columns = [col for col in df_.columns if 'player_rating_' in col]
top_5_percent_threshold = df_[all_player_columns].stack().quantile(0.95)

# Count the number of top players in each team
df_['num_top_players_home'] = df_[[col for col in df_.columns if 'player_rating_home_player' in col]].apply(lambda x: (x > top_5_percent_threshold).sum(), axis=1)
df_['num_top_players_away'] = df_[[col for col in df_.columns if 'player_rating_away_player' in col]].apply(lambda x: (x > top_5_percent_threshold).sum(), axis=1)

In [9]:
df_['avg_home_team_rating_x_ewm_shoton_away'] = (df_['avg_home_team_rating'] * df_['ewm_shoton_away']) / 10

In [10]:
# df_['ewm_home_team_goals_conceded_x_ewm_shoton_home'] = df_['ewm_home_team_goals_conceded'] * df_['ewm_shoton_home']
# # 
# bins = [-float('inf')] + df_['ewm_home_team_goals_conceded_x_ewm_shoton_home'].quantile([0.3, 0.5, 0.7, 0.9]).tolist() + [float('inf')]
# labels = [f'Bin{i}' for i in range(1, len(bins))]
# 
# df_[f'ewm_home_team_goals_conceded_x_ewm_shoton_home_home_binned'] = pd.cut(df_['ewm_home_team_goals_conceded_x_ewm_shoton_home'], bins, labels=labels, include_lowest=True)
# df_.drop('ewm_home_team_goals_conceded_x_ewm_shoton_home', axis=1, inplace=True)
# df_[['ewm_home_team_goals_conceded_x_ewm_shoton_home']].describe().round(2)

In [11]:
# Create a new feature that is the product of the inverse of the average rating
# and the goals conceded. The inverse is used to reflect that a lower rating
# means a weaker team. You might want to add a constant to avoid division by zero.

df_['defensive_weakness_home'] = (df_['ewm_home_team_goals_conceded'] / (df_['average_rating_home'])) * 100
df_['defensive_weakness_away'] = (df_['ewm_away_team_goals_conceded'] / (df_['average_rating_away'])) * 100

df_['defensive_weakness_diff'] = df_['defensive_weakness_home'] - df_['defensive_weakness_away']

df_.drop(['defensive_weakness_home', 'defensive_weakness_away'], axis=1, inplace=True)

In [12]:
# correlation = df_['ewm_home_team_goals'].corr(df_['average_rating_home'])
# print(f"Correlation coefficient: {correlation}")

In [13]:
# Example usage
# Assuming 'df_' is your DataFrame, 'y' is your target array, and you have specified column names
# visualize_data_with_target(df=df_, 
#                            col_1='average_rating_home', 
#                            col_2='ewm_home_team_goals', 
#                            diff_col='defensive_weakness_diff', 
#                            target=y)

In [14]:
df_['ewm_possession_home_x_ewm_shoton_home'] = (df_['ewm_possession_home'] * df_['ewm_shoton_home']) / 100

In [15]:
f1 = evaluate_model(df_, y)
print(f'Base line f1: {f1}')

Base line f1: 0.6420969905089159


In [16]:
import os
import json

output_dir = "../../data/new_features/"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

df_.to_csv(output_dir + 'df_.csv', index=False)
y.to_csv(output_dir + 'y.csv', index=False)

In [17]:
df_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 46 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   stage                                   3040 non-null   int64  
 1   player_rating_home_player_1             3040 non-null   int64  
 2   player_rating_home_player_2             3040 non-null   int64  
 3   player_rating_home_player_3             3040 non-null   int64  
 4   player_rating_home_player_4             3040 non-null   int64  
 5   player_rating_home_player_5             3040 non-null   int64  
 6   player_rating_home_player_6             3040 non-null   int64  
 7   player_rating_home_player_7             3040 non-null   int64  
 8   player_rating_home_player_8             3040 non-null   int64  
 9   player_rating_home_player_9             3040 non-null   int64  
 10  player_rating_home_player_10            3040 non-null   int6

In [18]:
df_.select_dtypes(exclude=['category']).describe().round(2)

Unnamed: 0,stage,player_rating_home_player_1,player_rating_home_player_2,player_rating_home_player_3,player_rating_home_player_4,player_rating_home_player_5,player_rating_home_player_6,player_rating_home_player_7,player_rating_home_player_8,player_rating_home_player_9,...,ewm_shoton_away,ewm_possession_home,ewm_possession_away,average_rating_home,average_rating_away,num_top_players_home,num_top_players_away,avg_home_team_rating_x_ewm_shoton_away,defensive_weakness_diff,ewm_possession_home_x_ewm_shoton_home
count,3040.0,3040.0,3040.0,3040.0,3040.0,3040.0,3040.0,3040.0,3040.0,3040.0,...,3022.0,3024.0,3022.0,3040.0,3040.0,3040.0,3040.0,3022.0,3016.0,3024.0
mean,19.5,76.05,76.05,76.05,76.05,76.05,76.05,75.88,75.92,76.79,...,5.88,48.88,48.94,76.34,76.16,0.42,0.38,44.84,0.1,2.86
std,10.97,4.83,4.83,4.83,4.83,4.83,4.83,4.87,5.13,5.37,...,1.33,3.91,3.73,4.11,4.1,1.22,1.06,10.35,1.12,0.61
min,1.0,46.0,46.0,46.0,46.0,46.0,46.0,45.0,45.0,51.0,...,0.51,17.26,21.83,56.55,56.55,0.0,0.0,3.78,-4.97,0.23
25%,10.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,72.75,73.0,...,5.0,47.08,47.15,73.55,73.36,0.0,0.0,37.98,-0.63,2.52
50%,19.5,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.5,...,5.68,49.17,49.2,75.73,75.64,0.0,0.0,43.46,0.08,2.83
75%,29.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,80.0,...,6.82,51.2,51.27,79.36,79.27,0.0,0.0,51.74,0.82,3.28
max,38.0,91.0,91.0,91.0,91.0,91.0,91.0,89.0,88.0,91.0,...,12.0,69.0,64.0,88.0,87.45,9.0,9.0,92.51,5.03,5.2


In [19]:
'avg_home_team_rating_x_ewm_shoton_away', 'average_rating_away', 'average_rating_home', 'ewm_possession_home', 'avg_home_team_rating', 'avg_away_team_rating', 'points_home', 'points_away'

('points_home',
 'points_away',
 'avg_home_team_rating',
 'avg_away_team_rating',
 'home_streak_wins',
 'away_streak_wins')