In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

from scipy.stats import chi2_contingency

from tqdm import tqdm
import warnings
from pprint import pprint

In [34]:
data = pd.read_csv("./data/augmented_data.csv")

In [35]:
data.nunique()

period                     5
minute                   139
possession               290
duration               78058
play_pattern               9
position                  25
location_x               635
location_x_distance      635
location_y               702
location_y_distance      472
technique                  7
body_part                  4
type                       5
is_penalty                 2
first_time                 2
open_goal                  2
one_on_one                 2
aerial_won                 2
follows_dribble            2
under_pressure             2
pass_duration          11912
pass_angle             10519
pass_type                  8
pass_height                4
pass_length             9930
num_passes                13
defenders_3m_radius       10
defenders_triangle        13
goalkeeper_x             258
goalkeeper_y             355
distance_to_goalie     45564
statsbomb_xg           82551
end_location_x           376
end_location_y           705
is_goal       

In [36]:
# drop redundant features
data.drop(["location_x", "location_y"], axis=1, inplace=True)
# drop features that leak target information
data.drop(["shot_angle", "duration"], axis=1, inplace=True)
# drop low information features
data.drop(["aerial_won", "follows_dribble", "possession", "minute", "period", "num_passes", "pass_duration", "pass_height", "pass_type", "type"], axis=1, inplace=True)
# "position", "body_part", "type", "pass_height", "pass_type"

In [37]:
# one-hot encode the categorical data
encoded_data = data
for column_name in ["play_pattern", "technique", "position", "body_part"]:
    one_hot_encoded = pd.get_dummies(encoded_data[column_name], prefix=column_name)
    encoded_data = encoded_data.drop(column_name, axis=1)
    encoded_data = pd.concat([encoded_data, one_hot_encoded], axis=1)
data = encoded_data

In [38]:
data.nunique()

location_x_distance           635
location_y_distance           472
is_penalty                      2
first_time                      2
open_goal                       2
                             ... 
position_Secondary Striker      2
body_part_Head                  2
body_part_Left Foot             2
body_part_Other                 2
body_part_Right Foot            2
Length: 64, dtype: int64

In [39]:
data = data.dropna()
data.replace([np.inf, -np.inf], 0, inplace=True)

In [40]:
y1 = data["statsbomb_xg"]
y2 = data["is_goal"]
X = data.drop(["statsbomb_xg", "is_goal", "end_location_x", "end_location_y"], axis=1)

In [41]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84065 entries, 0 to 84064
Data columns (total 60 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   location_x_distance                 84065 non-null  float64
 1   location_y_distance                 84065 non-null  float64
 2   is_penalty                          84065 non-null  bool   
 3   first_time                          84065 non-null  bool   
 4   open_goal                           84065 non-null  bool   
 5   one_on_one                          84065 non-null  bool   
 6   under_pressure                      84065 non-null  bool   
 7   pass_angle                          84065 non-null  float64
 8   pass_length                         84065 non-null  float64
 9   defenders_3m_radius                 84065 non-null  int64  
 10  defenders_triangle                  84065 non-null  int64  
 11  goalkeeper_x                        84065

In [57]:
contingency_table = pd.crosstab(X['good_foot'], y2)
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Output results
print("Chi-squared statistic:", chi2)
print("P-value:", p)
print("Degrees of freedom:", dof)
print("Expected frequencies table:")
print(expected)

Chi-squared statistic: 3.7915104186016064
P-value: 0.05151314423328219
Degrees of freedom: 1
Expected frequencies table:
[[44628.49837626  5575.50162374]
 [30100.50162374  3760.49837626]]
