In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

import statsmodels.api as sm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc
import dtreeviz

from tqdm import tqdm
import warnings
from pprint import pprint

warnings.filterwarnings("ignore")

In [39]:
data = pd.read_csv("./data/augmented_data.csv")

In [40]:
data.nunique()

period                     5
minute                   139
second                    60
possession               290
duration               77925
competition_id            17
season_id                 41
match_id                3312
timestamp              82726
team                     288
player                  5638
freeze_frame           82866
play_pattern               9
position                  25
player_type                5
location_x               635
location_x_distance      635
location_y               701
location_y_distance      471
technique                  7
body_part                  4
type                       5
is_penalty                 2
is_header                  2
first_time                 2
open_goal                  2
one_on_one                 2
aerial_won                 2
follows_dribble            2
under_pressure             2
pass_duration          11897
pass_angle             10504
pass_type                  8
pass_height                4
pass_length   

In [41]:
# Use mutual_info classification to check which features yield the most information about the target variable
data_copy = data.copy()
X = data_copy.drop(["statsbomb_xg", "is_goal", "end_location_x", "end_location_y", "shot_angle", "freeze_frame", "duration", "competition_id", "season_id", "match_id", "player", "timestamp", "team", "xg_so_far", "location_x", "location_y", "game_state", "body_part", "defenders_3m_radius", "under_pressure", "goal_distance", "play_pattern", "technique", "position", "body_part", "type", "pass_type", "pass_height"], axis=1)
mutual_info = mutual_info_classif(X, data["is_goal"])
mutual_info = pd.Series(mutual_info)
mutual_info.index = X.columns
mutual_info.sort_values(ascending=False)

distance_to_goalie     0.059322
shooting_range         0.052242
best_distance          0.049532
defenders_triangle     0.040078
open_range             0.038554
location_x_distance    0.032878
goalkeeper_x           0.027654
location_y_distance    0.022337
goalkeeper_y           0.021333
is_penalty             0.015125
own_past_minute        0.013455
open_goal              0.012256
pass_duration          0.011296
pass_length            0.010907
num_passes             0.008559
period                 0.007791
good_foot              0.007328
past_minute            0.006612
player_type            0.006551
first_time             0.005403
was_leading            0.004891
own_past_15            0.004184
minute                 0.004090
one_on_one             0.003403
pass_angle             0.003048
shots_so_far           0.002741
past_15                0.002523
possession             0.002064
aerial_won             0.001701
is_header              0.000534
second                 0.000142
follows_

In [42]:
# drop redundant features
data.drop(["location_x", "location_y", "game_state", "body_part", "defenders_3m_radius", "under_pressure", "player_type", "best_distance"], axis=1, inplace=True)
# drop features that leak target information
data.drop(["end_location_x", "end_location_y", "shot_angle", "duration", "competition_id", "season_id", "match_id", "timestamp", "team", "player", "freeze_frame", "xg_so_far"], axis=1, inplace=True)
# drop low information features
data.drop(["period", "minute", "second", "possession", "aerial_won", "follows_dribble", "num_passes", "pass_length", "pass_duration", "pass_height", "pass_type", "pass_angle", "shots_so_far", "past_minute", "past_15", "own_past_minute", "own_past_15"], axis=1, inplace=True)
# drop penalties
data = data[data["is_penalty"] == False]

In [43]:
data.nunique()

play_pattern               9
position                  25
location_x_distance      635
location_y_distance      471
technique                  7
type                       4
is_penalty                 1
is_header                  2
first_time                 2
open_goal                  2
one_on_one                 2
defenders_triangle        12
goalkeeper_x             258
goalkeeper_y             355
distance_to_goalie     45441
shooting_range         43788
open_range                12
goal_distance          36856
statsbomb_xg           82416
is_goal                    2
good_foot                  2
was_leading                3
is_extra_time              2
dtype: int64

In [44]:
# one-hot encode the categorical data
encoded_data = data
for column_name in ["play_pattern", "technique", "type", "position"]:
    one_hot_encoded = pd.get_dummies(encoded_data[column_name], prefix=column_name)
    encoded_data = encoded_data.drop(column_name, axis=1)
    encoded_data = pd.concat([encoded_data, one_hot_encoded], axis=1)
data = encoded_data

In [45]:
data = data.dropna()
data.replace([np.inf, -np.inf], 0, inplace=True)

In [46]:
y1 = data["statsbomb_xg"]
y2 = data["is_goal"]
X = data.drop(["statsbomb_xg", "is_goal"], axis=1)
X = X[["location_x", "location_y_distance"]]

KeyError: "['location_x'] not in index"

In [None]:
def preprocess_continuous(X, y):

    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=20)

    return X_train, X_test, y_train, y_test

In [None]:
def preprocess_classifier(X, y):

    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=20)

    X_train, y_train = SMOTE(sampling_strategy=0.2).fit_resample(X_train, y_train)

    return X_train, X_test, y_train, y_test

In [None]:
def calculate_errors(results_test, results_pred, name):
    mae = mean_absolute_error(results_test, results_pred)
    rmse = root_mean_squared_error(results_test, results_pred)
    r2 = r2_score(results_test, results_pred)
    print(f"\n{name}")
    print("Mean Absolute Error:", mae)
    print("Root Mean Squared Error:", rmse)
    print("R-squared (R2 Score):", r2)

In [None]:
def calculate_stats(results_test, results_pred, name):
    cm = confusion_matrix(results_test, results_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Not a goal", "Goal"])
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f'Confusion Matrix ({name})')
    plt.show()

    print(f"\n{name}")
    accuracy = accuracy_score(results_test, results_pred)
    precision = precision_score(results_test, results_pred)
    recall = recall_score(results_test, results_pred)
    f1 = f1_score(results_test, results_pred)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1:", f1)

In [None]:
def linear_regression():
    linear_r = LinearRegression()
    linear_r.fit(X_train, y_train)
    y_pred = linear_r.predict(X_test)

    calculate_errors(y_test, y_pred, "Linear Regression")

    return y_pred

In [None]:
X_train, X_test, y_train, y_test = preprocess_continuous(X, y1)
lr_xg = linear_regression()

In [None]:
gam_model = sm.GLM(y_train, X_train, family=sm.families.Gaussian())  # Assuming Gaussian family for continuous outcome
gam_result = gam_model.fit()
print(gam_result.summary())

In [None]:
y_pred = gam_result.predict(X_test)
calculate_errors(y_test, y_pred, "GAM")

In [None]:
feature_values = np.linspace(X_test[25].min(), X_test[25].max(), num=50)

# Function to compute partial dependence
def compute_partial_dependence(model, feature_index, feature_values, other_features):
    X_pred = other_features.copy()
    results = []
    for value in feature_values:
        X_pred[feature_index] = value
        prediction = model.predict(X_pred)
        results.append(prediction.mean())  # Use mean prediction if multiple samples are used
    return results

# Compute partial dependence for the feature of interest
pdp_results = compute_partial_dependence(gam_result, 25, feature_values, X_test)

# Plot the partial dependence curve
plt.figure(figsize=(8, 6))
plt.plot(feature_values, pdp_results, label="0")
plt.xlabel("0")
plt.ylabel('Partial Dependence')
plt.title(f'Partial Dependence Plot for {"0"}')
plt.legend()
plt.grid(True)
plt.show()