In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc
import dtreeviz

from tqdm import tqdm
import warnings
from pprint import pprint

warnings.filterwarnings("ignore")

In [12]:
all_shots = pd.read_csv("./data/all_shots.csv")
laliga_shots = pd.read_csv("./data/laliga_shots.csv")
bundesliga_shots = pd.read_csv("./data/bundesliga_shots.csv")
pl_shots = pd.read_csv("./data/PL_shots.csv")

In [13]:
all_leagues = [all_shots, laliga_shots, bundesliga_shots, pl_shots]

for league in all_leagues:
    league.drop(['period', 'minute', 'possession', 'duration', 'season_id', 'match_id', 'timestamp', 'team'], axis=1, inplace=True)
    # drop redundant features
    if("location_x" in league.columns):
        league.drop(["location_x", "location_y"], axis=1, inplace=True)
    # drop features that leak target information
    if("shot_angle" in league.columns):
        league.drop(["shot_angle"], axis=1, inplace=True)
    # drop low information features
    league.drop(["aerial_won", "follows_dribble", "num_passes", "pass_duration", "pass_height", "pass_type", "type"], axis=1, inplace=True)
    # "position", "body_part", "type", "pass_height", "pass_type"

In [14]:
encoded_data = all_shots
for column_name in ["technique", "body_part", "play_pattern", "position"]:
    one_hot_encoded = pd.get_dummies(encoded_data[column_name], prefix=column_name)
    encoded_data = encoded_data.drop(column_name, axis=1)
    encoded_data = pd.concat([encoded_data, one_hot_encoded], axis=1)
all_shots = encoded_data

all_shots = all_shots.dropna()
all_shots.replace([np.inf, -np.inf], 0, inplace=True)

encoded_data = laliga_shots
for column_name in ["technique", "body_part", "play_pattern", "position"]:
    one_hot_encoded = pd.get_dummies(encoded_data[column_name], prefix=column_name)
    encoded_data = encoded_data.drop(column_name, axis=1)
    encoded_data = pd.concat([encoded_data, one_hot_encoded], axis=1)
laliga_shots = encoded_data

laliga_shots = laliga_shots.dropna()
laliga_shots.replace([np.inf, -np.inf], 0, inplace=True)

encoded_data = bundesliga_shots
for column_name in ["technique", "body_part", "play_pattern", "position"]:
    one_hot_encoded = pd.get_dummies(encoded_data[column_name], prefix=column_name)
    encoded_data = encoded_data.drop(column_name, axis=1)
    encoded_data = pd.concat([encoded_data, one_hot_encoded], axis=1)
bundesliga_shots = encoded_data

bundesliga_shots = bundesliga_shots.dropna()
bundesliga_shots.replace([np.inf, -np.inf], 0, inplace=True)

encoded_data = pl_shots
for column_name in ["technique", "body_part", "play_pattern", "position"]:
    one_hot_encoded = pd.get_dummies(encoded_data[column_name], prefix=column_name)
    encoded_data = encoded_data.drop(column_name, axis=1)
    encoded_data = pd.concat([encoded_data, one_hot_encoded], axis=1)
pl_shots = encoded_data

pl_shots = pl_shots.dropna()
pl_shots.replace([np.inf, -np.inf], 0, inplace=True)


In [15]:
# all_shots_y1 = all_shots["statsbomb_xg"]
# all_shots_y2 = all_shots["is_goal"]
# all_shots_X = all_shots.drop(["statsbomb_xg", "is_goal", "end_location_x", "end_location_y"], axis=1)

# laliga_y1 =laliga_shots["statsbomb_xg"]
# laliga_y2 =laliga_shots["is_goal"]
# laliga_X = laliga_shots.drop(["statsbomb_xg", "is_goal", "end_location_x", "end_location_y"], axis=1)

# bundesliga_y1 = bundesliga_shots["statsbomb_xg"]
# bundesliga_y2 = bundesliga_shots["is_goal"]
# bundesliga_X =  bundesliga_shots.drop(["statsbomb_xg", "is_goal", "end_location_x", "end_location_y"], axis=1)

# pl_y1 = pl_shots["statsbomb_xg"]
# pl_y2 = pl_shots["is_goal"]
# pl_X =  pl_shots.drop(["statsbomb_xg", "is_goal", "end_location_x", "end_location_y"], axis=1)

In [16]:
def calculate_stats(results_test, results_pred, name):
    # cm = confusion_matrix(results_test, results_pred)
    # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Not a goal", "Goal"])
    # disp.plot(cmap=plt.cm.Blues)
    # plt.title(f'Confusion Matrix ({name})')
    # plt.show()

    # print(f"\n{name}")
    # accuracy = accuracy_score(results_test, results_pred)
    # precision = precision_score(results_test, results_pred)
    # recall = recall_score(results_test, results_pred)
    f1 = f1_score(results_test, results_pred)
    # print("Accuracy:", accuracy)
    # print("Precision:", precision)
    # print("Recall:", recall)
    # print("F1:", f1)
    
    return f1

In [17]:
def decision_tree(X_train, y_train, X_test, y_test):
    dt = DecisionTreeClassifier(criterion="gini", max_depth=12, max_leaf_nodes=50, min_samples_split=100)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)

    f1 = calculate_stats(y_test, y_pred, "Decision Tree")

    # viz_model = dtreeviz.model(dt,
    #                        X_train=X_train, y_train=y_train,
    #                        feature_names=X.columns, target_name="is_goal", class_names=["Not a goal", "Goal"])

    return dt.predict_proba(X_test), f1

In [46]:
def preprocess_classifier(X, y):

    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=20)

   # X_train, y_train = SMOTE(sampling_strategy=0.30).fit_resample(X_train, y_train)

    return X_train, X_test, y_train, y_test

In [19]:
def preprocess_classifier_2(X, y):

    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)

    #X, y = SMOTE(sampling_strategy=0.3).fit_resample(X, y)

    return X, y

In [20]:
# remove columns that are not present in all datasets

all_columns = all_shots.columns
laliga_columns = laliga_shots.columns
bundesliga_columns = bundesliga_shots.columns
pl_columns = pl_shots.columns

common_columns = list(set(all_columns) & set(laliga_columns) & set(bundesliga_columns) & set(pl_columns))

all_shots = all_shots[common_columns]
laliga_shots = laliga_shots[common_columns]
bundesliga_shots= bundesliga_shots[common_columns]
pl_shots = pl_shots[common_columns]

In [27]:
# function which takes in two league datasets, trains on first and tests on second, repeats 100 times, returns all f1 scores. if size of training set is > than test, randomly sample the train set to have 7831 points every time. if the train and test leagues are the same, just use preprocess_classifier instead of preprocess_classifier_2. 

def train_test_leagues(league1, league2, all = False):
    f1_scores = []
    X_test, y_test = preprocess_classifier_2(league2.drop(["statsbomb_xg", "is_goal", "end_location_x", "end_location_y"], axis=1), league2["is_goal"])
    for i in tqdm(range(100)):
        if all == False:
            league1_reduced = league1.sample(n=7831)
        else:
            league1_reduced = league1
        league1_X = league1_reduced.drop(["statsbomb_xg", "is_goal", "end_location_x", "end_location_y"], axis=1)
        league1_y2 = league1_reduced["is_goal"]
        
        X_train, y_train = preprocess_classifier_2(league1_X, league1_y2)
        
        _, f1 = decision_tree(X_train, y_train, X_test, y_test)
        f1_scores.append(f1)

    return f1_scores

In [55]:
#use the train_test_leagues function to train on bundesliga and test on laliga

f1_scores_bundesliga_laliga = train_test_leagues(bundesliga_shots, laliga_shots)
f1_scores_laliga_bundesliga = train_test_leagues(laliga_shots, bundesliga_shots)
f1_scores_all_laliga = train_test_leagues(all_shots, laliga_shots, all = True)
f1_scores_all_bundesliga = train_test_leagues(all_shots, bundesliga_shots, all = True)

100%|██████████| 100/100 [00:11<00:00,  8.65it/s]
100%|██████████| 100/100 [00:11<00:00,  8.40it/s]
100%|██████████| 100/100 [01:38<00:00,  1.02it/s]
100%|██████████| 100/100 [01:41<00:00,  1.02s/it]


In [56]:
# t test on the f1 scores of the two leagues

from scipy.stats import ttest_ind

t, p = ttest_ind(f1_scores_bundesliga_laliga, f1_scores_laliga_bundesliga)

print("t:", t)
print("p:", p)


t: -9.165824200023996
p: 6.373189289484359e-17


In [60]:
# perform permutation importance on the features of the decision tree

from sklearn.inspection import permutation_importance

X_train, X_test, y_train, y_test = preprocess_classifier(all_shots.drop(["statsbomb_xg", "is_goal", "end_location_x", "end_location_y"], axis=1), all_shots["is_goal"])

dt = DecisionTreeClassifier(criterion="gini", max_depth=12, max_leaf_nodes=50, min_samples_split=100)
dt.fit(X_train, y_train)

result = permutation_importance(dt, X_test, y_test, n_repeats=50, random_state=42, n_jobs=2)

sorted_idx = result.importances_mean.argsort()



In [37]:
# perform permutation importance on laliga shots and bundesliga shots, compare the results

X_train, X_test, y_train, y_test = preprocess_classifier(laliga_X, laliga_y2)

dt = DecisionTreeClassifier(criterion="gini", max_depth=12, max_leaf_nodes=50, min_samples_split=100)
dt.fit(X_train, y_train)

result_laliga = permutation_importance(dt, X_test, y_test, n_repeats=50, random_state=42, n_jobs=2)


X_train, X_test, y_train, y_test = preprocess_classifier(bundesliga_X, bundesliga_y2)
dt.fit(X_train, y_train)

result_bundesliga = permutation_importance(dt, X_test, y_test, n_repeats=50, random_state=42, n_jobs=2)



In [38]:
# list the top 10 most important features for each dataset

sorted_idx = result.importances_mean.argsort()
sorted_idx_bundesliga = result_bundesliga.importances_mean.argsort()
sorted_idx_laliga = result_laliga.importances_mean.argsort()

print("All shots:")
for i in sorted_idx[-15:]:
    print(f"{all_shots_X.columns[i]}: {result.importances_mean[i]}")

print("\nLaliga:")
for i in sorted_idx_laliga[-15:]:
    print(f"{laliga_X.columns[i]}: {result_laliga.importances_mean[i]}")
    
print("\nBundesliga:")
for i in sorted_idx_bundesliga[-15:]:
    print(f"{bundesliga_X.columns[i]}: {result_bundesliga.importances_mean[i]}")

All shots:
position_Right Center Back: 0.0
play_pattern_From Throw In: 0.0
technique_Half Volley: 0.0
position_Left Defensive Midfield: 0.0
play_pattern_From Corner: 0.0
position_Left Center Back: 0.0
position_Left Center Midfield: 0.0
position_Left Wing Back: 0.0013617021276596008
technique_Normal: 0.0022723404255319333
is_header: 0.00814468085106386
body_part_Head: 0.0105617021276596
shooting_range: 0.011038297872340447
defenders_triangle: 0.012127659574468102
distance_to_goalie: 0.021761702127659602
goal_distance: 0.05464680851063831

Laliga:
distance_to_goalie: 0.00024680851063829445
position_Left Wing Back: 0.000297872340425529
position_Right Wing Back: 0.0005446808510638302
position_Center Attacking Midfield: 0.0005531914893617085
defenders_3m_radius: 0.0008340425531914986
position_Right Back: 0.0009106382978723504
play_pattern_From Goal Kick: 0.0010042553191489457
position_Left Defensive Midfield: 0.0018042553191489551
pass_angle: 0.0019148936170212893
goalkeeper_x: 0.0027574468

In [39]:
# out of the top 10 features for each dataset, which aren't common?

all_shots_features = [all_shots_X.columns[i] for i in sorted_idx[-6:]]
laliga_features = [laliga_X.columns[i] for i in sorted_idx_laliga[-10:]]
bundesliga_features = [bundesliga_X.columns[i] for i in sorted_idx_bundesliga[-9:]]

common_features = list(set(all_shots_features) & set(laliga_features) & set(bundesliga_features))

print("Common features:", common_features)

print("\nin laliga but not in bundesliga:", list(set(laliga_features) - set(bundesliga_features)))

print("\nin bundesliga but not in laliga:", list(set(bundesliga_features) - set(laliga_features)))

print("\nin all shots but not in laliga or bundesliga:", list(set(all_shots_features) - set(laliga_features) - set(bundesliga_features)))

Common features: ['goal_distance', 'defenders_triangle', 'shooting_range']

in laliga but not in bundesliga: ['position_Right Back', 'play_pattern_From Goal Kick', 'goalkeeper_x', 'position_Left Defensive Midfield', 'pass_angle', 'is_header']

in bundesliga but not in laliga: ['defenders_3m_radius', 'pass_length', 'location_x_distance', 'position_Center Defensive Midfield', 'distance_to_goalie']

in all shots but not in laliga or bundesliga: ['body_part_Head']
