In [43]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from tqdm import tqdm
import warnings

from statsbombpy import sb

In [None]:
# Scrape all events from all records in StatsBomb's open data and store then in a dataframe called all_events
warnings.filterwarnings("ignore")

all_events = pd.DataFrame()
for season in sb.competitions().iterrows():
    for match in tqdm(sb.matches(competition_id=season[1].iloc[0], season_id=season[1].iloc[1]).iterrows()):
        all_events = pd.concat([all_events, sb.events(match_id=match[1].iloc[0], split=False, flatten_attrs=True)], ignore_index=True)
    # Remove this line to scrape all seasons instead of just one
    break

4it [00:03,  1.13it/s]

In [None]:
for column in all_events.columns:
    print(column)

In [None]:
# Print all shots in the data
all_shots = all_events[all_events["shot_type"].notna()]

In [None]:
# extract relevant attributes from the all_shots dataframe
data = pd.DataFrame()
data["period"] = all_shots["period"]
data["minute"] = all_shots["minute"]
data["possession"] = all_shots["possession"]
data["play_pattern"] = all_shots["play_pattern"]
data["position"] = all_shots["position"]
data["location_x"] = all_shots["location"].apply(lambda x: x[0])
data["location_y"] = all_shots["location"].apply(lambda x: x[1])
data["duration"] = all_shots["duration"]
data["technique"] = all_shots["shot_technique"]
data["body_part"] = all_shots["shot_body_part"]
data["type"] = all_shots["shot_type"]
data["first_time"] = all_shots["shot_first_time"].fillna(False)
data["open_goal"] = all_shots["shot_open_goal"].fillna(False)
data["one_on_one"] = all_shots["shot_one_on_one"].fillna(False)
data["statsbomb_xg"] = all_shots["shot_statsbomb_xg"]

In [None]:
data

In [None]:
for column_name in ["play_pattern", "position", "technique", "body_part", "type", "first_time", "open_goal", "one_on_one"]:
    one_hot_encoded = pd.get_dummies(data[column_name], prefix=column_name)
    data = data.drop(column_name, axis=1)
    data = pd.concat([data, one_hot_encoded], axis=1)

In [None]:
data

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.nunique()

In [None]:
data["statsbomb_xg"].hist()

In [None]:
for threshold in range(1, 10, 1):
    X = data.drop("statsbomb_xg", axis=1)

    Y = data["statsbomb_xg"]
    # threshold = 0.5
    Y = Y.apply(lambda x: x > (threshold/10))

    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)

    X_train, X_val_test, y_train, y_val_test = train_test_split(X, Y, test_size=0.4, random_state=20)
    X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=20)

    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Not a goal", "Goal"])
    disp.plot(cmap=plt.cm.Blues)
    plt.title('Confusion Matrix DT')
    plt.show()

    print(f"\nDecision Tree {threshold/10}")
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1:", f1)
