# Action-Based Expected Threat

# Setup

In [None]:
import json
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from mplsoccer import Pitch
from itertools import combinations_with_replacement
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
notebook_directory = os.getcwd()
course_directory = ""

if os.name == "posix": 
    course_directory = notebook_directory.rsplit("/Notebooks")[0]
else:
    course_directory = notebook_directory.rsplit("\\Notebooks")[0]
os.chdir(course_directory)

# Data

In [None]:
df = pd.read_json("data/wyscout/poseesion_chains.json")
df.reset_index(drop=True, inplace=True)
df

In [None]:
df[df["possession_chain"] == 4]

# Prep the variables for models
- Models will be a non-linear combo of the start & end x coordinates & c (distance from the middle).
- The non-linear transformations of these are got via sampling with replacement for 1-3 combinations of the variables.

In [None]:
var = ["x0", "x1", "c0", "c1"]

inputs = []
inputs.extend(combinations_with_replacement(var, 1))
inputs.extend(combinations_with_replacement(var, 2))
inputs.extend(combinations_with_replacement(var, 3))

for i in inputs:
    if len(i) > 1:
        column = ""
        x = 1
        for c in i:
            column += c
            x = x*df[c]
        df[column] = x
        var.append(column)

df

In [None]:
df[df["possession_chain"] == 4]["shot_end"]

# Calculate action based xT for passes
1. Predict the outcome of a shot using XGBoost.

In [None]:
passes = df.loc[df["eventName"] == "Pass"].copy()

X = passes[var].values
y = passes["shot_end"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123, stratify=y)

xgb_classifier = xgb.XGBClassifier(n_estimators=100,  max_depth=4,  min_child_weight=10, random_state=123)
scores = cross_val_score(estimator=xgb_classifier, X=X_train, y=y_train, cv=10, n_jobs=-1)
xgb_classifier.fit(X_train, y_train)

print(np.mean(scores), np.std(scores))
print(xgb_classifier.score(X_train, y_train))

In [None]:
y_pred_proba = xgb_classifier.predict_proba(X_test)
print(xgb_classifier.score(X_test, y_test))

In [None]:
y_pred_proba = xgb_classifier.predict_proba(X)[::,1]

passes["shot_prob"] = y_pred_proba
shot_ended  = passes.loc[passes["shot_end"] == 1]

X2 = shot_ended[var].values
y2 = shot_ended["xG"].values

lr = LinearRegression()
lr.fit(X2, y2)
y_pred = lr.predict(X)

passes["xG_pred"] = y_pred
passes["xT"] = passes["xG_pred"] * passes["shot_prob"]
passes[["xG_pred", "shot_prob", "xT"]].head(5)

# Plot Pass Values

In [None]:
example_chain = df.loc[df["possession_chain"] == 4]
passes_in = passes.loc[df["possession_chain"] == 4]
max_value = passes_in["xT"].max()
not_passes = example_chain.loc[example_chain["eventName"] != "Pass"].iloc[:-1]
shot = example_chain.iloc[-1]

pitch = Pitch(line_color='black',pitch_type='custom', pitch_length=105, pitch_width=68, line_zorder = 2)
fig, ax = pitch.grid(grid_height=0.9, title_height=0.06, axis=False, endnote_height=0.04, title_space=0, endnote_space=0)

for i, row in passes_in.iterrows():
    value = row["xT"]
    line_width = (value/max_value * 10)
    angle = np.arctan((row.y1-row.y0) / (row.x1-row.x0))*180/np.pi
    pitch.arrows(row.x0, row.y0, row.x1, row.y1, alpha=0.6, width=line_width, zorder=2, color="blue", ax=ax["pitch"])
    ax["pitch"].text((row.x0+row.x1-8) / 2, (row.y0+row.y1-4) / 2, str(value)[:5], fontweight="bold",  color="blue", zorder=4, fontsize=20, rotation = int(angle))

pitch.arrows(shot.x0, shot.y0, shot.x1, shot.y1, width=line_width, color="red", ax=ax["pitch"], zorder=3)
pitch.lines(not_passes.x0, not_passes.y0, not_passes.x1, not_passes.y1, color="grey", lw=1.5, ls="dotted", ax=ax["pitch"])

fig.suptitle("Passes Leading to a Shot", fontweight="bold", fontsize=20);

# Players with the highest action based xT

In [None]:
players_df = pd.read_json("data/wyscout/players.json", encoding="unicode_escape")
players_df.rename(columns={"wyId" : "playerId"}, inplace=True)
players_df["role"] = players_df.apply(lambda x: x.role["name"], axis=1)
to_merge = players_df[["playerId", "shortName", "role"]]
to_merge

In [None]:
summary = passes[["playerId", "xT"]].groupby("playerId").sum().reset_index()
summary = summary.merge(to_merge, on="playerId", how="left")
summary

In [None]:
minutes_per_game_df = pd.read_json("data/wyscout/minutes_played_per_game_England.json")
minutes = minutes_per_game_df.groupby("playerId")[["minutesPlayed"]].sum().reset_index()
summary = minutes.merge(summary, how="left", on="playerId")
summary = summary.fillna(0)
summary

In [None]:
summary_over_400_minutes = summary.loc[summary["minutesPlayed"] > 400].copy()
summary_over_400_minutes["xT_per_90"] = summary_over_400_minutes["xT"] * 90 / summary_over_400_minutes["minutesPlayed"]
summary_over_400_minutes 

In [None]:
possesion_df = pd.read_json("data/wyscout/player_possesion_England.json")
summary_over_400_minutes = summary_over_400_minutes.merge(possesion_df, how="left", on="playerId")
summary_over_400_minutes["xT_adjusted_per_90"] = (summary_over_400_minutes["xT"] / summary_over_400_minutes["possesion"])  * 90 / summary_over_400_minutes["minutesPlayed"]
summary_over_400_minutes

In [None]:
summary_over_400_minutes[['shortName', 'xT_adjusted_per_90']].sort_values(by='xT_adjusted_per_90', ascending=False).head(5)

# Challenge
1. Do the same analysis but for the Indian Super League 2021/22 season.

## Install Stats Bomb API Python Package
- Simplest way to get the data needed.
- Indian Super Leage 2021/22 has a competition ID of 1238 and a season ID of 108.

In [None]:
# !pip install statsbombpy

In [None]:
from statsbombpy import sb

In [None]:
events = sb.competition_events(country="India", division="Indian Super league", season="2021/2022")

In [None]:
events_with_possesion = events[events["possession"] != None].copy()
events_with_possesion

In [None]:
cols = [
    "carry_end_location",
    "location",
    "match_id",
    "pass_end_location",
    "player",
    "possession",
    "possession_team",
    "shot_end_location",
    "shot_outcome",
    "type",
    "shot_statsbomb_xg",
    "minute",
    "second"
]

event_types = ["Pass", "Carry", "Shot"]
possession_chain_events = events_with_possesion[events_with_possesion["type"].isin(event_types)][cols].copy()
possession_chain_events

In [None]:
possession_chain_events["x0"] = possession_chain_events["location"].apply(lambda x: x[0] * 105/100)
possession_chain_events["y0"] = possession_chain_events["location"].apply(lambda x: 100 - x[1] * (68 /100))
possession_chain_events["c0"] = possession_chain_events["location"].apply(lambda x: 50 - x[1] * (68 /100))

# Pass
pass_end_locations = possession_chain_events["pass_end_location"].dropna()
possession_chain_events["x1_pass"] = pass_end_locations.apply(lambda x: x[0] * 105/100)
possession_chain_events["y1_pass"] = pass_end_locations.apply(lambda x: 100 - x[1] * (68 /100))
possession_chain_events["c1_pass"] = pass_end_locations.apply(lambda x: 50 - x[1] * (68 /100))

# Shots
shot_end_locations = possession_chain_events["shot_end_location"].dropna()
possession_chain_events["x1_shot"] = 105
possession_chain_events["y1_shot"] = 34
possession_chain_events["c1_shot"] = 0

# Carry
carry_end_locations = possession_chain_events["carry_end_location"].dropna()
possession_chain_events["x1_carry"] = carry_end_locations.apply(lambda x: x[0] * 105/100)
possession_chain_events["y1_carry"] = carry_end_locations.apply(lambda x: 100 - x[1] * (68 /100))
possession_chain_events["c1_carry"] = carry_end_locations.apply(lambda x: 50 - x[1] * (68 /100))

In [None]:
possession_chain_events

In [None]:
possession_chain_events["x1"] = possession_chain_events["x1_pass"].combine_first(possession_chain_events["x1_shot"]).combine_first(possession_chain_events["x1_carry"])
possession_chain_events["y1"] = possession_chain_events["y1_pass"].combine_first(possession_chain_events["y1_shot"]).combine_first(possession_chain_events["y1_carry"])
possession_chain_events["c1"] = possession_chain_events["c1_pass"].combine_first(possession_chain_events["c1_shot"]).combine_first(possession_chain_events["c1_carry"])
possession_chain_events

In [None]:
possession_chain_key_cols = possession_chain_events[["match_id", "minute", "second", "player", "possession", "possession_team", "type", "x0", "y0", "c0", "x1", "y1", "c1", "shot_statsbomb_xg"]].copy()
possession_chain_key_cols

In [None]:
possession_chain_key_cols.shot_statsbomb_xg.fillna(0, inplace=True)
possession_chain_key_cols

In [None]:
possession_chain_key_cols = possession_chain_key_cols.rename(columns={"shot_statsbomb_xg" : "xG"})
possession_chain_key_cols

In [None]:
shots = possession_chain_key_cols[possession_chain_key_cols.type == "Shot"][["match_id", "possession", "xG"]].copy()
shots

In [None]:
possession_chains = possession_chain_key_cols.copy()
possession_chains["shot_end"] = 0

for row in shots.iterrows():    
    match_id = row[1]["match_id"]
    possession = row[1]["possession"]
    xG = row[1]["xG"]

    possession_chains.loc[(possession_chains.match_id == match_id) & (possession_chains.possession == possession), "xG"] = xG
    possession_chains.loc[(possession_chains.match_id == match_id) & (possession_chains.possession == possession), "shot_end"] = 1

possession_chains

In [None]:
df = possession_chains.copy()
var = ["x0", "x1", "c0", "c1"]

inputs = []
inputs.extend(combinations_with_replacement(var, 1))
inputs.extend(combinations_with_replacement(var, 2))
inputs.extend(combinations_with_replacement(var, 3))

for i in inputs:
    if len(i) > 1:
        column = ""
        x = 1
        for c in i:
            column += c
            x = x*df[c]
        df[column] = x
        var.append(column)

df

In [None]:
passes = df.loc[df["type"] == "Pass"].copy()

X = passes[var].values
y = passes["shot_end"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123, stratify=y)

xgb_classifier = xgb.XGBClassifier(random_state=123)
scores = cross_val_score(estimator=xgb_classifier, X=X_train, y=y_train, cv=10, n_jobs=-1)
xgb_classifier.fit(X_train, y_train)

print(np.mean(scores), np.std(scores))
print(xgb_classifier.score(X_train, y_train))

In [None]:
y_pred_proba = xgb_classifier.predict_proba(X_test)
print(xgb_classifier.score(X_test, y_test))
y_pred_proba = xgb_classifier.predict_proba(X)[::,1]

passes["shot_prob"] = y_pred_proba
shot_ended  = passes.loc[passes["shot_end"] == 1]

X2 = shot_ended[var].values
y2 = shot_ended["xG"].values

lr = LinearRegression()
lr.fit(X2, y2)
y_pred = lr.predict(X)

passes["xG_pred"] = y_pred
passes["xT"] = passes["xG_pred"] * passes["shot_prob"]
passes[["xG_pred", "shot_prob", "xT"]].head(100)

In [None]:
players_Total_xT = passes[["player", "xT"]].groupby("player").sum("xT").reset_index()
players_Total_xT.sort_values("xT", ascending=False)

In [None]:
starting_eleven = events[events["type"] == "Starting XI"].dropna(axis=1, how='all')
starting_eleven["lineup"] = starting_eleven["tactics"].apply(lambda x: x["lineup"])
starting_eleven

In [None]:
player_dicts = []

for row in starting_eleven[["match_id", "lineup"]].iterrows():
    match_id = row[1]["match_id"]
    for player in row[1]["lineup"]:
        player_dict = {}
        id = player["player"]["id"]
        name = player["player"]["name"]

        player_dict["match_id"] = match_id
        player_dict["player_id"] = id
        player_dict["player"] = name
        player_dict["player_in_min"] = 0 
        player_dict["player_out_min"] = -1

        player_dicts.append(player_dict)

In [None]:
players_df = pd.DataFrame(player_dicts, columns=["match_id", "player_id", "player", "player_in_min", "player_out_min"]).copy()
players_df

In [None]:
player_on_df = events[events["type"] == "Player On"][["match_id", "player_id", "player", "minute", "second"]].copy()
event_index = player_on_df.groupby(["match_id", "player_id", "player"]).cumcount()
player_on_df["total_second_on"] =  (player_on_df["minute"] * 60 + player_on_df["second"])
player_on_df["event_index"] = event_index
player_on_df

In [None]:
player_on_df[player_on_df["event_index"] != 0]

In [None]:
player_on_df[(player_on_df.match_id == 3817891) & (player_on_df.player_id == 166561)]

In [None]:
player_off_df = events[events["type"] == "Player Off"][["match_id", "player_id", "player", "minute", "second", "player_off_permanent"]]
event_index = player_off_df.groupby(["match_id", "player_id", "player"]).cumcount()
player_off_df["total_second_off"] =  (player_off_df["minute"] * 60 + player_off_df["second"])
player_off_df["event_index"] = event_index
player_off_df

In [None]:
player_off_df[(player_off_df.match_id == 3817891) & (player_off_df.player_id == 166561)]

In [None]:
merged = player_off_df.merge(player_on_df, on=["match_id", "player_id", "player", "event_index"])[["match_id", "player_id", "player", "total_second_off", "total_second_on"]]
merged["total_time_off_pitch"] = merged["total_second_on"] - merged["total_second_off"]
merged

In [None]:
merged[(merged.match_id == 3817891) & (merged.player_id == 166561)]

In [None]:
substitutions_df = events[events["type"] == "Substitution"][["match_id", "player_id", "player", "substitution_replacement", "minute", "second"]]
substitutions_df["time_of_sub"] = (substitutions_df["minute"] * 60) + substitutions_df["second"]
substitutions_df

In [None]:
substitutions_df[(substitutions_df.match_id == 3817891) & (substitutions_df.player_id == 166561)]

In [None]:
subs_with_time_off_and_on_df = merged.merge(substitutions_df, on=["match_id", "player_id", "player"])

In [None]:
subs_with_time_off_and_on_df