In [123]:
import warnings
warnings.filterwarnings("ignore")

In [124]:
import soccerdata as sd
import socceraction.spadl as spadl
import socceraction.xthreat as xthreat
import pandas as pd
from socceraction.data.opta import OptaLoader

In [125]:
# Setup a scraper for the 2024/2025 Premier League season
# ws = sd.WhoScored(leagues="ENG-Premier League", seasons=2024)
# # Scrape all games and return a OptaLoader object
# api = ws.read_events(output_fmt='loader') #não está na biblioteca / não deve funcionar

In [126]:
api2 = OptaLoader(root="data/WhoScored/events/ENG-Premier League_2024", parser="whoscored")

In [127]:
# df_comp = api2.competitions()
# df_comp.head()
# df_test_jogo = api2.events(1821049)
# df_test_jogo.head()

In [128]:
# 1. Load a set of actions to train the model on
df_games = api2.games(competition_id=1, season_id=1)
dataset = [
    {
        **game,
        'actions': spadl.opta.convert_to_actions(
            events=api2.events(game['game_id']),
            home_team_id=game['home_team_id']
        )
    }
    for game in df_games.to_dict(orient='records')
]

In [129]:
# 2. Convert direction of play + add names
df_actions_ltr = pd.concat([
  spadl.play_left_to_right(game['actions'], game['home_team_id'])
  for game in dataset
])
df_actions_ltr = spadl.add_names(df_actions_ltr)

In [130]:
before = len(df_actions_ltr)

df_actions_ltr = (
    df_actions_ltr
    .replace([float("inf"), float("-inf")], pd.NA)  # turn inf into NaN
    .dropna()  # drop all NaN rows
)

after = len(df_actions_ltr)

print(f"Dropped rows: {before - after}")
print(f"Remaining rows: {after}")


Dropped rows: 126130
Remaining rows: 478845


In [131]:
# 3. Train xT model with 16 x 12 grid
xTModel = xthreat.ExpectedThreat(l=16, w=12)
xTModel.fit(df_actions_ltr)

# iterations:  30


<socceraction.xthreat.ExpectedThreat at 0x2a5e3698190>

In [132]:
# 4. Rate ball-progressing actions
# xT should only be used to value actions that move the ball
# and that keep the current team in possession of the ball
df_mov_actions = xthreat.get_successful_move_actions(df_actions_ltr)
df_mov_actions["xT_value"] = xTModel.rate(df_mov_actions)

In [133]:
df_mov_actions.head()

Unnamed: 0,game_id,original_event_id,period_id,time_seconds,team_id,player_id,start_x,end_x,start_y,end_y,type_id,result_id,bodypart_id,action_id,type_name,result_name,bodypart_name,xT_value
0,1821049,2709313000.0,1,0.0,170,243254.0,52.395,41.16,33.932,31.756,0,1,0,0,pass,success,foot,-0.000879
1,1821049,2709313000.0,1,1.0,170,297395.0,41.16,25.41,31.756,24.956,0,1,0,1,pass,success,foot,-0.002196
3,1821049,2709313000.0,1,7.0,170,300359.0,32.025,68.985,14.008,7.208,0,1,0,3,pass,success,foot,0.007075
7,1821049,2709313000.0,1,12.0,32,460260.0,39.48,46.2,66.912,65.824,0,1,0,7,pass,success,foot,0.00091
14,1821049,2709314000.0,1,61.0,170,243254.0,71.82,70.455,20.672,29.308,0,1,1,14,pass,success,head,0.000665


In [134]:
df_sum_xt = (
    df_mov_actions
    .groupby(["team_id", "player_id"])["xT_value"]
    .sum()
    .reset_index()
    .sort_values(by="xT_value", ascending=False)
)
df_sum_xt.head()


Unnamed: 0,team_id,player_id,xT_value
172,26,108226.0,7.532105
178,26,318871.0,7.218087
440,170,306581.0,6.086037
432,170,136824.0,5.986651
235,30,362352.0,5.349771


In [135]:
df_games = api2.games(1, 1)
game_ids = df_games['game_id'].unique()

# --- Teams ---
team_list = []
for gid in game_ids:
    teams_df = api2.teams(gid)  # returns dataframe with team_id, team_name
    team_list.append(teams_df)

df_teams = pd.concat(team_list).drop_duplicates(subset="team_id")

# --- Players ---
player_list = []
for gid in game_ids:
    players_df = api2.players(gid)  # returns player_id, player_name, minutes_played
    player_list.append(players_df)

df_players = pd.concat(player_list)




In [136]:
# --- Step 1: Summarize players ---
# Sum minutes played per player across all games
df_minutes = df_players.groupby("player_id")["minutes_played"].sum().reset_index()

# Keep player names (drop duplicates)
# df_player_names = df_players[["player_id", "player_name"]].drop_duplicates()
df_player_names = (
    df_players
    .sort_values("game_id")  # optional, to take first starting_position chronologically
    .groupby("player_id")
    .agg(
        player_name=("player_name", "first"),
        starting_position=("starting_position", "first")
    )
    .reset_index()
)

# Merge names + total minutes
df_players_summary = df_player_names.merge(df_minutes, on="player_id", how="left")

# --- Step 2: Merge team names ---
df_sum_xt = df_sum_xt.merge(df_teams, on="team_id", how="left")

df_sum_xt = df_sum_xt.merge(df_players_summary, on="player_id", how="left")
df_sum_xt = df_sum_xt[["team_name", "player_name", "starting_position", "xT_value", "minutes_played"]]

# Check results
df_sum_xt.head(50)



Unnamed: 0,team_name,player_name,starting_position,xT_value,minutes_played
0,Liverpool,Mohamed Salah,AMR,7.532105,3715
1,Liverpool,Trent Alexander-Arnold,DR,7.218087,2583
2,Fulham,Antonee Robinson,DL,6.086037,3502
3,Fulham,Alex Iwobi,AML,5.986651,3209
4,Tottenham,Pedro Porro,DR,5.349771,2922
5,Newcastle,Jacob Murphy,FWR,5.33644,2547
6,Brentford,Bryan Mbeumo,FWR,5.281786,3663
7,Man Utd,Bruno Fernandes,FW,5.23947,3323
8,Brentford,Mikkel Damsgaard,Sub,4.978687,3041
9,Chelsea,Cole Palmer,AMR,4.6968,3534
