In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import soccerdata as sd
import socceraction.spadl as spadl
import socceraction.xthreat as xthreat
import pandas as pd
from socceraction.data.opta import OptaLoader

In [4]:
# Setup a scraper for the Brasileirao
ws = sd.WhoScored(leagues="BRA-Brasileirao", seasons=2025)
# # Scrape all games and return a OptaLoader object
api = ws.read_events(output_fmt='loader') #não está na biblioteca / não deve funcionar

In [5]:
api2 = OptaLoader(root="data/WhoScored/events/BRA-Brasileirao_2025", parser="whoscored")

In [6]:
# df_comp = api2.competitions()
# df_comp.head()
# df_test_jogo = api2.events(1821049)
# df_test_jogo.head()

In [7]:
# 1. Load a set of actions to train the model on
df_games = api2.games(competition_id=1, season_id=1)
dataset = [
    {
        **game,
        'actions': spadl.opta.convert_to_actions(
            events=api2.events(game['game_id']),
            home_team_id=game['home_team_id']
        )
    }
    for game in df_games.to_dict(orient='records')
]

In [8]:
# 2. Convert direction of play + add names
df_actions_ltr = pd.concat([
  spadl.play_left_to_right(game['actions'], game['home_team_id'])
  for game in dataset
])
df_actions_ltr = spadl.add_names(df_actions_ltr)

In [9]:
before = len(df_actions_ltr)

df_actions_ltr = (
    df_actions_ltr
    .replace([float("inf"), float("-inf")], pd.NA)  # turn inf into NaN
    .dropna()  # drop all NaN rows
)

after = len(df_actions_ltr)

print(f"Dropped rows: {before - after}")
print(f"Remaining rows: {after}")


Dropped rows: 63577
Remaining rows: 247003


In [10]:
# 3. Train xT model with 16 x 12 grid
xTModel = xthreat.ExpectedThreat(l=16, w=12)
xTModel.fit(df_actions_ltr)

# iterations:  29


<socceraction.xthreat.ExpectedThreat at 0x20122137520>

In [11]:
# 4. Rate ball-progressing actions
# xT should only be used to value actions that move the ball
# and that keep the current team in possession of the ball
df_mov_actions = xthreat.get_successful_move_actions(df_actions_ltr)
df_mov_actions["xT_value"] = xTModel.rate(df_mov_actions)

In [12]:
df_mov_actions.head()

Unnamed: 0,game_id,original_event_id,period_id,time_seconds,team_id,player_id,start_x,end_x,start_y,end_y,type_id,result_id,bodypart_id,action_id,type_name,result_name,bodypart_name,xT_value
0,1889883,2793263000.0,1,0.0,1239,320350.0,52.605,44.52,34.0,32.708,0,1,0,0,pass,success,foot,-0.001639
1,1889883,2793263000.0,1,2.0,1239,345389.0,45.36,35.175,33.252,18.904,0,1,0,1,pass,success,foot,-0.000965
5,1889883,2793264000.0,1,19.0,1219,231141.0,13.44,15.12,32.232,41.82,0,1,0,5,pass,success,foot,0.000179
6,1889883,2793264000.0,1,21.0,1219,31958.0,15.12,16.275,42.772,15.912,0,1,0,6,pass,success,foot,-0.00038
7,1889883,2793264000.0,1,24.0,1219,397766.0,16.275,25.515,15.912,5.44,0,1,0,7,pass,success,foot,-0.000495


In [17]:
type_counts = df_mov_actions['type_name'].value_counts()

print(type_counts)

type_name
pass       138635
dribble      6208
cross        1180
Name: count, dtype: int64


In [13]:
df_sum_xt = (
    df_mov_actions
    .groupby(["team_id", "player_id"])["xT_value"]
    .sum()
    .reset_index()
    .sort_values(by="xT_value", ascending=False)
)
df_sum_xt.head()


Unnamed: 0,team_id,player_id,xT_value
342,1235,149738.0,3.006802
218,1230,297389.0,2.99145
149,1226,373513.0,2.953047
217,1230,295012.0,2.571165
588,5438,429475.0,2.544033


In [14]:
df_games = api2.games(1, 1)
game_ids = df_games['game_id'].unique()

# --- Teams ---
team_list = []
for gid in game_ids:
    teams_df = api2.teams(gid)  # returns dataframe with team_id, team_name
    team_list.append(teams_df)

df_teams = pd.concat(team_list).drop_duplicates(subset="team_id")

# --- Players ---
player_list = []
for gid in game_ids:
    players_df = api2.players(gid)  # returns player_id, player_name, minutes_played
    player_list.append(players_df)

df_players = pd.concat(player_list)




In [15]:
# --- Step 1: Summarize players ---
# Sum minutes played per player across all games
df_minutes = df_players.groupby("player_id")["minutes_played"].sum().reset_index()

# Keep player names (drop duplicates)
# df_player_names = df_players[["player_id", "player_name"]].drop_duplicates()
df_player_names = (
    df_players
    .sort_values("game_id")  # optional, to take first starting_position chronologically
    .groupby("player_id")
    .agg(
        player_name=("player_name", "first"),
        starting_position=("starting_position", "first")
    )
    .reset_index()
)

# Merge names + total minutes
df_players_summary = df_player_names.merge(df_minutes, on="player_id", how="left")

# --- Step 2: Merge team names ---
df_sum_xt = df_sum_xt.merge(df_teams, on="team_id", how="left")

df_sum_xt = df_sum_xt.merge(df_players_summary, on="player_id", how="left")
df_sum_xt = df_sum_xt[["team_name", "player_name", "starting_position", "xT_value", "minutes_played"]]

# Check results
df_sum_xt.head(50)



Unnamed: 0,team_name,player_name,starting_position,xT_value,minutes_played
0,Atletico MG,Gustavo Scarpa,MR,3.006802,1603
1,Cruzeiro,Matheus Pereira,Sub,2.99145,1703
2,Vasco da Gama,Lucas Piton,DL,2.953047,1793
3,Cruzeiro,Wanderson,AMC,2.571165,1628
4,Red Bull Bragantino,Jhon Jhon,AMC,2.544033,1751
5,Mirassol,Reinaldo,DL,2.501871,1748
6,Internacional,Alexandro Bernabéi,DL,2.491807,1443
7,Sport Recife,Lucas Lima,MR,2.472165,1477
8,Corinthians,Matheus Bidu,DL,2.161422,1076
9,Palmeiras,Joaquín Piquerez,DL,2.156574,1682


In [18]:
df_sum_xt.to_csv('df_sum_xt_no_index.csv', index=False)