In [1]:
import pandas as pd
import os
import json
import numpy as np
import socceraction.spadl as spadl
import socceraction.xthreat as xthreat
import matplotsoccer as mps


In [2]:
# Load the xT model in
url_grid = "https://karun.in/blog/data/open_xt_12x8_v1.json"
xT_model = xthreat.load_model(url_grid)

### Get event data from the data directory 

In [3]:
def create_statsbomb_dataset():
    """
    Returns a concatenated dataset of all files and match details.
    Assumes the data is in a folder named man-city-data in the root directory
    """
    dfs = []

    full_data_path = '../man-city-data/StatsBomb/Data'
    for dir in os.listdir(full_data_path):
        if 'events' in dir:
            dfs.append(pd.read_json(f'{full_data_path}/{dir}'))
        
    match_details = pd.read_json(f'{full_data_path}/FAWSL_22_23.json')
    return dfs, match_details

In [4]:
def normalize_dataframe(df):
    json_data = json.loads(df.to_json(orient='records'))
    norm_df = pd.json_normalize(json_data)
    return norm_df

In [5]:
def rename_cols(df):
    new_cols = [i.replace('.', '_') for i in df.columns]
    df.columns = new_cols
    df = df.rename(columns={
        'period': 'period_id',
        'id':'event_id',
    })
    return df

In [6]:
dfs, match_details = create_statsbomb_dataset()
full_df = pd.concat(dfs)

In [7]:
event_data = rename_cols(normalize_dataframe(full_df))
match_details = rename_cols(normalize_dataframe(match_details))
match_details

Unnamed: 0,match_id,match_date,kick_off,home_score,away_score,attendance,behind_closed_doors,neutral_ground,play_status,match_status,...,competition_stage_name,stadium_id,stadium_name,stadium_country_id,stadium_country_name,referee_id,referee_name,referee_country_id,referee_country_name,referee
0,3856041,2023-03-05,14:30:00.000,5.0,1.0,,False,False,Normal,available,...,Regular Season,4979,Leigh Sports Village Stadium,68,England,1004975.0,Melissa Burgin,68.0,England,
1,3856039,2023-03-05,15:00:00.000,0.0,2.0,,False,False,Normal,available,...,Regular Season,4989,Walton Hall Park,68,England,898.0,Amy Fearn,68.0,England,
2,3852829,2023-02-12,14:00:00.000,1.0,2.0,,False,False,Normal,available,...,Regular Season,600,Tottenham Hotspur Stadium,68,England,898.0,Amy Fearn,68.0,England,
3,3856035,2023-02-05,14:00:00.000,0.0,0.0,,False,False,Normal,available,...,Regular Season,4979,Leigh Sports Village Stadium,68,England,1711.0,Emily Heaslip,68.0,England,
4,3856037,2023-02-05,20:45:00.000,0.0,0.0,,False,False,Normal,available,...,Regular Season,116891,Chigwell Construction Stadium,68,England,2127.0,Richie Watkins,68.0,England,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,3852835,2022-09-16,20:30:00.000,4.0,0.0,,False,False,Normal,available,...,Regular Season,456,Meadow Park,68,England,928.0,Lisa Benn,68.0,England,
128,3852833,2023-02-12,16:00:00.000,2.0,6.0,,False,False,Normal,available,...,Regular Season,1001076,Broadfield Stadium,68,England,926.0,Elizabeth Simms,68.0,England,
129,3852832,2023-02-11,14:30:00.000,2.0,1.0,,False,False,Normal,available,...,Regular Season,99,Academy Stadium,68,England,566.0,Rebecca Welch,68.0,England,
130,3852831,2022-11-24,21:00:00.000,3.0,3.0,,False,False,Normal,available,...,Regular Season,223,Select Car Leasing Stadium,68,England,898.0,Amy Fearn,68.0,England,


In [18]:
match_details.columns

Index(['match_id', 'match_date', 'kick_off', 'home_score', 'away_score',
       'attendance', 'behind_closed_doors', 'neutral_ground', 'play_status',
       'match_status', 'match_status_360', 'last_updated', 'last_updated_360',
       'match_week', 'competition_competition_id', 'competition_country_name',
       'competition_competition_name', 'season_season_id',
       'season_season_name', 'home_team_home_team_id',
       'home_team_home_team_name', 'home_team_home_team_gender',
       'home_team_home_team_youth', 'home_team_home_team_group',
       'home_team_country_id', 'home_team_country_name', 'home_team_managers',
       'away_team_away_team_id', 'away_team_away_team_name',
       'away_team_away_team_gender', 'away_team_away_team_youth',
       'away_team_away_team_group', 'away_team_country_id',
       'away_team_country_name', 'away_team_managers', 'metadata_data_version',
       'metadata_shot_fidelity_version', 'metadata_xy_fidelity_version',
       'competition_stage_id'

Create the columns that are needed by SPADL that are missing from the data source

In [8]:
event_data['freeze_frame_360'] = [[] for _ in range(len(event_data))]
event_data['visible_area_360'] = [[] for _ in range(len(event_data))]
event_data['extra'] = [json.loads('{}') for _ in range(len(event_data))] 
event_data['game_id'] = [3852832 for _ in range(len(event_data))] 

In [14]:
print(event_data.columns)

Index(['event_id', 'index', 'period_id', 'timestamp', 'minute', 'second',
       'possession', 'obv_for_after', 'obv_for_before', 'obv_for_net',
       ...
       'clearance_other', 'block_save_block', 'shot_deflected',
       'foul_committed_offensive', 'foul_committed_penalty',
       'foul_won_penalty', 'freeze_frame_360', 'visible_area_360', 'extra',
       'game_id'],
      dtype='object', length=165)


### Give actions a value using xT model

In [15]:
spadl_df = spadl.statsbomb.convert_to_actions(event_data, 746)
spadl_df.to_csv("spadl.csv")

In [21]:
df_actions_ltr = spadl.play_left_to_right(spadl_df, 746)
spadl_df["xT_value"] = xT_model.rate(df_actions_ltr)

In [22]:
spadl_df.xT_value.describe()

count    14570.000000
mean         0.000024
std          0.014777
min         -0.245306
25%          0.000000
50%          0.000000
75%          0.000000
max          0.244970
Name: xT_value, dtype: float64

In [23]:
# columns = ['counterpress', 
#            'duration',
#            'event_id',
#            'extra',
#            'freeze_frame_360',
#            'game_id',
#            'index',
#            'location',
#            'minute',
#            'period_id',
#            'play_pattern_id',
#            'play_pattern_name',
#            'player_id',
#            'player_name',
#            'position_id',
#            'position_name',
#            'possession',
#            'possession_team_id',
#            'possession_team_name',
#            'related_events',
#            'second',
#             'team_id',
#             'team_name',
#             'timestamp',
#             'type_id',
#             'type_name',
#             'under_pressure',
#             'visible_area_360'
#            ]