In [126]:
import pandas as pd
import os
import json
import socceraction.spadl as spadl
import socceraction.xthreat as xthreat

In [127]:
# Load the xT model in
url_grid = "https://karun.in/blog/data/open_xt_12x8_v1.json"
xT_model = xthreat.load_model(url_grid)

### Get event data from the data directory 

In [134]:
def create_statsbomb_dataset():
    """
    Returns a concatenated dataset of all files and match details.
    Assumes the data is in a folder named man-city-data in the root directory
    """
    dfs = []
    root_dir = os.getcwd().split('/')
    root_dir = '/'.join(root_dir[:-1])
    full_data_path = f'{root_dir}/man-city-data/StatsBomb/Data'
    for dir in os.listdir(full_data_path):
        if 'events' in dir:
            dfs.append(pd.read_json(f'{full_data_path}/{dir}'))
        
    match_details = pd.read_json(f'{full_data_path}/FAWSL_22_23.json')
    return dfs, match_details

In [135]:
def normalize_dataframe(df):
    json_data = json.loads(df.to_json(orient='records'))
    norm_df = pd.json_normalize(json_data)
    return norm_df

In [136]:
def rename_cols(df):
    new_cols = [i.replace('.', '_') for i in df.columns]
    df.columns = new_cols
    df = df.rename(columns={
        'period': 'period_id',
        'id':'event_id',
    })
    return df

In [137]:
dfs, match_details = create_statsbomb_dataset()
full_df = pd.concat(dfs)

In [121]:
event_data = rename_cols(normalize_dataframe(full_df))
match_details = rename_cols(normalize_dataframe(match_details))

In [114]:
match_details.columns

Index(['match_id', 'match_date', 'kick_off', 'home_score', 'away_score',
       'attendance', 'behind_closed_doors', 'neutral_ground', 'play_status',
       'match_status', 'match_status_360', 'last_updated', 'last_updated_360',
       'match_week', 'competition_competition_id', 'competition_country_name',
       'competition_competition_name', 'season_season_id',
       'season_season_name', 'home_team_home_team_id',
       'home_team_home_team_name', 'home_team_home_team_gender',
       'home_team_home_team_youth', 'home_team_home_team_group',
       'home_team_country_id', 'home_team_country_name', 'home_team_managers',
       'away_team_away_team_id', 'away_team_away_team_name',
       'away_team_away_team_gender', 'away_team_away_team_youth',
       'away_team_away_team_group', 'away_team_country_id',
       'away_team_country_name', 'away_team_managers', 'metadata_data_version',
       'metadata_shot_fidelity_version', 'metadata_xy_fidelity_version',
       'competition_stage_id'

Create the columns that are needed by SPADL that are missing from the data source

In [122]:
event_data['freeze_frame_360'] = [[] for _ in range(len(event_data))]
event_data['visible_area_360'] = [[] for _ in range(len(event_data))]
event_data['extra'] = [json.loads('{}') for _ in range(len(event_data))] 
event_data['game_id'] = [3852832 for _ in range(len(event_data))] 

### Give actions a value using xT model

In [125]:
spadl_df = spadl.statsbomb.convert_to_actions(event_data, 746)
spadl_df.head(3)

Unnamed: 0,game_id,original_event_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,type_id,result_id,bodypart_id,action_id
0,3852832,b81dcf83-537d-4f17-8f34-abb38d2dce13,1,0.0,749,63424.0,52.058824,33.655696,52.058824,33.655696,0,1,0,0
1,3852832,bc23853b-0aa8-42ac-9daa-deefa00d9de2,1,0.0,2647,5058.0,52.941176,33.56962,52.941176,33.56962,0,1,0,1
2,3852832,27bf5806-cf6a-4595-9989-af7e6605daae,1,0.0,746,25554.0,52.058824,34.43038,52.058824,34.43038,0,1,0,2


In [128]:
df_actions_ltr = spadl.play_left_to_right(spadl_df, 746)
spadl_df["xT_value"] = xT_model.rate(df_actions_ltr)

In [132]:
spadl_df.xT_value.describe()

count    14598.000000
mean         0.000243
std          0.014347
min         -0.245306
25%          0.000000
50%          0.000000
75%          0.000000
max          0.244970
Name: xT_value, dtype: float64

In [87]:
# columns = ['counterpress', 
#            'duration',
#            'event_id',
#            'extra',
#            'freeze_frame_360',
#            'game_id',
#            'index',
#            'location',
#            'minute',
#            'period_id',
#            'play_pattern_id',
#            'play_pattern_name',
#            'player_id',
#            'player_name',
#            'position_id',
#            'position_name',
#            'possession',
#            'possession_team_id',
#            'possession_team_name',
#            'related_events',
#            'second',
#             'team_id',
#             'team_name',
#             'timestamp',
#             'type_id',
#             'type_name',
#             'under_pressure',
#             'visible_area_360'
#            ]