[Getting Started With Statsbomb Data](https://znstrider.github.io/2018-11-11-Getting-Started-with-StatsBomb-Data/)

In [1]:
import json
import pandas as pd
import numpy as np
from pandas import json_normalize
from os import listdir
from os.path import isfile, join
from tqdm.notebook import tqdm  # Import the notebook version of tqdm

'''
Set mypath to your open-data-master/data/ path
'''
mypath = 'StatsBomb/data/'

In [None]:
# EVENTS AND FREEZE-FRAMES
files = [f for f in listdir(mypath+'events/') if isfile(join(mypath+'events/', f))]
try: #if you're on MacOS like I am this file might mess with you, so try removing it
    files.remove('.DS_Store')
except:
    pass

dfs = {}
ffs = {}

for file in files:
    with open(mypath+'events/'+file) as data_file:
        #print (mypath+'events/'+file)
        data = json.load(data_file)
        #get the nested structure into a dataframe
        df = json_normalize(data, sep = "_").assign(match_id = file[:-5])
        #store the dataframe in a dictionary with the match id as key (remove '.json' from string)
        dfs[file[:-5]] = df.set_index('id')    
        shots = df.loc[df['type_name'] == 'Shot'].set_index('id')

        #get the freeze frame information for every shot in the df
        for id_, row in shots.iterrows():
            try:
                ff = json_normalize(row.shot_freeze_frame, sep = "_")
                ff = ff.assign(x = ff.apply(lambda x: x.location[0], axis = 1)).\
                        assign(y = ff.apply(lambda x: x.location[1], axis = 1)).\
                        drop('location', axis = 1).\
                        assign(id = id_)
                ffs[id_] = ff
            except:
                pass

#concatenate all the dictionaries
#this creates a multi-index with the dictionary key as first level
df = pd.concat(dfs, axis = 0)

#split locations into x and y components
df[['location_x', 'location_y']] = df['location'].apply(pd.Series)
df[['pass_end_location_x', 'pass_end_location_y']] = df['pass_end_location'].apply(pd.Series)

#split the shot_end_locations into x,y and z components (some don't include the z-part)
df['shot_end_location_x'], df['shot_end_location_y'], df['shot_end_location_z'] = np.nan, np.nan, np.nan
end_locations = np.vstack(df.loc[df.type_name == 'Shot'].shot_end_location.apply(lambda x: x if len(x) == 3
                                       else x + [np.nan]).values)
df.loc[df.type_name == 'Shot', 'shot_end_location_x'] = end_locations[:, 0]
df.loc[df.type_name == 'Shot', 'shot_end_location_y'] = end_locations[:, 1]
df.loc[df.type_name == 'Shot', 'shot_end_location_z'] = end_locations[:, 2]
events_df = df.drop(['location', 'pass_end_location', 'shot_end_location'], axis = 1)

#concatenate all the Freeze Frame dataframes
ff_df = pd.concat(ffs, axis = 0)


# MATCHES
files = [f for f in listdir(mypath+'matches/') if isfile(join(mypath+'matches/', f))]
try:
    files.remove('.DS_Store')
except:
    pass

matches_dfs = {}
for file in files:
    with open(mypath+'matches/'+file) as data_file:
        #print (mypath+'lineups/'+file)
        data = json.load(data_file)
        #get the nested structure into a dataframe
        df_ = json_normalize(data, sep = "_")
        #store the dataframe in a dictionary with the competition id as key
        matches_dfs[file[:-5]] = df_

matches_df = pd.concat(matches_dfs)


# LINEUPS w Minutes played
files = [f for f in listdir(mypath+'lineups/') if isfile(join(mypath+'lineups/', f))]
try: #if you're on MacOS like I am this file might mess with you, so try removing it
    files.remove('.DS_Store')
except:
    pass


dfs = {}
ffs = {}

for file in files:
    with open(mypath+'lineups/'+file) as data_file:
        #print (mypath+'events/'+file)
        data = json.load(data_file)
        #get the nested structure into a dataframe
        df = json_normalize(data, sep = "_").assign(match_id = file[:-5])
        df_1 = json_normalize(df.lineup.iloc[0], sep = "_").assign(
                team_id = df.team_id.iloc[0],
                team_name = df.team_name.iloc[0],
                match_id = df.match_id.iloc[0])
        df_2 = json_normalize(df.lineup.iloc[1], sep = "_").assign(
                team_id = df.team_id.iloc[1],
                team_name = df.team_name.iloc[1],
                match_id = df.match_id.iloc[1])
        dfs[file[:-5]] = pd.concat([df_1, df_2])

lineups_df = pd.concat(dfs.values())

# get the lengths of matches
match_lengths = events_df.groupby('match_id')['minute'].max()

# get all substitutions
substitutions = events_df.loc[events_df.substitution_outcome_name.notnull(),
                               ['minute', 'player_name', 'substitution_replacement_name']].\
                    reset_index().\
                    drop('id', axis = 1).\
                    rename(columns = {'level_0': 'match_id'}).\
                    set_index(['match_id'])

# assign all minutes played to the lineups_df
a = lineups_df.reset_index().set_index('match_id').assign(minutes_played = match_lengths)

for idx, row in substitutions.iterrows():
    a.loc[(a.index == idx)&(a.player_name == row.player_name), 'minutes_played'] = row.minute
    a.loc[(a.index == idx)&(a.player_name == row.substitution_replacement_name), 'minutes_played'] = \
        a.loc[(a.index == idx)&(a.player_name == row.substitution_replacement_name), 'minutes_played'] - row.minute

lineups_df = a.reset_index().set_index(['match_id', 'index'])