In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import json
import requests
from pandas.io.json import json_normalize
import numpy as np
import pandas as pd

In [27]:
# Get list of matches we are interested in from StatsBomb's GitHub page
fawsl = 'https://raw.githubusercontent.com/statsbomb/open-data/master/data/matches/37.json'
nwsl = 'https://raw.githubusercontent.com/statsbomb/open-data/master/data/matches/49.json'

def get_json(url):
    r = requests.get(url)
    j = r.json()
    df = json_normalize(j, sep = "_")
    return df

fawsl_df = get_json(fawsl)
nwsl_df = get_json(nwsl)

matches = fawsl_df.append(nwsl_df)
match_list = matches['match_id'].tolist()

In [28]:
# Get event data for all matches in match_list
master = pd.DataFrame()
for match in match_list:
    temp = get_json('https://raw.githubusercontent.com/statsbomb/open-data/master/data/events/%s.json' % match)
    temp['match_id'] = match
    master = master.append(temp)
    

In [29]:
# Optional save to avoid scraping again
master.to_csv('master.csv')

In [3]:
# Optional read csv
master = pd.read_csv('master.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# Display raw data
def display_all(df):
    with pd.option_context("display.max_rows", 1000): 
        with pd.option_context("display.max_columns", 1000): 
            display(df.tail().transpose())

display_all(master)

Unnamed: 0,288785,288786,288787,288788,288789
Unnamed: 0,2493,2494,2495,2496,2497
50_50_outcome_id,,,,,
50_50_outcome_name,,,,,
bad_behaviour_card_id,,,,,
bad_behaviour_card_name,,,,,
ball_receipt_outcome_id,,,,,
ball_receipt_outcome_name,,,,,
ball_recovery_offensive,,,,,
ball_recovery_recovery_failure,,,,,
block_deflection,,,,,


In [5]:
# Create separate dataframes for shots and passes
shots = master[master['type_id'] == 16]
passes = master[master['type_id'] == 30]

In [6]:
# Get possession chains
# The 'possession' column contains an ID for each possession in the match
# E.g. the first few passes from kickoff would be possession=1, until the other team wins the ball
# Here I chose to loop through each value of 'possession' in the shots df and get all events associated with that possession
# from the master dataframe
# There is probably a better way to do this, so try to speed it up if you can!
chains = pd.DataFrame()
for item in list(shots['possession'].unique()):
    temp = master[master['possession'] == item]
    chains = chains.append(temp)
    
# Remove pressures and receptions (we don't need to know about these in this case)
chains = chains[chains['type_id'] != 17]
chains = chains[chains['type_id'] != 42]
chains = chains.reset_index().drop('level_0', axis=1)

# Create previous events column
# .shift() will take the value from the previous row in the df
chains['previous_event'] = chains['type_name'].shift()
chains['previous_event'] = chains['previous_event'].str.lower().str.replace(' ', '_').str.replace('*', '')
chains['previous_event'] = np.where(chains['previous_event'] == 'goal_keeper', 'goalkeeper', chains['previous_event'])

In [7]:
# Print a list of all items in the new 'pre_shot_events' column
pre_shot_events = list(chains[chains['type_id'] == 16]['previous_event'].unique())
print(pre_shot_events)

['pass', 'ball_recovery', 'duel', 'dribble', 'error', 'interception', 'foul_won', 'block', 'clearance', 'goalkeeper', '50/50', 'substitution', 'miscontrol', 'camera_off', 'camera_on', 'injury_stoppage', 'referee_ball-drop', 'tactical_shift', 'dispossessed']


In [8]:
# Create 'follows_event' for each event we care about
important_pre_shot = ['pass', 'ball_recovery', 'duel', 'interception', 'clearance', 'goalkeeper', 'miscontrol',
                      'error', 'block', 'dispossessed', '50/50', 'dribble']

for event in important_pre_shot:
    chains['follows_%s' % event] = np.where(chains['previous_event'] == event, 1, 0)

In [9]:
# Get a cumulative value for the 'duration' column for each possession chain
chains_filt = chains[['match_id', 'possession', 'duration']]
chains_filt.columns = ['match_id', 'possession', 'chain_duration']
chains_dur = chains_filt.pivot_table(index=['match_id', 'possession'], values='chain_duration', aggfunc='sum')
chains_dur.reset_index(inplace=True)

In [10]:
# Add 'chain_duration' to the shots df
shots = chains[chains['type_id'] == 16]
shots = shots.merge(chains_dur, on=['match_id', 'possession'])

In [11]:
# Add pass info to shots df
assisted_shots = shots[shots['shot_key_pass_id'].notna()]
assist_passes = passes[passes['pass_assisted_shot_id'].notna()]
# Convert pass body part
assist_passes['pass_body_part_name'] = np.where((assist_passes['pass_body_part_name'] == 'Right Foot')
                                 | (assist_passes['pass_body_part_name'] == 'Left Foot'), 'foot',
                                np.where(assist_passes['pass_body_part_name'] == 'Head', 'head', 'other'))
pass_column = [i for i in assist_passes.columns if i.startswith('pass')]
assist_passes = assist_passes[pass_column]
assisted_shots = assisted_shots.drop(pass_column, axis=1)
shot_pass = assisted_shots.merge(assist_passes, left_on='id', right_on='pass_assisted_shot_id', how='left')
other_shots = shots[shots['shot_key_pass_id'].isna()]
shots = other_shots.append(shot_pass)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [12]:
# Split out location into x, y
shots['location'] = shots['location'].astype(str).str.strip('[]')
shots['x_loc'] = shots['location'].str.split(', ').str.get(0).str.strip().astype(float)
shots['y_loc'] = shots['location'].str.split(', ').str.get(1).str.strip().astype(float)

In [13]:
# Recreate features from previous exercises

# Remove penalties
shots = shots[shots['shot_type_name'] != 'Penalty']

# Reset index
shots = shots.reset_index().drop('level_0', axis=1)

# Create goal column for labels
shots['goal'] = np.where(shots['shot_outcome_name'] == 'Goal', 1, 0)

# Create assisted flag
shots['assisted'] = np.where(shots['shot_key_pass_id'].isna(), 0, 1)

# Create columns for distance and angle
# In this dataset, the pitch is 120 units long and 80 units wide
# We will measure distance and angle from the centre of the goal at (120, 40) to the starting point of the shot
# Note we are using radians this time to help keep things consistent with pass_angle
shots['x_dist'] = 120 - shots['x_loc']
shots['y_dist'] = abs(40 - shots['y_loc'])
shots['distance'] = np.sqrt((shots['x_dist']**2 + shots['y_dist']**2))
shots['angle'] = np.arctan((shots['y_dist'] / shots['x_dist']))

# We would only want to differentiate between left and right foot if we already knew the player's weak foot
shots['shot_body_part_name'] = np.where((shots['shot_body_part_name'] == 'Right Foot')
                                 | (shots['shot_body_part_name'] == 'Left Foot'), 'foot',
                                np.where(shots['shot_body_part_name'] == 'Head', 'head', 'other'))

# Add new features

# Add build-up duration
shots['build_up_duration'] = shots['chain_duration'] - shots['duration']

# Add direct free kick flag
shots['direct_free_kick'] = np.where((shots['play_pattern_name'] == 'From Free Kick')
                                      & (shots['pass_assisted_shot_id'].isna()), 1, 0)

# Convert pass angle to positive values, since we aren't differentiating between clockwise/anti-clockwise
shots['pass_angle'] = np.where(shots['pass_angle'] < 0, shots['pass_angle'] * -1, shots['pass_angle'])

In [14]:
# Get freeze frame info
# The shot_freeze_frame column contains a list of dictionaries containing information about the location of each player
# at the time of the shot
# The code below adds shot id, x_loc and y_loc to the freeze_frame info
# https://stackoverflow.com/questions/50782047/python-sees-list-of-dicts-as-string-how-to-parse
from itertools import chain
from ast import literal_eval

freeze_frame = shots[['shot_freeze_frame', 'id', 'x_loc', 'y_loc']]
freeze_frame_df = pd.DataFrame(freeze_frame)
freeze_frame_df['shot_freeze_frame'] = freeze_frame_df['shot_freeze_frame'].apply(literal_eval)

# Add shot_id and shot x, y locations to each dict in list for every row
def add_shot_id(row):
    return [item.update({'shot_id': row['id'], 'shot_x_loc': row['x_loc'], 'shot_y_loc': row['y_loc']}) 
            for item in row['shot_freeze_frame']]

freeze_frame_df.apply(add_shot_id, axis=1)
freeze_frame_df.head()

Unnamed: 0,shot_freeze_frame,id,x_loc,y_loc
0,"[{'location': [119.0, 37.0], 'player': {'id': ...",aabcd43f-68ca-45f6-9d84-e97cb858a940,113.0,33.0
1,"[{'location': [117.0, 48.0], 'player': {'id': ...",0e6703a6-fe2a-4f0b-864c-1f85068c9b74,113.0,45.0
2,"[{'location': [91.0, 36.0], 'player': {'id': 1...",a93acb2a-3fd7-4fb9-aa0b-a8c2337cdbac,95.0,46.0
3,"[{'location': [101.0, 30.0], 'player': {'id': ...",701abd4b-93b0-4579-904f-bd8d5f40cf3b,95.0,47.0
4,"[{'location': [99.0, 61.0], 'player': {'id': 1...",534f07ec-fb85-4f85-8ad0-a4596a4c04e3,107.0,33.0


In [15]:
# Expand shot_freeze_frame into a new df
chained = chain.from_iterable(freeze_frame_df['shot_freeze_frame'])
ff_expanded = pd.DataFrame(list(chained))
ff_expanded.head()

Unnamed: 0,location,player,position,shot_id,shot_x_loc,shot_y_loc,teammate
0,"[119.0, 37.0]","{'id': 15709, 'name': 'Megan Walsh'}","{'id': 1, 'name': 'Goalkeeper'}",aabcd43f-68ca-45f6-9d84-e97cb858a940,113.0,33.0,False
1,"[104.0, 33.0]","{'id': 15705, 'name': 'Emily Donovan'}","{'id': 15, 'name': 'Left Center Midfield'}",aabcd43f-68ca-45f6-9d84-e97cb858a940,113.0,33.0,False
2,"[109.0, 33.0]","{'id': 15712, 'name': 'Nicola Cousins'}","{'id': 2, 'name': 'Right Back'}",aabcd43f-68ca-45f6-9d84-e97cb858a940,113.0,33.0,False
3,"[112.0, 39.0]","{'id': 15715, 'name': 'Ellie Mason'}","{'id': 5, 'name': 'Left Center Back'}",aabcd43f-68ca-45f6-9d84-e97cb858a940,113.0,33.0,False
4,"[113.0, 37.0]","{'id': 15713, 'name': 'Bonnie Horwood'}","{'id': 10, 'name': 'Center Defensive Midfield'}",aabcd43f-68ca-45f6-9d84-e97cb858a940,113.0,33.0,False


In [16]:
# Split out location into x, y
ff_expanded['location'] = ff_expanded['location'].astype(str).str.strip('[]')
ff_expanded['x_loc'] = ff_expanded['location'].str.split(', ').str.get(0).str.strip().astype(float)
ff_expanded['y_loc'] = ff_expanded['location'].str.split(', ').str.get(1).str.strip().astype(float)

In [17]:
# Add distance and angle to shooter
ff_expanded['player_x_dist'] = abs(ff_expanded['shot_x_loc'] - ff_expanded['x_loc'])
ff_expanded['player_y_dist'] = abs(ff_expanded['shot_y_loc'] - ff_expanded['y_loc'])
ff_expanded['player_distance_from_shooter'] = np.where(ff_expanded['player_x_dist'] < 0,
                                                    -1*np.sqrt((ff_expanded['player_x_dist']**2 + ff_expanded['player_y_dist']**2)),
                                                           np.sqrt((ff_expanded['player_x_dist']**2 + ff_expanded['player_y_dist']**2)))
ff_expanded['player_angle_to_shooter'] = np.arctan((ff_expanded['player_y_dist'] / ff_expanded['player_x_dist']))

  import sys


In [18]:
# Add shot_cone and player_in_cone
# Assume goalposts are at (120, 44) and (120, 36)

# Define function to calculate area of triangle formed by (x1, y1), (x2, y2) and (x3, y3) 
# https://www.geeksforgeeks.org/check-whether-a-given-point-lies-inside-a-triangle-or-not/
def area(x1, y1, x2, y2, x3, y3):
    return abs((x1 * (y2 - y3) + x2 * (y3 - y1) + x3 * (y1 - y2)) / 2.0)

def get_shot_cone(row):
    return area(row['shot_x_loc'], row['shot_x_loc'], 120, 44, 120, 36)

ff_expanded['shot_cone'] = ff_expanded.apply(get_shot_cone, axis=1)

# Define function to check whether a player (point P) is inside shot_cone (area of triangle formed by ABC)
def in_shot_cone(x1, y1, x2, y2, x3, y3, x, y): 
    # Calculate area of triangle ABC 
    A = area (x1, y1, x2, y2, x3, y3) 
    # Calculate area of triangle PBC  
    A1 = area (x, y, x2, y2, x3, y3)  
    # Calculate area of triangle PAC  
    A2 = area (x1, y1, x, y, x3, y3)   
    # Calculate area of triangle PAB  
    A3 = area (x1, y1, x2, y2, x, y) 
    # Check if sum of A1, A2 and A3 is same as A 
    if(A == A1 + A2 + A3): 
        return 1
    else: 
        return 0

def player_in_cone(row):
    return in_shot_cone(row['shot_x_loc'], row['shot_x_loc'], 120, 44, 120, 36, row['x_loc'], row['y_loc'])

ff_expanded['player_in_shot_cone'] = ff_expanded.apply(player_in_cone, axis=1)

# If player location equals shot location, set player_in_shot_cone to 1
ff_expanded['player_in_shot_cone'] = np.where(ff_expanded['player_angle_to_shooter'].isna(), 
                                              1, ff_expanded['player_in_shot_cone'])

# Set angle to 0 if na (happens when player location equals shot location)
ff_expanded['player_angle_to_shooter'] = ff_expanded['player_angle_to_shooter'].fillna(0)

In [19]:
# Add new features to main shot df
shot_cone = pd.pivot_table(ff_expanded, index=['shot_id'], values=['shot_cone']).reset_index()
shot_cone.columns = ['id', 'shot_cone']
players_in_cone = pd.pivot_table(ff_expanded, index=['shot_id'], values=['player_in_shot_cone'], aggfunc=np.sum).reset_index()
players_in_cone.columns = ['id', 'num_players_in_shot_cone']
min_distance = pd.pivot_table(ff_expanded[ff_expanded['teammate'] == True], index=['shot_id'],
                                          values=['player_distance_from_shooter'], aggfunc=np.min).reset_index()
min_distance.columns = ['id', 'distance_to_nearest_opponent']
shots = shots.merge(shot_cone, on='id', how='left')
shots = shots.merge(players_in_cone, on='id', how='left')
shots = shots.merge(min_distance, on='id', how='left')

In [24]:
# Filter shots to important columns only
feature_cols = ['duration', 'follows_50/50', 'follows_ball_recovery', 'follows_block', 
                'follows_clearance', 'follows_dispossessed', 'follows_dribble', 'follows_duel', 'follows_error', 
                'follows_goalkeeper', 'follows_interception', 'follows_miscontrol', 'follows_pass', 
                'pass_aerial_won', 'pass_angle', 'pass_body_part_name', 
                'pass_cross', 'pass_cut_back', 'pass_deflected', 'pass_height_name', 'pass_length', 
                'pass_switch', 'pass_through_ball', 'play_pattern_name', 'shot_aerial_won', 
                'shot_body_part_name', 'shot_first_time', 'shot_one_on_one', 'shot_open_goal', 
                'shot_technique_name', 'under_pressure', 'goal', 'distance', 'angle', 
                'build_up_duration', 'direct_free_kick', 'shot_cone', 'num_players_in_shot_cone', 
                'distance_to_nearest_opponent']

shots_final = shots[feature_cols]

In [25]:
# Check for missing values
print(shots_final.isnull().sum())

duration                           0
follows_50/50                      0
follows_ball_recovery              0
follows_block                      0
follows_clearance                  0
follows_dispossessed               0
follows_dribble                    0
follows_duel                       0
follows_error                      0
follows_goalkeeper                 0
follows_interception               0
follows_miscontrol                 0
follows_pass                       0
pass_aerial_won                 3078
pass_angle                       926
pass_body_part_name              926
pass_cross                      2723
pass_cut_back                   3048
pass_deflected                  3084
pass_height_name                 926
pass_length                      926
pass_switch                     2915
pass_through_ball               3018
play_pattern_name                  0
shot_aerial_won                 2935
shot_body_part_name                0
shot_first_time                 2693
s

In [26]:
# Handle missing values
# Note that we don't need to fix categorical variables, as they will be set to -1 automatically
bool_cols = ['pass_aerial_won', 'pass_cross', 'pass_cut_back', 'pass_deflected',
             'pass_switch', 'pass_through_ball', 'shot_aerial_won', 'shot_first_time', 'shot_one_on_one',
             'shot_open_goal', 'under_pressure']

# for true/false columns, set to either missing(false) = 0 and true=1
for col in bool_cols:
    shots_final[col] = np.where(shots_final[col].isna(), 0, 1)
    
# Add flag to note that values were previously missing for numeric columns
fix_numeric = ['pass_angle', 'pass_length', 'distance_to_nearest_opponent']
for col in fix_numeric:
    shots_final[col+'_na'] = pd.isnull(shots_final[col])

# For distance to nearest opponent, set missing values to maximum
shots_final['distance_to_nearest_opponent'] = shots_final['distance_to_nearest_opponent'].fillna(shots_final['distance_to_nearest_opponent'].max())

# For pass length and pass angle, set missing values to zero
shots_final['pass_length'] = shots_final['pass_length'].fillna(0)
shots_final['pass_angle'] = shots_final['pass_angle'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-v

In [27]:
# Save to csv
shots_final.to_csv('shots.csv', index=False)