In [3]:
import pandas as pd

from tqdm import tqdm
import warnings

from statsbombpy import sb

In [4]:
# Scrape all events from all records in StatsBomb's open data and store then in a dataframe called all_events
warnings.filterwarnings("ignore")
counter = 0

all_events = pd.DataFrame()
for season in sb.competitions().iterrows():
    for match in tqdm(sb.matches(competition_id=season[1].iloc[0], season_id=season[1].iloc[1]).iterrows()):
        try:
            all_events = pd.concat([all_events, sb.events(match_id=match[1].iloc[0], split=False, flatten_attrs=True)], ignore_index=True)
        except Exception as e:
            print(e)
    # Remove this line to scrape all seasons instead of just one
    counter += 1
    if counter > 40:
        break

306it [06:17,  1.23s/it]
52it [01:24,  1.62s/it]
1it [00:01,  1.88s/it]
1it [00:01,  1.71s/it]
1it [00:01,  1.82s/it]
1it [00:01,  1.97s/it]
1it [00:01,  1.90s/it]
1it [00:01,  1.57s/it]
1it [00:03,  3.36s/it]
1it [00:01,  1.66s/it]
1it [00:01,  1.77s/it]


In [5]:
# Print all shots in the data
all_shots = all_events[all_events["shot_type"].notna()]

In [6]:
all_shots.columns

Index(['ball_receipt_outcome', 'ball_recovery_recovery_failure',
       'block_deflection', 'carry_end_location', 'clearance_aerial_won',
       'clearance_body_part', 'clearance_head', 'clearance_left_foot',
       'clearance_right_foot', 'counterpress',
       ...
       'goalkeeper_shot_saved_off_target', 'shot_saved_off_target',
       'goalkeeper_shot_saved_to_post', 'shot_saved_to_post',
       'goalkeeper_lost_out', 'goalkeeper_success_in_play',
       'shot_follows_dribble', 'half_start_late_video_start',
       'player_off_permanent', 'pass_backheel'],
      dtype='object', length=115)

In [12]:
# extract relevant attributes from the all_shots dataframe
data = pd.DataFrame()
data["period"] = all_shots["period"]
data["minute"] = all_shots["minute"]
data["possession"] = all_shots["possession"]
data["play_pattern"] = all_shots["play_pattern"]
data["position"] = all_shots["position"]
data["location_x"] = all_shots["location"].apply(lambda x: x[0])
data["location_y"] = all_shots["location"].apply(lambda x: x[1])
data["duration"] = all_shots["duration"]
data["technique"] = all_shots["shot_technique"]
data["body_part"] = all_shots["shot_body_part"]
data["type"] = all_shots["shot_type"]
data["first_time"] = all_shots["shot_first_time"].fillna(False)
data["open_goal"] = all_shots["shot_open_goal"].fillna(False)
data["one_on_one"] = all_shots["shot_one_on_one"].fillna(False)
data["statsbomb_xg"] = all_shots["shot_statsbomb_xg"]
data["is_goal"] = all_shots["shot_outcome"].apply(lambda x: True if x == "Goal" else False)

In [13]:
data

Unnamed: 0,period,minute,possession,play_pattern,position,location_x,location_y,duration,technique,body_part,type,first_time,open_goal,one_on_one,statsbomb_xg,is_goal
3692,1,4,7,Regular Play,Left Wing Back,108.1,31.2,0.597523,Half Volley,Left Foot,Open Play,True,False,False,0.087901,False
3693,1,6,11,From Throw In,Center Forward,110.9,42.6,0.636048,Volley,Right Foot,Open Play,True,False,False,0.160274,True
3694,1,8,15,From Throw In,Left Attacking Midfield,117.9,29.1,0.125672,Half Volley,Left Foot,Open Play,True,False,False,0.016036,False
3695,1,13,27,From Counter,Left Midfield,101.8,27.6,1.303079,Normal,Left Foot,Open Play,False,False,False,0.527759,True
3696,1,17,33,Regular Play,Center Forward,109.3,26.5,0.666134,Normal,Right Foot,Open Play,False,False,False,0.074020,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267467,2,68,135,From Corner,Right Wing,100.0,45.0,0.981924,Normal,Right Foot,Open Play,False,False,False,0.081895,True
1267468,2,70,136,From Kick Off,Secondary Striker,99.0,32.0,1.341121,Normal,Right Foot,Open Play,False,False,False,0.052839,False
1267469,2,84,166,Regular Play,Right Midfield,99.0,48.0,1.635364,Normal,Left Foot,Open Play,False,False,False,0.045260,False
1267470,2,90,175,Regular Play,Left Center Midfield,86.0,56.0,1.504703,Lob,Right Foot,Open Play,True,False,False,0.011287,False


In [14]:
# one-hot encode the categorical data
encoded_data = data
for column_name in ["play_pattern", "position", "technique", "body_part", "type", "first_time", "open_goal", "one_on_one"]:
    one_hot_encoded = pd.get_dummies(encoded_data[column_name], prefix=column_name)
    encoded_data = encoded_data.drop(column_name, axis=1)
    encoded_data = pd.concat([encoded_data, one_hot_encoded], axis=1)

In [15]:
encoded_data

Unnamed: 0,period,minute,possession,location_x,location_y,duration,statsbomb_xg,is_goal,play_pattern_From Corner,play_pattern_From Counter,...,type_Corner,type_Free Kick,type_Open Play,type_Penalty,first_time_False,first_time_True,open_goal_False,open_goal_True,one_on_one_False,one_on_one_True
3692,1,4,7,108.1,31.2,0.597523,0.087901,False,False,False,...,False,False,True,False,False,True,True,False,True,False
3693,1,6,11,110.9,42.6,0.636048,0.160274,True,False,False,...,False,False,True,False,False,True,True,False,True,False
3694,1,8,15,117.9,29.1,0.125672,0.016036,False,False,False,...,False,False,True,False,False,True,True,False,True,False
3695,1,13,27,101.8,27.6,1.303079,0.527759,True,False,True,...,False,False,True,False,True,False,True,False,True,False
3696,1,17,33,109.3,26.5,0.666134,0.074020,False,False,False,...,False,False,True,False,True,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267467,2,68,135,100.0,45.0,0.981924,0.081895,True,True,False,...,False,False,True,False,True,False,True,False,True,False
1267468,2,70,136,99.0,32.0,1.341121,0.052839,False,False,False,...,False,False,True,False,True,False,True,False,True,False
1267469,2,84,166,99.0,48.0,1.635364,0.045260,False,False,False,...,False,False,True,False,True,False,True,False,True,False
1267470,2,90,175,86.0,56.0,1.504703,0.011287,False,False,False,...,False,False,True,False,False,True,True,False,True,False


In [16]:
encoded_data.to_csv('./data.csv', index=False)