In [2]:
import pandas as pd

from tqdm import tqdm
import warnings

from statsbombpy import sb

In [3]:
# Scrape all events from all records in StatsBomb's open data and store then in a dataframe called all_events
warnings.filterwarnings("ignore")
counter = 0

all_events = pd.DataFrame()
for season in sb.competitions().iterrows():
    for match in tqdm(sb.matches(competition_id=season[1].iloc[0], season_id=season[1].iloc[1]).iterrows()):
        try:
            all_events = pd.concat([all_events, sb.events(match_id=match[1].iloc[0], split=False, flatten_attrs=True)], ignore_index=True)
        except Exception as e:
            print(e)
    # Remove this line to scrape all seasons instead of just one
    counter += 1
    if counter > 20:
        break

306it [06:27,  1.27s/it]
52it [01:25,  1.65s/it]
1it [00:04,  4.55s/it]
1it [00:03,  3.09s/it]
1it [00:01,  1.60s/it]
1it [00:02,  2.92s/it]
1it [00:01,  1.46s/it]
1it [00:01,  1.56s/it]
1it [00:01,  1.69s/it]
1it [00:01,  1.59s/it]
1it [00:01,  1.52s/it]
1it [00:01,  1.79s/it]
1it [00:01,  1.78s/it]
1it [00:01,  1.58s/it]
1it [00:01,  1.61s/it]
1it [00:01,  1.55s/it]
1it [00:01,  1.85s/it]
1it [00:01,  1.67s/it]
1it [00:01,  1.69s/it]
1it [00:01,  1.95s/it]
1it [00:01,  1.89s/it]


In [21]:
all_events

Unnamed: 0,ball_receipt_outcome,ball_recovery_recovery_failure,block_deflection,carry_end_location,clearance_aerial_won,clearance_body_part,clearance_head,clearance_left_foot,clearance_right_foot,counterpress,...,goalkeeper_shot_saved_to_post,shot_saved_to_post,goalkeeper_lost_out,goalkeeper_success_in_play,shot_follows_dribble,half_start_late_video_start,player_off_permanent,pass_backheel,goalkeeper_lost_in_play,half_end_early_video_end
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1301001,,,,,,,,,,,...,,,,,,,,,,
1301002,,,,,,,,,,,...,,,,,,,,,,
1301003,,,,,,,,,,,...,,,,,,,,,,
1301004,,,,,,,,,,,...,,,,,,,,,,


In [22]:
# Print all shots in the data
all_shots = all_events[all_events["shot_type"].notna()]

In [23]:
all_shots.columns

Index(['ball_receipt_outcome', 'ball_recovery_recovery_failure',
       'block_deflection', 'carry_end_location', 'clearance_aerial_won',
       'clearance_body_part', 'clearance_head', 'clearance_left_foot',
       'clearance_right_foot', 'counterpress',
       ...
       'goalkeeper_shot_saved_to_post', 'shot_saved_to_post',
       'goalkeeper_lost_out', 'goalkeeper_success_in_play',
       'shot_follows_dribble', 'half_start_late_video_start',
       'player_off_permanent', 'pass_backheel', 'goalkeeper_lost_in_play',
       'half_end_early_video_end'],
      dtype='object', length=117)

In [29]:
# extract relevant attributes from the all_shots dataframe
data = pd.DataFrame()
data["period"] = all_shots["period"]
data["minute"] = all_shots["minute"]
data["possession"] = all_shots["possession"]
data["play_pattern"] = all_shots["play_pattern"]
data["position"] = all_shots["position"]
data["location_x"] = all_shots["location"].apply(lambda x: x[0])
data["location_y"] = all_shots["location"].apply(lambda x: x[1])
data["duration"] = all_shots["duration"]
data["technique"] = all_shots["shot_technique"]
data["body_part"] = all_shots["shot_body_part"]
data["type"] = all_shots["shot_type"]
data["first_time"] = all_shots["shot_first_time"].fillna(False)
data["open_goal"] = all_shots["shot_open_goal"].fillna(False)
data["one_on_one"] = all_shots["shot_one_on_one"].fillna(False)
data["statsbomb_xg"] = all_shots["shot_statsbomb_xg"]
data["end_location_x"] = all_shots["shot_end_location"].apply(lambda x: x[0])
data["end_location_y"] = all_shots["shot_end_location"].apply(lambda x: x[1])
data["is_goal"] = all_shots["shot_outcome"].apply(lambda x: True if x == "Goal" else False)

In [30]:
data

Unnamed: 0,period,minute,possession,play_pattern,position,location_x,location_y,duration,technique,body_part,type,first_time,open_goal,one_on_one,statsbomb_xg,end_location_x,end_location_y,is_goal
3692,1,4,7,Regular Play,Left Wing Back,108.1,31.2,0.597523,Half Volley,Left Foot,Open Play,True,False,False,0.087901,120.0,35.2,False
3693,1,6,11,From Throw In,Center Forward,110.9,42.6,0.636048,Volley,Right Foot,Open Play,True,False,False,0.160274,120.0,39.3,True
3694,1,8,15,From Throw In,Left Attacking Midfield,117.9,29.1,0.125672,Half Volley,Left Foot,Open Play,True,False,False,0.016036,118.1,30.2,False
3695,1,13,27,From Counter,Left Midfield,101.8,27.6,1.303079,Normal,Left Foot,Open Play,False,False,False,0.527759,120.0,39.0,True
3696,1,17,33,Regular Play,Center Forward,109.3,26.5,0.666134,Normal,Right Foot,Open Play,False,False,False,0.074020,118.8,36.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1300989,2,83,185,From Corner,Right Center Back,117.1,42.9,0.121192,Half Volley,Right Foot,Open Play,False,False,True,0.354868,118.4,42.5,False
1300990,2,83,187,From Keeper,Right Wing,100.0,55.3,0.741536,Normal,Left Foot,Open Play,False,False,False,0.039881,118.3,43.9,False
1300991,2,84,190,From Free Kick,Right Back,96.0,49.7,0.759679,Normal,Right Foot,Open Play,False,False,False,0.032110,120.0,47.0,False
1300992,2,85,193,From Throw In,Right Center Back,84.9,36.3,1.141500,Normal,Right Foot,Open Play,True,False,False,0.014643,120.0,48.2,False


In [31]:
# one-hot encode the categorical data
encoded_data = data
for column_name in ["play_pattern", "position", "technique", "body_part", "type"]:
    one_hot_encoded = pd.get_dummies(encoded_data[column_name], prefix=column_name)
    encoded_data = encoded_data.drop(column_name, axis=1)
    encoded_data = pd.concat([encoded_data, one_hot_encoded], axis=1)

In [32]:
encoded_data

Unnamed: 0,period,minute,possession,location_x,location_y,duration,first_time,open_goal,one_on_one,statsbomb_xg,...,technique_Overhead Kick,technique_Volley,body_part_Head,body_part_Left Foot,body_part_Other,body_part_Right Foot,type_Corner,type_Free Kick,type_Open Play,type_Penalty
3692,1,4,7,108.1,31.2,0.597523,True,False,False,0.087901,...,False,False,False,True,False,False,False,False,True,False
3693,1,6,11,110.9,42.6,0.636048,True,False,False,0.160274,...,False,True,False,False,False,True,False,False,True,False
3694,1,8,15,117.9,29.1,0.125672,True,False,False,0.016036,...,False,False,False,True,False,False,False,False,True,False
3695,1,13,27,101.8,27.6,1.303079,False,False,False,0.527759,...,False,False,False,True,False,False,False,False,True,False
3696,1,17,33,109.3,26.5,0.666134,False,False,False,0.074020,...,False,False,False,False,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1300989,2,83,185,117.1,42.9,0.121192,False,False,True,0.354868,...,False,False,False,False,False,True,False,False,True,False
1300990,2,83,187,100.0,55.3,0.741536,False,False,False,0.039881,...,False,False,False,True,False,False,False,False,True,False
1300991,2,84,190,96.0,49.7,0.759679,False,False,False,0.032110,...,False,False,False,False,False,True,False,False,True,False
1300992,2,85,193,84.9,36.3,1.141500,True,False,False,0.014643,...,False,False,False,False,False,True,False,False,True,False


In [33]:
encoded_data.to_csv('./data.csv', index=False)