In [None]:
%matplotlib inline


# Using Statsbomb
Getting familiar with Statsbomb data


In [2]:
#importing SBopen class from mplsoccer to open the data
from mplsoccer import Sbopen
# The first thing we have to do is open the data. We use a parser SBopen available in mplsoccer.
parser = Sbopen()

## Alternate way to store SB data - Dict

In [11]:
# iterate through all matches to get the events data
df_matches = {}
for i, id in enumerate(df_match['match_id']):
  df_matches[id] = {}
  df_matches[id]['event'], df_matches[id]['related'], df_matches[id]['freeze'], df_matches[id]['tactic'] = parser.event(id)

In [28]:
df_matches[22949]['event']['shot_statsbomb_xg']

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
3357   NaN
3358   NaN
3359   NaN
3360   NaN
3361   NaN
Name: shot_statsbomb_xg, Length: 3362, dtype: float64

In [22]:
df_matches[22949]['event'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3362 entries, 0 to 3361
Data columns (total 72 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              3362 non-null   object 
 1   index                           3362 non-null   int64  
 2   period                          3362 non-null   int64  
 3   timestamp                       3362 non-null   object 
 4   minute                          3362 non-null   int64  
 5   second                          3362 non-null   int64  
 6   possession                      3362 non-null   int64  
 7   duration                        2553 non-null   float64
 8   match_id                        3362 non-null   int64  
 9   type_id                         3362 non-null   int64  
 10  type_name                       3362 non-null   object 
 11  possession_team_id              3362 non-null   int64  
 12  possession_team_name            33

In [26]:
# example events data, filtered to shots only
df_matches[22949]['event'][df_matches[22949]['event']['type_name'] == 'Shot'].head()

Unnamed: 0,id,index,period,timestamp,minute,second,possession,duration,match_id,type_id,...,foul_committed_advantage,dribble_overrun,ball_recovery_offensive,shot_open_goal,substitution_replacement_id,substitution_replacement_name,block_deflection,foul_committed_card_id,foul_committed_card_name,shot_one_on_one
47,61ba8fdd-7b2e-4967-8c3c-038cad3d44c1,48,1,00:01:45.395000,1,45,5,0.4137,22949,16,...,,,,,,,,,,
554,3473cc2a-c7b4-4c21-b4c5-3631b79d8ba8,555,1,00:13:55.191000,13,55,36,1.574044,22949,16,...,,,,,,,,,,
638,b987d5a3-0152-4919-8fcf-5ad26f4b2883,639,1,00:15:47.927000,15,47,40,0.926764,22949,16,...,,,,,,,,,,
674,cd838028-9847-4638-86d6-955feb6ec3e0,675,1,00:16:44.355000,16,44,43,0.576395,22949,16,...,,,,,,,,,,
738,7d9bced3-546a-4116-aded-67eeed94a1f5,739,1,00:17:46.017000,17,46,47,1.70309,22949,16,...,,,,,,,,,,


## Competition data
Using method *competition* of the parser we can explore competitions to find the competition we are interested in.
The most important information for us is in the *competition_id* (id of competition) and *season_id*.
The first one is the key in Statsbomb database of a competition, the second one of a season 
of this competition (for example WC 2018 would have a different *season_id* than WC 2014, but the same *competition_id*).



In [3]:
#opening data using competition method
df_competition = parser.competition()
#structure of data
df_competition.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74 entries, 0 to 73
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   competition_id             74 non-null     int64 
 1   season_id                  74 non-null     int64 
 2   country_name               74 non-null     object
 3   competition_name           74 non-null     object
 4   competition_gender         74 non-null     object
 5   competition_youth          74 non-null     bool  
 6   competition_international  74 non-null     bool  
 7   season_name                74 non-null     object
 8   match_updated              74 non-null     object
 9   match_updated_360          56 non-null     object
 10  match_available_360        10 non-null     object
 11  match_available            74 non-null     object
dtypes: bool(2), int64(2), object(8)
memory usage: 6.1+ KB


## Match data
Using method *match* of the parser we can explore matches of a competition to find the match we are interested in.
To open it we need to know the *competition_id* (id of competition) and *season_id*.
We know that for Women World Cup *competition_id* is 72 and *season_id* is 30
From this dataframe for us the most important imformation is provided in *match_id*, 
*home_team_id* and *home_team_name* and adequately *away_team_id* and *away_team_name*.



In [4]:
#opening data using match method
df_match = parser.match(competition_id=72, season_id=30)
#structure of data
df_match.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 52 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   match_id                         52 non-null     int64         
 1   match_date                       52 non-null     datetime64[ns]
 2   kick_off                         52 non-null     datetime64[ns]
 3   home_score                       52 non-null     int64         
 4   away_score                       52 non-null     int64         
 5   match_status                     52 non-null     object        
 6   match_status_360                 52 non-null     object        
 7   last_updated                     52 non-null     datetime64[ns]
 8   last_updated_360                 52 non-null     datetime64[ns]
 9   match_week                       52 non-null     int64         
 10  competition_id                   52 non-null     int64         


## Lineup data
To check the lineups we use the *lineup* method. We do it for England Sweden WWC 2019 game - *game_id* is 69301 
- you can check that in the df_match. In this dataframe you will find all players who played in this game, their teams 
and jersey numbers
COMMENTED OUT BECAUSE OF CHANGE OF DATA FORMAT.



In [5]:
#opening data using match method
#df_lineup = parser.lineup(69301)
#structure of data
#df_lineup.info()

## Event data
The Statsbomb data that we will use the most during the course is event data. 
Knowing *game_id* you can open all the events that occured on the pitch
In the event dataframe you will find events with additional information, we will mostly use this dataframe.
Tactics dataframe provides information about player position on the pitch. 'Related' dataframe provides information
on events that were related to each other - for example ball pass and pressure applied. *df_freeze* consists of freezed
frames with player position in the moment of shots. We will learn more about tracking data later in the course.
Below, an example of event data is presented.



In [6]:
#opening data
df_event, df_related, df_freeze, df_tactics = parser.event(69301)
#if you want only event data you can use 
#df_event = parser.event(69301)[0]
#structure of data
df_event.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3289 entries, 0 to 3288
Data columns (total 73 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              3289 non-null   object 
 1   index                           3289 non-null   int64  
 2   period                          3289 non-null   int64  
 3   timestamp                       3289 non-null   object 
 4   minute                          3289 non-null   int64  
 5   second                          3289 non-null   int64  
 6   possession                      3289 non-null   int64  
 7   duration                        2457 non-null   float64
 8   match_id                        3289 non-null   int64  
 9   type_id                         3289 non-null   int64  
 10  type_name                       3289 non-null   object 
 11  possession_team_id              3289 non-null   int64  
 12  possession_team_name            32

In [7]:
df_tactics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   jersey_number     44 non-null     int64 
 1   match_id          44 non-null     int64 
 2   id                44 non-null     object
 3   player_id         44 non-null     int64 
 4   player_name       44 non-null     object
 5   position_id       44 non-null     int64 
 6   position_name     44 non-null     object
 7   event_tactics_id  44 non-null     int64 
dtypes: int64(5), object(3)
memory usage: 2.9+ KB


## 360 data
Statsbomb offers 360 data which track not only location of an event but also players' location. To open them we need
an id of game. Later, we will also need id of the event. In the *df_frame* we find information on players' position (but only if teammate, not all information)
and in *df_visible* it is provided which part of the pitch was tracked during an event.



In [8]:
df_frame, df_visible = parser.frame(3788741)

# exploring the data
df_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45737 entries, 0 to 45736
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   teammate  45737 non-null  bool   
 1   actor     45737 non-null  bool   
 2   keeper    45737 non-null  bool   
 3   match_id  45737 non-null  int64  
 4   id        45737 non-null  object 
 5   x         45737 non-null  float64
 6   y         45737 non-null  float64
dtypes: bool(3), float64(2), int64(1), object(1)
memory usage: 1.5+ MB


## Before you start
Run these lines in Spyder/Jupyter notebook and explore dataframes 
to get more familiar before you start working on the course.


In [10]:
df_visible

Unnamed: 0,match_id,id,visible_area
0,3788741,5c888f58-fe77-459b-ab3b-a2fa5fb8ab16,"[0.0, 27.7649873094419, 19.2332231959735, 17.3..."
1,3788741,776e11e0-9e85-4226-9f9e-edefa3685288,"[34.5953633899854, 77.2328373069048, 6.5972315..."
2,3788741,bea4235d-7e40-461c-bb82-6d473f5bb324,"[34.5953633899854, 77.2328373069048, 6.5972315..."
3,3788741,84b9b798-0fbe-45bc-a4bf-3621959f29ce,"[35.4292984462803, 78.5704958529984, 7.7187164..."
4,3788741,6e2955cf-ad4f-4096-a4dd-053173a3802b,"[19.7831665231312, 80.0, 41.3426126170851, 0.0..."
...,...,...,...
3365,3788741,3a176721-5ca0-43d8-813c-482a2688c932,"[0.0, 80.0, 0.0, 30.7709117714099, 13.95558739..."
3366,3788741,98eaff6a-811b-4101-80ba-aa4bf7797d1f,"[0.0, 80.0, 0.0, 33.0363392144773, 12.95951717..."
3367,3788741,87f19c86-543d-4a7a-93bb-b576fe9be63d,"[0.0, 80.0, 0.0, 33.0363392144773, 12.95951717..."
3368,3788741,9a782a08-27d0-4d77-908f-fdfde7e5f3a4,"[104.1532652775, 67.3224942956356, 84.40446024..."
