<a href="https://colab.research.google.com/github/joaossmacedo/SoccerAnalysis/blob/main/notebooks/data/cleaning/shots.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
!python3 -m pip install soccerdata
import soccerdata as sd
import seaborn as sns
import matplotlib.pyplot as plt

Collecting soccerdata
  Downloading soccerdata-1.8.7-py3-none-any.whl.metadata (5.6 kB)
Collecting Unidecode<2.0.0,>=1.2.0 (from soccerdata)
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Collecting cloudscraper<2.0.0,>=1.2.71 (from soccerdata)
  Downloading cloudscraper-1.2.71-py2.py3-none-any.whl.metadata (19 kB)
Collecting lxml<5.0.0,>=4.9.3 (from soccerdata)
  Downloading lxml-4.9.4-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.7 kB)
Collecting undetected-chromedriver<4.0.0,>=3.5.0 (from soccerdata)
  Downloading undetected-chromedriver-3.5.5.tar.gz (65 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.4/65.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting unicode<3.0,>=2.7 (from soccerdata)
  Downloading unicode-2.9-py2.py3-none-any.whl.metadata (1.1 kB)
Collecting selenium>=4.9.0 (from undetected-chromedriver<4.0.0,>=3.5.0->soccerdata)
  Downloading selenium-4.33.0-py3-no

In [2]:
# pd.set_option('display.height', 500)
import pandas as pd
pd.set_option('display.max_columns', 100)

# Params

In [3]:
st = 2017
ed = 2025

path_input = f'/content/drive/My Drive/database/soccerdata/fbref/raw/shot_events/'
path_output = f'/content/drive/My Drive/database/soccerdata/fbref/raw_clean/shot_events/'

# Code

## Prepare colab

In [4]:
from google.colab import drive
import os

drive.mount('/content/drive')

os.makedirs(path_output, exist_ok=True)

Mounted at /content/drive


## Utils

In [5]:
def flatten_columns(df):
  cols = []
  for v0, v1, v2 in zip(df.columns.get_level_values(0), df.columns.get_level_values(1), df.columns.get_level_values(2)):
    text = ''
    if not v0.startswith('Unnamed'):
      text += f'{v0}_'
    if not v1.startswith('Unnamed'):
      text += f'{v1}_'
    if not v2.startswith('Unnamed'):
      text += f'{v2}_'
    text = text[:-1]

    cols.append(text)

  df.columns = cols
  return df

## Getting Data

In [6]:
df = None
for y in range(st, ed):
  print('-'*50)
  print(y)

  path_input_year = f'{path_input}{y}/'

  path_input_year_csv = f'{path_input_year}database.csv'
  if not os.path.exists(path_input_year_csv):
    print('missing')
    continue

  df_year = pd.read_csv(path_input_year_csv, header=[0, 1, 2])
  df_year = flatten_columns(df_year)

  if df is None:
    df = df_year
  else:
    df = pd.concat([df, df_year])

df = df.reset_index(drop=True)
df

--------------------------------------------------
2017
--------------------------------------------------
2018
--------------------------------------------------
2019
--------------------------------------------------
2020
--------------------------------------------------
2021
--------------------------------------------------
2022
--------------------------------------------------
2023
--------------------------------------------------
2024


Unnamed: 0,league,season,game,minute,player,team,xG,PSxG,outcome,distance,body_part,notes,SCA 1_player,SCA 1_event,SCA 2_player,SCA 2_event
0,ENG-Premier League,1718,2017-08-11 Arsenal-Leicester City,2,Alexandre Lacazette,Arsenal,0.06,0.37,Goal,13,Head,,Mohamed Elneny,Pass (Live),Héctor Bellerín,Pass (Live)
1,ENG-Premier League,1718,2017-08-11 Arsenal-Leicester City,4,Riyad Mahrez,Leicester City,0.08,,Off Target,17,Left Foot,,,,,
2,ENG-Premier League,1718,2017-08-11 Arsenal-Leicester City,5,Shinji Okazaki,Leicester City,0.42,0.49,Goal,3,Head,,Harry Maguire,Pass (Live),Marc Albrighton,Pass (Live)
3,ENG-Premier League,1718,2017-08-11 Arsenal-Leicester City,8,Alex Oxlade-Chamberlain,Arsenal,0.04,,Off Target,24,Left Foot,,Alex Oxlade-Chamberlain,Take-On,Mesut Özil,Pass (Live)
4,ENG-Premier League,1718,2017-08-11 Arsenal-Leicester City,14,Alex Oxlade-Chamberlain,Arsenal,0.03,0.35,Saved,25,Right Foot,,Mesut Özil,Pass (Live),Granit Xhaka,Pass (Live)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358019,ITA-Serie A,2425,2025-05-25 Venezia-Juventus,73,Manuel Locatelli (pen),Juventus,0.79,0.97,Goal,13,Right Foot,,Francisco Conceição,Fouled,Kenan Yıldız,Pass (Live)
358020,ITA-Serie A,2425,2025-05-25 Venezia-Juventus,78,Manuel Locatelli,Juventus,0.04,0.00,Off Target,24,Right Foot,,Francisco Conceição,Pass (Live),Nicolás González,Pass (Live)
358021,ITA-Serie A,2425,2025-05-25 Venezia-Juventus,82,Fali Candé,Venezia,0.05,0.00,Blocked,28,Left Foot,Free kick,Christian Gytkjær,Fouled,Ionuț Radu,Pass (Live)
358022,ITA-Serie A,2425,2025-05-25 Venezia-Juventus,84,Christian Gytkjær,Venezia,0.02,0.06,Saved,17,Head,,Gaetano Oristanio,Pass (Live),Kike Pérez,Pass (Live)


## Validating notes

In [7]:
display(df['notes'].value_counts().head(5))

Unnamed: 0_level_0,count
notes,Unnamed: 1_level_1
Volley,40000
Free kick,13290
Deflected,7747
"Deflected, Volley",677
"Free kick, Deflected",274


## Spliting notes into calls

In [8]:
for note in ['Volley', 'Free kick', 'Deflected']:
  df[f'is_{note.lower().replace(" ", "")}'] = df['notes'].apply(lambda x: note in x if isinstance(x, str) else False).astype('int')
df.head(2)

Unnamed: 0,league,season,game,minute,player,team,xG,PSxG,outcome,distance,body_part,notes,SCA 1_player,SCA 1_event,SCA 2_player,SCA 2_event,is_volley,is_freekick,is_deflected
0,ENG-Premier League,1718,2017-08-11 Arsenal-Leicester City,2,Alexandre Lacazette,Arsenal,0.06,0.37,Goal,13,Head,,Mohamed Elneny,Pass (Live),Héctor Bellerín,Pass (Live),0,0,0
1,ENG-Premier League,1718,2017-08-11 Arsenal-Leicester City,4,Riyad Mahrez,Leicester City,0.08,,Off Target,17,Left Foot,,,,,,0,0,0


## Splitting game info

In [9]:
df[f'game_date'] = df['game'].apply(lambda x: x.split(' ', 1)[0])
df[f'game_team_h'] = df['game'].apply(lambda x: x.split(' ', 1)[1].split('-')[0])
df[f'game_team_a'] = df['game'].apply(lambda x: x.split(' ', 1)[1].split('-')[1])
df.head(2)

Unnamed: 0,league,season,game,minute,player,team,xG,PSxG,outcome,distance,body_part,notes,SCA 1_player,SCA 1_event,SCA 2_player,SCA 2_event,is_volley,is_freekick,is_deflected,game_date,game_team_h,game_team_a
0,ENG-Premier League,1718,2017-08-11 Arsenal-Leicester City,2,Alexandre Lacazette,Arsenal,0.06,0.37,Goal,13,Head,,Mohamed Elneny,Pass (Live),Héctor Bellerín,Pass (Live),0,0,0,2017-08-11,Arsenal,Leicester City
1,ENG-Premier League,1718,2017-08-11 Arsenal-Leicester City,4,Riyad Mahrez,Leicester City,0.08,,Off Target,17,Left Foot,,,,,,0,0,0,2017-08-11,Arsenal,Leicester City


## Save

In [10]:
path_output

'/content/drive/My Drive/database/soccerdata/fbref/raw_clean/shot_events/'

In [11]:
df.to_csv(path_output + 'database.csv', index=False)