<a href="https://colab.research.google.com/github/joaossmacedo/SoccerAnalysis/blob/main/notebooks/data/extraction/player_season.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
!python3 -m pip install soccerdata
import soccerdata as sd
import seaborn as sns
import matplotlib.pyplot as plt



In [2]:
import pandas as pd
pd.set_option('display.max_columns', 100)

# Params

In [3]:
st = 2017
ed = 2025

stat_type = 'standard'
# stat_type = 'shooting'
# stat_type = 'passing'
# stat_type = 'passing_types'
# stat_type = 'goal_shot_creation'
# stat_type = 'defense'
# stat_type = 'possession'
# stat_type = 'playing_time'
# stat_type = 'misc'
# stat_type = 'keeper'
# stat_type = 'keeper_adv'

path_input = f'/content/drive/My Drive/database/soccerdata/fbref/raw/player_season_stats/{stat_type}/'
path_output = f'/content/drive/My Drive/database/soccerdata/fbref/raw_clean/player_season_stats/{stat_type}/'


# Code

## Prepare Colab

In [4]:
from google.colab import drive
import os

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Utils

In [5]:
def flatten_columns(df):
  cols = []
  for v0, v1, v2 in zip(df.columns.get_level_values(0), df.columns.get_level_values(1), df.columns.get_level_values(2)):
    text = ''
    if not v0.startswith('Unnamed'):
      text += f'{v0}_'
    if not v1.startswith('Unnamed'):
      text += f'{v1}_'
    if not v2.startswith('Unnamed'):
      text += f'{v2}_'
    text = text[:-1]

    cols.append(text)

  df.columns = cols
  return df

## Getting data

In [6]:
df = None

for y in range(st, ed):
  print('-'*50)
  print(y, stat_type)

  path_input_stat_year_csv = f'{path_input}{y}/database.csv'
  if not os.path.exists(path_input_stat_year_csv):
    print('missing')
    continue

  df_year = pd.read_csv(path_input_stat_year_csv, header=[0, 1, 2])
  df_year = flatten_columns(df_year)

  if df is None:
    df = df_year
  else:
    df = pd.concat([df, df_year])

df = df.reset_index(drop=True)
df

--------------------------------------------------
2017 standard
--------------------------------------------------
2018 standard
--------------------------------------------------
2019 standard
--------------------------------------------------
2020 standard
--------------------------------------------------
2021 standard
--------------------------------------------------
2022 standard
--------------------------------------------------
2023 standard
--------------------------------------------------
2024 standard


Unnamed: 0,league,season,team,player,nation,pos,age,born,Playing Time_MP,Playing Time_Starts,Playing Time_Min,Playing Time_90s,Performance_Gls,Performance_Ast,Performance_G+A,Performance_G-PK,Performance_PK,Performance_PKatt,Performance_CrdY,Performance_CrdR,Expected_xG,Expected_npxG,Expected_xAG,Expected_npxG+xAG,Progression_PrgC,Progression_PrgP,Progression_PrgR,Per 90 Minutes_Gls,Per 90 Minutes_Ast,Per 90 Minutes_G+A,Per 90 Minutes_G-PK,Per 90 Minutes_G+A-PK,Per 90 Minutes_xG,Per 90 Minutes_xAG,Per 90 Minutes_xG+xAG,Per 90 Minutes_npxG,Per 90 Minutes_npxG+xAG
0,ENG-Premier League,1718,Arsenal,Aaron Ramsey,WAL,MF,26.0,1990.0,24,21,1846,20.5,7,8,15,7,0,0,0,0,6.1,6.1,5.4,11.5,61.0,134.0,161.0,0.34,0.39,0.73,0.34,0.73,0.30,0.26,0.56,0.30,0.56
1,ENG-Premier League,1718,Arsenal,Ainsley Maitland-Niles,ENG,"DF,MF",19.0,1997.0,15,8,914,10.2,0,0,0,0,0,0,1,0,0.2,0.2,0.9,1.1,21.0,35.0,58.0,0.00,0.00,0.00,0.00,0.00,0.02,0.08,0.11,0.02,0.11
2,ENG-Premier League,1718,Arsenal,Alex Iwobi,NGA,"MF,FW",21.0,1996.0,26,22,1830,20.3,3,5,8,3,0,0,1,0,3.9,3.9,3.5,7.3,93.0,138.0,146.0,0.15,0.25,0.39,0.15,0.39,0.19,0.17,0.36,0.19,0.36
3,ENG-Premier League,1718,Arsenal,Alex Oxlade-Chamberlain,ENG,DF,23.0,1993.0,3,3,241,2.7,0,0,0,0,0,0,0,0,0.3,0.3,0.5,0.8,19.0,16.0,21.0,0.00,0.00,0.00,0.00,0.00,0.09,0.19,0.29,0.09,0.29
4,ENG-Premier League,1718,Arsenal,Alexandre Lacazette,FRA,FW,26.0,1991.0,32,26,2202,24.5,14,4,18,12,2,2,1,0,13.7,12.1,4.4,16.5,39.0,59.0,233.0,0.57,0.16,0.74,0.49,0.65,0.56,0.18,0.74,0.49,0.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22314,ITA-Serie A,2425,Venezia,Nicholas Pierini,ITA,MF,25.0,1998.0,2,0,83,0.9,0,0,0,0,0,0,0,0,0.0,0.0,0.1,0.1,1.0,2.0,4.0,0.00,0.00,0.00,0.00,0.00,0.04,0.10,0.13,0.04,0.13
22315,ITA-Serie A,2425,Venezia,Nunzio Lella,ITA,MF,24.0,2000.0,1,0,5,0.1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
22316,ITA-Serie A,2425,Venezia,Richie Sagrado,BEL,DF,20.0,2004.0,3,3,140,1.6,0,0,0,0,0,0,1,0,0.1,0.1,0.0,0.1,0.0,2.0,5.0,0.00,0.00,0.00,0.00,0.00,0.03,0.00,0.03,0.03,0.03
22317,ITA-Serie A,2425,Venezia,Ridgeciano Haps,SUR,DF,31.0,1993.0,25,14,1369,15.2,1,1,2,1,0,0,3,0,1.2,1.2,0.7,1.9,26.0,52.0,47.0,0.07,0.07,0.13,0.07,0.13,0.08,0.05,0.13,0.08,0.13


## Save

In [7]:
path_output

'/content/drive/My Drive/database/soccerdata/fbref/raw_clean/player_season_stats/standard/'

In [8]:
os.makedirs(path_output, exist_ok=True)
df.to_csv(path_output + 'database.csv', index=False)