<a href="https://colab.research.google.com/github/joaossmacedo/SoccerAnalysis/blob/main/notebooks/data/cleaning/soccerdata_fbref_raw_clean_data_team_standard.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
!python3 -m pip install soccerdata
import soccerdata as sd
import seaborn as sns
import matplotlib.pyplot as plt



In [2]:
import pandas as pd
pd.set_option('display.max_columns', 100)

# Params

In [3]:
st = 2017
ed = 2025

stat_type = 'standard'
# stat_type = 'keeper'
# stat_type = 'keeper_adv'
# stat_type = 'shooting'
# stat_type = 'passing'
# stat_type = 'passing_types'
# stat_type = 'goal_shot_creation'
# stat_type = 'defense'
# stat_type = 'possession'
# stat_type = 'playing_time'
# stat_type = 'misc'

path_input = f'/content/drive/My Drive/database/soccerdata/fbref/raw/team_season_stats/{stat_type}/'
path_output = f'/content/drive/My Drive/database/soccerdata/fbref/raw_clean/team_season_stats/{stat_type}/'


# Code

## Prepare Colab

In [4]:
from google.colab import drive
import os

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Utils

In [5]:
rename_dict = {
  'players_used': 'team_qty_players_used',
 'Age': 'team_avg_age',
 'Poss': 'team_avg_poss',
 'Playing Time_MP': 'team_matches_played',
 'Playing Time_Starts': 'team_minutes_played',

 'Performance_Gls': 'team_goals',
 'Performance_Ast': 'team_assists',
 'Performance_G-PK': 'team_np_goals',
 'Performance_PK': 'team_penalties_converted',
 'Performance_PKatt': 'team_penalties_att',
 'Performance_CrdY': 'team_card_yellow',
 'Performance_CrdR': 'team_card_red',
 'Expected_xG': 'team_xG',
 'Expected_npxG': 'team_npxG',
 'Expected_xAG': 'team_xAG',
 'Progression_PrgC': 'carries_progressive',
 'Progression_PrgP': 'passes_progressive',
}
drop_list = ['Playing Time_Min', 'Playing Time_90s', 'Performance_G+A', 'Expected_npxG+xAG', 'Per 90 Minutes_Gls', 'Per 90 Minutes_Ast', 'Per 90 Minutes_G+A', 'Per 90 Minutes_G-PK', 'Per 90 Minutes_G+A-PK', 'Per 90 Minutes_xG', 'Per 90 Minutes_xAG', 'Per 90 Minutes_xG+xAG', 'Per 90 Minutes_npxG', 'Per 90 Minutes_npxG+xAG']

In [6]:
def flatten_columns(df):
  cols = []
  for v0, v1, v2 in zip(df.columns.get_level_values(0), df.columns.get_level_values(1), df.columns.get_level_values(2)):
    text = ''
    if not v0.startswith('Unnamed'):
      text += f'{v0}_'
    if not v1.startswith('Unnamed'):
      text += f'{v1}_'
    if not v2.startswith('Unnamed'):
      text += f'{v2}_'
    text = text[:-1]

    cols.append(text)

  df.columns = cols
  return df

## Getting data

In [7]:
df = None

for y in range(st, ed):
  print('-'*50)
  print(y, stat_type)

  path_input_stat_year_csv = f'{path_input}{y}/database.csv'
  if not os.path.exists(path_input_stat_year_csv):
    print('missing')
    continue

  df_year = pd.read_csv(path_input_stat_year_csv, header=[0, 1, 2])
  df_year = flatten_columns(df_year)

  if df is None:
    df = df_year
  else:
    df = pd.concat([df, df_year])

df = df.reset_index(drop=True)
df

--------------------------------------------------
2017 standard
--------------------------------------------------
2018 standard
--------------------------------------------------
2019 standard
--------------------------------------------------
2020 standard
--------------------------------------------------
2021 standard
--------------------------------------------------
2022 standard
--------------------------------------------------
2023 standard
--------------------------------------------------
2024 standard


Unnamed: 0,league,season,team,players_used,Age,Poss,Playing Time_MP,Playing Time_Starts,Playing Time_Min,Playing Time_90s,Performance_Gls,Performance_Ast,Performance_G+A,Performance_G-PK,Performance_PK,Performance_PKatt,Performance_CrdY,Performance_CrdR,Expected_xG,Expected_npxG,Expected_xAG,Expected_npxG+xAG,Progression_PrgC,Progression_PrgP,Per 90 Minutes_Gls,Per 90 Minutes_Ast,Per 90 Minutes_G+A,Per 90 Minutes_G-PK,Per 90 Minutes_G+A-PK,Per 90 Minutes_xG,Per 90 Minutes_xAG,Per 90 Minutes_xG+xAG,Per 90 Minutes_npxG,Per 90 Minutes_npxG+xAG,url
0,ENG-Premier League,1718,Arsenal,30,26.8,61.4,38,418,3420,38,73,61,134,69,4,5,57,,68.3,64.3,53.3,117.6,915,2223,1.92,1.61,3.53,1.82,3.42,1.80,1.40,3.20,1.69,3.10,/en/squads/18bb7c10/2017-2018/Arsenal-Stats
1,ENG-Premier League,1718,Bournemouth,22,26.9,48.1,38,418,3420,38,45,36,81,42,3,3,57,,38.8,36.4,29.1,65.5,702,1526,1.18,0.95,2.13,1.11,2.05,1.02,0.77,1.79,0.96,1.72,/en/squads/4ba7cbea/2017-2018/Bournemouth-Stats
2,ENG-Premier League,1718,Brighton,24,27.5,43.7,38,418,3420,38,33,25,58,28,5,7,54,,37.0,31.8,26.2,58.0,523,1230,0.87,0.66,1.53,0.74,1.39,0.97,0.69,1.66,0.84,1.53,/en/squads/d07537b9/2017-2018/Brighton-and-Hov...
3,ENG-Premier League,1718,Burnley,24,27.5,43.7,38,418,3420,38,35,26,61,35,0,0,65,,32.3,32.3,23.6,55.9,431,1103,0.92,0.68,1.61,0.92,1.61,0.85,0.62,1.47,0.85,1.47,/en/squads/943e8050/2017-2018/Burnley-Stats
4,ENG-Premier League,1718,Chelsea,26,26.7,55.6,38,418,3420,38,60,42,102,57,3,3,46,,54.4,52.0,41.5,93.5,1045,1998,1.58,1.11,2.68,1.50,2.61,1.43,1.09,2.52,1.37,2.46,/en/squads/cff3d9bb/2017-2018/Chelsea-Stats
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
775,ITA-Serie A,2425,Parma,32,23.8,44.2,38,418,3420,38,43,31,74,37,6,7,78,6.0,43.1,37.6,29.3,66.9,578,1059,1.13,0.82,1.95,0.97,1.79,1.13,0.77,1.90,0.99,1.76,/en/squads/eab4234c/Parma-Stats
776,ITA-Serie A,2425,Roma,28,26.5,53.9,38,418,3420,38,55,35,90,47,8,8,66,1.0,53.0,46.7,35.0,81.7,691,1501,1.45,0.92,2.37,1.24,2.16,1.39,0.92,2.32,1.23,2.15,/en/squads/cf74a709/Roma-Stats
777,ITA-Serie A,2425,Torino,31,26.7,47.5,38,418,3420,38,36,21,57,34,2,4,80,3.0,34.6,31.6,22.5,54.0,529,1113,0.95,0.55,1.50,0.89,1.45,0.91,0.59,1.50,0.83,1.42,/en/squads/105360fe/Torino-Stats
778,ITA-Serie A,2425,Udinese,31,26.4,47.2,38,418,3420,38,41,28,69,39,2,4,88,5.0,38.2,35.0,27.2,62.1,566,1160,1.08,0.74,1.82,1.03,1.76,1.01,0.71,1.72,0.92,1.64,/en/squads/04eea015/Udinese-Stats


## Pre-process

In [8]:
df = df.rename(columns=rename_dict)
for c in drop_list:
  if not (c in df.columns):
    continue
  df = df.drop(c, axis=1)
df.head(2)

Unnamed: 0,league,season,team,team_qty_players_used,team_avg_age,team_avg_poss,team_matches_played,team_minutes_played,team_goals,team_assists,team_np_goals,team_penalties_converted,team_penalties_att,team_card_yellow,team_card_red,team_xG,team_npxG,team_xAG,carries_progressive,passes_progressive,url
0,ENG-Premier League,1718,Arsenal,30,26.8,61.4,38,418,73,61,69,4,5,57,,68.3,64.3,53.3,915,2223,/en/squads/18bb7c10/2017-2018/Arsenal-Stats
1,ENG-Premier League,1718,Bournemouth,22,26.9,48.1,38,418,45,36,42,3,3,57,,38.8,36.4,29.1,702,1526,/en/squads/4ba7cbea/2017-2018/Bournemouth-Stats


## Save

In [9]:
path_output

'/content/drive/My Drive/database/soccerdata/fbref/raw_clean/team_season_stats/standard/'

In [12]:
os.makedirs(path_output, exist_ok=True)
df.to_csv(path_output + 'database.csv', index=False)