# Bundesliga-Auswertung
https://www.kaggle.com/code/slehkyi/web-scraping-football-statistics-per-game-data/notebook das ist etwas ausführlicher

Quelle: https://www.sergilehkyi.com/web-scraping-advanced-football-statistics/ 

In [1]:

import pandas as pd
from bs4 import BeautifulSoup
import requests


In [2]:
# create urls for all seasons of all leagues
base_url = 'https://understat.com/league'
leagues = ['La_liga', 'EPL', 'Bundesliga', 'Serie_A', 'Ligue_1', 'RFPL']
seasons = ['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']

In [3]:
# Starting with latest data for Spanish league, because I'm a Barcelona fan
url = base_url+'/'+leagues[2]+'/'+seasons[7]
res = requests.get(url)
soup = BeautifulSoup(res.content, "lxml")

# Based on the structure of the webpage, I found that data is in the JSON variable, under 'script' tags
scripts = soup.find_all('script')

In [13]:
import json

string_with_json_obj = ''

# Find data for teams
for el in scripts:
    if 'teamsData' in str(el):
      string_with_json_obj = str(el).strip()
      
#print(string_with_json_obj)

# strip unnecessary symbols and get only JSON data
ind_start = string_with_json_obj.index("('")+2
ind_end = string_with_json_obj.index("')")
json_data = string_with_json_obj[ind_start:ind_end]

json_data = json_data.encode('utf8').decode('unicode_escape')
#print(json_data)
aDict = json.loads(json_data)

for id in aDict.keys():    print (id,aDict[id]['title'])

117 Bayern Munich
119 Bayer Leverkusen
120 Hoffenheim
121 Augsburg
122 Hertha Berlin
125 Mainz 05
129 Borussia Dortmund
130 Borussia M.Gladbach
131 Wolfsburg
132 Eintracht Frankfurt
133 VfB Stuttgart
134 FC Cologne
135 Freiburg
136 RasenBallsport Leipzig
240 Union Berlin
262 Arminia Bielefeld
267 Greuther Fuerth
268 Bochum


In [14]:
# Get teams and their relevant ids and put them into separate dictionary
teams = {}
teams = {}
for id in aDict.keys():
    teams[id] = aDict[id]['title']
#for id in aDict.keys():    print (id,aDict[id]['title'],aDict[id]['history'])
columns = []
# Check the sample of values per each column
values = []
for id in aDict.keys():
    columns = list(aDict[id]['history'][0].keys())
    values = list(aDict[id]['history'][0].values())
    break
columns

['h_a',
 'xG',
 'xGA',
 'npxG',
 'npxGA',
 'ppda',
 'ppda_allowed',
 'deep',
 'deep_allowed',
 'scored',
 'missed',
 'xpts',
 'result',
 'date',
 'wins',
 'draws',
 'loses',
 'pts',
 'npxGD']

In [15]:
dataframes = {}
for id, team in teams.items():
    teams_data = []
    for row in aDict[id]['history']:
        teams_data.append(list(row.values()))
    
    df = pd.DataFrame(teams_data, columns=columns)
    dataframes[team] = df
    print('Added data for {}.'.format(team))

Added data for Bayern Munich.
Added data for Bayer Leverkusen.
Added data for Hoffenheim.
Added data for Augsburg.
Added data for Hertha Berlin.
Added data for Mainz 05.
Added data for Borussia Dortmund.
Added data for Borussia M.Gladbach.
Added data for Wolfsburg.
Added data for Eintracht Frankfurt.
Added data for VfB Stuttgart.
Added data for FC Cologne.
Added data for Freiburg.
Added data for RasenBallsport Leipzig.
Added data for Union Berlin.
Added data for Arminia Bielefeld.
Added data for Greuther Fuerth.
Added data for Bochum.


In [16]:
for team, df in dataframes.items():
    dataframes[team]['ppda_coef'] = dataframes[team]['ppda'].apply(lambda x: x['att']/x['def'] if x['def'] != 0 else 0)
    dataframes[team]['oppda_coef'] = dataframes[team]['ppda_allowed'].apply(lambda x: x['att']/x['def'] if x['def'] != 0 else 0)
#berechnen der Koeffizienten ppda und oppda
#https://totalfootballanalysis.com/data-analysis/data-analysis-ppda-its-definition-advantages-and-disadvantages

# Spalten zum Summieren und zum Mitteln zusammenfassen
cols_to_sum = ['xG', 'xGA', 'npxG', 'npxGA', 'deep', 'deep_allowed', 'scored', 'missed', 'xpts', 'wins', 'draws', 'loses', 'pts', 'npxGD']
cols_to_mean = ['ppda_coef', 'oppda_coef']

In [19]:
frames = []
for team, df in dataframes.items():
    sum_data = pd.DataFrame(df[cols_to_sum].sum()).transpose()
    mean_data = pd.DataFrame(df[cols_to_mean].mean()).transpose()
    final_df = sum_data.join(mean_data)
    final_df['team'] = team
    final_df['matches'] = len(df)
    frames.append(final_df)
  
full_stat = pd.concat(frames)
full_stat = full_stat[['team', 'matches', 'wins', 'draws', 'loses', 'scored', 'missed', 'pts', 'xG', 'npxG', 'xGA', 'npxGA', 'npxGD', 'ppda_coef', 'oppda_coef', 'deep', 'deep_allowed', 'xpts']]
full_stat.sort_values('pts', ascending=False, inplace=True)
full_stat.reset_index(inplace=True, drop=True)
full_stat['position'] = range(1,len(full_stat)+1)

full_stat['xG_diff'] = full_stat['xG'] - full_stat['scored']
full_stat['xGA_diff'] = full_stat['xGA'] - full_stat['missed']
full_stat['xpts_diff'] = full_stat['xpts'] - full_stat['pts']
cols_to_int = ['wins', 'draws', 'loses', 'scored', 'missed', 'pts', 'deep', 'deep_allowed']
full_stat[cols_to_int] = full_stat[cols_to_int].astype(int)
col_order = ['position','team', 'matches', 'wins', 'draws', 'loses', 'scored', 'missed', 'pts', 'xG', 'xG_diff', 'npxG', 'xGA', 'xGA_diff', 'npxGA', 'npxGD', 'ppda_coef', 'oppda_coef', 'deep', 'deep_allowed', 'xpts', 'xpts_diff']
full_stat = full_stat[col_order]
full_stat.columns = ['#', 'team', 'M', 'W', 'D', 'L', 'G', 'GA', 'PTS', 'xG', 'xG_diff', 'NPxG', 'xGA', 'xGA_diff', 'NPxGA', 'NPxGD', 'PPDA', 'OPPDA', 'DC', 'ODC', 'xPTS', 'xPTS_diff']
pd.options.display.float_format = '{:,.2f}'.format
full_stat

Unnamed: 0,#,team,M,W,D,L,G,GA,PTS,xG,...,xGA,xGA_diff,NPxGA,NPxGD,PPDA,OPPDA,DC,ODC,xPTS,xPTS_diff
0,1,Bayern Munich,34,24,5,5,97,37,77,99.91,...,38.61,1.61,37.09,59.02,8.41,18.69,522,162,78.83,1.83
1,2,Borussia Dortmund,34,22,3,9,85,52,69,65.65,...,47.67,-4.33,43.13,14.18,10.64,15.73,306,198,58.3,-10.7
2,3,Bayer Leverkusen,34,19,7,8,80,47,64,66.31,...,45.02,-1.98,41.98,19.78,12.97,13.36,283,232,59.23,-4.77
3,4,RasenBallsport Leipzig,34,17,7,10,72,37,58,67.4,...,44.38,7.38,42.1,18.47,12.0,15.36,357,166,60.59,2.59
4,5,Union Berlin,34,16,9,9,50,44,57,55.2,...,39.83,-4.17,38.31,13.1,15.44,12.01,173,177,58.28,1.28
5,6,Freiburg,34,15,10,9,58,46,55,57.73,...,49.08,3.08,47.57,5.55,12.35,12.4,241,247,52.93,-2.07
6,7,FC Cologne,34,14,10,10,52,49,52,52.3,...,53.09,4.09,52.33,-0.79,7.88,10.92,231,208,49.04,-2.96
7,8,Hoffenheim,34,13,7,14,58,60,46,59.77,...,57.27,-2.73,54.17,5.6,12.19,15.8,269,176,46.7,0.7
8,9,Mainz 05,34,13,7,14,50,45,46,52.08,...,43.41,-1.59,38.86,10.19,11.26,10.02,223,182,50.62,4.62
9,10,Borussia M.Gladbach,34,12,9,13,54,61,45,59.92,...,60.71,-0.29,56.92,-2.31,12.12,12.11,250,206,49.74,4.74
