In [176]:
# Importing Librarys
import kaggle
import os
import pandas as pd
import requests
from sklearn.preprocessing import MinMaxScaler

# Where to save the data
data_path = 'data/football-database'
os.makedirs(data_path, exist_ok=True)

# Download the data + unzip
kaggle.api.dataset_download_files('technika148/football-database', path=data_path, unzip=True)

# check if the file was downloaded
if os.path.exists(os.path.join(data_path, 'appearances.csv')):
    print('Die Datei games.csv wurde erfolgreich heruntergeladen.')
else:
    print('Fehler beim Herunterladen der Datei.')




Dataset URL: https://www.kaggle.com/datasets/technika148/football-database
Die Datei games.csv wurde erfolgreich heruntergeladen.


In [186]:
# Laden des Datensatzes
file_path = 'data/football-database/appearances.csv'
data = pd.read_csv(file_path)

# Überblick über den Datensatz
data.info()
data.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356513 entries, 0 to 356512
Data columns (total 19 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   gameID         356513 non-null  int64  
 1   playerID       356513 non-null  int64  
 2   goals          356513 non-null  int64  
 3   ownGoals       356513 non-null  int64  
 4   shots          356513 non-null  int64  
 5   xGoals         356513 non-null  float64
 6   xGoalsChain    356513 non-null  float64
 7   xGoalsBuildup  356513 non-null  float64
 8   assists        356513 non-null  int64  
 9   keyPasses      356513 non-null  int64  
 10  xAssists       356513 non-null  float64
 11  position       356513 non-null  object 
 12  positionOrder  356513 non-null  int64  
 13  yellowCard     356513 non-null  int64  
 14  redCard        356513 non-null  int64  
 15  time           356513 non-null  int64  
 16  substituteIn   356513 non-null  int64  
 17  substituteOut  356513 non-nul

Unnamed: 0,gameID,playerID,goals,ownGoals,shots,xGoals,xGoalsChain,xGoalsBuildup,assists,keyPasses,xAssists,position,positionOrder,yellowCard,redCard,time,substituteIn,substituteOut,leagueID
0,81,560,0,0,0,0.0,0.0,0.0,0,0,0.0,GK,1,0,0,90,0,0,1
1,81,557,0,0,0,0.0,0.106513,0.106513,0,1,0.106513,DR,2,0,0,82,222605,0,1
2,81,548,0,0,0,0.0,0.127738,0.127738,0,0,0.0,DC,3,0,0,90,0,0,1
3,81,628,0,0,0,0.0,0.106513,0.106513,0,0,0.0,DC,3,0,0,90,0,0,1
4,81,1006,0,0,0,0.0,0.021225,0.021225,0,0,0.0,DL,4,0,0,90,0,0,1


In [187]:
# Ein Überblick über die einzigartigen Werte in der "position"-Spalte
unique_positions = data['position'].unique()

unique_positions


array(['GK', 'DR', 'DC', 'DL', 'DMC', 'AMR', 'AMC', 'AML', 'FW', 'Sub',
       'MR', 'MC', 'ML', 'FWR', 'FWL', 'DMR', 'DML'], dtype=object)

In [198]:
# Beispielhafte Standardisierung der Positionsbezeichnungen
position_mapping = {
    'GK': 'Goalkeeper',
    'DR': 'Defender',
    'DC': 'Defender',
    'DL': 'Defender',
    'DMC': 'Midfielder',
    'DMR': 'Midfielder',
    'DML': 'Midfielder',
    'AMR': 'Midfielder',
    'AMC': 'Midfielder',
    'AML': 'Midfielder',
    'MC': 'Midfielder',
    'MR': 'Midfielder',
    'ML': 'Midfielder',
    'FW': 'Forward',
    'FWL': 'Forward',
    'FWR': 'Forward',
    'Sub': 'Substitute',
}

data['position'] = data['position'].replace(position_mapping)

# Normalisierung der numerischen Spalten
numeric_columns = ['goals', 'ownGoals', 'shots', 'xGoals', 'xGoalsChain', 'xGoalsBuildup', 'assists', 'keyPasses', 'xAssists', 'yellowCard', 'redCard', 'time', 'substituteIn', 'substituteOut']
scaler = MinMaxScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# Berechnung eines neuen Merkmals
data['MinutesPerGoal'] = data['time'] / (data['goals'] + 1e-6)  # Hinzufügen einer kleinen Zahl, um Division durch Null zu vermeiden

# Überprüfen der finalen Daten
data.head()


Unnamed: 0,gameID,playerID,goals,ownGoals,shots,xGoals,xGoalsChain,xGoalsBuildup,assists,keyPasses,...,position,positionOrder,yellowCard,redCard,time,substituteIn,substituteOut,leagueID,position_encoded,MinutesPerGoal
0,81,560,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Goalkeeper,1,0.0,0.0,1.0,0.0,0.0,1,2,1000000.0
1,81,557,0.0,0.0,0.0,0.0,0.02108,0.029934,0.0,0.083333,...,Defender,2,0.0,0.0,0.910112,0.469525,0.0,1,0,910112.359551
2,81,548,0.0,0.0,0.0,0.0,0.025281,0.035899,0.0,0.0,...,Defender,3,0.0,0.0,1.0,0.0,0.0,1,0,1000000.0
3,81,628,0.0,0.0,0.0,0.0,0.02108,0.029934,0.0,0.0,...,Defender,3,0.0,0.0,1.0,0.0,0.0,1,0,1000000.0
4,81,1006,0.0,0.0,0.0,0.0,0.004201,0.005965,0.0,0.0,...,Defender,4,0.0,0.0,1.0,0.0,0.0,1,0,1000000.0


In [199]:
# Define a new variable, which we use for our analysis
position_goals = {}

for index, row in data.iterrows():
    position = row['position']
    goals = row['goals']
    
    if position not in position_goals:
        position_goals[position] = {'sum_goals': 0, 'count': 0}
    
    position_goals[position]['sum_goals'] += goals
    position_goals[position]['count'] += 1

# Berechnung des Durchschnitts
average_goals_per_position = {position: info['sum_goals'] / info['count'] for position, info in position_goals.items()}

average_goals_per_position

{'Goalkeeper': 3.943528669453427e-05,
 'Defender': 0.007227629255472269,
 'Midfielder': 0.021289132459345713,
 'Forward': 0.0641342438673686,
 'Substitute': 0.010391552673311242}