In [3]:
import pandas as pd
import numpy as np

import soccerdata as sd

pd.set_option('display.max_columns', 500)

# ELO data

In [4]:
premier_league_dict = {
    'Arsenal': 'Arsenal',
    'Aston Villa': 'Aston Villa',
    'Brighton': 'Brighton',
    'Burnley': 'Burnley',
    'Chelsea': 'Chelsea',
    'Crystal Palace': 'Crystal Palace',
    'Everton': 'Everton',
    'Fulham': 'Fulham',
    'Leeds': 'Leeds United',
    'Leicester': 'Leicester City',
    'Liverpool': 'Liverpool',
    'Man City': 'Manchester City',
    'Man United': 'Manchester Utd',
    'Newcastle': 'Newcastle Utd',
    'Sheffield United': 'Sheffield Utd',
    'Southampton': 'Southampton',
    'Tottenham': 'Tottenham',
    'West Brom': 'West Brom',
    'West Ham': 'West Ham',
    'Wolves': 'Wolves',
    'Brentford': 'Brentford',
    'Watford': 'Watford',
    'Norwich': 'Norwich City',
    "Nott'm Forest": "Nott'ham Forest",
    'Bournemouth': 'Bournemouth',
}

budneslinga_dict = {
    'Bielefeld': 'Arminia',
    'Augsburg': 'Augsburg',
    'Bayern Munich': 'Bayern Munich',
    'Dortmund': 'Dortmund',
    'Ein Frankfurt': 'Eint Frankfurt',
    'Freiburg': 'Freiburg',
    'Hertha': 'Hertha BSC',
    'Hoffenheim': 'Hoffenheim',
    'FC Koln': 'Köln',
    'Leverkusen': 'Leverkusen',
    "M'gladbach": "M'Gladbach",
    'Mainz': 'Mainz 05',
    'RB Leipzig': 'RB Leipzig',
    'Schalke 04': 'Schalke 04',
    'Stuttgart': 'Stuttgart',
    'Union Berlin': 'Union Berlin',
    'Werder Bremen': 'Werder Bremen',
    'Wolfsburg': 'Wolfsburg',
    'Greuther Furth': 'Greuther Fürth',
    'Bochum': 'Bochum',
}

serie_a_dict = {
    'Atalanta': 'Atalanta',
    'Benevento': 'Benevento',
    'Bologna': 'Bologna',
    'Cagliari': 'Cagliari',
    'Crotone': 'Crotone',
    'Fiorentina': 'Fiorentina',
    'Genoa': 'Genoa',
    'Verona': 'Hellas Verona',
    'Inter': 'Inter',
    'Juventus': 'Juventus',
    'Lazio': 'Lazio',
    'Milan': 'Milan',
    'Napoli': 'Napoli',
    'Parma': 'Parma',
    'Roma': 'Roma',
    'Sampdoria': 'Sampdoria',
    'Sassuolo': 'Sassuolo',
    'Spezia': 'Spezia',
    'Torino': 'Torino',
    'Udinese': 'Udinese',
    'Salernitana': 'Salernitana',
    'Venezia': 'Venezia',
    'Empoli': 'Empoli',
    'Monza': 'Monza',
    'Lecce': 'Lecce',
    'Cremonese': 'Cremonese',
}

la_liga_dict = {
    'Alaves': 'Alavés',
    'Ath Bilbao': 'Athletic Club',
    'Ath Madrid': 'Atlético Madrid',
    'Barcelona': 'Barcelona',
    'Betis': 'Betis',
    'Celta': 'Celta Vigo',
    'Cadiz': 'Cádiz',
    'Eibar': 'Eibar',
    'Elche': 'Elche',
    'Getafe': 'Getafe',
    'Granada': 'Granada',
    'Huesca': 'Huesca',
    'Levante': 'Levante',
    'Osasuna': 'Osasuna',
    'Real Madrid': 'Real Madrid',
    'Sociedad': 'Real Sociedad',
    'Sevilla': 'Sevilla',
    'Valencia': 'Valencia',
    'Valladolid': 'Valladolid',
    'Villarreal': 'Villarreal',
    'Espanol': 'Espanyol',
    'Mallorca': 'Mallorca',
    'Vallecano': 'Rayo Vallecano',
    'Almeria': 'Almería',
    'Girona': 'Girona',
}

primeira_liga_dict = {
    'Belenenses': 'B-SAD',
    'Benfica': 'Benfica',
    'Boavista': 'Boavista',
    'Sp Braga': 'Braga',
    'Famalicao': 'Famalicão',
    'Farense': 'Farense',
    'Gil Vicente': 'Gil Vicente FC',
    'Maritimo': 'Marítimo',
    'Moreirense': 'Moreirense',
    'Nacional': 'Nacional',
    'Pacos Ferreira': 'Paços',
    'Portimonense': 'Portimonense',
    'Porto': 'Porto',
    'Rio Ave': 'Rio Ave',
    'Santa Clara': 'Santa Clara',
    'Sp Lisbon': 'Sporting CP',
    'Tondela': 'Tondela',
    'Guimaraes': 'Vitória',
    'Arouca': 'Arouca',
    'Estoril': 'Estoril',
    'Vizela': 'Vizela',
    'Casa Pia': 'Casa Pia',
    'Chaves': 'Chaves',
}


club_names = {}
for dict in [premier_league_dict, budneslinga_dict, serie_a_dict, la_liga_dict, primeira_liga_dict]:
    for key, value in dict.items():
        club_names[key] = value


# Months
years = np.arange(2020, 2024)
months = np.arange(1, 13)
dates = [f"{y}-{m}-1" for y in years for m in months]

# Read elo for months
elo = sd.ClubElo(no_store=True)
elo_scores = pd.DataFrame()
for date in dates:
    elo_scores = pd.concat([elo_scores, elo.read_by_date(date)]).drop_duplicates()

elo_scores = elo_scores.reset_index()
elo_scores = elo_scores[elo_scores['team'].isin(club_names.keys())]
elo_scores['team'] = elo_scores['team'].map(club_names)

elo_scores.sort_values(["team", "from"]).to_csv("data/elo_scores.csv", sep=";")

# FBref data

In [None]:
leagues_dict = {
    'ESP-La Liga': ['Real Madrid'],
    'ENG-Premier League': ['Chelsea', 'Man City'],
    'GER-Bundesliga': ['Bayern'],
    'ITA-Serie A': ['Milan', 'Napoli', 'Inter'],
    'POR-Primeira Liga': ['Benfica'],
}

In [None]:
def fbref_team_stats(stat_type, leagues_dict):
    # Team season stats
    fbref_team_season_stats = pd.DataFrame()
    for season in ["20-21", "21-22", "22-23"]:
        for league in leagues_dict.keys():
            fbref = sd.FBref(seasons=season, leagues=league, no_store=True)
            team_season_stats = fbref.read_team_season_stats(stat_type=stat_type)
            team_season_stats = team_season_stats.reset_index()
            team_season_stats.columns = ["_".join(col).replace(" ", "_") if col[1] != '' else col[0].replace(" ", "_") for col in team_season_stats.columns]
            fbref_team_season_stats = pd.concat([fbref_team_season_stats, team_season_stats])

    fbref_team_season_stats.to_csv(f"data/fbref_team_season_stats_{stat_type}.csv", sep=";")


def fbref_potistion_stats(stat_type, leagues_dict):
    # Positions season stats
    fbref_position_season_stats = pd.DataFrame()
    for season in ["20-21", "21-22", "22-23"]:
        for league in leagues_dict.keys():
            fbref = sd.FBref(seasons=season, leagues=league, no_store=True)
            position_season_stats = fbref.read_player_season_stats(stat_type=stat_type)
            position_season_stats = position_season_stats.reset_index()
            position_season_stats.columns = ["_".join(col).replace(" ", "_") if col[1] != '' else col[0].replace(" ", "_") for col in position_season_stats.columns]
            position_season_stats = position_season_stats.drop(columns=["player", "Nation", "Born"])
            position_season_stats = position_season_stats[~position_season_stats["Pos"].isna()]
            
            team_position_season_stats = pd.DataFrame()
            for i, position in enumerate(["MF", "DF", "FW", "GK"]):
                position_season_stats_pos_agg = position_season_stats[[position in val for val in position_season_stats["Pos"]]].drop(columns=["Pos"]).groupby(["league", "season", "team"]).agg(["max", "min"]).reset_index()
                position_season_stats_pos_agg.columns = [position + "_" + "_".join(col) if col[1] != '' else col[0] for col in position_season_stats_pos_agg.columns]
                age_columns = [col for col in position_season_stats_pos_agg if "Age" in col]
                for col in age_columns:
                    position_season_stats_pos_agg[col] = position_season_stats_pos_agg[col].map(lambda x: x[0:2]).astype(int)
                if i == 0:
                    team_position_season_stats = position_season_stats_pos_agg
                else:
                    team_position_season_stats = team_position_season_stats.merge(position_season_stats_pos_agg, on=["league", "season", "team"])
                
            fbref_position_season_stats = pd.concat([fbref_position_season_stats, team_position_season_stats], axis=0)

    fbref_position_season_stats.to_csv(f"data/fbref_position_season_stats_{stat_type}.csv", sep=";")

In [None]:
# Team stats
team_stats_types = ['standard', 'keeper', 'keeper_adv', 'shooting', 'passing', 'goal_shot_creation', 'defense', 'possession', 'playing_time', 'misc']
for team_stats_type in team_stats_types:
    fbref_team_stats(
        team_stats_type, 
        leagues_dict
    )

# Team position stats
player_stats_types = ['standard', 'shooting', 'passing', 'goal_shot_creation', 'defense', 'possession', 'playing_time', 'misc']
for player_stats_type in player_stats_types:
    fbref_potistion_stats(
        player_stats_type,
        leagues_dict
    )

# Match History data

In [1]:
leagues_dict = {
    'ESP-La Liga': ['Real Madrid'],
    'ENG-Premier League': ['Chelsea', 'Man City'],
    'GER-Bundesliga': ['Bayern'],
    'ITA-Serie A': ['Milan', 'Napoli', 'Inter'],
    'POR-Primeira Liga': ['Benfica'],
}

In [2]:
# Columns description: https://www.football-data.co.uk/notes.txt

# Dicts to translate club names

premier_league_dict = {
    'Arsenal': 'Arsenal',
    'Aston Villa': 'Aston Villa',
    'Brighton': 'Brighton',
    'Burnley': 'Burnley',
    'Chelsea': 'Chelsea',
    'Crystal Palace': 'Crystal Palace',
    'Everton': 'Everton',
    'Fulham': 'Fulham',
    'Leeds': 'Leeds United',
    'Leicester': 'Leicester City',
    'Liverpool': 'Liverpool',
    'Man City': 'Manchester City',
    'Man United': 'Manchester Utd',
    'Newcastle': 'Newcastle Utd',
    'Sheffield United': 'Sheffield Utd',
    'Southampton': 'Southampton',
    'Tottenham': 'Tottenham',
    'West Brom': 'West Brom',
    'West Ham': 'West Ham',
    'Wolves': 'Wolves',
    'Brentford': 'Brentford',
    'Watford': 'Watford',
    'Norwich': 'Norwich City',
    "Nott'm Forest": "Nott'ham Forest",
    'Bournemouth': 'Bournemouth',
}

budneslinga_dict = {
    'Bielefeld': 'Arminia',
    'Augsburg': 'Augsburg',
    'Bayern Munich': 'Bayern Munich',
    'Dortmund': 'Dortmund',
    'Ein Frankfurt': 'Eint Frankfurt',
    'Freiburg': 'Freiburg',
    'Hertha': 'Hertha BSC',
    'Hoffenheim': 'Hoffenheim',
    'FC Koln': 'Köln',
    'Leverkusen': 'Leverkusen',
    "M'gladbach": "M'Gladbach",
    'Mainz': 'Mainz 05',
    'RB Leipzig': 'RB Leipzig',
    'Schalke 04': 'Schalke 04',
    'Stuttgart': 'Stuttgart',
    'Union Berlin': 'Union Berlin',
    'Werder Bremen': 'Werder Bremen',
    'Wolfsburg': 'Wolfsburg',
    'Greuther Furth': 'Greuther Fürth',
    'Bochum': 'Bochum',
}

serie_a_dict = {
    'Atalanta': 'Atalanta',
    'Benevento': 'Benevento',
    'Bologna': 'Bologna',
    'Cagliari': 'Cagliari',
    'Crotone': 'Crotone',
    'Fiorentina': 'Fiorentina',
    'Genoa': 'Genoa',
    'Verona': 'Hellas Verona',
    'Inter': 'Inter',
    'Juventus': 'Juventus',
    'Lazio': 'Lazio',
    'Milan': 'Milan',
    'Napoli': 'Napoli',
    'Parma': 'Parma',
    'Roma': 'Roma',
    'Sampdoria': 'Sampdoria',
    'Sassuolo': 'Sassuolo',
    'Spezia': 'Spezia',
    'Torino': 'Torino',
    'Udinese': 'Udinese',
    'Salernitana': 'Salernitana',
    'Venezia': 'Venezia',
    'Empoli': 'Empoli',
    'Monza': 'Monza',
    'Lecce': 'Lecce',
    'Cremonese': 'Cremonese',
}

la_liga_dict = {
    'Alaves': 'Alavés',
    'Ath Bilbao': 'Athletic Club',
    'Ath Madrid': 'Atlético Madrid',
    'Barcelona': 'Barcelona',
    'Betis': 'Betis',
    'Celta': 'Celta Vigo',
    'Cadiz': 'Cádiz',
    'Eibar': 'Eibar',
    'Elche': 'Elche',
    'Getafe': 'Getafe',
    'Granada': 'Granada',
    'Huesca': 'Huesca',
    'Levante': 'Levante',
    'Osasuna': 'Osasuna',
    'Real Madrid': 'Real Madrid',
    'Sociedad': 'Real Sociedad',
    'Sevilla': 'Sevilla',
    'Valencia': 'Valencia',
    'Valladolid': 'Valladolid',
    'Villarreal': 'Villarreal',
    'Espanol': 'Espanyol',
    'Mallorca': 'Mallorca',
    'Vallecano': 'Rayo Vallecano',
    'Almeria': 'Almería',
    'Girona': 'Girona',
}

primeira_liga_dict = {
    'Belenenses': 'B-SAD',
    'Benfica': 'Benfica',
    'Boavista': 'Boavista',
    'Sp Braga': 'Braga',
    'Famalicao': 'Famalicão',
    'Farense': 'Farense',
    'Gil Vicente': 'Gil Vicente FC',
    'Maritimo': 'Marítimo',
    'Moreirense': 'Moreirense',
    'Nacional': 'Nacional',
    'Pacos Ferreira': 'Paços',
    'Portimonense': 'Portimonense',
    'Porto': 'Porto',
    'Rio Ave': 'Rio Ave',
    'Santa Clara': 'Santa Clara',
    'Sp Lisbon': 'Sporting CP',
    'Tondela': 'Tondela',
    'Guimaraes': 'Vitória',
    'Arouca': 'Arouca',
    'Estoril': 'Estoril',
    'Vizela': 'Vizela',
    'Casa Pia': 'Casa Pia',
    'Chaves': 'Chaves',
}

description_dict = {
    'date': 'Match_Date',

    'home_team': 'Home_Team',
    'away_team': 'Away_Team',
    
    'FTHG': 'Full_Time_Home_Team_Goals',
    'FTAG': 'Full_Time_Away_Team_Goals',
    'FTR': 'Full_Time_Result',
    
    'HST': 'Home_Team_Shots_on_Target',
    'AST': 'Away_Team_Shots_on_Target',

    'B365H': 'Bet365_home_win_odds',
    'B365D': 'Bet365_draw_odds',
    'B365A': 'Bet365_away_win_odds',
}


club_names = {}
for dict in [premier_league_dict, budneslinga_dict, serie_a_dict, la_liga_dict, primeira_liga_dict]:
    for key, value in dict.items():
        club_names[key] = value

match_history_data = pd.DataFrame()
for season in ["20-21", "21-22", "22-23"]:
    for league in leagues_dict.keys():  
        mh = sd.MatchHistory(leagues=league, seasons=season, no_store=True)
        # Historic match results and betting odds
        hist = mh.read_games().reset_index()[
            [
                'league', 'season', 'date', 'home_team', 'away_team', 
                'FTHG', 'FTAG', 'FTR', 'HST', 'AST',
                'B365H', 'B365D', 'B365A',

            ]
        ]
        hist = hist.rename(columns=description_dict)
        hist['Result_Home_Team_Win'] = np.where(hist['Full_Time_Result'] == 'H', 1, 0)
        hist['Result_Away_Team_Win'] = np.where(hist['Full_Time_Result'] == 'A', 1, 0)
        hist['Result_Draw'] = np.where(hist['Full_Time_Result'] == 'D', 1, 0)
        hist = hist.drop(columns=['Full_Time_Result'])

        hist['Home_Team'] = hist['Home_Team'].map(club_names)
        hist['Away_Team'] = hist['Away_Team'].map(club_names)

        match_history_data = pd.concat([match_history_data, hist])

match_history_data.to_csv(f"data/match_history_data.csv", sep=";")

NameError: name 'pd' is not defined

# FIFA 20, 21, 22 data

In [18]:
# Dicts to translate club names

premier_league_dict = {
    'Arsenal': 'Arsenal',
    'Aston Villa': 'Aston Villa',
    'Brighton & Hove Albion': 'Brighton',
    'Burnley': 'Burnley',
    'Chelsea': 'Chelsea',
    'Crystal Palace': 'Crystal Palace',
    'Everton': 'Everton',
    'Fulham': 'Fulham',
    'Leeds United': 'Leeds United',
    'Leicester City': 'Leicester City',
    'Liverpool': 'Liverpool',
    'Manchester City': 'Manchester City',
    'Manchester United': 'Manchester Utd',
    'Newcastle United': 'Newcastle Utd',
    'Sheffield United': 'Sheffield Utd',
    'Southampton': 'Southampton',
    'Tottenham Hotspur': 'Tottenham',
    'West Bromwich Albion': 'West Brom',
    'West Ham United': 'West Ham',
    'Wolverhampton Wanderers': 'Wolves',
    'Brentford': 'Brentford',
    'Watford': 'Watford',
    'Norwich City': 'Norwich City',
    'Nottingham Forest': "Nott'ham Forest",
    'AFC Bournemouth': 'Bournemouth',
}

budneslinga_dict = {
    'DSC Arminia Bielefeld': 'Arminia',
    'FC Augsburg': 'Augsburg',
    'FC Bayern München': 'Bayern Munich',
    'Borussia Dortmund': 'Dortmund',
    'Eintracht Frankfurt': 'Eint Frankfurt',
    'SC Freiburg': 'Freiburg',
    'Hertha BSC': 'Hertha BSC',
    'TSG 1899 Hoffenheim': 'Hoffenheim',
    '1. FC Köln': 'Köln',
    'Bayer 04 Leverkusen': 'Leverkusen',
    'Borussia Mönchengladbach': "M'Gladbach",
    '1. FSV Mainz 05': 'Mainz 05',
    'RB Leipzig': 'RB Leipzig',
    'FC Schalke 04': 'Schalke 04',
    'VfB Stuttgart': 'Stuttgart',
    '1. FC Union Berlin': 'Union Berlin',
    'SV Werder Bremen': 'Werder Bremen',
    'VfL Wolfsburg': 'Wolfsburg',
    'SpVgg Greuther Fürth': 'Greuther Fürth',
    'VfL Bochum 1848': 'Bochum',
}

serie_a_dict = {
    'Atalanta': 'Atalanta',
    'Benevento': 'Benevento',
    'Bologna': 'Bologna',
    'Cagliari': 'Cagliari',
    'Crotone': 'Crotone',
    'Fiorentina': 'Fiorentina',
    'Genoa': 'Genoa',
    'Hellas Verona': 'Hellas Verona',
    'Inter': 'Inter',
    'Juventus': 'Juventus',
    'Lazio': 'Lazio',
    'Milan': 'Milan',
    'Napoli': 'Napoli',
    'Parma': 'Parma',
    'Roma': 'Roma',
    'Sampdoria': 'Sampdoria',
    'Sassuolo': 'Sassuolo',
    'Spezia': 'Spezia',
    'Torino': 'Torino',
    'Udinese': 'Udinese',
    'US Salernitana 1919': 'Salernitana',
    'Venezia FC': 'Venezia',
    'Empoli': 'Empoli',
    'Monza': 'Monza',
    'Lecce': 'Lecce',
    'US Cremonese': 'Cremonese',
}

la_liga_dict = {
    'Deportivo Alavés': 'Alavés',
    'Athletic Club de Bilbao': 'Athletic Club',
    'Atlético Madrid': 'Atlético Madrid',
    'FC Barcelona': 'Barcelona',
    'Real Betis': 'Betis',
    'RC Celta': 'Celta Vigo',
    'Cádiz CF': 'Cádiz',
    'SD Eibar': 'Eibar',
    'Elche CF': 'Elche',
    'Getafe CF': 'Getafe',
    'Granada CF': 'Granada',
    'SD Huesca': 'Huesca',
    'Levante UD': 'Levante',
    'CA Osasuna': 'Osasuna',
    'Real Madrid': 'Real Madrid',
    'Real Sociedad': 'Real Sociedad',
    'Sevilla FC': 'Sevilla',
    'Valencia CF': 'Valencia',
    'Real Valladolid CF': 'Valladolid',
    'Villarreal CF': 'Villarreal',
    'RCD Espanyol': 'Espanyol',
    'RCD Mallorca': 'Mallorca',
    'Rayo Vallecano': 'Rayo Vallecano',
    'UD Almería': 'Almería',
    'Girona FC': 'Girona',
}

primeira_liga_dict = {
    'Os Belenenses': 'B-SAD',
    'SL Benfica': 'Benfica',
    'Boavista FC': 'Boavista',
    'SC Braga': 'Braga',
    'Famalicão': 'Famalicão',
    'SC Farense': 'Farense',
    'Gil Vicente FC': 'Gil Vicente FC',
    'Clube Sport Marítimo': 'Marítimo',
    'Moreirense FC': 'Moreirense',
    'CD Nacional': 'Nacional',
    'FC Paços de Ferreira': 'Paços',
    'Portimonense SC': 'Portimonense',
    'FC Porto': 'Porto',
    'Rio Ave FC': 'Rio Ave',
    'Santa Clara': 'Santa Clara',
    'Sporting CP': 'Sporting CP',
    'CD Tondela': 'Tondela',
    'Vitória Guimarães': 'Vitória',
    'FC Arouca': 'Arouca',
    'Estoril Praia': 'Estoril',
    'FC Vizela': 'Vizela',
    'Casa Pia AC': 'Casa Pia',
    'GD Chaves': 'Chaves',
}

# https://www.kaggle.com/datasets/bryanb/fifa-player-stats-database

cols_to_drop = [
    'Photo', 'Flag', 'Club Logo', 'Body Type', 'Real Face', 
    'Position', 'Jersey Number', 'Joined', 'Loaned From', 
    'Preferred Foot', 'Special', 'Work Rate', 'Height', 
    'Weight', 'Contract Valid Until', 'Nationality', 'Name', 
    'ID', 'Release Clause', 'Best Position', 'Value', 'Wage'
]

FIFA20_data = pd.read_csv('raw/FIFA20_official_data.csv').drop(columns=cols_to_drop)
FIFA21_data = pd.read_csv('raw/FIFA21_official_data.csv').drop(columns=cols_to_drop)
FIFA22_data = pd.read_csv('raw/FIFA22_official_data.csv').drop(columns=cols_to_drop)

FIFA20_data['season'] = '2021'
FIFA21_data['season'] = '2122'
FIFA22_data['season'] = '2223'

club_names = {}
for dict in [premier_league_dict, budneslinga_dict, serie_a_dict, la_liga_dict, primeira_liga_dict]:
    for key, value in dict.items():
        club_names[key] = value


FIFA20_data['Club'] = FIFA20_data['Club'].map(club_names)
FIFA21_data['Club'] = FIFA21_data['Club'].map(club_names)
FIFA22_data['Club'] = FIFA22_data['Club'].map(club_names)

FIFA20_data = FIFA20_data[~FIFA20_data['Club'].isna()]
FIFA21_data = FIFA21_data[~FIFA21_data['Club'].isna()]
FIFA22_data = FIFA22_data[~FIFA22_data['Club'].isna()]

FIFA_data = pd.concat([FIFA20_data, FIFA21_data, FIFA22_data]).reset_index(drop=True)
FIFA_data = FIFA_data.groupby(['Club', 'season']).agg(['max', 'min', 'mean', 'median']).reset_index()
FIFA_data.columns = ["FIFA_" + "_".join(col).replace(' ', '_') if col[1] != '' else col[0].replace(' ', '_') for col in FIFA_data.columns]

FIFA_data.to_csv(f"data/FIFA_data.csv", sep=";")