In [1]:
import requests
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import time
import random
from datetime import datetime, timedelta
from rapidfuzz import process

### Scrapowanie meczów, które właśnie się odbyły

In [2]:
def scrap_match(url, season, league):
    data = {}
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1'
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        data["season"] = season
        data["league"] = league
        venuetime_span = soup.find('span', class_='venuetime')
        if venuetime_span:
            data["venue_date"] = venuetime_span.get('data-venue-date')
            data["venue_time"] = venuetime_span.get('data-venue-time')
        else:
            print("Date/Time information not found")

        matchweek_div = soup.select_one('div:-soup-contains("Matchweek")')
        if matchweek_div:
            data["round"] = int(re.search(r'Matchweek (\d+)', matchweek_div.text).group(1))
        else:
            print("Date/Time information not found")
        
        
        
        attendance_div = soup.select_one('div strong small:-soup-contains("Attendance")')
        if attendance_div:
            data["attendance_value"] = int((attendance_div.find_parent('strong').find_next_sibling().text).replace(",", ""))
        else:
            print("Attendance information not found")

        referee_span = soup.select_one('span:-soup-contains("(Referee)")')
        if referee_span:
            data["referee"] = referee_span.text.split()[0] + " " + referee_span.text.split()[1]
        else:
            print("Referee not found")

        manager_spans = soup.select('.datapoint:-soup-contains("Manager:")')
        if manager_spans[0]:
            data["home_manager"] = manager_spans[0].text.replace("Manager: ", "")
        if manager_spans[1]:
            data["away_manager"] = manager_spans[1].text.replace("Manager: ", "")
        else:
            print("Managers not found")

        captain_spans = soup.select('.datapoint:-soup-contains("Captain:")')
        if captain_spans[0]:
            data["home_captain"] = captain_spans[0].text.replace("Captain: ", "")
        if captain_spans[1]:
            data["away_captain"] = captain_spans[1].text.replace("Captain: ", "")
        else:
            print("Captains not found")

        data["formation_home"] = re.findall(r'\((.*?)\)', soup.select_one('#a.lineup th').text)[0]
        data["formation_away"] = re.findall(r'\((.*?)\)', soup.select_one('#b.lineup th').text)[0]

        data["home_possession"] = int(soup.select('#team_stats tr')[2].text.split()[0][:-1])/100
        data["away_possession"] = int(soup.select('#team_stats tr')[2].text.split()[1][:-1])/100

        home_lineup = []
        home_lineup_el = soup.select("#a.lineup tr")
        for el in home_lineup_el:
            txt = el.text
            if txt and txt[0].isdigit():
                home_lineup.append(txt.lstrip('0123456789').strip())

        away_lineup = []
        away_lineup_el = soup.select("#a.lineup tr")
        for el in away_lineup_el:
            txt = el.text
            if txt and txt[0].isdigit():
                away_lineup.append(txt.lstrip('0123456789').strip())

        data["home_lineup"] = home_lineup
        data["away_lineup"] = away_lineup

        
        teams = re.findall(r'(.+?)\s+vs\.\s+(.+?)\s+Match Report', soup.select_one('#content h1').text)
        data["home_team"], data["away_team"] = teams[0]

        table_names = [div.text.strip() for div in soup.select('.filter.switcher')[0].select('div')]

        tables = soup.select('.table_wrapper.tabbed')
        home_tables = tables[0].select('.stats_table')
        away_tables = tables[1].select('.stats_table')
        all_players_data = {}

        # home team scraping
        for i in range(len(home_tables)):
            table = home_tables[i]

            ths = table.select_one('tr:nth-of-type(2)').select('th')
            df_columns = [th.get('data-stat') for th in ths]

            rows = table.select('tbody tr')
            for row in rows:
                columns = row.select('th, td')
                player_name = columns[0].text.strip()
                if player_name not in all_players_data:
                    all_players_data[player_name] = {}

                all_players_data[player_name]["team"] = data["home_team"]
                all_players_data[player_name]["where"] = "home"
                all_players_data[player_name]["date"] = data["venue_date"]
                all_players_data[player_name]["round"] = data["round"]
                all_players_data[player_name]["season"] = season

                for j in range(len(columns)):
                    column_name = df_columns[j]
                    player_stat = columns[j].text.replace('\xa0\xa0\xa0', '').strip()
                    all_players_data[player_name][column_name] = player_stat
                    
                if len(all_players_data[player_name]["nationality"].split())>1:
                    all_players_data[player_name]["nationality"] = all_players_data[player_name]["nationality"].split()[1]
                else:
                    all_players_data[player_name]["nationality"] = "none"
            match_stat_tds = table.select('tfoot td')
            for td in match_stat_tds:
                data["home_" + td.get('data-stat')] = td.text.strip()

        # away team scraping
        for i in range(len(away_tables)):
            table = away_tables[i]

            ths = table.select_one('tr:nth-of-type(2)').select('th')
            df_columns = [th.get('data-stat') for th in ths]

            rows = table.select('tbody tr')
            for row in rows:
                columns = row.select('th, td')
                player_name = columns[0].text.strip()
                if player_name not in all_players_data:
                    all_players_data[player_name] = {}

                all_players_data[player_name]["team"] = data["away_team"]
                all_players_data[player_name]["where"] = "away"
                all_players_data[player_name]["date"] = data["venue_date"]
                all_players_data[player_name]["round"] = data["round"]
                all_players_data[player_name]["season"] = season

                for j in range(len(columns)):
                    column_name = df_columns[j]
                    player_stat = columns[j].text.replace('\xa0\xa0\xa0', '').strip()
                    all_players_data[player_name][column_name] = player_stat

                if len(all_players_data[player_name]["nationality"].split())>1:
                    all_players_data[player_name]["nationality"] = all_players_data[player_name]["nationality"].split()[1]
                else:
                    all_players_data[player_name]["nationality"] = "none"

            match_stat_tds = table.select('tfoot td')
            for td in match_stat_tds:
                data["away_" + td.get('data-stat')] = td.text.strip()

        return data, all_players_data
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return None, None

In [6]:
def is_newer(date_str, last_date):
    target_date = datetime.strptime(date_str, "%Y-%m-%d")
    last_date = datetime.strptime(last_date, "%Y-%m-%d")
    
    return target_date >= last_date

def scrape_season(season):
    historic_matches = pd.read_csv("data/final_prepared_data_with_new.csv").sort_values(by=['date'])
    urls = []
    urls.append('https://fbref.com/en/comps/9/' + season + '/schedule/' + season + '-Premier-League-Scores-and-Fixtures')
    urls.append('https://fbref.com/en/comps/12/' + season + '/schedule/' + season + '-La-Liga-Scores-and-Fixtures')
    urls.append('https://fbref.com/en/comps/11/' + season + '/schedule/' + season + '-Serie-A-Scores-and-Fixtures')
    urls.append('https://fbref.com/en/comps/20/' + season + '/schedule/' + season + '-Bundesliga-Scores-and-Fixtures')
    urls.append('https://fbref.com/en/comps/13/' + season + '/schedule/' + season + '-Ligue-1-Scores-and-Fixtures')
    leagues = ["pl", "ll", "sa", "bl", "l1"]
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1'
    }
    links = []
    leagues_array = []
    for i in range(len(urls)):
        url = urls[i]
        league = leagues[i]
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            last_date = historic_matches[historic_matches["league"] == league]["date"].tail(1).iloc[0]
            print(last_date)
            soup = BeautifulSoup(response.content, 'html.parser')
            link_elements = soup.select('td[data-stat="match_report"] a')
            for el in link_elements:
                if el.text == "Match Report":
                    row = el.parent.parent
                    if is_newer(row.select_one('td[data-stat="date"] a').text, last_date):
                        links.append("https://fbref.com" + el.get('href') )
                        leagues_array.append(league)

        else:
            if response.status_code == 429:
                retry_after = response.headers.get('Retry-After')
        
            if retry_after:
                print(f"Rate limited. Retry after: {retry_after} seconds.")
            else:
                print("Rate limited but no 'Retry-After' header found")
            print(f"Failed to retrieve the page. Status code: {response.status_code}")

    all_teams_data = []
    all_players_data = []
    for i in range(len(links)):
        link = links[i]
        league = leagues_array[i]
        print(link)
        time.sleep(random.uniform(3, 5))
        teams_data, players_data = scrap_match(link, season, league)
        if not teams_data:
            return
        all_teams_data.append(teams_data)
        all_players_data.append(players_data)

    flattened_data = []
    for player_dict in all_players_data:
        for key, player_info in player_dict.items():
            flattened_data.append(player_info)

    df_players = pd.DataFrame(flattened_data)
    df_teams = pd.DataFrame(all_teams_data)

    return df_teams, df_players

In [7]:
updated_teams, updated_players = scrape_season("2024-2025")
historic_matches = pd.read_csv("data/final_prepared_data_with_new.csv").sort_values(by=['date'])
updated_teams.to_csv("data/latest_historic_matches.csv", index=False)
updated_teams = pd.read_csv("data/latest_historic_matches.csv")
updated_teams.rename(columns={"venue_date": "date", "venue_time": "time"}, inplace=True)
updated_teams = updated_teams.drop(columns = ["home_shirtnumber", "home_nationality", "home_position", "home_age", "away_shirtnumber", "away_nationality", "away_position", "away_age"])

updated_teams["home_goals"] = updated_teams["home_goals"] + updated_teams["away_own_goals"]
updated_teams["away_goals"] = updated_teams["away_goals"] + updated_teams["home_own_goals"]
updated_teams["total_goals"] = updated_teams["home_goals"] + updated_teams["away_goals"]
stadiums = pd.read_csv("final_data/stadiumsTop5.csv")
matches_with_stadiums = pd.merge(updated_teams, stadiums, left_on=['home_team'], right_on=['Team'], how='left')
def determine_outcome_and_points(row):
    if row['home_goals'] > row['away_goals']:
        return 1, 3, 0
    elif row['home_goals'] < row['away_goals']:
        return 2, 0, 3
    else:
        return 0, 1, 1
    
matches_with_stadiums[['outcome', 'home_points', 'away_points']] = matches_with_stadiums.apply(lambda row: pd.Series(determine_outcome_and_points(row)), axis=1)
historic_set = set(zip(historic_matches['date'], historic_matches['home_team']))
filtered_matches = [
    row for _, row in matches_with_stadiums.iterrows() 
    if (row['date'], row['home_team']) not in historic_set
]
filtered_matches = pd.DataFrame(filtered_matches, columns=matches_with_stadiums.columns)

2025-01-18
2025-01-18
2025-01-18
2025-01-18
2025-01-18
https://fbref.com/en/matches/99d11a39/Newcastle-United-Bournemouth-January-18-2025-Premier-League
https://fbref.com/en/matches/03ac4a9c/West-Ham-United-Crystal-Palace-January-18-2025-Premier-League
https://fbref.com/en/matches/5e8445c1/Brentford-Liverpool-January-18-2025-Premier-League
https://fbref.com/en/matches/bc3ae18e/Leicester-City-Fulham-January-18-2025-Premier-League
https://fbref.com/en/matches/1fdaaaba/Arsenal-Aston-Villa-January-18-2025-Premier-League
https://fbref.com/en/matches/45028d5b/Manchester-United-Brighton-and-Hove-Albion-January-19-2025-Premier-League
https://fbref.com/en/matches/e0f90407/Nottingham-Forest-Southampton-January-19-2025-Premier-League
https://fbref.com/en/matches/e9f61cb0/Everton-Tottenham-Hotspur-January-19-2025-Premier-League
https://fbref.com/en/matches/6c829b8f/Ipswich-Town-Manchester-City-January-19-2025-Premier-League
https://fbref.com/en/matches/e62cfa12/Chelsea-Wolverhampton-Wanderers-Janu

In [8]:
standings_historic = pd.read_csv("data/standings_with_new.csv").sort_values(["date"])
standings_historic["date"] = pd.to_datetime(standings_historic["date"])
cols = matches_with_stadiums.columns
attributes = [col.replace("home_", "") for col in cols if col.startswith("home_") and col != 'home_manager' and col != 'home_captain' and col != 'home_lineup' and col != 'home_team']

standings_columns = ['matches_played', 'wins', 'draws', 'defeats', 'goal_difference', 'goals_conceded'] + attributes
standings = []
matches_sorted = filtered_matches.sort_values(by=['league', 'season', 'date'])


for league in matches_sorted['league'].unique():
    league_data = matches_sorted[matches_sorted['league'] == league]
    for season in matches_sorted['season'].unique():
        season_data = league_data.loc[matches_sorted['season'] == season]
        teams = standings_historic[(standings_historic["season"] == season) & (standings_historic["league"] == league)]["team"].unique()
        
        standings_dict = {team: {attribute: standings_historic[standings_historic["team"] == team].tail(1)[attribute].iloc[0] for attribute in standings_columns} for team in teams}
        
        # tu można dodać punkty karne, z którymi drużyny zaczynają sezon, bardzo niewiele to zmieni ale jednak w teorii dokładniejszy model + żeby tabele w aplikacji się zgadzały!!!!!!
        
        for dt in season_data['date'].unique():
            round_data = season_data[season_data['date'] == dt]
            
            for index, row in round_data.iterrows():
                home_team = row['home_team']
                away_team = row['away_team']
                home_goals = row['home_goals']
                away_goals = row['away_goals']
                home_points = row['home_points']
                away_points = row['away_points']
                outcome = row['outcome']
                
                standings_dict[home_team]['matches_played'] += 1
                standings_dict[home_team]['goals_conceded'] += away_goals
                
                for attr in attributes:
                    standings_dict[home_team][attr] += row["home_" + attr]

                standings_dict[home_team]['goal_difference'] = standings_dict[home_team]['goals'] - standings_dict[home_team]['goals_conceded']
                
                standings_dict[away_team]['matches_played'] += 1
                standings_dict[away_team]['goals_conceded'] += home_goals
                
                for attr in attributes:
                    standings_dict[away_team][attr] += row["away_" + attr]

                standings_dict[away_team]['goal_difference'] = standings_dict[away_team]['goals'] - standings_dict[away_team]['goals_conceded']

                if outcome == 1:
                    standings_dict[home_team]['wins'] += 1
                    standings_dict[away_team]['defeats'] += 1
                elif outcome == 0:
                    standings_dict[home_team]['draws'] += 1
                    standings_dict[away_team]['draws'] += 1
                else:
                    standings_dict[home_team]['defeats'] += 1
                    standings_dict[away_team]['wins'] += 1

            temp_standings_data = {
            'league': league,
            'season': season,
            'date': dt,
            'team': list(standings_dict.keys()),
            }
            for attr in standings_columns:
                temp_standings_data[attr] = [standings_dict[team][attr] for team in standings_dict]

            temp_standings = pd.DataFrame(temp_standings_data)

            standings.append(temp_standings)
new_standings = pd.concat(standings).sort_values(by=['season', 'date', 'points', 'goal_difference', 'goals'], ascending=[True, True, False, False, False]).reset_index(drop=True)
standings = pd.concat([standings_historic, new_standings], ignore_index=True)
standings.to_csv("data/standings_with_new.csv", index=False)

In [9]:
def get_team_stats(season, team, matches_played):
    return standings[(standings["season"] == season) & (standings["team"] == team) & (standings["matches_played"] == matches_played)].iloc[0]


def calculate_rolling_stats(n, stats, stats_old, matches_played, real_matches_played):
    rolling_stats = {}
    for feature in standings_columns:
        if matches_played == 0:
            rolling_stats[feature] = stats[feature]
            if real_matches_played != 0:
                rolling_stats[feature] = rolling_stats[feature] / real_matches_played
        else:
            rolling_stats[feature] = stats[feature] - stats_old[feature]
            rolling_stats[feature] = rolling_stats[feature] / n
    return rolling_stats

def create_rolling_stats(n, df):
    home_stats_list = []
    away_stats_list = []
    for index, row in df.iterrows():
        date = row["date"]
        season = row["season"]
        home_team = row["home_team"]
        away_team = row["away_team"]

        home_team_matches_played = standings[(standings["date"] == date) & (standings["team"] == home_team)]["matches_played"].values[0]
        away_team_matches_played = standings[(standings["date"] == date) & (standings["team"] == away_team)]["matches_played"].values[0]

        home_team_stats = get_team_stats(season, home_team, home_team_matches_played)
        away_team_stats = get_team_stats(season, away_team, away_team_matches_played)

        home_team_matches_played_old = max(0, home_team_matches_played - n)
        away_team_matches_played_old = max(0, away_team_matches_played - n)

        if home_team_matches_played_old > 0:
            home_team_stats_old = get_team_stats(season, home_team, home_team_matches_played_old)
        if away_team_matches_played_old > 0:
            away_team_stats_old = get_team_stats(season, away_team, away_team_matches_played_old)

        home_rolling_stats = calculate_rolling_stats(n, home_team_stats, home_team_stats_old if home_team_matches_played_old > 0 else None, home_team_matches_played_old, home_team_matches_played)
        away_rolling_stats = calculate_rolling_stats(n, away_team_stats, away_team_stats_old if away_team_matches_played_old > 0 else None, away_team_matches_played_old, away_team_matches_played)

        home_stats_list.append({f"home_last{n}_{feature}": value for feature, value in home_rolling_stats.items()})
        away_stats_list.append({f"away_last{n}_{feature}": value for feature, value in away_rolling_stats.items()})

    home_stats_df = pd.DataFrame(home_stats_list, index=df.index)
    away_stats_df = pd.DataFrame(away_stats_list, index=df.index)

    return pd.concat([df, home_stats_df, away_stats_df], axis=1)

df = create_rolling_stats(n = 5, df = filtered_matches)

In [10]:
matches = df
matches = matches.fillna(0)
matches['date'] = pd.to_datetime(matches['date'])
matches["formation_home"] = matches["formation_home"].str.replace(r"-1-1$", "-2", regex=True)
matches["formation_away"] = matches["formation_away"].str.replace(r"-1-1$", "-2", regex=True)
matches["formation_home"] = matches["formation_home"].str.replace("4-1-2-1-2", "4-3-1-2", regex=True)
matches["formation_away"] = matches["formation_away"].str.replace("4-1-2-1-2", "4-3-1-2", regex=True)
matches["formation_back_line"] = matches["formation_home"].apply(lambda text: int(text[0] == "4"))

In [11]:
def calculate_tiredness_factor(days_since_last_match, decay_rate=0.1):
    return np.exp(-decay_rate * days_since_last_match)

matches['last_match_date'] = pd.NaT
matches['date'] = pd.to_datetime(matches['date'])
matches = matches.sort_values(by='date', ignore_index=True)
for i, row in matches.iterrows():
    home_team = row['home_team']
    away_team = row['away_team']
    
    last_home_date = pd.NaT
    last_away_date = pd.NaT

    home_team_last_manager = None
    away_team_last_manager = None
    
    for j in range(i-1, -1, -1):
        if matches.iloc[j]['home_team'] == home_team:
            last_home_date = matches.iloc[j]['date']
            home_team_last_manager = matches.iloc[j]["home_manager"]
            break  # Stop once the match is found
        if matches.iloc[j]['away_team'] == home_team:
            last_home_date = matches.iloc[j]['date']
            home_team_last_manager = matches.iloc[j]["away_manager"]
            break
    
    for j in range(i-1, -1, -1):
        if matches.iloc[j]['home_team'] == away_team:
            last_away_date = matches.iloc[j]['date']
            away_team_last_manager = matches.iloc[j]["home_manager"]
            break
        if matches.iloc[j]['away_team'] == away_team:
            last_away_date = matches.iloc[j]['date']
            away_team_last_manager = matches.iloc[j]["away_manager"]
            break

    home_matches_since_last_manager = None
    away_matches_since_last_manager = None
    if home_team_last_manager == None:
        home_team_last_manager = row["home_manager"]
        home_matches_since_last_manager = 20
    if away_team_last_manager == None:
        away_team_last_manager = row["away_manager"]
        away_matches_since_last_manager = 20

    if home_team_last_manager == row["home_manager"]:
        new_home_manager = False
    else:
        new_home_manager = True

    if away_team_last_manager and away_team_last_manager == row["away_manager"]:
        new_away_manager = False
    else:
        new_away_manager = True

    matches.at[i, 'last_match_date_home'] = last_home_date
    matches.at[i, 'last_match_date_away'] = last_away_date
    matches.at[i, 'last_home_manager'] = home_team_last_manager
    matches.at[i, 'last_away_manager'] = away_team_last_manager
    matches.at[i, 'new_home_manager'] = new_home_manager
    matches.at[i, 'new_away_manager'] = new_away_manager
    matches.at[i, 'matches_since_new_home_manager'] = home_matches_since_last_manager
    matches.at[i, 'matches_since_new_away_manager'] = away_matches_since_last_manager    

matches['days_since_last_home'] = (matches['date'] - matches['last_match_date_home']).dt.days
matches['days_since_last_away'] = (matches['date'] - matches['last_match_date_away']).dt.days
matches['days_since_last_home'] = matches['days_since_last_home'].fillna(7)
matches['days_since_last_away'] = matches['days_since_last_away'].fillna(7)
matches['home_tiredness'] = calculate_tiredness_factor(matches['days_since_last_home'])
matches['away_tiredness'] = calculate_tiredness_factor(matches['days_since_last_away'])

In [12]:
for i in range (len(matches)):
    home_team = matches.iloc[i]['home_team']
    away_team = matches.iloc[i]['away_team']

    new_home_manager = matches.iloc[i]["new_home_manager"]
    new_away_manager = matches.iloc[i]["new_away_manager"]

    if new_home_manager:
        matches.at[i, 'matches_since_new_home_manager'] = 0
    if new_away_manager:
        matches.at[i, 'matches_since_new_away_manager'] = 0

    for j in range(i+1, len(matches)):
        if matches.iloc[j]['home_team'] == home_team:
            matches.at[j, 'matches_since_new_home_manager'] = matches.iloc[i]["matches_since_new_home_manager"] + 1
            break
        if matches.iloc[j]['away_team'] == home_team:
            matches.at[j, 'matches_since_new_away_manager'] = matches.iloc[i]["matches_since_new_home_manager"] + 1
            break

    for j in range(i+1, len(matches)):
        if matches.iloc[j]['home_team'] == away_team:
            matches.at[j, 'matches_since_new_home_manager'] = matches.iloc[i]["matches_since_new_away_manager"] + 1
            break
        if matches.iloc[j]['away_team'] == away_team:
            matches.at[j, 'matches_since_new_away_manager'] = matches.iloc[i]["matches_since_new_away_manager"] + 1
            break

In [13]:
def get_h2h_metrics(row, df, num_matches=6):
    past_h2h = df[((df['home_team'] == row['home_team']) & (df['away_team'] == row['away_team'])) |
                  ((df['home_team'] == row['away_team']) & (df['away_team'] == row['home_team']))]
    past_h2h["date"] = pd.to_datetime(past_h2h["date"])
    past_h2h = past_h2h[past_h2h['date'] < row['date']].sort_values(by='date', ascending=False)
    
    past_h2h = past_h2h.head(num_matches)
    num_past_h2h = len(past_h2h)

    past_h2h_home_home = past_h2h[past_h2h["home_team"] == row["home_team"]]
    past_h2h_home_away = past_h2h[past_h2h["home_team"] == row["away_team"]]

    if not past_h2h.empty:
        metrics = {
            'h2h_win_ratio': ((past_h2h_home_home['outcome'] == 1).sum() + (past_h2h_home_away['outcome'] == 2).sum()) / num_past_h2h,
            'h2h_draw_ratio': (past_h2h['outcome'] == 0).sum() / num_past_h2h,
            'h2h_avg_goals_scored_home_team': (past_h2h_home_home['home_goals'].sum() + past_h2h_home_away['away_goals'].sum()) / num_past_h2h,
            'h2h_avg_goals_scored_away_team': (past_h2h_home_home['away_goals'].sum() + past_h2h_home_away['home_goals'].sum()) / num_past_h2h,
            'h2h_avg_xG_home_team': (past_h2h_home_home['home_xg'].sum() + past_h2h_home_away['away_xg'].sum()) / num_past_h2h,
            'h2h_avg_xG_away_team': (past_h2h_home_home['away_xg'].sum() + past_h2h_home_away['home_xg'].sum()) / num_past_h2h,
        }
    else:
        metrics = {
            'h2h_win_ratio': 0,
            'h2h_draw_ratio': 0,
            'h2h_avg_goals_scored_home_team': 0,
            'h2h_avg_goals_scored_away_team': 0,
            'h2h_avg_xG_home_team': 0,
            'h2h_avg_xG_away_team': 0,
        }

    return pd.Series(metrics)

matches = matches.join(matches.apply(lambda row: get_h2h_metrics(row, historic_matches), axis=1))

In [14]:
fifa1 = pd.read_csv("final_data/sofifa_players_17_18.csv")
fifa2 = pd.read_csv("final_data/sofifa_players_18_19.csv")
fifa3 = pd.read_csv("final_data/sofifa_players_19_20.csv")
fifa4 = pd.read_csv("final_data/sofifa_players_20_21.csv")
fifa5 = pd.read_csv("final_data/sofifa_players_21_22.csv")
fifa6 = pd.read_csv("final_data/sofifa_players_22_23.csv")
fifa7 = pd.read_csv("final_data/sofifa_players_23_24.csv")
fifa8 = pd.read_csv("final_data/sofifa_players_24_25.csv")
fifa = pd.concat([fifa1, fifa2, fifa3, fifa4, fifa5, fifa6, fifa7, fifa8], ignore_index=True)

In [15]:
import json
with open("names_mapping.json", "r") as file:
    mapping_dict = json.load(file)

In [16]:
updated_players["player_name"] = updated_players["player"].map(mapping_dict)
updated_players['date'] = pd.to_datetime(updated_players['date'])
players_joined = pd.merge(updated_players, fifa, left_on=["player_name","season"], right_on=["name", "season"], how="left")

for idx, row in players_joined[players_joined["name"].isnull()].iterrows():
    matching = fifa[fifa["name"] == row["player_name"]]
    if not matching.empty:
        for col in matching.columns:
            players_joined.loc[idx, col] = matching.iloc[0][col]

In [17]:
def get_starters(group):
    starters = []
    group = group.sort_index()
    used_indices = set()
    for idx, row in group.iterrows():
        if idx in used_indices:
            continue

        row["minutes"] = int(row["minutes"])

        if row['minutes'] == 90:
            starters.append(group.index.get_loc(idx))
            used_indices.add(idx)
        elif row['minutes'] < 90:
            starters.append(group.index.get_loc(idx))
            used_indices.add(idx)
            minutes_sum = row['minutes']
            next_row = row
            next_idx_global = idx
            while minutes_sum < 90 and int(next_row['cards_red']) < 1:
                next_idx = group.index.get_loc(next_idx_global) + 1
                next_idx_global = next_idx_global + 1
                if next_idx < len(group):
                    next_row = group.iloc[next_idx]
                    minutes_sum += int(next_row['minutes'])
                    if minutes_sum > 91:
                        starters.append(next_idx)
                    used_indices.add(next_idx_global)
                else:
                    minutes_sum = 90
                    
    group = group.iloc[starters]
    return group

In [18]:
player_stat_columns = [col for col in players_joined.columns if col.startswith('stat_') or col == "overall_rating"]
for _, match in matches.iterrows():
    match_date = match['date']
    home_team = match['home_team']
    away_team = match['away_team']
    
    home_players = get_starters(players_joined[(players_joined['team'] == home_team) & (players_joined['date'] == match_date)])
    away_players = get_starters(players_joined[(players_joined['team'] == away_team) & (players_joined['date'] == match_date)])
    
    home_stats_avg = home_players[player_stat_columns].mean()
    away_stats_avg = away_players[player_stat_columns].mean()
    
    for stat in player_stat_columns:
        matches.at[_, f'overall_home_{stat}'] = home_stats_avg[stat]
        matches.at[_, f'overall_away_{stat}'] = away_stats_avg[stat]

### Scrapowanie nowych meczów, które się jeszcze nie odbyły

In [19]:
updated_historic_matches = pd.concat([historic_matches, matches], ignore_index=True)
updated_historic_matches["date"] = pd.to_datetime(updated_historic_matches["date"])
updated_historic_matches.to_csv("data/final_prepared_data_with_new.csv", index=False)

In [22]:
historic_players = pd.read_csv("data/new_players.csv")
updated_historic_players = pd.concat([historic_players, updated_players], ignore_index=True)
updated_players.to_csv("data/new_players.csv", index=False)

In [23]:
def is_date_in_next_month(date_str):
    target_date = datetime.strptime(date_str, "%Y-%m-%d")
    
    now = datetime.now()
    
    one_month_later = now + timedelta(days=20)
    
    return now <= target_date <= one_month_later

def scrape_season(season):
    urls = []
    urls.append('https://fbref.com/en/comps/9/' + season + '/schedule/' + season + '-Premier-League-Scores-and-Fixtures')
    urls.append('https://fbref.com/en/comps/12/' + season + '/schedule/' + season + '-La-Liga-Scores-and-Fixtures')
    urls.append('https://fbref.com/en/comps/11/' + season + '/schedule/' + season + '-Serie-A-Scores-and-Fixtures')
    urls.append('https://fbref.com/en/comps/20/' + season + '/schedule/' + season + '-Bundesliga-Scores-and-Fixtures')
    urls.append('https://fbref.com/en/comps/13/' + season + '/schedule/' + season + '-Ligue-1-Scores-and-Fixtures')
    leagues = ["pl", "ll", "sa", "bl", "l1"]

    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1'
    }
    new_matches = []
    for i in range(len(urls)):
        url = urls[i]
        league = leagues[i]
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            link_elements = soup.select('td[data-stat="match_report"] a')
            for el in link_elements:
                row = el.parent.parent
                if is_date_in_next_month(row.select_one('td[data-stat="date"] a').text):
                    match_data = {}
                    match_data["date"] = row.select_one('td[data-stat="date"] a').text
                    match_data["round"] = row.select_one('th[data-stat="gameweek"]').text
                    match_data["time"] = row.select_one('.venuetime').text if row.select_one('.venuetime') else None
                    match_data["home_team"] = row.select_one('td[data-stat="home_team"]').text
                    match_data["away_team"] = row.select_one('td[data-stat="away_team"]').text
                    match_data["season"] = season
                    match_data["league"] = league
                    new_matches.append(match_data)
        else:
            if response.status_code == 429:
                retry_after = response.headers.get('Retry-After')
        
            if retry_after:
                print(f"Rate limited. Retry after: {retry_after} seconds.")
            else:
                print("Rate limited but no 'Retry-After' header found")
            print(f"Failed to retrieve the page. Status code: {response.status_code}")

    df_new_matches = pd.DataFrame(new_matches)

    return df_new_matches


In [24]:
df_matches = scrape_season('2024-2025')

In [28]:
df_matches["historical"] = False
historic_matches = pd.read_csv("data/final_prepared_data_with_new.csv").sort_values(by=['date'])
standings = pd.read_csv("data/standings_with_new.csv").sort_values(["date"])

In [29]:
all_teams = standings["team"].unique()
df_matches["home_team"] = df_matches["home_team"].apply(lambda x:process.extractOne(x, all_teams)[0])
df_matches["away_team"] = df_matches["away_team"].apply(lambda x:process.extractOne(x, all_teams)[0])
stadiums = pd.read_csv("final_data/stadiumsTop5.csv")
matches = pd.merge(df_matches, stadiums, left_on=['home_team'], right_on=['Team'], how='left')
last_date = standings.tail(1)["date"].iloc[0]
standings_columns = ['matches_played', 'wins', 'draws', 'defeats', 'goal_difference', 'goals_conceded', 'possession', 'minutes', 'goals', 'assists', 'pens_made', 'pens_att', 'shots', 'shots_on_target', 'cards_yellow', 'cards_red', 'touches', 'tackles', 'interceptions', 'blocks', 'xg', 'npxg', 'xg_assist', 'sca', 'gca', 'passes_completed', 'passes', 'passes_pct', 'progressive_passes', 'carries', 'progressive_carries', 'take_ons', 'take_ons_won', 'passes_total_distance', 'passes_progressive_distance', 'passes_completed_short', 'passes_short', 'passes_pct_short', 'passes_completed_medium', 'passes_medium', 'passes_pct_medium', 'passes_completed_long', 'passes_long', 'passes_pct_long', 'pass_xa', 'assisted_shots', 'passes_into_final_third', 'passes_into_penalty_area', 'crosses_into_penalty_area', 'passes_live', 'passes_dead', 'passes_free_kicks', 'through_balls', 'passes_switches', 'crosses', 'throw_ins', 'corner_kicks', 'corner_kicks_in', 'corner_kicks_out', 'corner_kicks_straight', 'passes_offsides', 'passes_blocked', 'tackles_won', 'tackles_def_3rd', 'tackles_mid_3rd', 'tackles_att_3rd', 'challenge_tackles', 'challenges', 'challenge_tackles_pct', 'challenges_lost', 'blocked_shots', 'blocked_passes', 'tackles_interceptions', 'clearances', 'errors', 'touches_def_pen_area', 'touches_def_3rd', 'touches_mid_3rd', 'touches_att_3rd', 'touches_att_pen_area', 'touches_live_ball', 'take_ons_won_pct', 'take_ons_tackled', 'take_ons_tackled_pct', 'carries_distance', 'carries_progressive_distance', 'carries_into_final_third', 'carries_into_penalty_area', 'miscontrols', 'dispossessed', 'passes_received', 'progressive_passes_received', 'cards_yellow_red', 'fouls', 'fouled', 'offsides', 'pens_won', 'pens_conceded', 'own_goals', 'ball_recoveries', 'aerials_won', 'aerials_lost', 'aerials_won_pct', 'points']

In [39]:
def get_team_stats(season, team, matches_played):
    return standings[(standings["season"] == season) & (standings["team"] == team) & (standings["matches_played"] == matches_played)].iloc[0]


def calculate_rolling_stats(n, stats, stats_old, matches_played, real_matches_played):
    rolling_stats = {}
    for feature in standings_columns:
        if matches_played == 0:
            rolling_stats[feature] = stats[feature]
            if real_matches_played != 0:
                rolling_stats[feature] = rolling_stats[feature] / real_matches_played
        else:
            rolling_stats[feature] = stats[feature] - stats_old[feature]
            rolling_stats[feature] = rolling_stats[feature] / n
    return rolling_stats

def create_rolling_stats(n, df):
    home_stats_list = []
    away_stats_list = []
    for index, row in df.iterrows():
        season = row["season"]
        home_team = row["home_team"]
        away_team = row["away_team"]
        home_last_date = standings[standings["team"]==home_team].tail(1)["date"].iloc[0]
        away_last_date = standings[standings["team"]==home_team].tail(1)["date"].iloc[0]

        home_team_matches_played = standings[(standings["date"] == home_last_date) & (standings["team"] == home_team)]["matches_played"].values[0]
        away_team_matches_played = standings[(standings["date"] == away_last_date) & (standings["team"] == away_team)]["matches_played"].values[0]

        home_team_stats = get_team_stats(season, home_team, home_team_matches_played)
        away_team_stats = get_team_stats(season, away_team, away_team_matches_played)

        home_team_matches_played_old = max(0, home_team_matches_played - n)
        away_team_matches_played_old = max(0, away_team_matches_played - n)

        if home_team_matches_played_old > 0:
            home_team_stats_old = get_team_stats(season, home_team, home_team_matches_played_old)
        if away_team_matches_played_old > 0:
            away_team_stats_old = get_team_stats(season, away_team, away_team_matches_played_old)

        home_rolling_stats = calculate_rolling_stats(n, home_team_stats, home_team_stats_old if home_team_matches_played_old > 0 else None, home_team_matches_played_old, home_team_matches_played)
        away_rolling_stats = calculate_rolling_stats(n, away_team_stats, away_team_stats_old if away_team_matches_played_old > 0 else None, away_team_matches_played_old, away_team_matches_played)

        home_stats_list.append({f"home_last{n}_{feature}": value for feature, value in home_rolling_stats.items()})
        away_stats_list.append({f"away_last{n}_{feature}": value for feature, value in away_rolling_stats.items()})

    home_stats_df = pd.DataFrame(home_stats_list, index=df.index)
    away_stats_df = pd.DataFrame(away_stats_list, index=df.index)

    return pd.concat([df, home_stats_df, away_stats_df], axis=1)

df = create_rolling_stats(n = 5, df = matches[matches["historical"]==False])

In [42]:
df.head(2)

Unnamed: 0,date,round,time,home_team,away_team,season,league,historical,Team,City,Stadium,Capacity,Latitude,Longitude,Country,home_last5_matches_played,home_last5_wins,home_last5_draws,home_last5_defeats,home_last5_goal_difference,home_last5_goals_conceded,home_last5_possession,home_last5_minutes,home_last5_goals,home_last5_assists,home_last5_pens_made,home_last5_pens_att,home_last5_shots,home_last5_shots_on_target,home_last5_cards_yellow,home_last5_cards_red,home_last5_touches,home_last5_tackles,home_last5_interceptions,home_last5_blocks,home_last5_xg,home_last5_npxg,home_last5_xg_assist,home_last5_sca,home_last5_gca,home_last5_passes_completed,home_last5_passes,home_last5_passes_pct,home_last5_progressive_passes,home_last5_carries,home_last5_progressive_carries,home_last5_take_ons,home_last5_take_ons_won,home_last5_passes_total_distance,home_last5_passes_progressive_distance,home_last5_passes_completed_short,home_last5_passes_short,home_last5_passes_pct_short,home_last5_passes_completed_medium,home_last5_passes_medium,home_last5_passes_pct_medium,home_last5_passes_completed_long,home_last5_passes_long,home_last5_passes_pct_long,home_last5_pass_xa,home_last5_assisted_shots,home_last5_passes_into_final_third,home_last5_passes_into_penalty_area,home_last5_crosses_into_penalty_area,home_last5_passes_live,home_last5_passes_dead,home_last5_passes_free_kicks,home_last5_through_balls,home_last5_passes_switches,home_last5_crosses,home_last5_throw_ins,home_last5_corner_kicks,home_last5_corner_kicks_in,home_last5_corner_kicks_out,home_last5_corner_kicks_straight,home_last5_passes_offsides,home_last5_passes_blocked,home_last5_tackles_won,home_last5_tackles_def_3rd,home_last5_tackles_mid_3rd,home_last5_tackles_att_3rd,home_last5_challenge_tackles,home_last5_challenges,home_last5_challenge_tackles_pct,home_last5_challenges_lost,home_last5_blocked_shots,home_last5_blocked_passes,home_last5_tackles_interceptions,home_last5_clearances,home_last5_errors,home_last5_touches_def_pen_area,home_last5_touches_def_3rd,home_last5_touches_mid_3rd,home_last5_touches_att_3rd,home_last5_touches_att_pen_area,home_last5_touches_live_ball,home_last5_take_ons_won_pct,home_last5_take_ons_tackled,home_last5_take_ons_tackled_pct,home_last5_carries_distance,home_last5_carries_progressive_distance,home_last5_carries_into_final_third,home_last5_carries_into_penalty_area,home_last5_miscontrols,home_last5_dispossessed,home_last5_passes_received,home_last5_progressive_passes_received,home_last5_cards_yellow_red,home_last5_fouls,home_last5_fouled,home_last5_offsides,home_last5_pens_won,home_last5_pens_conceded,home_last5_own_goals,home_last5_ball_recoveries,home_last5_aerials_won,home_last5_aerials_lost,home_last5_aerials_won_pct,home_last5_points,away_last5_matches_played,away_last5_wins,away_last5_draws,away_last5_defeats,away_last5_goal_difference,away_last5_goals_conceded,away_last5_possession,away_last5_minutes,away_last5_goals,away_last5_assists,away_last5_pens_made,away_last5_pens_att,away_last5_shots,away_last5_shots_on_target,away_last5_cards_yellow,away_last5_cards_red,away_last5_touches,away_last5_tackles,away_last5_interceptions,away_last5_blocks,away_last5_xg,away_last5_npxg,away_last5_xg_assist,away_last5_sca,away_last5_gca,away_last5_passes_completed,away_last5_passes,away_last5_passes_pct,away_last5_progressive_passes,away_last5_carries,away_last5_progressive_carries,away_last5_take_ons,away_last5_take_ons_won,away_last5_passes_total_distance,away_last5_passes_progressive_distance,away_last5_passes_completed_short,away_last5_passes_short,away_last5_passes_pct_short,away_last5_passes_completed_medium,away_last5_passes_medium,away_last5_passes_pct_medium,away_last5_passes_completed_long,away_last5_passes_long,away_last5_passes_pct_long,away_last5_pass_xa,away_last5_assisted_shots,away_last5_passes_into_final_third,away_last5_passes_into_penalty_area,away_last5_crosses_into_penalty_area,away_last5_passes_live,away_last5_passes_dead,away_last5_passes_free_kicks,away_last5_through_balls,away_last5_passes_switches,away_last5_crosses,away_last5_throw_ins,away_last5_corner_kicks,away_last5_corner_kicks_in,away_last5_corner_kicks_out,away_last5_corner_kicks_straight,away_last5_passes_offsides,away_last5_passes_blocked,away_last5_tackles_won,away_last5_tackles_def_3rd,away_last5_tackles_mid_3rd,away_last5_tackles_att_3rd,away_last5_challenge_tackles,away_last5_challenges,away_last5_challenge_tackles_pct,away_last5_challenges_lost,away_last5_blocked_shots,away_last5_blocked_passes,away_last5_tackles_interceptions,away_last5_clearances,away_last5_errors,away_last5_touches_def_pen_area,away_last5_touches_def_3rd,away_last5_touches_mid_3rd,away_last5_touches_att_3rd,away_last5_touches_att_pen_area,away_last5_touches_live_ball,away_last5_take_ons_won_pct,away_last5_take_ons_tackled,away_last5_take_ons_tackled_pct,away_last5_carries_distance,away_last5_carries_progressive_distance,away_last5_carries_into_final_third,away_last5_carries_into_penalty_area,away_last5_miscontrols,away_last5_dispossessed,away_last5_passes_received,away_last5_progressive_passes_received,away_last5_cards_yellow_red,away_last5_fouls,away_last5_fouled,away_last5_offsides,away_last5_pens_won,away_last5_pens_conceded,away_last5_own_goals,away_last5_ball_recoveries,away_last5_aerials_won,away_last5_aerials_lost,away_last5_aerials_won_pct,away_last5_points
0,2025-01-26,23,14:00,Crystal Palace,Brentford,2024-2025,pl,False,Crystal Palace,London,Selhurst Park,25486,51.398333,-0.085556,England,1.0,0.6,0.4,0.0,1.0,0.4,0.448,990.0,1.4,1.0,0.2,0.2,12.2,5.8,0.8,0.0,572.6,21.2,10.0,13.2,1.32,1.16,0.98,21.4,2.8,346.4,452.6,76.32,33.2,304.2,12.4,14.0,6.6,6404.4,2411.2,147.0,174.2,84.38,160.0,189.2,84.22,35.2,72.4,47.9,0.98,9.0,25.4,9.4,2.0,407.8,43.2,12.8,2.6,1.8,16.4,15.4,4.6,2.4,1.0,0.0,1.6,9.0,13.0,9.6,8.6,3.0,9.0,15.8,58.06,6.8,3.8,9.4,31.2,25.0,0.4,69.8,217.4,244.6,118.0,19.4,572.4,45.7,6.6,48.66,1469.6,635.6,8.6,3.0,13.4,9.2,344.8,33.2,0.0,11.0,12.6,1.6,0.2,0.0,0.0,50.8,15.2,15.4,50.66,2.2,1.0,0.2,0.4,0.4,0.2,1.4,0.456,990.0,1.6,1.2,0.2,0.2,12.2,5.0,1.0,0.0,547.6,14.0,8.2,12.4,1.82,1.68,1.46,22.8,2.6,345.6,439.2,78.3,32.2,283.8,17.6,14.0,5.4,6186.2,2378.4,154.4,176.0,87.2,146.4,171.2,85.36,35.8,69.8,51.3,1.12,10.4,27.0,7.6,1.0,397.8,38.0,9.0,1.2,0.8,12.8,12.4,3.0,2.6,0.0,0.0,3.4,6.0,8.4,8.2,5.0,0.8,8.0,16.2,48.74,8.2,5.8,6.6,22.2,26.8,0.4,99.6,238.8,194.2,120.6,25.4,547.4,39.48,6.2,44.12,1576.2,842.6,10.0,7.2,10.6,7.8,339.8,31.6,0.0,6.4,8.0,3.4,0.2,0.0,0.0,41.4,13.4,16.8,44.54,1.0
1,2025-01-26,23,14:00,Tottenham Hotspur,Leicester City,2024-2025,pl,False,Tottenham Hotspur,London,Tottenham Hotspur Stadium,62850,51.604722,-0.066389,England,1.0,0.0,0.2,0.8,-1.0,2.2,0.57,989.8,1.2,0.8,0.0,0.2,11.8,3.4,1.6,0.2,654.4,19.0,5.8,12.2,1.18,1.04,0.86,20.6,2.2,442.2,542.4,80.98,44.6,366.8,21.8,20.2,8.0,7325.6,2568.0,206.6,229.0,90.04,195.4,223.4,86.84,29.0,57.2,50.34,0.82,9.0,37.4,10.0,2.0,488.2,52.6,15.4,1.2,1.4,19.2,19.4,6.6,2.8,1.6,0.0,1.6,15.2,13.0,8.2,7.8,3.0,8.6,17.0,50.1,8.4,4.0,8.2,24.8,18.4,1.8,71.4,210.8,277.0,174.2,24.4,654.2,38.38,8.4,41.22,2027.2,1031.4,13.4,7.4,15.0,12.0,438.8,44.0,0.2,11.2,12.8,1.6,0.2,0.0,0.4,40.8,13.8,14.0,49.74,0.2,1.0,0.0,0.0,1.0,-1.8,2.2,0.442,990.0,0.4,0.2,0.0,0.0,9.6,2.8,1.6,0.0,589.6,15.4,10.6,10.4,0.84,0.84,0.64,15.6,0.6,403.0,488.2,81.3,28.4,318.6,16.2,15.4,6.2,6558.4,2344.0,207.4,227.0,90.42,161.2,184.0,86.72,26.8,55.8,48.04,0.82,6.8,26.4,6.2,2.2,443.4,43.0,11.2,1.8,1.4,15.0,16.8,3.2,2.4,0.6,0.0,1.8,6.2,8.6,7.4,6.6,1.4,6.2,13.6,45.54,7.4,4.2,6.2,26.0,24.0,0.4,71.0,212.4,273.4,108.8,19.2,589.6,39.0,7.6,50.98,1692.0,844.2,10.8,3.8,10.8,9.4,400.2,27.8,0.0,8.4,9.2,1.8,0.0,0.0,0.0,38.0,12.0,12.0,48.12,0.0


In [43]:
def get_last_formation(team):
    last_match = historic_matches[(historic_matches["home_team"] == team) | (historic_matches["away_team"] == team)].tail(1)
    if last_match["home_team"].iloc[0] == team:
        return last_match["formation_home"].iloc[0]
    else:
        return last_match["formation_away"].iloc[0]
    
def get_last_lineup(team):
    last_match = historic_matches[(historic_matches["home_team"] == team) | (historic_matches["away_team"] == team)].tail(1)
    if last_match["home_team"].iloc[0] == team:
        return last_match["home_lineup"].iloc[0]
    else:
        return last_match["away_lineup"].iloc[0]

def get_last_manager_days(team):
    last_match = historic_matches[(historic_matches["home_team"] == team) | (historic_matches["away_team"] == team)].tail(1)
    if last_match["home_team"].iloc[0] == team:
        return last_match["matches_since_new_home_manager"].iloc[0]
    else:
        return last_match["matches_since_new_away_manager"].iloc[0]
    
def get_days_since_match(team, date):
    last_match = historic_matches[(historic_matches["home_team"] == team) | (historic_matches["away_team"] == team)].tail(1)
    return (date - pd.to_datetime(last_match["date"].iloc[0])).days

def calculate_tiredness_factor(days_since_last_match, decay_rate=0.1):
    return np.exp(-decay_rate * days_since_last_match)

In [44]:
matches = df
matches['date'] = pd.to_datetime(matches['date'])
matches["formation_home"] = matches["home_team"].apply(lambda team: get_last_formation(team))
matches["formation_away"] = matches["away_team"].apply(lambda team: get_last_formation(team))
matches["home_lineup"] = matches["home_team"].apply(lambda team: get_last_lineup(team))
matches["away_lineup"] = matches["away_team"].apply(lambda team: get_last_lineup(team))
matches["matches_since_new_home_manager"] = matches["home_team"].apply(lambda team: get_last_manager_days(team)) + 1
matches["matches_since_new_away_manager"] = matches["away_team"].apply(lambda team: get_last_manager_days(team)) + 1
matches['days_since_last_home'] = matches.apply(
    lambda row: get_days_since_match(row["home_team"], row["date"]), axis=1
)
matches['days_since_last_away'] = matches.apply(
    lambda row: get_days_since_match(row["away_team"], row["date"]), axis=1
)
matches['home_tiredness'] = calculate_tiredness_factor(matches['days_since_last_home'])
matches['away_tiredness'] = calculate_tiredness_factor(matches['days_since_last_away'])

matches["formation_home"] = matches["formation_home"].str.replace(r"-1-1$", "-2", regex=True)
matches["formation_away"] = matches["formation_away"].str.replace(r"-1-1$", "-2", regex=True)
matches["formation_home"] = matches["formation_home"].str.replace("4-1-2-1-2", "4-3-1-2", regex=True)
matches["formation_away"] = matches["formation_away"].str.replace("4-1-2-1-2", "4-3-1-2", regex=True)
matches["formation_back_line"] = matches["formation_home"].apply(lambda text: int(text[0] == "4"))

In [45]:
def get_h2h_metrics(row, df, num_matches=6):
    past_h2h = df[((df['home_team'] == row['home_team']) & (df['away_team'] == row['away_team'])) |
                  ((df['home_team'] == row['away_team']) & (df['away_team'] == row['home_team']))]
    past_h2h = past_h2h[pd.to_datetime(past_h2h['date']) < row['date']].sort_values(by='date', ascending=False)
    
    past_h2h = past_h2h.head(num_matches)
    num_past_h2h = len(past_h2h)

    past_h2h_home_home = past_h2h[past_h2h["home_team"] == row["home_team"]]
    past_h2h_home_away = past_h2h[past_h2h["home_team"] == row["away_team"]]

    if not past_h2h.empty:
        metrics = {
            'h2h_win_ratio': ((past_h2h_home_home['outcome'] == 1).sum() + (past_h2h_home_away['outcome'] == 2).sum()) / num_past_h2h,
            'h2h_draw_ratio': (past_h2h['outcome'] == 0).sum() / num_past_h2h,
            'h2h_avg_goals_scored_home_team': (past_h2h_home_home['home_goals'].sum() + past_h2h_home_away['away_goals'].sum()) / num_past_h2h,
            'h2h_avg_goals_scored_away_team': (past_h2h_home_home['away_goals'].sum() + past_h2h_home_away['home_goals'].sum()) / num_past_h2h,
            'h2h_avg_xG_home_team': (past_h2h_home_home['home_xg'].sum() + past_h2h_home_away['away_xg'].sum()) / num_past_h2h,
            'h2h_avg_xG_away_team': (past_h2h_home_home['away_xg'].sum() + past_h2h_home_away['home_xg'].sum()) / num_past_h2h,
        }
    else:
        metrics = {
            'h2h_win_ratio': 0,
            'h2h_draw_ratio': 0,
            'h2h_avg_goals_scored_home_team': 0,
            'h2h_avg_goals_scored_away_team': 0,
            'h2h_avg_xG_home_team': 0,
            'h2h_avg_xG_away_team': 0,
        }

    return pd.Series(metrics)

matches_with_h2h = matches.join(matches.apply(lambda row: get_h2h_metrics(row, historic_matches), axis=1))

In [46]:
def get_fifa_stats(team, col):
    last_match = historic_matches[(historic_matches["home_team"] == team) | (historic_matches["away_team"] == team)].tail(1)
    return last_match[col].iloc[0]

In [47]:
cols_home = [col for col in historic_matches.columns if 'overall' in col and 'home' in col]
cols_away = [col for col in historic_matches.columns if 'overall' in col and 'away' in col]
for col in cols_home:
    matches[col] = matches["home_team"].apply(lambda team: get_fifa_stats(team, col))
for col in cols_away:
    matches[col] = matches["away_team"].apply(lambda team: get_fifa_stats(team, col))

In [48]:
matches.to_csv("data/new_matches_fbref.csv", index=False)

In [49]:
matches.head(2)

Unnamed: 0,date,round,time,home_team,away_team,season,league,historical,Team,City,Stadium,Capacity,Latitude,Longitude,Country,home_last5_matches_played,home_last5_wins,home_last5_draws,home_last5_defeats,home_last5_goal_difference,home_last5_goals_conceded,home_last5_possession,home_last5_minutes,home_last5_goals,home_last5_assists,home_last5_pens_made,home_last5_pens_att,home_last5_shots,home_last5_shots_on_target,home_last5_cards_yellow,home_last5_cards_red,home_last5_touches,home_last5_tackles,home_last5_interceptions,home_last5_blocks,home_last5_xg,home_last5_npxg,home_last5_xg_assist,home_last5_sca,home_last5_gca,home_last5_passes_completed,home_last5_passes,home_last5_passes_pct,home_last5_progressive_passes,home_last5_carries,home_last5_progressive_carries,home_last5_take_ons,home_last5_take_ons_won,home_last5_passes_total_distance,home_last5_passes_progressive_distance,home_last5_passes_completed_short,home_last5_passes_short,home_last5_passes_pct_short,home_last5_passes_completed_medium,home_last5_passes_medium,home_last5_passes_pct_medium,home_last5_passes_completed_long,home_last5_passes_long,home_last5_passes_pct_long,home_last5_pass_xa,home_last5_assisted_shots,home_last5_passes_into_final_third,home_last5_passes_into_penalty_area,home_last5_crosses_into_penalty_area,home_last5_passes_live,home_last5_passes_dead,home_last5_passes_free_kicks,home_last5_through_balls,home_last5_passes_switches,home_last5_crosses,home_last5_throw_ins,home_last5_corner_kicks,home_last5_corner_kicks_in,home_last5_corner_kicks_out,home_last5_corner_kicks_straight,home_last5_passes_offsides,home_last5_passes_blocked,home_last5_tackles_won,home_last5_tackles_def_3rd,home_last5_tackles_mid_3rd,home_last5_tackles_att_3rd,home_last5_challenge_tackles,home_last5_challenges,home_last5_challenge_tackles_pct,home_last5_challenges_lost,home_last5_blocked_shots,home_last5_blocked_passes,home_last5_tackles_interceptions,home_last5_clearances,home_last5_errors,home_last5_touches_def_pen_area,home_last5_touches_def_3rd,home_last5_touches_mid_3rd,home_last5_touches_att_3rd,home_last5_touches_att_pen_area,home_last5_touches_live_ball,home_last5_take_ons_won_pct,home_last5_take_ons_tackled,home_last5_take_ons_tackled_pct,home_last5_carries_distance,home_last5_carries_progressive_distance,home_last5_carries_into_final_third,home_last5_carries_into_penalty_area,home_last5_miscontrols,home_last5_dispossessed,home_last5_passes_received,home_last5_progressive_passes_received,home_last5_cards_yellow_red,home_last5_fouls,home_last5_fouled,home_last5_offsides,home_last5_pens_won,home_last5_pens_conceded,home_last5_own_goals,home_last5_ball_recoveries,home_last5_aerials_won,home_last5_aerials_lost,home_last5_aerials_won_pct,home_last5_points,away_last5_matches_played,away_last5_wins,away_last5_draws,away_last5_defeats,away_last5_goal_difference,away_last5_goals_conceded,away_last5_possession,away_last5_minutes,away_last5_goals,away_last5_assists,away_last5_pens_made,away_last5_pens_att,away_last5_shots,away_last5_shots_on_target,away_last5_cards_yellow,away_last5_cards_red,away_last5_touches,away_last5_tackles,away_last5_interceptions,away_last5_blocks,away_last5_xg,away_last5_npxg,away_last5_xg_assist,away_last5_sca,away_last5_gca,away_last5_passes_completed,away_last5_passes,away_last5_passes_pct,away_last5_progressive_passes,away_last5_carries,away_last5_progressive_carries,away_last5_take_ons,away_last5_take_ons_won,away_last5_passes_total_distance,away_last5_passes_progressive_distance,away_last5_passes_completed_short,away_last5_passes_short,away_last5_passes_pct_short,away_last5_passes_completed_medium,away_last5_passes_medium,away_last5_passes_pct_medium,away_last5_passes_completed_long,away_last5_passes_long,away_last5_passes_pct_long,away_last5_pass_xa,away_last5_assisted_shots,away_last5_passes_into_final_third,away_last5_passes_into_penalty_area,away_last5_crosses_into_penalty_area,away_last5_passes_live,away_last5_passes_dead,away_last5_passes_free_kicks,away_last5_through_balls,away_last5_passes_switches,away_last5_crosses,away_last5_throw_ins,away_last5_corner_kicks,away_last5_corner_kicks_in,away_last5_corner_kicks_out,away_last5_corner_kicks_straight,away_last5_passes_offsides,away_last5_passes_blocked,away_last5_tackles_won,away_last5_tackles_def_3rd,away_last5_tackles_mid_3rd,away_last5_tackles_att_3rd,away_last5_challenge_tackles,away_last5_challenges,away_last5_challenge_tackles_pct,away_last5_challenges_lost,away_last5_blocked_shots,away_last5_blocked_passes,away_last5_tackles_interceptions,away_last5_clearances,away_last5_errors,away_last5_touches_def_pen_area,away_last5_touches_def_3rd,away_last5_touches_mid_3rd,away_last5_touches_att_3rd,away_last5_touches_att_pen_area,away_last5_touches_live_ball,away_last5_take_ons_won_pct,away_last5_take_ons_tackled,away_last5_take_ons_tackled_pct,away_last5_carries_distance,away_last5_carries_progressive_distance,away_last5_carries_into_final_third,away_last5_carries_into_penalty_area,away_last5_miscontrols,away_last5_dispossessed,away_last5_passes_received,away_last5_progressive_passes_received,away_last5_cards_yellow_red,away_last5_fouls,away_last5_fouled,away_last5_offsides,away_last5_pens_won,away_last5_pens_conceded,away_last5_own_goals,away_last5_ball_recoveries,away_last5_aerials_won,away_last5_aerials_lost,away_last5_aerials_won_pct,away_last5_points,formation_home,formation_away,home_lineup,away_lineup,matches_since_new_home_manager,matches_since_new_away_manager,days_since_last_home,days_since_last_away,home_tiredness,away_tiredness,formation_back_line,overall_home_overall_rating,overall_home_stat_crossing,overall_home_stat_finishing,overall_home_stat_heading_accuracy,overall_home_stat_short_passing,overall_home_stat_volleys,overall_home_stat_dribbling,overall_home_stat_curve,overall_home_stat_fk_accuracy,overall_home_stat_long_passing,overall_home_stat_ball_control,overall_home_stat_accelaration,overall_home_stat_sprint_speed,overall_home_stat_agility,overall_home_stat_reactions,overall_home_stat_balance,overall_home_stat_jumping,overall_home_stat_stamina,overall_home_stat_strength,overall_home_stat_long_shots,overall_home_stat_aggression,overall_home_stat_interceptions,overall_home_stat_att_position,overall_home_stat_vision,overall_home_stat_penalties,overall_home_stat_composure,overall_home_stat_marking,overall_home_stat_standing_tackle,overall_home_stat_sliding_tackle,overall_home_stat_gk_diving,overall_home_stat_gk_handling,overall_home_stat_gk_kicking,overall_home_stat_gk_positioning,overall_home_stat_gk_reflexes,overall_away_overall_rating,overall_away_stat_crossing,overall_away_stat_finishing,overall_away_stat_heading_accuracy,overall_away_stat_short_passing,overall_away_stat_volleys,overall_away_stat_dribbling,overall_away_stat_curve,overall_away_stat_fk_accuracy,overall_away_stat_long_passing,overall_away_stat_ball_control,overall_away_stat_accelaration,overall_away_stat_sprint_speed,overall_away_stat_agility,overall_away_stat_reactions,overall_away_stat_balance,overall_away_stat_jumping,overall_away_stat_stamina,overall_away_stat_strength,overall_away_stat_long_shots,overall_away_stat_aggression,overall_away_stat_interceptions,overall_away_stat_att_position,overall_away_stat_vision,overall_away_stat_penalties,overall_away_stat_composure,overall_away_stat_marking,overall_away_stat_standing_tackle,overall_away_stat_sliding_tackle,overall_away_stat_gk_diving,overall_away_stat_gk_handling,overall_away_stat_gk_kicking,overall_away_stat_gk_positioning,overall_away_stat_gk_reflexes
0,2025-01-26,23,14:00,Crystal Palace,Brentford,2024-2025,pl,False,Crystal Palace,London,Selhurst Park,25486,51.398333,-0.085556,England,1.0,0.6,0.4,0.0,1.0,0.4,0.448,990.0,1.4,1.0,0.2,0.2,12.2,5.8,0.8,0.0,572.6,21.2,10.0,13.2,1.32,1.16,0.98,21.4,2.8,346.4,452.6,76.32,33.2,304.2,12.4,14.0,6.6,6404.4,2411.2,147.0,174.2,84.38,160.0,189.2,84.22,35.2,72.4,47.9,0.98,9.0,25.4,9.4,2.0,407.8,43.2,12.8,2.6,1.8,16.4,15.4,4.6,2.4,1.0,0.0,1.6,9.0,13.0,9.6,8.6,3.0,9.0,15.8,58.06,6.8,3.8,9.4,31.2,25.0,0.4,69.8,217.4,244.6,118.0,19.4,572.4,45.7,6.6,48.66,1469.6,635.6,8.6,3.0,13.4,9.2,344.8,33.2,0.0,11.0,12.6,1.6,0.2,0.0,0.0,50.8,15.2,15.4,50.66,2.2,1.0,0.2,0.4,0.4,0.2,1.4,0.456,990.0,1.6,1.2,0.2,0.2,12.2,5.0,1.0,0.0,547.6,14.0,8.2,12.4,1.82,1.68,1.46,22.8,2.6,345.6,439.2,78.3,32.2,283.8,17.6,14.0,5.4,6186.2,2378.4,154.4,176.0,87.2,146.4,171.2,85.36,35.8,69.8,51.3,1.12,10.4,27.0,7.6,1.0,397.8,38.0,9.0,1.2,0.8,12.8,12.4,3.0,2.6,0.0,0.0,3.4,6.0,8.4,8.2,5.0,0.8,8.0,16.2,48.74,8.2,5.8,6.6,22.2,26.8,0.4,99.6,238.8,194.2,120.6,25.4,547.4,39.48,6.2,44.12,1576.2,842.6,10.0,7.2,10.6,7.8,339.8,31.6,0.0,6.4,8.0,3.4,0.2,0.0,0.0,41.4,13.4,16.8,44.54,1.0,3-4-3,4-2-3-1,"['Łukasz Fabiański', 'Aaron Cresswell', 'Lucas...","['Mark Flekken', 'Sepp van den Berg', 'Christi...",27.0,27.0,8,8,0.449329,0.449329,0,73.636364,60.0,50.0,58.0,64.090909,48.636364,63.454545,51.636364,48.636364,61.818182,66.545455,69.363636,70.727273,68.181818,63.090909,66.090909,66.818182,70.090909,70.545455,55.454545,64.727273,58.090909,55.909091,58.454545,55.727273,65.090909,55.909091,58.0,56.363636,16.454545,17.363636,13.909091,15.181818,16.090909,78.636364,64.090909,61.181818,59.181818,71.0,56.090909,69.545455,63.727273,62.454545,64.727273,72.181818,78.0,76.454545,71.363636,68.0,64.909091,69.727273,72.545455,72.909091,62.363636,59.818182,52.181818,61.818182,69.272727,60.090909,72.909091,52.909091,53.727273,49.272727,18.545455,18.0,17.636364,18.727273,19.363636
1,2025-01-26,23,14:00,Tottenham Hotspur,Leicester City,2024-2025,pl,False,Tottenham Hotspur,London,Tottenham Hotspur Stadium,62850,51.604722,-0.066389,England,1.0,0.0,0.2,0.8,-1.0,2.2,0.57,989.8,1.2,0.8,0.0,0.2,11.8,3.4,1.6,0.2,654.4,19.0,5.8,12.2,1.18,1.04,0.86,20.6,2.2,442.2,542.4,80.98,44.6,366.8,21.8,20.2,8.0,7325.6,2568.0,206.6,229.0,90.04,195.4,223.4,86.84,29.0,57.2,50.34,0.82,9.0,37.4,10.0,2.0,488.2,52.6,15.4,1.2,1.4,19.2,19.4,6.6,2.8,1.6,0.0,1.6,15.2,13.0,8.2,7.8,3.0,8.6,17.0,50.1,8.4,4.0,8.2,24.8,18.4,1.8,71.4,210.8,277.0,174.2,24.4,654.2,38.38,8.4,41.22,2027.2,1031.4,13.4,7.4,15.0,12.0,438.8,44.0,0.2,11.2,12.8,1.6,0.2,0.0,0.4,40.8,13.8,14.0,49.74,0.2,1.0,0.0,0.0,1.0,-1.8,2.2,0.442,990.0,0.4,0.2,0.0,0.0,9.6,2.8,1.6,0.0,589.6,15.4,10.6,10.4,0.84,0.84,0.64,15.6,0.6,403.0,488.2,81.3,28.4,318.6,16.2,15.4,6.2,6558.4,2344.0,207.4,227.0,90.42,161.2,184.0,86.72,26.8,55.8,48.04,0.82,6.8,26.4,6.2,2.2,443.4,43.0,11.2,1.8,1.4,15.0,16.8,3.2,2.4,0.6,0.0,1.8,6.2,8.6,7.4,6.6,1.4,6.2,13.6,45.54,7.4,4.2,6.2,26.0,24.0,0.4,71.0,212.4,273.4,108.8,19.2,589.6,39.0,7.6,50.98,1692.0,844.2,10.8,3.8,10.8,9.4,400.2,27.8,0.0,8.4,9.2,1.8,0.0,0.0,0.0,38.0,12.0,12.0,48.12,0.0,3-4-3,4-2-3-1,"['Jordan Pickford', 'James Tarkowski', 'Orel M...","['Jakub Stolarczyk', 'James Justin', 'Wout Fae...",21.0,27.0,7,8,0.496585,0.449329,0,70.818182,50.090909,46.363636,54.545455,63.272727,42.181818,58.0,41.272727,42.272727,56.363636,63.090909,70.181818,71.272727,63.636364,59.545455,66.0,68.181818,69.0,65.0,48.727273,58.272727,56.272727,50.363636,58.0,42.181818,59.545455,53.363636,57.363636,53.090909,16.272727,15.363636,17.181818,16.454545,16.818182,70.272727,54.545455,47.0,52.636364,62.0,43.909091,63.545455,50.636364,43.272727,55.909091,62.818182,69.454545,71.363636,64.181818,55.363636,62.909091,62.727273,66.0,65.272727,47.272727,56.0,48.909091,53.727273,57.818182,51.454545,62.454545,50.0,50.636364,48.0,17.727273,16.0,17.363636,18.181818,16.545455
