In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../data/processed/test_match_logs_with_elos_20250528.csv")

In [3]:
df["team_elo_from_club_elo"].isna().sum(), len(df)

(np.int64(1698), 40485)

In [4]:
df[df["team_elo_from_club_elo"].isna()].head()

Unnamed: 0,date,dayofweek,round,venue,result,team,opponent,game_started,position,minutes,...,pens_conceded,transfer_id,transfer_date,from_club,to_club,is_post_transfer,is_pre_transfer,days_since_transfer,match_number_after_transfer,team_elo_from_club_elo
611,2021-08-14,Sat,Matchweek 1,Away,L 1–5,Greuther Fürth,Stuttgart,N,CM,21.0,...,,Adrian Fein_Bayern Munich_Greuther Fürth_2021-...,2021-07-14,Bayern Munich,Greuther Fürth,True,False,31,1,
612,2021-08-21,Sat,Matchweek 2,Home,D 1–1,Greuther Fürth,Arminia,N,,,...,,Adrian Fein_Bayern Munich_Greuther Fürth_2021-...,2021-07-14,Bayern Munich,Greuther Fürth,True,False,38,2,
613,2021-08-28,Sat,Matchweek 3,Away,L 0–3,Greuther Fürth,Mainz 05,N,,,...,,Adrian Fein_Bayern Munich_Greuther Fürth_2021-...,2021-07-14,Bayern Munich,Greuther Fürth,True,False,45,3,
614,2021-09-11,Sat,Matchweek 4,Home,L 0–2,Greuther Fürth,Wolfsburg,N,,,...,,Adrian Fein_Bayern Munich_Greuther Fürth_2021-...,2021-07-14,Bayern Munich,Greuther Fürth,True,False,59,4,
615,2021-09-17,Fri,Matchweek 5,Away,L 1–2,Greuther Fürth,Hertha BSC,N,DM,3.0,...,,Adrian Fein_Bayern Munich_Greuther Fürth_2021-...,2021-07-14,Bayern Munich,Greuther Fürth,True,False,65,5,


In [5]:
df[df["team_elo_from_club_elo"].isna()]["team"].unique()

array(['Greuther Fürth', 'Spartak Moscow', 'Real Sociedad',
       'Saint-Étienne', 'Portimonense', 'Köln', 'Chaves', 'Vitesse',
       'FC Copenhagen', 'Benevento', 'Dijon', 'Vizela', 'Ankaragücü'],
      dtype=object)

In [6]:
def calculate_elo_delta_feature(df: pd.DataFrame) -> pd.DataFrame:
    """
    Function to calculate the elo delta feature. This feature is the difference between
    the current team elo for post-transfer matches and the last team elo from the club elo
    prior to the transfer.
    """
    
    df = df.copy()
    df['delta_elo_post_transfer'] = np.nan
    
    # Group by transfer_id to process each transfer separately
    for transfer_id, transfer_group in df.groupby('transfer_id'):
        transfer_group_sorted = transfer_group.sort_values('date')
        pre_transfer_rows = transfer_group_sorted[
            (transfer_group_sorted['is_pre_transfer']) & 
            (transfer_group_sorted['team_elo_from_club_elo'].notna())
        ]
        if len(pre_transfer_rows) > 0:
            # Get the last pre-transfer elo value (chronologically last)
            last_pre_transfer_elo = pre_transfer_rows['team_elo_from_club_elo'].iloc[-1]
            # Calculate delta for post-transfer rows with non-null elo values
            post_transfer_mask = (
                (df['transfer_id'] == transfer_id) & 
                (df['is_post_transfer']) &
                (df['team_elo_from_club_elo'].notna())
            )
            df.loc[post_transfer_mask, 'delta_elo_post_transfer'] = (
                df.loc[post_transfer_mask, 'team_elo_from_club_elo'] - last_pre_transfer_elo
            )
    return df

In [7]:
df = calculate_elo_delta_feature(df)

In [9]:
df["delta_elo_post_transfer"].notna().sum(), df["is_post_transfer"].sum()

(np.int64(6640), np.int64(24120))

In [14]:
def add_team_delta_features(
    df: pd.DataFrame,
    team_stats_dfs: dict,
    feature_df_mapping: dict
) -> pd.DataFrame:
    """
    Add team delta features to the test match logs dataframe.
    
    This function calculates the difference between post-transfer team stats 
    and pre-transfer team stats for the same season preceding the transfer.
    
    Parameters:
    -----------
    df : pd.DataFrame
        The test match logs dataframe with transfer information
    team_stats_dfs : dict
        Dictionary mapping stat type names to their corresponding dataframes
        e.g., {'shooting': shooting_df, 'standard': standard_df}
    feature_df_mapping : dict
        Dictionary mapping feature names to the dataframe they should be sourced from
        e.g., {'goals': 'standard', 'shots': 'shooting', 'assists': 'standard'}
        
    Returns:
    --------
    pd.DataFrame
        The original dataframe with added delta feature columns
    """
    
    result_df = df.copy()
    for feature in feature_df_mapping.keys():
        result_df[f'{feature}_delta'] = pd.NA
    
    for _, transfer_group in result_df.groupby('transfer_id'):
        
        pre_transfer_matches = transfer_group[transfer_group['is_pre_transfer'] == True]
        post_transfer_matches = transfer_group[transfer_group['is_post_transfer'] == True]
        
        # Skip if we don't have both pre and post transfer matches
        if pre_transfer_matches.empty or post_transfer_matches.empty:
            continue
        pre_transfer_sample = pre_transfer_matches.iloc[0]
        post_transfer_sample = post_transfer_matches.iloc[0]
        reference_season = pre_transfer_sample['season']
        pre_transfer_league = pre_transfer_sample['league']
        pre_transfer_team = pre_transfer_sample['team']
        post_transfer_league = post_transfer_sample['league']
        post_transfer_team = post_transfer_sample['team']
        
        delta_values = {}
        
        for feature, df_name in feature_df_mapping.items():
            if df_name not in team_stats_dfs:
                print(f"Warning: Dataframe '{df_name}' not found in team_stats_dfs")
                delta_values[f'{feature}_delta'] = pd.NA
                continue
                
            stats_df = team_stats_dfs[df_name]
            
            if feature not in stats_df.columns:
                print(f"Warning: Feature '{feature}' not found in '{df_name}' dataframe")
                delta_values[f'{feature}_delta'] = pd.NA
                continue
            
            pre_team_mask = (
                (stats_df['league'] == pre_transfer_league) &
                (stats_df['season'] == reference_season) &
                (stats_df['team_name'] == pre_transfer_team)
            )
            pre_team_matches = stats_df[pre_team_mask]
            
            post_team_mask = (
                (stats_df['league'] == post_transfer_league) &
                (stats_df['season'] == reference_season) &
                (stats_df['team_name'] == post_transfer_team)
            )
            post_team_matches = stats_df[post_team_mask]
            
            # Calculate delta (post - pre)
            if not pre_team_matches.empty and not post_team_matches.empty:
                pre_team_stat = pre_team_matches[feature].iloc[0]
                post_team_stat = post_team_matches[feature].iloc[0]
                delta_values[f'{feature}_delta'] = post_team_stat - pre_team_stat
            else:
                delta_values[f'{feature}_delta'] = pd.NA
        
        # Apply the calculated deltas to all rows in this transfer group
        for delta_feature, delta_value in delta_values.items():
            result_df.loc[transfer_group.index, delta_feature] = delta_value
    
    return result_df

In [15]:
standard_df = pd.read_csv("../data/processed/team_stats_standard_20250530.csv")
shooting_df = pd.read_csv("../data/processed/team_stats_shooting_20250530.csv")

In [16]:
standard_df.head()

Unnamed: 0,league,season,team_name,team_url,assists,assists_per90,avg_age,cards_red,cards_yellow,goals,...,players_used,possession,progressive_carries,progressive_passes,starts,xg,xg_assist,xg_assist_per90,xg_per90,xg_xg_assist_per90
0,SerieA,2021-2022,Atalanta,https://fbref.com/en/squads/922493f3/2021-2022...,48,1.26,27.5,2,87,62,...,32,55.0,796.0,1858.0,418,62.4,48.5,1.28,1.64,2.92
1,SerieA,2021-2022,Bologna,https://fbref.com/en/squads/1d8099f8/2021-2022...,34,0.89,26.6,7,92,43,...,36,50.6,540.0,1276.0,418,44.9,33.6,0.88,1.18,2.06
2,SerieA,2021-2022,Cagliari,https://fbref.com/en/squads/c4260e09/2021-2022...,26,0.68,26.5,5,96,34,...,33,44.5,474.0,1088.0,418,39.0,29.6,0.78,1.03,1.81
3,SerieA,2021-2022,Empoli,https://fbref.com/en/squads/a3d88bd8/2021-2022...,27,0.71,24.5,5,87,47,...,28,47.4,658.0,1477.0,418,44.9,28.9,0.76,1.18,1.94
4,SerieA,2021-2022,Fiorentina,https://fbref.com/en/squads/421387cf/2021-2022...,33,0.87,26.4,8,89,59,...,28,57.7,755.0,1584.0,418,60.0,39.2,1.03,1.58,2.61


In [17]:
shooting_df.head()

Unnamed: 0,league,season,team_name,team_url,average_shot_distance,goals,goals_per_shot,goals_per_shot_on_target,minutes_90s,npxg,...,pens_made,players_used,shots,shots_free_kicks,shots_on_target,shots_on_target_pct,shots_on_target_per90,shots_per90,xg,xg_net
0,SerieA,2021-2022,Atalanta,https://fbref.com/en/squads/922493f3/2021-2022...,16.5,62,0.1,0.34,38.0,58.1,...,5,32,598,16.0,167,27.9,4.39,15.74,62.4,-0.4
1,SerieA,2021-2022,Bologna,https://fbref.com/en/squads/1d8099f8/2021-2022...,18.1,43,0.09,0.27,38.0,41.0,...,4,36,434,14.0,143,32.9,3.76,11.42,44.9,-1.9
2,SerieA,2021-2022,Cagliari,https://fbref.com/en/squads/c4260e09/2021-2022...,17.3,34,0.07,0.26,38.0,36.4,...,3,33,420,17.0,117,27.9,3.08,11.05,39.0,-5.0
3,SerieA,2021-2022,Empoli,https://fbref.com/en/squads/a3d88bd8/2021-2022...,18.5,47,0.08,0.26,38.0,39.3,...,7,28,490,16.0,151,30.8,3.97,12.89,44.9,2.1
4,SerieA,2021-2022,Fiorentina,https://fbref.com/en/squads/421387cf/2021-2022...,17.2,59,0.1,0.29,38.0,50.8,...,9,28,501,25.0,172,34.3,4.53,13.18,60.0,-1.0


In [18]:
team_stats_dfs = {
    'standard': standard_df,
    'shooting': shooting_df
}

feature_df_mapping = {
    'goals': 'standard', # Use goals from standard stats since it's more complete
    'assists': 'standard',
    'shots': 'shooting',
    'shots_on_target': 'shooting',
}

In [19]:
result_df = add_team_delta_features(df, team_stats_dfs, feature_df_mapping)

In [25]:
result_df[result_df["goals_delta"].notna()].head(30)

Unnamed: 0,date,dayofweek,round,venue,result,team,opponent,game_started,position,minutes,...,is_post_transfer,is_pre_transfer,days_since_transfer,match_number_after_transfer,team_elo_from_club_elo,delta_elo_post_transfer,goals_delta,assists_delta,shots_delta,shots_on_target_delta
0,2021-08-22,Sun,Matchweek 1,Home,W 3–2,Bologna,Salernitana,Y,LB,55.0,...,False,True,-321,-36,1594.298096,,3,-1,2,-2
1,2021-08-28,Sat,Matchweek 2,Away,D 0–0,Bologna,Atalanta,Y,LB,80.0,...,False,True,-315,-35,1599.851929,,3,-1,2,-2
2,2021-09-13,Mon,Matchweek 3,Home,W 1–0,Bologna,Hellas Verona,Y,LB,90.0,...,False,True,-299,-34,1606.636597,,3,-1,2,-2
3,2021-09-18,Sat,Matchweek 4,Away,L 1–6,Bologna,Inter,Y,LB,55.0,...,False,True,-294,-33,1609.807983,,3,-1,2,-2
4,2021-09-21,Tue,Matchweek 5,Home,D 2–2,Bologna,Genoa,Y,LB,79.0,...,False,True,-291,-32,1605.707153,,3,-1,2,-2
5,2021-09-26,Sun,Matchweek 6,Away,L 2–4,Bologna,Empoli,Y,LB,45.0,...,False,True,-286,-31,1604.374756,,3,-1,2,-2
6,2021-10-03,Sun,Matchweek 7,Home,W 3–0,Bologna,Lazio,Y,WB,86.0,...,False,True,-279,-30,1590.879883,,3,-1,2,-2
7,2021-10-17,Sun,Matchweek 8,Away,D 1–1,Bologna,Udinese,Y,WB,90.0,...,False,True,-265,-29,1608.852173,,3,-1,2,-2
8,2021-10-23,Sat,Matchweek 9,Home,L 2–4,Bologna,Milan,Y,"LB,WB",62.0,...,False,True,-259,-28,1598.778809,,3,-1,2,-2
9,2021-10-28,Thu,Matchweek 10,Away,L 0–3,Bologna,Napoli,Y,WB,80.0,...,False,True,-254,-27,1593.022949,,3,-1,2,-2


In [24]:
result_df[result_df["delta_elo_post_transfer"].notna()].head(30)

Unnamed: 0,date,dayofweek,round,venue,result,team,opponent,game_started,position,minutes,...,is_post_transfer,is_pre_transfer,days_since_transfer,match_number_after_transfer,team_elo_from_club_elo,delta_elo_post_transfer,goals_delta,assists_delta,shots_delta,shots_on_target_delta
36,2022-08-07,Sun,Matchweek 1,Away,D 2–2,Brentford,Leicester City,Y,RB,82.0,...,True,False,29,1,1714.308105,125.2677,3.0,-1.0,2.0,-2.0
37,2022-08-13,Sat,Matchweek 2,Home,W 4–0,Brentford,Manchester Utd,Y,RB,79.0,...,True,False,35,2,1717.469116,128.428711,3.0,-1.0,2.0,-2.0
38,2022-08-20,Sat,Matchweek 3,Away,L 2–3,Brentford,Fulham,Y,RB,58.0,...,True,False,42,3,1735.649536,146.609131,3.0,-1.0,2.0,-2.0
39,2022-08-27,Sat,Matchweek 4,Home,D 1–1,Brentford,Everton,Y,"RB,CB",90.0,...,True,False,49,4,1727.202026,138.161621,3.0,-1.0,2.0,-2.0
40,2022-08-30,Tue,Matchweek 5,Away,D 1–1,Brentford,Crystal Palace,Y,RB,78.0,...,True,False,52,5,1724.475098,135.434692,3.0,-1.0,2.0,-2.0
41,2022-09-03,Sat,Matchweek 6,Home,W 5–2,Brentford,Leeds United,Y,"RB,WB",90.0,...,True,False,56,6,1726.939697,137.899292,3.0,-1.0,2.0,-2.0
42,2022-09-18,Sun,Matchweek 8,Home,L 0–3,Brentford,Arsenal,Y,WB,90.0,...,True,False,71,7,1728.763794,139.723389,3.0,-1.0,2.0,-2.0
43,2022-10-01,Sat,Matchweek 9,Away,D 0–0,Brentford,Bournemouth,N,RB,30.0,...,True,False,84,8,1718.621216,129.580811,3.0,-1.0,2.0,-2.0
44,2022-10-08,Sat,Matchweek 10,Away,L 1–5,Brentford,Newcastle Utd,Y,"WB,RB",82.0,...,True,False,91,9,1724.353516,135.31311,3.0,-1.0,2.0,-2.0
45,2023-01-22,Sun,Matchweek 21,Away,D 0–0,Brentford,Leeds United,N,,,...,True,False,197,10,1779.646729,190.606323,3.0,-1.0,2.0,-2.0
