In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../data/processed/test_match_logs_with_elos_20250528.csv")

In [3]:
df["team_elo_from_club_elo"].isna().sum(), len(df)

(np.int64(1698), 40485)

In [4]:
df[df["team_elo_from_club_elo"].isna()].head()

Unnamed: 0,date,dayofweek,round,venue,result,team,opponent,game_started,position,minutes,...,pens_conceded,transfer_id,transfer_date,from_club,to_club,is_post_transfer,is_pre_transfer,days_since_transfer,match_number_after_transfer,team_elo_from_club_elo
611,2021-08-14,Sat,Matchweek 1,Away,L 1–5,Greuther Fürth,Stuttgart,N,CM,21.0,...,,Adrian Fein_Bayern Munich_Greuther Fürth_2021-...,2021-07-14,Bayern Munich,Greuther Fürth,True,False,31,1,
612,2021-08-21,Sat,Matchweek 2,Home,D 1–1,Greuther Fürth,Arminia,N,,,...,,Adrian Fein_Bayern Munich_Greuther Fürth_2021-...,2021-07-14,Bayern Munich,Greuther Fürth,True,False,38,2,
613,2021-08-28,Sat,Matchweek 3,Away,L 0–3,Greuther Fürth,Mainz 05,N,,,...,,Adrian Fein_Bayern Munich_Greuther Fürth_2021-...,2021-07-14,Bayern Munich,Greuther Fürth,True,False,45,3,
614,2021-09-11,Sat,Matchweek 4,Home,L 0–2,Greuther Fürth,Wolfsburg,N,,,...,,Adrian Fein_Bayern Munich_Greuther Fürth_2021-...,2021-07-14,Bayern Munich,Greuther Fürth,True,False,59,4,
615,2021-09-17,Fri,Matchweek 5,Away,L 1–2,Greuther Fürth,Hertha BSC,N,DM,3.0,...,,Adrian Fein_Bayern Munich_Greuther Fürth_2021-...,2021-07-14,Bayern Munich,Greuther Fürth,True,False,65,5,


In [5]:
df[df["team_elo_from_club_elo"].isna()]["team"].unique()

array(['Greuther Fürth', 'Spartak Moscow', 'Real Sociedad',
       'Saint-Étienne', 'Portimonense', 'Köln', 'Chaves', 'Vitesse',
       'FC Copenhagen', 'Benevento', 'Dijon', 'Vizela', 'Ankaragücü'],
      dtype=object)

In [6]:
def calculate_elo_delta_feature(df: pd.DataFrame) -> pd.DataFrame:
    """
    Function to calculate the elo delta feature. This feature is the difference between
    the current team elo for post-transfer matches and the last team elo from the club elo
    prior to the transfer.
    """
    
    df = df.copy()
    df['delta_elo_post_transfer'] = np.nan
    
    # Group by transfer_id to process each transfer separately
    for transfer_id, transfer_group in df.groupby('transfer_id'):
        transfer_group_sorted = transfer_group.sort_values('date')
        pre_transfer_rows = transfer_group_sorted[
            (transfer_group_sorted['is_pre_transfer']) & 
            (transfer_group_sorted['team_elo_from_club_elo'].notna())
        ]
        if len(pre_transfer_rows) > 0:
            # Get the last pre-transfer elo value (chronologically last)
            last_pre_transfer_elo = pre_transfer_rows['team_elo_from_club_elo'].iloc[-1]
            # Calculate delta for post-transfer rows with non-null elo values
            post_transfer_mask = (
                (df['transfer_id'] == transfer_id) & 
                (df['is_post_transfer']) &
                (df['team_elo_from_club_elo'].notna())
            )
            df.loc[post_transfer_mask, 'delta_elo_post_transfer'] = (
                df.loc[post_transfer_mask, 'team_elo_from_club_elo'] - last_pre_transfer_elo
            )
    return df

In [7]:
df = calculate_elo_delta_feature(df)

In [9]:
df["delta_elo_post_transfer"].notna().sum(), df["is_post_transfer"].sum()

(np.int64(6640), np.int64(24120))