In [1]:
import pandas as pd
import numpy as np
from time import time

from preprocess.DataLoader import execute_data_loader
from preprocess.XmlProcessor import XmlProcessor

# execute_data_loader()

In [2]:
matches_data = pd.read_csv('../data/match_details.csv')
players_data = pd.read_csv('../data/player_attributes.csv')

df_transformed = XmlProcessor(matches_data).process_data()

In [3]:
df_transformed['date'] = pd.to_datetime(df_transformed['date'])
players_data['date'] = pd.to_datetime(players_data['date'])
df_transformed.sort_values(by='date', inplace=True)

In [4]:
df_transformed[['season', 'result_match']] = df_transformed[['season', 'result_match']].astype('category')

In [5]:
start = time()

players = ['{}_player_{}'.format(team, i) for team in ['home', 'away'] for i in range(1, 12)]

def get_player_overall_rating(player_id, match_date):
    player_ratings_on_or_before_match = players_data[
        (players_data['player_api_id'] == player_id) & (players_data['date'] <= match_date)
    ]
    latest_rating = player_ratings_on_or_before_match.sort_values(by='date', ascending=False).iloc[0]['overall_rating']
    return latest_rating

def get_player_id_for_team(row, player, team_type):
    player_id = row[player]

    if not np.isnan(player_id):
        return player_id

    player_id = df_transformed.loc[df_transformed[f'{team_type}_team'].eq(row[f'{team_type}_team'])][player].value_counts().idxmax()
    df_transformed.loc[df_transformed.match_api_id == row.match_api_id, player] = int(player_id)
    
    return player_id

def calculate_player_stat(match_row):
    player_stats_dict = {}
    match_date = match_row['date']

    for player in players:
        team_type = 'home' if 'home' in player else 'away'
        player_id = get_player_id_for_team(match_row, player, team_type)

        overall_ranking = get_player_overall_rating(player_id, match_date)
        name = 'player_rating_{}'.format(player)

        player_stats_dict[name] = int(overall_ranking)
        player_stats_dict['match_api_id'] = match_row.match_api_id

    return player_stats_dict

player_stats_dict = df_transformed.apply(lambda row: calculate_player_stat(row), axis=1)
new_player_stats = pd.json_normalize(player_stats_dict)
df_transformed = pd.merge(df_transformed, new_player_stats, how='left', on='match_api_id')
df_transformed = df_transformed.drop(players, axis=1)

end = time()
print("Calculate player stats in {:.3f} minutes".format((end - start) / 60))

Calculate player stats in 0.938 minutes


In [6]:
# Check on which positions given player played
columns_with_value = matches_data.columns[matches_data.eq(30843).any()].tolist()
print(columns_with_value)

['home_player_9', 'home_player_10', 'home_player_11', 'away_player_10', 'away_player_11']


In [7]:
# Test: Test that the overall ratings fetched are indeed the latest
player_id = 30843
player_col = 'home_player_11'
rating_col = 'player_rating_' + player_col

# Fetch rating from transformed df
match = matches_data.loc[matches_data[player_col].eq(player_id)].sort_values(by='date',ascending=False).iloc[0]
match_id = match['match_api_id']
transformed_rating = df_transformed[df_transformed['match_api_id'] == match_id].iloc[0][rating_col]

# Fetch player rating from players data by id and latest date
raw_rating = players_data[
    (players_data['player_api_id'] == player_id) & (players_data['date'] <= match['date'])
].sort_values(by='date', ascending=False).iloc[0]['overall_rating']

assert transformed_rating == raw_rating, f"Incorrect rating for {player_col}"
print("test passed")

test passed


In [8]:
alpha = 0.001

def calculate_team_ema(df, alpha):
    # Create a long format DataFrame where each row is a team's performance in a match
    home_df = df[['date', 'home_team', 'home_team_goal']].rename(columns={'home_team': 'team', 'home_team_goal': 'goals'})
    away_df = df[['date', 'away_team', 'away_team_goal']].rename(columns={'away_team': 'team', 'away_team_goal': 'goals'})
    long_df = pd.concat([home_df, away_df])

    # Sort by team and date to ensure correct order for EMA calculation
    long_df.sort_values(by=['team', 'date'], inplace=True)

    # Calculate EMA with shift to exclude the current match
    long_df['ema_goals'] = (
        long_df.groupby('team')['goals']
        .apply(lambda x: x.shift(1).ewm(adjust=True, alpha=alpha).mean())
        .reset_index(level=0, drop=True)
        .round(2)
    )

    # Pivot the long format back to wide format with separate columns for home and away teams
    ema_home = long_df.rename(columns={'team': 'home_team', 'ema_goals': 'home_team_ewm_goals'}).drop(columns=['goals'])
    ema_away = long_df.rename(columns={'team': 'away_team', 'ema_goals': 'away_team_ewm_goals'}).drop(columns=['goals'])

    # Merge the EMA data back to the original DataFrame
    merged_df = df.merge(ema_home, on=['date', 'home_team'], how='left').merge(ema_away, on=['date', 'away_team'], how='left')

    return merged_df

df_transformed = calculate_team_ema(df_transformed.copy(), alpha=alpha)

In [None]:
def test_count_average_goals_from_last_n_matches():
    data = {
        'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06'],
        'home_team': ['TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB'],
        'away_team': ['TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA'],
        'home_team_goal': [2, 1, 3, 1, 3, 2],
        'away_team_goal': [1, 2, 0, 1, 1, 1]
    }
    
    df_mock = pd.DataFrame(data)

    # Calculating EMA for home and away teams
    df_ = calculate_team_ema(df_mock, 0.01)

    assert df_.iloc[-1]['home_team_ewm_goals'] == 0.8, f"Expected 0.8 but got {df_.iloc[-1]['home_team_ewm_goals']}"
    assert df_.iloc[-1]['away_team_ewm_goals'] == 2.2, f"Expected 2.2 but got {df_.iloc[-1]['away_team_ewm_goals']}"
    print("Test passed!")

test_count_average_goals_from_last_n_matches()

In [None]:
def calculate_team_ema(df, alpha):
    # Create a long format DataFrame where each row is a team's performance in a match
    home_df = df[['date', 'home_team', 'away_team_goal']].rename(columns={'home_team': 'team', 'away_team_goal': 'goals'})
    away_df = df[['date', 'away_team', 'home_team_goal']].rename(columns={'away_team': 'team', 'home_team_goal': 'goals'})
    long_df = pd.concat([home_df, away_df])

    # Sort by team and date to ensure correct order for EMA calculation
    long_df.sort_values(by=['team', 'date'], inplace=True)

    # Calculate EMA with shift to exclude the current match
    long_df['ema_goals'] = (
        long_df.groupby('team')['goals']
        .apply(lambda x: x.shift(1).ewm(adjust=True, alpha=alpha).mean())
        .reset_index(level=0, drop=True)
        .round(2)
    )

    # Pivot the long format back to wide format with separate columns for home and away teams
    ema_home = long_df.rename(columns={'team': 'home_team', 'ema_goals': 'home_team_ewm_goals_conceded'}).drop(columns=['goals'])
    ema_away = long_df.rename(columns={'team': 'away_team', 'ema_goals': 'away_team_ewm_goals_conceded'}).drop(columns=['goals'])

    # Merge the EMA data back to the original DataFrame
    merged_df = df.merge(ema_home, on=['date', 'home_team'], how='left').merge(ema_away, on=['date', 'away_team'], how='left')

    return merged_df

df_transformed = calculate_team_ema(df_transformed.copy(), alpha=alpha)

In [None]:
def test_count_average_goals_conceded_from_last_n_matches():
    data = {
        'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06'],
        'home_team': ['TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB'],
        'away_team': ['TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA'],
        'home_team_goal': [2, 1, 3, 1, 0, 2],
        'away_team_goal': [1, 2, 0, 1, 1, 1]
    }
    
    df_mock = pd.DataFrame(data)

   # Calculating EMA for home and away teams
    df_ = calculate_team_ema(df_mock, 0.01)
    
    assert df_.iloc[-1]['away_team_ewm_goals_conceded'] == 0.8, f"Expected 0.8 but got {df_transformed.iloc[-1]['away_team_ewm_goals_conceded']}"
    assert df_.iloc[-1]['home_team_ewm_goals_conceded'] == 1.59, f"Expected 1.59 but got {df_transformed.iloc[-1]['home_team_ewm_goals_conceded']}"
    print("Test passed!")

test_count_average_goals_conceded_from_last_n_matches()

In [None]:
def calculate_weighted_wins(df, team, match_date, is_home_team, n=10, decay_factor=0.9):
    """
    Calculate the weighted number of wins of a team before a given match_date.
    More recent wins carry more weight.
    If is_home_team is True, count only home wins, else count only away wins.
    """
    if is_home_team:
        # Filter for home matches only
        team_matches = df[(df['home_team'] == team) & (df['date'] < match_date)].head(n)
        wins = (team_matches['home_team_goal'] > team_matches['away_team_goal'])
    else:
        # Filter for away matches only
        team_matches = df[(df['away_team'] == team) & (df['date'] < match_date)].head(n)
        wins = (team_matches['home_team_goal'] < team_matches['away_team_goal'])

    # Apply decay factor
    decay_weights = decay_factor ** np.arange(len(wins))
    weighted_wins = sum(wins * decay_weights)

    return weighted_wins

def get_team_weighted_wins(row, df, n, decay_factor=0.9):
    """
    Get the weighted number of wins for both the home and away teams for a given match row,
    considering only previous home or away matches respectively.
    """
    home_team = row['home_team']
    away_team = row['away_team']
    match_date = row['date']

    home_wins = calculate_weighted_wins(df, home_team, match_date, is_home_team=True, n=n, decay_factor=decay_factor)
    away_wins = calculate_weighted_wins(df, away_team, match_date, is_home_team=False, n=n, decay_factor=decay_factor)

    return home_wins, away_wins

start = time()

# Define the number of matches to consider and the decay factor
N = 10
decay_factor = 0.85

# Example usage
df_transformed[['home_weighted_wins', 'away_weighted_wins']] = df_transformed.apply(lambda row: get_team_weighted_wins(row, df_transformed, N, decay_factor), axis=1, result_type='expand')

end = time()
print("Calculate team weighted wins in {:.3f} minutes".format((end - start) / 60))


In [None]:
df_transformed[['home_weighted_wins', 'away_weighted_wins']].describe().round(2)

In [None]:
# def test_get_team_wins():
#     data = {
#         'home_team': ['TeamA', 'TeamB', 'TeamB', 'TeamB'],
#         'away_team': ['TeamB', 'TeamA', 'TeamC', 'TeamA'],
#         'home_team_goal': [2, 1, 3, 0],
#         'away_team_goal': [1, 2, 0, 1],
#         'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04'],
#         'season': ['2023', '2023', '2023', '2023']
#     }

#     mock_df = pd.DataFrame(data)
#     mock_df['date'] = pd.to_datetime(mock_df['date'])

#     expected_home_wins = 1
#     expected_away_wins = 1

#     home_wins, away_wins = get_team_wins(mock_df.iloc[-1], mock_df, 3)

#     assert home_wins == expected_home_wins, "home_wins doesn't match expected values"
#     assert away_wins == expected_away_wins, "away_wins doesn't match expected values"

#     print("Test for get_team_wins passed!")

# test_get_team_wins()

In [None]:
start = time()

def get_overall_player_strength(match_row):
    stats_home_players = match_row.filter(regex='player_rating_home_player')
    stats_away_players = match_row.filter(regex='player_rating_away_player')

    home_team_strength = round(stats_home_players.values.mean(), 2)
    away_team_strength = round(stats_away_players.values.mean(), 2)

    return home_team_strength, away_team_strength


df_transformed[['avg_home_team_rating', 'avg_away_team_rating']] = df_transformed.apply(lambda row: get_overall_player_strength(row), axis=1, result_type='expand')

end = time()
print("Calculate player stats in {:.3f} minutes".format((end - start) / 60))

In [None]:
start = time()

def calculate_weighted_streak(df, team, match_date, is_home_team, decay_factor=0.95):
    if is_home_team:
        team_matches = df[(df['home_team'] == team) & (df['date'] < match_date)].copy()
        team_matches.loc[:, 'win'] = (team_matches['home_team_goal'] > team_matches['away_team_goal']).astype(int)
        # Weight wins by opponent strength (example: higher rating means stronger team)
        team_matches.loc[:, 'quality'] =  team_matches['avg_away_team_rating']
    else:
        team_matches = df[(df['away_team'] == team) & (df['date'] < match_date)].copy()
        team_matches.loc[:, 'win'] = (team_matches['home_team_goal'] < team_matches['away_team_goal']).astype(int)
        team_matches.loc[:, 'quality'] = team_matches['avg_home_team_rating']

    # Sort matches from most recent to oldest
    team_matches = team_matches.sort_values(by='date', ascending=False)

    # Apply decay factor
    decay_weights = decay_factor ** np.arange(len(team_matches))

    # Calculate the weighted streak
    weighted_streak = sum(decay_weights * team_matches['quality'] * team_matches['win'])

    return weighted_streak

def count_streak_wins(match_row, df):
    match_date = match_row['date']
    home_team = match_row['home_team']
    away_team = match_row['away_team']

    home_streak = calculate_weighted_streak(df, home_team, match_date, is_home_team=True)
    away_streak = calculate_weighted_streak(df, away_team, match_date, is_home_team=False)

    return round(home_streak, 2), round(away_streak, 2)


df_transformed[['home_streak_wins', 'away_streak_wins']] = df_transformed.apply(lambda row: count_streak_wins(row, df_transformed), axis=1, result_type='expand')

end = time()
print("Count streak wins in {:.3f} minutes".format((end - start) / 60))

In [None]:
def test_get_team_wins():
    data = {
        'home_team': ['TeamB', 'TeamB', 'TeamA', 'TeamB', 'TeamB', 'TeamB', 'TeamB'],
        'away_team': ['TeamA', 'TeamA', 'TeamB', 'TeamA', 'TeamA', 'TeamC', 'TeamA'],
        'home_team_goal': [2, 1, 3, 0, 0, 2, 0],
        'away_team_goal': [1, 2, 0, 1, 1, 1, 1],
        'avg_away_team_rating': [75, 85, 83, 74, 73, 66, 78],
        'avg_home_team_rating': [90, 76, 78, 85, 77, 65, 82],
        'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06', '2023-01-07'],
        'season': ['2023', '2023', '2023', '2023', '2023', '2023', '2023']
    }

    mock_df = pd.DataFrame(data)
    mock_df['date'] = pd.to_datetime(mock_df['date'])

    expected_home_wins = 127.09
    expected_away_wins = 226.34

    home_streak_wins, away_streak_wins = count_streak_wins(mock_df.iloc[-1], mock_df)

    assert home_streak_wins == expected_home_wins, f"home_wins doesn't match expected values, {expected_home_wins}"
    assert away_streak_wins == expected_away_wins, f"away_wins doesn't match expected values, {expected_away_wins}"

    print("Test for get_team_wins passed!")

test_get_team_wins()

In [None]:
start = time()

def get_points(row, team):
    points_mapping = {
        'H': {'home': 3, 'away': 0},
        'D': {'home': 1, 'away': 1},
        'A': {'home': 0, 'away': 3}
    }
    team_type = 'home' if row['home_team'] == team else 'away'    
    return int(points_mapping[row['result_match']][team_type])

def process_points(team, df, match_date, match_season):
    team_matches = df.query('(home_team == @team | away_team == @team) & date < @match_date & season == @match_season')
    if len(team_matches) == 0:
        return 0

    return team_matches.apply(lambda row: get_points(row, team), axis=1).sum()

def count_points(match_row, df):
    match_date = match_row['date']
    match_season = match_row['season']
    home_team = match_row['home_team']
    away_team = match_row['away_team']

    home_team_points = process_points(home_team, df, match_date, match_season)
    away_team_points = process_points(away_team, df, match_date, match_season)

    return home_team_points, away_team_points

df_transformed[['points_home', 'points_away']] = df_transformed.apply(lambda row: count_points(row, df_transformed), axis=1, result_type='expand')

end = time()
print("Count points in {:.3f} minutes".format((end - start) / 60))

In [None]:
def calculate_ewma_team_stat(df, alpha, stat_type):
    # Create a long format DataFrame where each row is a team's performance in a match
    home_df = df[['date', 'home_team', f'home_{stat_type}']].rename(
        columns={'home_team': 'team', f'home_{stat_type}': stat_type})
    away_df = df[['date', 'away_team', f'away_{stat_type}']].rename(
        columns={'away_team': 'team', f'away_{stat_type}': stat_type})
    long_df = pd.concat([home_df, away_df])

    # Sort by team and date to ensure correct order for EMA calculation
    long_df.sort_values(by=['team', 'date'], inplace=True)

    # Calculate EWMA with shift to exclude the current match
    long_df[f'ewma_{stat_type}'] = (
        long_df.groupby('team')[stat_type]
        .apply(lambda x: x.shift(1).ewm(adjust=True, alpha=alpha).mean())
        .reset_index(level=0, drop=True)
        .round(3)
    )

    # Pivot the long format back to wide format with separate columns for home and away teams
    ewma_home = long_df.rename(columns={'team': 'home_team', f'ewma_{stat_type}': f'ewma_{stat_type}_home'}).drop(columns=[stat_type])
    ewma_away = long_df.rename(columns={'team': 'away_team', f'ewma_{stat_type}': f'ewma_{stat_type}_away'}).drop(columns=[stat_type])

    # Merge the EWMA data back to the original DataFrame
    merged_df = df.merge(ewma_home, on=['date', 'home_team'], how='left').merge(ewma_away, on=['date', 'away_team'], how='left')

    return merged_df

# Define the alpha for EWMA and the stat_type
alpha = 0.01
stat_type = 'shoton'

# Example usage
df_transformed = calculate_ewma_team_stat(df_transformed.copy(), alpha, stat_type)

In [None]:
def test_count_average_stat_from_last_n_matches():
    data = {
        'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06'],
        'home_team': ['TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB'],
        'away_team': ['TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA'],
        'home_shoton': [5, 4, 6, 3, 5, 4],
        'away_shoton': [4, 5, 5, 4, 4, 3]
    }
    
    df_mock = pd.DataFrame(data)
    stat_type = 'shoton'
    
    df_ = calculate_ewma_team_stat(df_mock, 0.01, stat_type)
    
    assert df_.iloc[-1]['ewma_shoton_home'] == 3.998, f"Expected 3.998 but got {df_.iloc[-1]['ewma_shoton_home']}"
    assert df_.iloc[-1]['ewma_shoton_away'] == 4.998, f"Expected 4.998 but got {df_.iloc[-1]['ewma_shoton_away']}"
    print("Test passed!")

test_count_average_stat_from_last_n_matches()

In [20]:
def test_count_average_stat_from_last_n_matches():
    data = {
        'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06'],
        'home_team': ['TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB'],
        'away_team': ['TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA'],
        'home_shoton': [5, 4, 6, 3, 5, 4],
        'away_shoton': [4, 5, 5, 4, 4, 3]
    }
    
    df_mock = pd.DataFrame(data)
    stat_type = 'shoton'
    
    df_ = calculate_ewma_team_stat(df_mock, 0.01, stat_type)
    
    assert df_.iloc[-1]['ewma_shoton_home'] == 3.998, f"Expected 3.998 but got {df_.iloc[-1]['ewma_shoton_home']}"
    assert df_.iloc[-1]['ewma_shoton_away'] == 4.998, f"Expected 4.998 but got {df_.iloc[-1]['ewma_shoton_away']}"
    print("Test passed!")

test_count_average_stat_from_last_n_matches()

Test passed!


In [21]:
from sklearn.preprocessing import LabelEncoder

# Map result_match to int
def prepare_target(y_):
    le = LabelEncoder()
    transform = le.fit_transform(y_)
    return transform, le.classes_  # 0: 'A', 1: 'D', 2: 'H'

df_transformed['result_match'], classes = prepare_target(df_transformed['result_match'])

In [22]:
df_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 49 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   match_api_id                  3040 non-null   int64         
 1   season                        3040 non-null   category      
 2   stage                         3040 non-null   int64         
 3   date                          3040 non-null   datetime64[ns]
 4   away_team                     3040 non-null   int64         
 5   home_team                     3040 non-null   int64         
 6   home_team_goal                3040 non-null   int64         
 7   away_team_goal                3040 non-null   int64         
 8   result_match                  3040 non-null   int32         
 9   away_possession               3040 non-null   int64         
 10  home_shoton                   3040 non-null   int64         
 11  home_possession               

In [31]:
import os

output_dir = "../data/transform/"
filename= "transformed_data.csv"

full_path = os.path.join(output_dir, filename)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

df_transformed.to_csv(full_path, index=False)