In [1]:
import pandas as pd
import numpy as np
from time import time

from preprocess.DataLoader import execute_data_loader
from preprocess.XmlProcessor import XmlProcessor

# execute_data_loader()

In [2]:
matches_data = pd.read_csv('../data/match_details.csv')
players_data = pd.read_csv('../data/player_attributes.csv')

df_transformed = XmlProcessor(matches_data).process_data()

In [3]:
df_transformed['date'] = pd.to_datetime(df_transformed['date'])
players_data['date'] = pd.to_datetime(players_data['date'])
df_transformed.sort_values(by='date', inplace=True)

In [4]:
df_transformed[['season', 'result_match']] = df_transformed[['season', 'result_match']].astype('category')

In [5]:
start = time()

players = ['{}_player_{}'.format(team, i) for team in ['home', 'away'] for i in range(1, 12)]

def get_player_overall_rating(player_id, match_date):
    player_ratings_on_or_before_match = players_data[
        (players_data['player_api_id'] == player_id) & (players_data['date'] <= match_date)
    ]
    latest_rating = player_ratings_on_or_before_match.sort_values(by='date', ascending=False).iloc[0]['overall_rating']
    return latest_rating

def get_player_id_for_team(row, player, team_type):
    player_id = row[player]

    if not np.isnan(player_id):
        return player_id

    # If the player ID is NaN
    player_id = df_transformed.loc[df_transformed[f'{team_type}_team'].eq(row[f'{team_type}_team'])][player].value_counts().idxmax()
    df_transformed.loc[df_transformed.match_api_id == row.match_api_id, player] = int(player_id)
    
    return player_id

def calculate_player_stat(match_row):
    player_stats_dict = {}
    match_date = match_row['date']

    for player in players:
        team_type = 'home' if 'home' in player else 'away'
        player_id = get_player_id_for_team(match_row, player, team_type)

        overall_ranking = get_player_overall_rating(player_id, match_date)
        name = 'player_rating_{}'.format(player)

        player_stats_dict[name] = int(overall_ranking)
        player_stats_dict['match_api_id'] = match_row.match_api_id

    return player_stats_dict

player_stats_dict = df_transformed.apply(lambda row: calculate_player_stat(row), axis=1)
new_player_stats = pd.json_normalize(player_stats_dict)
df_transformed = pd.merge(df_transformed, new_player_stats, how='left', on='match_api_id')
df_transformed = df_transformed.drop(players, axis=1)

end = time()
print("Calculate player stats in {:.3f} minutes".format((end - start) / 60))

Calculate player stats in 0.915 minutes


In [6]:
# Check on which positions given player played
columns_with_value = matches_data.columns[matches_data.eq(30843).any()].tolist()
print(columns_with_value)

['home_player_9', 'home_player_10', 'home_player_11', 'away_player_10', 'away_player_11']


In [7]:
# Test 3: Test that the overall ratings fetched are indeed the latest
player_id = 30843
player_col = 'home_player_11'
rating_col = 'player_rating_' + player_col

# Fetch rating from transformed df
match = matches_data.loc[matches_data[player_col].eq(player_id)].sort_values(by='date',ascending=False).iloc[0]
match_id = match['match_api_id']
rating = df_transformed[df_transformed['match_api_id'] == match_id].iloc[0][rating_col]
print(rating)

# Fetch player rating from players data by id and latest date
latest_rating = players_data[
    (players_data['player_api_id'] == player_id) & (players_data['date'] <= match['date'])
].sort_values(by='date', ascending=False).iloc[0]['overall_rating']
print(latest_rating)

assert rating == latest_rating, f"Incorrect rating for {player_col}"
print("test passed")

85
85.0
test passed


In [29]:
start = time()

def get_scored_goals(row, team):
    return row['home_team_goal'] if row['home_team'] == team else row['away_team_goal']

def average_scored_goals(team, match_date, df, n):
    """
    Calculate the average goals scored by a team over the last n matches before a given date.
    """
    team_matches = df[((df['home_team'] == team) | (df['away_team'] == team)) &
                      (df['date'] < match_date)].sort_values(by='date', ascending=False).iloc[:n]
    
    if team_matches.empty:
        return 0
    
    return team_matches.apply(lambda row: get_scored_goals(row, team), axis=1).mean()

def count_average_goals_from_last_n_matches(row, df, n=3):
    home_team = row['home_team']
    away_team = row['away_team']
    match_date = row['date']

    avg_goals_home = average_scored_goals(home_team, match_date, df, n)
    avg_goals_away = average_scored_goals(away_team, match_date, df, n)

    return round(avg_goals_home, 2), round(avg_goals_away, 2)

# Usage
N = 4
df_transformed[['home_avg_goals_scored', 'away_avg_goals_scored']] = df_transformed.apply(
    lambda row: count_average_goals_from_last_n_matches(row, df_transformed, n=N), 
    axis=1, 
    result_type='expand'
)

end = time()
print("Calculate average goals from {} matches in {:.3f} minutes".format(N, (end - start) / 60))

Calculate average goals from 4 matches in 0.134 minutes


In [9]:
def test_count_average_goals_from_last_n_matches():
    # Mock dataframe
    data = {
        'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06'],
        'home_team': ['TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB'],
        'away_team': ['TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA'],
        'home_team_goal': [2, 1, 3, 1, 0, 2],
        'away_team_goal': [1, 2, 0, 1, 1, 1]
    }
    
    df_mock = pd.DataFrame(data)

    avg_goals_home, avg_goals_away = count_average_goals_from_last_n_matches(df_mock.iloc[-1], df_mock, n=3)
    
    # Expected values: 
    # TeamB (home) -> Goals: 1, 1, 0 -> Average: 0.67
    # TeamA (away) -> Goals: 0, 1, 3 -> Average: 1.33
    
    assert avg_goals_home == 0.67, f"Expected 0.67 but got {avg_goals_home}"
    assert avg_goals_away == 1.33, f"Expected 1.33 but got {avg_goals_away}"
    print("Test passed!")

test_count_average_goals_from_last_n_matches()

Test passed!


In [23]:
N = 10
start = time()

def get_conceded_goals(row, team):
    return row['away_team_goal'] if row['home_team'] == team else row['home_team_goal']

def average_conceded_goals(team, match_date, n, df):
    """
    Calculate the average conceded goals of a team over the last n matches before a given date.
    """
    team_matches = df[((df['home_team'] == team) | (df['away_team'] == team)) &
                      (df['date'] < match_date)].sort_values(by='date', ascending=False).head(n)

    if team_matches.empty:
        return 0

    return team_matches.apply(lambda row: get_conceded_goals(row, team), axis=1).mean()

def count_average_conceded_goals_from_last_n_matches(row, df, n=3):
    home_team = row['home_team']
    away_team = row['away_team']
    match_date = row['date']

    avg_goals_home = average_conceded_goals(home_team, match_date, n, df)
    avg_goals_away = average_conceded_goals(away_team, match_date, n, df)

    return round(avg_goals_home, 2), round(avg_goals_away, 2)

df_transformed[['home_avg_goals_conceded', 'away_avg_goals_conceded']] = df_transformed.apply(lambda row: count_average_conceded_goals_from_last_n_matches(row, df_transformed, n=N), axis=1, result_type='expand')

end = time()
print("Calculate average goals from {} matches in {:.3f} minutes".format(N, (end - start) / 60))

Calculate average goals from 10 matches in 0.138 minutes


In [24]:
def test_count_average_goals_conceded_from_last_n_matches():
    # Mock dataframe
    data = {
        'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06'],
        'home_team': ['TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB'],
        'away_team': ['TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA'],
        'home_team_goal': [2, 1, 3, 1, 0, 2],
        'away_team_goal': [1, 2, 0, 1, 1, 1]
    }
    
    df_mock = pd.DataFrame(data)

    avg_goals_home, avg_goals_away = count_average_conceded_goals_from_last_n_matches(df_mock.iloc[-1], df_mock, n=3)
    
    # Expected values:
    # TeamB (home) -> Goals: 0, 1, 4 -> Average: 1.33
    # TeamA (away) -> Goals: 1, 1, 0 -> Average: 0.67
    
    assert avg_goals_home == 1.33, f"Expected 1.33 but got {avg_goals_home}"
    assert avg_goals_away == 0.67, f"Expected 0.67 but got {avg_goals_away}"
    print("Test passed!")

test_count_average_goals_conceded_from_last_n_matches()

Test passed!


In [25]:
start = time()

def count_previous_wins(df, team, match_date, season):
    """
    Count the number of wins of a team before a given match_date in a given season.
    """
    team_matches = df[((df['home_team'] == team) | (df['away_team'] == team)) &
                      (df['date'] < match_date) &
                      (df['season'] == season)]
    
    home_wins = (team_matches['home_team'] == team) & (team_matches['home_team_goal'] > team_matches['away_team_goal'])
    away_wins = (team_matches['away_team'] == team) & (team_matches['home_team_goal'] < team_matches['away_team_goal'])
    
    return home_wins.sum() + away_wins.sum()

def get_team_wins(row, df):
    """
    Get the number of wins for both the home and away teams for a given match row.
    """
    home_team = row['home_team']
    away_team = row['away_team']
    match_date = row['date']
    season = row['season']

    home_wins = count_previous_wins(df, home_team, match_date, season)
    away_wins = count_previous_wins(df, away_team, match_date, season)

    return home_wins, away_wins

df_transformed[['home_wins', 'away_wins']] = df_transformed.apply(lambda row: get_team_wins(row, df_transformed), axis=1, result_type='expand')

end = time()
print("Calculate team wins in {:.3f} minutes".format((end - start) / 60))

Calculate team wins in 0.096 minutes


In [26]:
def test_get_team_wins():
    # 1. Create mock data
    data = {
        'home_team': ['TeamA', 'TeamB', 'TeamA', 'TeamB'],
        'away_team': ['TeamB', 'TeamA', 'TeamB', 'TeamA'],
        'home_team_goal': [2, 1, 3, 0],
        'away_team_goal': [1, 2, 0, 1],
        'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04'],
        'season': ['2023', '2023', '2023', '2023']
    }

    mock_df = pd.DataFrame(data)
    mock_df['date'] = pd.to_datetime(mock_df['date'])

    # 2. Expected results:
    expected_home_wins = 0
    expected_away_wins = 3

    # 3. Run the methods on mock data
    home_wins, away_wins = get_team_wins(mock_df.iloc[-1], mock_df)

    # 4. Assert
    assert home_wins == expected_home_wins, "home_wins doesn't match expected values"
    assert away_wins == expected_away_wins, "away_wins doesn't match expected values"

    print("Test for get_team_wins passed!")

test_get_team_wins()

Test for get_team_wins passed!


In [27]:
start = time()

def get_overall_player_strength(match_row):
    stats_home_players = match_row.filter(regex='player_rating_home_player')
    stats_away_players = match_row.filter(regex='player_rating_away_player')

    home_team_strength = round(stats_home_players.values.mean(), 2)
    away_team_strength = round(stats_away_players.values.mean(), 2)

    return home_team_strength, away_team_strength


df_transformed[['home_team_strength', 'away_team_strength']] = df_transformed.apply(lambda row: get_overall_player_strength(row), axis=1, result_type='expand')

# cols_to_remove = df_transformed.filter(like='player_rating').columns
# df = df_transformed.drop(columns=cols_to_remove)

end = time()
print("Calculate player stats in {:.3f} minutes".format((end - start) / 60))

Calculate player stats in 0.014 minutes


In [31]:
df_transformed['weighted_wins_diff'] = ((df_transformed['home_wins'] * df_transformed['home_team_strength'] - df_transformed['away_wins'] * df_transformed['away_team_strength']) / 10).astype(int)

In [32]:
start = time()

def get_streak_wins(df, team, match_date):
    team_matches = df[(df['home_team'] == team) | (df['away_team'] == team)]
    recent_matches = team_matches[team_matches['date'] < match_date].sort_values(by='date', ascending=False)
    
    counter = 0
    for _, match in recent_matches.iterrows():
        if (match['home_team'] == team and match['home_team_goal'] > match['away_team_goal']) \
        or (match['away_team'] == team and match['home_team_goal'] < match['away_team_goal']):
            counter += 1
        else:
            break
    return counter

def count_streak_wins(match_row):
    match_date = match_row['date']
    home_team = match_row['home_team']
    away_team = match_row['away_team']

    home_streak = get_streak_wins(df_transformed, home_team, match_date)
    away_streak = get_streak_wins(df_transformed, away_team, match_date)

    return home_streak, away_streak

df_transformed[['home_streak_wins', 'away_streak_wins']] = df_transformed.apply(lambda row: count_streak_wins(row), axis=1, result_type='expand')

end = time()
print("Count streak wins in {:.3f} minutes".format((end - start) / 60))

Count streak wins in 0.151 minutes


In [33]:
start = time()

def get_points(row, team):
    points_mapping = {
        'H': {'home': 3, 'away': 0},
        'D': {'home': 1, 'away': 1},
        'A': {'home': 0, 'away': 3}
    }
    team_type = 'home' if row['home_team'] == team else 'away'    
    return int(points_mapping[row['result_match']][team_type])

def process_points(team, df, match_date, match_season):
    team_matches = df.query('(home_team == @team | away_team == @team) & date < @match_date & season == @match_season')
    if len(team_matches) == 0:
        return 0
    return team_matches.apply(lambda row: get_points(row, team), axis=1).sum()

def count_points(match_row, df):
    match_date = match_row['date']
    match_season = match_row['season']
    home_team = match_row['home_team']
    away_team = match_row['away_team']

    home_team_points = process_points(home_team, df, match_date, match_season)
    away_team_points = process_points(away_team, df, match_date, match_season)

    return home_team_points, away_team_points

# df_transformed[['points_home', 'points_away']] = df_transformed.apply(lambda row: count_points(row, df_transformed), axis=1, result_type='expand')

end = time()
print("Count points in {:.3f} minutes".format((end - start) / 60))

Count points in 0.000 minutes


In [34]:
def test_count_points():
    data = {
        'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04'],
        'season': ['2022-2023', '2022-2023', '2022-2023', '2022-2023'],
        'home_team': ['TeamA', 'TeamB', 'TeamA', 'TeamB'],
        'away_team': ['TeamB', 'TeamA', 'TeamB', 'TeamA'],
        'result_match': ['H', 'D', 'H', 'A']
    }
    
    df_mock = pd.DataFrame(data)
    home_points, away_points = count_points(df_mock.iloc[-1], df_mock)
    
    assert home_points == 1, f"Expected 1 but got {home_points}"
    assert away_points == 7, f"Expected 6 but got {away_points}"

    print("Test passed!")

test_count_points()

Test passed!


In [35]:
start = time()
N = 10
def get_last_n_matches_points(df, team, match_date, n):
    mask_team_matches = (df['home_team'] == team) | (df['away_team'] == team)
    team_matches = df[mask_team_matches]
    sorted_matches = team_matches[team_matches['date'].lt(match_date)].sort_values(by='date', ascending=False).iloc[:n]
    if sorted_matches.empty:
        return 0
    points = sorted_matches.apply(lambda row: get_points(row, team), axis=1).sum()
    return points

def count_points_from_n_last_matches(row, df, n):
    home_points = get_last_n_matches_points(df, row['home_team'], row['date'], n)
    away_points = get_last_n_matches_points(df, row['away_team'], row['date'], n)
    return home_points, away_points

df_transformed[['home_sum_points', 'away_sum_points']] = df_transformed.apply(lambda row: count_points_from_n_last_matches(row,df_transformed, n=N), axis=1, result_type='expand')
df_transformed['points_diff'] = df_transformed['home_sum_points'] - df_transformed['away_sum_points']

end = time()
print("Count team sum of points from n last matches in {:.3f} minutes".format((end - start) / 60))

Count team sum of points from n last matches in 0.181 minutes


In [41]:
def test_count_points_from_n_last_matches():
    # Mock dataframe
    data = {
        'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06'],
        'home_team': ['TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB'],
        'away_team': ['TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA'],
        'home_team_goal': [2, 1, 3, 1, 0, 2],
        'away_team_goal': [1, 2, 0, 1, 1, 1],
        'result_match': ['H', 'A', 'H', 'D', 'A', 'H']
    }
    
    df_mock = pd.DataFrame(data)
    N = 3
    home_points, away_points = count_points_from_n_last_matches(df_mock.iloc[-1], df_mock, N)
    
    assert home_points == 4, f"Expected 2 but got {home_points}"
    assert away_points == 4, f"Expected 4 but got {away_points}"
    
    print("Test passed!")

test_count_points_from_n_last_matches()

Test passed!


In [42]:
start = time()

def get_last_winners(row):
    if row['home_team_goal'] > row['away_team_goal']:
        return row['home_team']
    elif row['home_team_goal'] < row['away_team_goal']:
        return row['away_team']
    else:
        return 'draw'

def get_teams_match_mask(home_team, away_team, df):
    return ((df['home_team'] == home_team) | (df['home_team'] == away_team)) & \
           ((df['away_team'] == home_team) | (df['away_team'] == away_team))

def count_wins_eachother(match_row, df):
    match_date = match_row['date']
    home_team_ = match_row['home_team']
    away_team_ = match_row['away_team']

    mask_all_matches_between_teams = get_teams_match_mask(home_team_, away_team_, df)
    sorted_team_matches = df.loc[mask_all_matches_between_teams & df['date'].lt(match_date)].sort_values(by='date', ascending=False)
    winners = sorted_team_matches.apply(get_last_winners, axis=1)
    counts = winners.value_counts()

    return counts.loc[home_team_] if home_team_ in counts else 0, counts.loc[away_team_] if away_team_ in counts else 0

df_transformed[['win_eachother_home', 'win_eachother_away']] = df_transformed.apply(lambda x: count_wins_eachother(x, df_transformed), axis=1, result_type='expand')

end = time()
print("Get last match winner in {:.3f} minutes".format((end - start) / 60))

Get last match winner in 0.086 minutes


In [43]:
def test_count_wins_eachother():
    data = {
        'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05'],
        'home_team': ['TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA'],
        'away_team': ['TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB'],
        'home_team_goal': [2, 1, 3, 0, 0],
        'away_team_goal': [1, 2, 0, 1, 0]
    }
    
    df_mock = pd.DataFrame(data)

    home_wins, away_wins = count_wins_eachother(df_mock.iloc[-1], df_mock)
    
    assert home_wins == 4, f"Expected 4 but got {home_wins}"
    assert away_wins == 0, f"Expected 0 but got {away_wins}"
    
    print("Test passed!")

test_count_wins_eachother()

Test passed!


In [51]:
from sklearn.preprocessing import LabelEncoder

# Map result_match to int
def prepare_target(y_):
    le = LabelEncoder()
    transform = le.fit_transform(y_)
    return transform, le.classes_  # 0: 'A', 1: 'D', 2: 'H'

df_transformed['result_match'], classes = prepare_target(df_transformed['result_match'])

In [55]:
df_transformed.drop(['stage', 'date'], axis=1, inplace=True)

In [57]:
import os

output_dir = "../data/transform/"
filename= "match_details_transformed.csv"

full_path = os.path.join(output_dir, filename)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

df_transformed.to_csv(full_path)