In [1]:
import pandas as pd
import numpy as np
from time import time

from loaddata.DataLoader import execute_data_loader
from loaddata.XmlProcessor import XmlProcessor

In [2]:
# execute_data_loader()

In [3]:
matches_data = pd.read_csv('../data/raw/match_details.csv')
players_data = pd.read_csv('../data/raw/player_attributes.csv')

df_transformed = XmlProcessor(matches_data).process_data()

In [4]:
df_transformed['date'] = pd.to_datetime(df_transformed['date'])
players_data['date'] = pd.to_datetime(players_data['date'])
df_transformed.sort_values(by='date', inplace=True)

In [5]:
df_transformed[['season', 'result_match']] = df_transformed[['season', 'result_match']].astype('category')

In [6]:
# Check on which positions given player played
columns_with_value = matches_data.columns[matches_data.eq(30843).any()].tolist()
print(columns_with_value)

['home_player_9', 'home_player_10', 'home_player_11', 'away_player_10', 'away_player_11']


In [7]:
start = time()

players = ['{}_player_{}'.format(team, i) for team in ['home', 'away'] for i in range(1, 12)]

def get_player_overall_rating(player_id, match_date):
    player_ratings_on_or_before_match = players_data[
        (players_data['player_api_id'] == player_id) & (players_data['date'] <= match_date)
    ]
    latest_rating = player_ratings_on_or_before_match.sort_values(by='date', ascending=False).iloc[0]['overall_rating']
    return latest_rating

def get_player_id_for_team(row, player, team_type):
    player_id = row[player]

    if not np.isnan(player_id):
        return player_id

    player_id = df_transformed.loc[df_transformed[f'{team_type}_team'].eq(row[f'{team_type}_team'])][player].value_counts().idxmax()
    df_transformed.loc[df_transformed.match_api_id == row.match_api_id, player] = int(player_id)
    
    return player_id

def calculate_player_stat(match_row):
    player_stats_dict = {}
    match_date = match_row['date']

    for player in players:
        team_type = 'home' if 'home' in player else 'away'
        player_id = get_player_id_for_team(match_row, player, team_type)

        overall_ranking = get_player_overall_rating(player_id, match_date)
        name = 'player_rating_{}'.format(player)

        player_stats_dict[name] = int(overall_ranking)
        player_stats_dict['match_api_id'] = match_row.match_api_id

    return player_stats_dict

player_stats_dict = df_transformed.apply(lambda row: calculate_player_stat(row), axis=1)
new_player_stats = pd.json_normalize(player_stats_dict)
df_transformed = pd.merge(df_transformed, new_player_stats, how='left', on='match_api_id')
df_transformed = df_transformed.drop(players, axis=1)

end = time()
print("Calculate player stats in {:.3f} minutes".format((end - start) / 60))

Calculate player stats in 0.960 minutes


In [8]:
# Test: Test that the overall ratings fetched are indeed the latest
player_id = 30843
player_col = 'home_player_11'
rating_col = 'player_rating_' + player_col

# Fetch rating from transformed df
match = matches_data.loc[matches_data[player_col].eq(player_id)].sort_values(by='date',ascending=False).iloc[0]
match_id = match['match_api_id']
transformed_rating = df_transformed[df_transformed['match_api_id'] == match_id].iloc[0][rating_col]

# Fetch player rating from players data by id and latest date
raw_rating = players_data[
    (players_data['player_api_id'] == player_id) & (players_data['date'] <= match['date'])
].sort_values(by='date', ascending=False).iloc[0]['overall_rating']

assert transformed_rating == raw_rating, f"Incorrect rating for {player_col}"
print("test passed")

test passed


In [9]:
alpha = 0.3

def calculate_team_ema(df, alpha):
    # Create a long format DataFrame where each row is a team's performance in a match
    home_df = df[['date', 'home_team', 'home_team_goal']].rename(columns={'home_team': 'team', 'home_team_goal': 'goals'})
    away_df = df[['date', 'away_team', 'away_team_goal']].rename(columns={'away_team': 'team', 'away_team_goal': 'goals'})
    long_df = pd.concat([home_df, away_df])

    # Sort by team and date to ensure correct order for EMA calculation
    long_df.sort_values(by=['team', 'date'], inplace=True)

    # Calculate EMA with shift to exclude the current match
    long_df['ema_goals'] = (
        long_df.groupby('team')['goals']
        .apply(lambda x: x.shift(1).ewm(adjust=True, alpha=alpha).mean())
        .reset_index(level=0, drop=True)
        .round(2)
    )

    # Pivot the long format back to wide format with separate columns for home and away teams
    ema_home = long_df.rename(columns={'team': 'home_team', 'ema_goals': 'ewm_home_team_goals'}).drop(columns=['goals'])
    ema_away = long_df.rename(columns={'team': 'away_team', 'ema_goals': 'ewm_away_team_goals'}).drop(columns=['goals'])

    # Merge the EMA data back to the original DataFrame
    merged_df = df.merge(ema_home, on=['date', 'home_team'], how='left').merge(ema_away, on=['date', 'away_team'], how='left')

    return merged_df

df_transformed = calculate_team_ema(df_transformed.copy(), alpha=alpha)

In [10]:
def test_count_average_goals_from_last_n_matches():
    data = {
        'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06'],
        'home_team': ['TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB'],
        'away_team': ['TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA'],
        'home_team_goal': [2, 1, 3, 1, 3, 2],
        'away_team_goal': [1, 2, 0, 1, 1, 1]
    }
    
    df_mock = pd.DataFrame(data)

    # Calculating EMA for home and away teams
    df_ = calculate_team_ema(df_mock, 0.01)

    assert df_.iloc[-1]['ewm_home_team_goals'] == 0.8, f"Expected 0.8 but got {df_.iloc[-1]['ewm_home_team_goals']}"
    assert df_.iloc[-1]['ewm_away_team_goals'] == 2.2, f"Expected 2.2 but got {df_.iloc[-1]['ewm_away_team_goals']}"
    print("Test passed!")

test_count_average_goals_from_last_n_matches()

Test passed!


In [11]:
alpha = 0.3

def calculate_team_ema(df, alpha):
    # Create a long format DataFrame where each row is a team's performance in a match
    home_df = df[['date', 'home_team', 'away_team_goal']].rename(columns={'home_team': 'team', 'away_team_goal': 'goals'})
    away_df = df[['date', 'away_team', 'home_team_goal']].rename(columns={'away_team': 'team', 'home_team_goal': 'goals'})
    long_df = pd.concat([home_df, away_df])

    # Sort by team and date to ensure correct order for EMA calculation
    long_df.sort_values(by=['team', 'date'], inplace=True)

    # Calculate EMA with shift to exclude the current match
    long_df['ema_goals'] = (
        long_df.groupby('team')['goals']
        .apply(lambda x: x.shift(1).ewm(adjust=True, alpha=alpha).mean())
        .reset_index(level=0, drop=True)
        .round(2)
    )

    # Pivot the long format back to wide format with separate columns for home and away teams
    ema_home = long_df.rename(columns={'team': 'home_team', 'ema_goals': 'ewm_home_team_goals_conceded'}).drop(columns=['goals'])
    ema_away = long_df.rename(columns={'team': 'away_team', 'ema_goals': 'ewm_away_team_goals_conceded'}).drop(columns=['goals'])

    # Merge the EMA data back to the original DataFrame
    merged_df = df.merge(ema_home, on=['date', 'home_team'], how='left').merge(ema_away, on=['date', 'away_team'], how='left')

    return merged_df

df_transformed = calculate_team_ema(df_transformed.copy(), alpha=alpha)

In [12]:
def test_count_average_goals_conceded_from_last_n_matches():
    data = {
        'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06'],
        'home_team': ['TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB'],
        'away_team': ['TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA'],
        'home_team_goal': [2, 1, 3, 1, 0, 2],
        'away_team_goal': [1, 2, 0, 1, 1, 1]
    }
    
    df_mock = pd.DataFrame(data)

   # Calculating EMA for home and away teams
    df_ = calculate_team_ema(df_mock, 0.01)
    
    assert df_.iloc[-1]['ewm_away_team_goals_conceded'] == 0.8, f"Expected 0.8 but got {df_transformed.iloc[-1]['ewm_away_team_goals_conceded']}"
    assert df_.iloc[-1]['ewm_home_team_goals_conceded'] == 1.59, f"Expected 1.59 but got {df_transformed.iloc[-1]['ewm_home_team_goals_conceded']}"
    print("Test passed!")

test_count_average_goals_conceded_from_last_n_matches()

Test passed!


In [13]:
start = time()

def get_points(row, team):
    points_mapping = {
        'H': {'home': 3, 'away': 0},
        'D': {'home': 1, 'away': 1},
        'A': {'home': 0, 'away': 3}
    }
    team_type = 'home' if row['home_team'] == team else 'away'    
    return int(points_mapping[row['result_match']][team_type])

def process_points(team, df, match_date, match_season):
    team_matches = df.query('(home_team == @team | away_team == @team) & date < @match_date & season == @match_season')
    if len(team_matches) == 0:
        return 0

    return team_matches.apply(lambda row: get_points(row, team), axis=1).sum()

def count_points(match_row, df):
    match_date = match_row['date']
    match_season = match_row['season']
    home_team = match_row['home_team']
    away_team = match_row['away_team']

    home_team_points = process_points(home_team, df, match_date, match_season)
    away_team_points = process_points(away_team, df, match_date, match_season)

    return home_team_points, away_team_points

df_transformed[['points_home', 'points_away']] = df_transformed.apply(lambda row: count_points(row, df_transformed), axis=1, result_type='expand')

end = time()
print("Count points in {:.3f} minutes".format((end - start) / 60))

Count points in 0.246 minutes


In [14]:
def calculate_weighted_wins(df, team, match_date, is_home_team, n=10, decay_factor=0.9):
    """
    Calculate the weighted number of wins of a team before a given match_date.
    
    Parameters:
    - df: DataFrame containing match data
    - team: The team for which to calculate weighted wins
    - match_date: The date of the match for which to calculate
    - is_home_team: True if the team is the home team, False for away team
    - n: Number of past matches to consider
    - decay_factor: The factor to decay the weight of older wins (default is 0.9)
    
    Returns:
    - weighted_wins: The calculated weighted number of wins
    """
    point_difference = 0
    if is_home_team:
        # Filter for home matches only
        team_matches = df[(df['home_team'] == team) & (df['date'] < match_date)].head(n)
        wins = (team_matches['home_team_goal'] > team_matches['away_team_goal'])
        # point_difference = team_matches['points_away'] - team_matches['points_home']
    else:
        # Filter for away matches only
        team_matches = df[(df['away_team'] == team) & (df['date'] < match_date)].head(n)
        wins = (team_matches['home_team_goal'] < team_matches['away_team_goal'])
        # point_difference = team_matches['points_home'] - - team_matches['points_away']

    # Weight wins by the difference in points
    weighted_wins = sum(wins * (decay_factor ** np.arange(len(wins))))

    return weighted_wins

def get_team_weighted_wins(row, df, n, decay_factor=0.9):
    """
    Get the weighted number of wins for both the home and away teams for a given match row,
    considering only previous home or away matches respectively.
    
    Parameters:
    - row: A row of match data
    - df: DataFrame containing match data
    - n: Number of past matches to consider
    - decay_factor: The factor to decay the weight of older wins (default is 0.9)
    
    Returns:
    - home_wins: The calculated weighted number of wins for the home team
    - away_wins: The calculated weighted number of wins for the away team
    """
    home_team = row['home_team']
    away_team = row['away_team']
    match_date = row['date']

    home_wins = calculate_weighted_wins(df, home_team, match_date, is_home_team=True, n=n, decay_factor=decay_factor)
    away_wins = calculate_weighted_wins(df, away_team, match_date, is_home_team=False, n=n, decay_factor=decay_factor)

    return home_wins, away_wins

start = time()

N = 25
decay_factor = 0.85

df_transformed[['home_weighted_wins', 'away_weighted_wins']] = df_transformed.apply(
    lambda row: get_team_weighted_wins(row, df_transformed, N, decay_factor), axis=1, result_type='expand'
)

end = time()

print("Calculate team weighted wins in {:.3f} minutes".format((end - start) / 60))

Calculate team weighted wins in 0.059 minutes


In [15]:
df_transformed[['home_weighted_wins', 'away_weighted_wins']].describe().round(2)

Unnamed: 0,home_weighted_wins,away_weighted_wins
count,3040.0,3040.0
mean,2.7,1.91
std,1.2,1.48
min,0.0,0.0
25%,2.02,0.76
50%,2.4,1.56
75%,3.58,3.08
max,5.35,5.53


In [16]:
start = time()

def get_overall_player_strength(match_row):
    stats_home_players = match_row.filter(regex='player_rating_home_player')
    stats_away_players = match_row.filter(regex='player_rating_away_player')

    home_team_strength = round(stats_home_players.values.mean(), 2)
    away_team_strength = round(stats_away_players.values.mean(), 2)

    return home_team_strength, away_team_strength


df_transformed[['avg_home_team_rating', 'avg_away_team_rating']] = df_transformed.apply(lambda row: get_overall_player_strength(row), axis=1, result_type='expand')

end = time()
print("Calculate player stats in {:.3f} minutes".format((end - start) / 60))

Calculate player stats in 0.015 minutes


In [17]:
start = time()

def calculate_weighted_streak(df, team, match_date, is_home_team, decay_factor=0.95):
    if is_home_team:
        team_matches = df[(df['home_team'] == team) & (df['date'] < match_date)].copy()
        team_matches.loc[:, 'win'] = (team_matches['home_team_goal'] > team_matches['away_team_goal']).astype(int)
        # Weight wins by opponent strength (example: higher rating means stronger team)
        team_matches.loc[:, 'quality'] =  team_matches['avg_away_team_rating']
    else:
        team_matches = df[(df['away_team'] == team) & (df['date'] < match_date)].copy()
        team_matches.loc[:, 'win'] = (team_matches['home_team_goal'] < team_matches['away_team_goal']).astype(int)
        team_matches.loc[:, 'quality'] = team_matches['avg_home_team_rating']

    # Sort matches from most recent to oldest
    team_matches = team_matches.sort_values(by='date', ascending=False)

    # Apply decay factor
    decay_weights = decay_factor ** np.arange(len(team_matches))

    # Calculate the weighted streak
    weighted_streak = sum(decay_weights * team_matches['quality'] * team_matches['win'])

    return weighted_streak

def count_streak_wins(match_row, df, decay_factor):
    match_date = match_row['date']
    home_team = match_row['home_team']
    away_team = match_row['away_team']

    home_team_points = match_row['points_home']
    away_team_points = match_row['points_away']

    home_streak = calculate_weighted_streak(df, home_team, match_date, is_home_team=True, decay_factor=decay_factor)
    away_streak = calculate_weighted_streak(df, away_team, match_date, is_home_team=False, decay_factor=decay_factor)

    return round(home_streak, 2), round(away_streak, 2)

decay_factor = 0.9

df_transformed[['home_streak_wins', 'away_streak_wins']] = df_transformed.apply(lambda row: count_streak_wins(row, df_transformed, decay_factor), axis=1, result_type='expand')

end = time()
print("Count streak wins in {:.3f} minutes".format((end - start) / 60))

Count streak wins in 0.153 minutes


In [18]:
def test_count_streak_wins():
    data = {
        'home_team': ['TeamB', 'TeamB', 'TeamA', 'TeamB', 'TeamB', 'TeamB', 'TeamB'],
        'away_team': ['TeamA', 'TeamA', 'TeamB', 'TeamA', 'TeamA', 'TeamC', 'TeamA'],
        'home_team_goal': [2, 1, 3, 0, 0, 2, 0],
        'away_team_goal': [1, 2, 0, 1, 1, 1, 1],
        'avg_away_team_rating': [75, 85, 83, 74, 73, 66, 78],
        'avg_home_team_rating': [90, 76, 78, 85, 77, 65, 82],
        'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06', '2023-01-07'],
        'season': ['2023', '2023', '2023', '2023', '2023', '2023', '2023'],
        'points_home': [3, 1, 3, 0, 0, 3, 0],
        'points_away': [0, 3, 0, 1, 1, 0, 3]
    }

    mock_df = pd.DataFrame(data)
    mock_df['date'] = pd.to_datetime(mock_df['date'])

    expected_home_streak_wins = 115.21
    expected_away_streak_wins = 215.06

    decay_factor = 0.9
    home_streak_wins, away_streak_wins = count_streak_wins(mock_df.iloc[-1], mock_df, decay_factor)

    assert round(home_streak_wins, 2) == expected_home_streak_wins, f"Home streak wins don't match expected value: {home_streak_wins}"
    assert round(away_streak_wins, 2) == expected_away_streak_wins, f"Away streak wins don't match expected value: {away_streak_wins}"

    print("Test for count_streak_wins passed!")

test_count_streak_wins()

Test for count_streak_wins passed!


In [19]:
def calculate_ewm_team_stat(df, alpha, stat_type):
    # Create a long format DataFrame where each row is a team's performance in a match
    home_df = df[['date', 'home_team', f'home_{stat_type}']].rename(
        columns={'home_team': 'team', f'home_{stat_type}': stat_type})
    away_df = df[['date', 'away_team', f'away_{stat_type}']].rename(
        columns={'away_team': 'team', f'away_{stat_type}': stat_type})
    long_df = pd.concat([home_df, away_df])

    # Sort by team and date to ensure correct order for EMA calculation
    long_df.sort_values(by=['team', 'date'], inplace=True)

    # Calculate EWMA with shift to exclude the current match
    long_df[f'ewm_{stat_type}'] = (
        long_df.groupby('team')[stat_type]
        .apply(lambda x: x.shift(1).ewm(adjust=True, alpha=alpha).mean())
        .reset_index(level=0, drop=True)
        .round(3)
    )

    # Pivot the long format back to wide format with separate columns for home and away teams
    ewm_home = long_df.rename(columns={'team': 'home_team', f'ewm_{stat_type}': f'ewm_{stat_type}_home'}).drop(columns=[stat_type])
    ewm_away = long_df.rename(columns={'team': 'away_team', f'ewm_{stat_type}': f'ewm_{stat_type}_away'}).drop(columns=[stat_type])

    # Merge the EWMA data back to the original DataFrame
    merged_df = df.merge(ewm_home, on=['date', 'home_team'], how='left').merge(ewm_away, on=['date', 'away_team'], how='left')

    return merged_df

alpha = 0.0005
shoton_type = 'shoton'
possesion_type = 'possession'

df_transformed = calculate_ewm_team_stat(df_transformed.copy(), alpha, shoton_type)
df_transformed = calculate_ewm_team_stat(df_transformed.copy(), alpha, possesion_type)

In [20]:
df_transformed.filter(like='possession').describe().round(2)

Unnamed: 0,home_possession,away_possession,ewm_possession_home,ewm_possession_away
count,3040.0,3040.0,3024.0,3022.0
mean,51.31,47.9,48.12,48.19
std,9.55,9.62,5.12,5.0
min,0.0,0.0,0.0,0.0
25%,46.0,43.0,47.05,47.06
50%,51.0,48.0,48.67,48.68
75%,57.0,54.0,50.44,50.47
max,80.0,80.0,57.52,57.51


In [21]:
def test_count_average_stat_from_last_n_matches():
    data = {
        'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06'],
        'home_team': ['TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB'],
        'away_team': ['TeamB', 'TeamA', 'TeamB', 'TeamA', 'TeamB', 'TeamA'],
        'home_shoton': [5, 4, 6, 3, 5, 4],
        'away_shoton': [4, 5, 5, 4, 4, 3]
    }
    
    df_mock = pd.DataFrame(data)
    stat_type = 'shoton'
    
    df_ = calculate_ewm_team_stat(df_mock, 0.01, stat_type)
    
    assert df_.iloc[-1]['ewm_shoton_home'] == 3.998, f"Expected 3.998 but got {df_.iloc[-1]['ewm_shoton_home']}"
    assert df_.iloc[-1]['ewm_shoton_away'] == 4.998, f"Expected 4.998 but got {df_.iloc[-1]['ewm_shoton_away']}"
    print("Test passed!")

test_count_average_stat_from_last_n_matches()

Test passed!


In [22]:
from sklearn.preprocessing import LabelEncoder

def prepare_target(y_):
    le = LabelEncoder()
    transform = le.fit_transform(y_)
    return transform, le.classes_

df_transformed['result_match'], classes = prepare_target(df_transformed['result_match'])

In [23]:
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

import warnings
warnings.filterwarnings('ignore')

X = df_transformed.drop(['result_match', 'match_api_id', 'season', 'date', 'away_team', 'home_team', 'away_team_goal', 'home_team_goal', 'away_possession', 'home_shoton', 'home_possession', 'away_shoton',], axis=1)
y = df_transformed['result_match']
X_train_full, X_val_full, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

xgb_full = XGBClassifier(random_state=42, enable_categorical='True')

xgb_full.fit(X_train_full, y_train)

y_pred_full = xgb_full.predict(X_val_full)
f1 = f1_score(y_val, y_pred_full, average='weighted')

f1

0.479134685236962

In [24]:
import os

output_dir = "../data/"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)


X.to_csv(output_dir + "start_dataset.csv", index=False)
y.to_csv(output_dir + 'y.csv', index=False)