In [1]:
import pandas as pd
import numpy as np
from scipy.stats import gamma, poisson

In [2]:
teams = pd.read_csv('data/teams.csv').convert_dtypes()
results = pd.read_csv('data/results.csv').convert_dtypes()
fixtures = pd.read_csv('data/fixtures.csv').convert_dtypes()

In [3]:
# Calculate total goals scored and conceded by each team in home and away matches
team_stats_home = results.groupby('HomeTeamID').agg({'HomeScore': 'sum', 'AwayScore': 'sum', 'HomeTeamID': 'count'}).rename(columns={'HomeScore': 'home_goals_scored', 'AwayScore': 'home_goals_conceded', 'HomeTeamID': 'home_games_played'}).reset_index()
team_stats_away = results.groupby('AwayTeamID').agg({'HomeScore': 'sum', 'AwayScore': 'sum', 'AwayTeamID': 'count'}).rename(columns={'HomeScore': 'away_goals_conceded', 'AwayScore': 'away_goals_scored', 'AwayTeamID': 'away_games_played'}).reset_index()

team_stats = team_stats_home.merge(team_stats_away, left_on='HomeTeamID', right_on='AwayTeamID')

In [4]:
team_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   HomeTeamID           28 non-null     Int64
 1   home_goals_scored    28 non-null     Int64
 2   home_goals_conceded  28 non-null     Int64
 3   home_games_played    28 non-null     Int64
 4   AwayTeamID           28 non-null     Int64
 5   away_goals_conceded  28 non-null     Int64
 6   away_goals_scored    28 non-null     Int64
 7   away_games_played    28 non-null     Int64
dtypes: Int64(8)
memory usage: 2.1 KB


In [5]:
# Calculate league averages and standard deviations
league_avg_home_goals = results['HomeScore'].mean()
league_avg_away_goals = results['AwayScore'].mean()
league_std_home_goals = results['HomeScore'].std()
league_std_away_goals = results['AwayScore'].std()

# Prior parameters for the Gamma distribution (shape and rate)
alpha_home_scored = league_avg_home_goals**2 / league_std_home_goals**2
beta_home_scored = league_avg_home_goals / league_std_home_goals**2

alpha_away_scored = league_avg_away_goals**2 / league_std_away_goals**2
beta_away_scored = league_avg_away_goals / league_std_away_goals**2

alpha_home_conceded = league_avg_away_goals**2 / league_std_away_goals**2
beta_home_conceded = league_avg_away_goals / league_std_away_goals**2

alpha_away_conceded = league_avg_home_goals**2 / league_std_home_goals**2
beta_away_conceded = league_avg_home_goals / league_std_home_goals**2

In [6]:
# Update the parameters with observed data
team_stats['alpha_home_goals_scored'] = alpha_home_scored + team_stats['home_goals_scored']
team_stats['beta_home_goals_scored'] = beta_home_scored + team_stats['home_games_played']

team_stats['alpha_away_goals_scored'] = alpha_away_scored + team_stats['away_goals_scored']
team_stats['beta_away_goals_scored'] = beta_away_scored + team_stats['away_games_played']

team_stats['alpha_home_goals_conceded'] = alpha_home_conceded + team_stats['home_goals_conceded']
team_stats['beta_home_goals_conceded'] = beta_home_conceded + team_stats['home_games_played']

team_stats['alpha_away_goals_conceded'] = alpha_away_conceded + team_stats['away_goals_conceded']
team_stats['beta_away_goals_conceded'] = beta_away_conceded + team_stats['away_games_played']

In [7]:
# Sampling from the posterior distributions to get predicted goals
def sample_goals(alpha, beta, size=1000):
    rate_samples = gamma.rvs(alpha, scale=1/beta, size=size)
    return poisson.rvs(rate_samples)


In [8]:
# Predicting goals using the sampled rates
num_samples = 10000
fixtures['predicted_home_goals'] = fixtures.apply(
    lambda row: sample_goals(
        team_stats.loc[team_stats['HomeTeamID'] == row['HomeTeamID'], 'alpha_home_goals_scored'].values[0],
        team_stats.loc[team_stats['HomeTeamID'] == row['HomeTeamID'], 'beta_home_goals_scored'].values[0],
        num_samples
    ).mean(), axis=1
)

fixtures['predicted_away_goals'] = fixtures.apply(
    lambda row: sample_goals(
        team_stats.loc[team_stats['AwayTeamID'] == row['AwayTeamID'], 'alpha_away_goals_scored'].values[0],
        team_stats.loc[team_stats['AwayTeamID'] == row['AwayTeamID'], 'beta_away_goals_scored'].values[0],
        num_samples
    ).mean(), axis=1
)

predicted_fixtures = fixtures[['MatchID', 'HomeTeamID', 'AwayTeamID', 'predicted_home_goals', 'predicted_away_goals']]

In [9]:
predict_actual_results = predicted_fixtures.merge(results[['MatchID', 'HomeScore', 'AwayScore']])

In [12]:
predict_actual_results

Unnamed: 0,MatchID,HomeTeamID,AwayTeamID,predicted_home_goals,predicted_away_goals,HomeScore,AwayScore,Actual_HomePoints,Actual_AwayPoints,Pred_HomePoints,Pred_AwayPoints,Correct Match
0,757,2,1,1.1854,1.3396,1,2,0,3,0,3,1
1,758,28,3,1.5289,1.1080,5,1,3,0,3,0,1
2,759,27,4,2.0465,2.1043,0,1,0,3,0,3,1
3,760,26,5,1.0060,2.0959,2,6,0,3,0,3,1
4,761,25,6,0.9422,1.9395,0,3,0,3,0,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...
751,1508,10,19,1.0574,2.0085,0,6,0,3,0,3,1
752,1509,11,18,1.8284,1.1592,5,2,3,0,3,0,1
753,1510,12,17,1.2391,1.0975,3,3,1,1,3,0,0
754,1511,13,16,1.7667,1.0000,5,1,3,0,3,0,1


In [10]:
# Calculate actual and predicted match points: win 3 , draw 1 , loss 0

predict_actual_results['Actual_HomePoints'] = predict_actual_results.apply(lambda x: 3 if x['HomeScore'] > x['AwayScore'] else (1 if x['HomeScore'] == x['AwayScore'] else 0), axis=1)
predict_actual_results['Actual_AwayPoints'] = predict_actual_results.apply(lambda x: 3 if x['HomeScore'] < x['AwayScore'] else (1 if x['HomeScore'] == x['AwayScore'] else 0), axis=1)

predict_actual_results['Pred_HomePoints'] = predict_actual_results.apply(lambda x: 3 if x['predicted_home_goals'] > x['predicted_away_goals'] else (1 if x['predicted_home_goals'] == x['predicted_away_goals'] else 0), axis=1)
predict_actual_results['Pred_AwayPoints'] = predict_actual_results.apply(lambda x: 3 if x['predicted_home_goals'] < x['predicted_away_goals'] else (1 if x['predicted_home_goals'] == x['predicted_away_goals'] else 0), axis=1)

predict_actual_results['Correct Match'] = predict_actual_results.apply(lambda x: 1 if x['Actual_HomePoints'] == x['Pred_HomePoints'] else 0, axis=1)

In [11]:
predict_actual_results['Correct Match'].value_counts()


Correct Match
1    461
0    295
Name: count, dtype: int64