In [159]:
import numpy as np
import pandas as pd
import math
from scipy.stats import norm
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import joblib
from sklearn.model_selection import train_test_split

In [160]:
def train_model(df_train):
    # Convert match_outcome from categorical to numeric
    label_encoder = LabelEncoder()
    df_train['match_outcome_encoded'] = label_encoder.fit_transform(df_train['match_outcome'])

    # Create Logistic Regression model
    model = LogisticRegression(multi_class='ovr')  # 'ovr' stands for One-Vs-Rest

    # Reshape rating_difference to 2D array for model fitting
    X = df_train['rating_difference'].values.reshape(-1, 1)
    y = df_train['match_outcome_encoded']

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit the model with the training data
    model.fit(X_train, y_train)

    # Save the model and label encoder
    joblib.dump(model, 'model.pkl')
    joblib.dump(label_encoder, 'label_encoder.pkl')

    # Evaluate the model on the test set
    score = model.score(X_test, y_test)
    print(f'Model accuracy: {score*100:.2f}%')

    return model, label_encoder

In [161]:
# Function to initialize ratings
def initialize_ratings(df_results, df_fixtures):
    ratings = {}

    # Iterate over teams in the results data
    teams = set(df_results['home_team']).union(set(df_results['away_team'])).union(set(df_fixtures['home_team'])).union(set(df_fixtures['away_team']))
    for team in teams:
        # Initialize ratings for each team
        ratings[team] = {
            'brH': 0.0,
            'brA': 0.0,
            'continuous_overunderperformances': 0
        }
    print('Teams:', teams, end='\n\n')
    return ratings

In [162]:
# Function to update ratings based on results data
def update_ratings_multiple_games(df_results, ratings):

    # Iterate over each match in the results data
    for index, row in df_results.iterrows():

        if index == 1000:
            break

        print("Game Nr:", index)

        ratings = update_ratings_single_game(row['home_team'], row['away_team'], row['home_goals'], row['away_goals'], ratings)

    return ratings

In [163]:
# Function to update ratings based on results data
def update_ratings_single_game(home_team, away_team, home_goals, away_goals, ratings):

    #lambda: Determines to what extent the new match results influence the team ratings (could be improved to include temporal difference between matches)
    learning_rate_lambda = 0.054

    #psi: diminish the impact each additional goal difference error has on team ratings
    diminishing_function_psi = lambda error: 3 * np.log10(1 + error)

    #gamma: determines to what extent performances at the home grounds influence away team ratings and vice versa
    learning_rate_gamma = 0.79

    print(home_team, "-", away_team, home_goals, ":", away_goals)


    observed_goal_difference = home_goals - away_goals
    print("Observed Goal Difference:", observed_goal_difference)

    #Calculate expected goals for home team
    #expected_goal_x = round((10 ** (abs(ratings[home_team]['brH']) / 3)) - 1,5)
    expected_goal_x_temp = abs(ratings[home_team]['brH']) / 3
    expected_goal_x = np.power(10, expected_goal_x_temp) - 1
    print("Expected Goals x:", expected_goal_x)

    # Calculate expected goals for away team
    #expected_goal_y = round((10 ** (abs(ratings[away_team]['brA']) / 3)) - 1,5)
    expected_goal_y_temp = abs(ratings[away_team]['brA']) / 3
    expected_goal_y = np.power(10, expected_goal_y_temp) - 1
    print("Expected Goals y:", expected_goal_y)

    # Calculate expected goal difference based on ratings
    expected_goal_difference = expected_goal_x - expected_goal_y
    print("Expected Goal Difference:", expected_goal_difference)

    # Calculate the error between observed and expected goal difference
    error = abs(observed_goal_difference - expected_goal_difference)
    print("error:", error)

    psi_temp = diminishing_function_psi(error)

    # Diminish the impact of the goal difference error for both teams x and y respectively
    if (expected_goal_difference < observed_goal_difference):
        diminishing_function_psi_x = psi_temp
        diminishing_function_psi_y = -psi_temp
    else:
        diminishing_function_psi_x = -psi_temp
        diminishing_function_psi_y = psi_temp
    print("Diminishing Function psi x:", diminishing_function_psi_x)
    print("Diminishing Function psi y:", diminishing_function_psi_y)

    # Update the home team x background ratings
    previous_home_rating_x = ratings[home_team]['brH']
    previous_away_rating_x = ratings[home_team]['brA']
    print("Old brH x:", previous_home_rating_x)
    print("Old brA x:", previous_away_rating_x)

    ratings[home_team]['brH'] = previous_home_rating_x + diminishing_function_psi_x * learning_rate_lambda
    ratings[home_team]['brA'] = previous_away_rating_x + (ratings[home_team]['brH'] - previous_home_rating_x) * learning_rate_gamma
    print("New brH x:", ratings[home_team]['brH'])
    print("New brA x:", ratings[home_team]['brA'])

    # Update the away team y background ratings
    previous_home_rating_y = ratings[away_team]['brH']
    previous_away_rating_y = ratings[away_team]['brA']
    print("Old brH y:", previous_home_rating_y)
    print("Old brA y:", previous_away_rating_y)

    ratings[away_team]['brA'] = previous_away_rating_y + diminishing_function_psi_y * learning_rate_lambda
    ratings[away_team]['brH'] = previous_home_rating_y + (ratings[away_team]['brA'] - previous_away_rating_y) * learning_rate_gamma
    print("New brH y:", ratings[away_team]['brH'])
    print("New brA y:", ratings[away_team]['brA'])

    print("Old overunderperformance x:", ratings[home_team]['continuous_overunderperformances'])
    print("Old overunderperformance y:", ratings[away_team]['continuous_overunderperformances'])

    # Update the continuous over/underperformances for the home team
    if (observed_goal_difference > expected_goal_difference):
        ratings[home_team]['continuous_overunderperformances'] = max(1, ratings[home_team]['continuous_overunderperformances'] + 1)
        ratings[away_team]['continuous_overunderperformances'] = min(-1, ratings[away_team]['continuous_overunderperformances'] - 1)
    elif (observed_goal_difference < expected_goal_difference):
        ratings[home_team]['continuous_overunderperformances'] = min(-1, ratings[home_team]['continuous_overunderperformances'] - 1)
        ratings[away_team]['continuous_overunderperformances'] = max(1, ratings[away_team]['continuous_overunderperformances'] + 1)
    else:
        ratings[home_team]['continuous_overunderperformances'] = 0
        ratings[away_team]['continuous_overunderperformances'] = 0

    print("New overunderperformance x:", ratings[home_team]['continuous_overunderperformances'])
    print("New overunderperformance y:", ratings[away_team]['continuous_overunderperformances'], end='\n\n')

    return ratings


In [164]:
def calculate_provisional_ratings(ratings, team):

    #phi: Represents the number of continuous performances, above or below expectations, which do not trigger the form factor
    form_threshold_phi = 1

    #mu: represents the rating difference used to establish provisional ratings from background ratings
    rating_impact_mu = 0.01

    #delta: the level by which rating impact μ diminishes with each additional continuous over/under-performance
    diminishing_factor_delta = 2.5

    brH = ratings[team]['brH']  # Background rating home
    brA = ratings[team]['brA']  # Background rating away
    prH = brH
    prA = brA

    # Calculate performance factor for home team x
    a = ratings[team]['continuous_overunderperformances'] - form_threshold_phi
    b = a ** diminishing_factor_delta
    if a == 0 or b == 0:
        form_factor_home = 0
    else:
        form_factor_home = a / b

    # Calculate provisional rating of the team
    if (ratings[team]['continuous_overunderperformances'] > form_threshold_phi):
        prH = brH + rating_impact_mu * form_factor_home
        prA = brA + rating_impact_mu * form_factor_home
    if (ratings[team]['continuous_overunderperformances'] < -form_threshold_phi):
        prH = brH - rating_impact_mu * form_factor_home
        prA = brA - rating_impact_mu * form_factor_home

    return prH, prA

In [165]:
def calculate_probabilities(rating_difference):
    # Load the model and label encoder
    model = joblib.load('model.pkl')
    label_encoder = joblib.load('label_encoder.pkl')

    # Now you can predict probabilities for a new game:
    new_game_rating_difference = np.array([[rating_difference]])
    probabilities = model.predict_proba(new_game_rating_difference)
    decoded_predictions = {label: prob for label, prob in zip(label_encoder.classes_, probabilities[0])}

    return decoded_predictions

In [166]:
# Function to calculate the rating difference between two teams
def calculate_rating_difference(home_team, away_team, ratings):

    # Calculate home team rating
    home_rating_x = ratings[home_team]['brH']
    if (abs(ratings[home_team]['continuous_overunderperformances']) > 1):
        provisional_ratings_x = calculate_provisional_ratings(ratings, home_team)
        home_rating_x = provisional_ratings_x[0]

    # Calculate away team rating
    away_rating_y = ratings[away_team]['brA']
    if (abs(ratings[away_team]['continuous_overunderperformances']) > 1):
        provisional_ratings_y = calculate_provisional_ratings(ratings, away_team)
        away_rating_y = provisional_ratings_y[1]

    # Calculate rating difference
    rating_difference = home_rating_x - away_rating_y
    print(rating_difference)

    return rating_difference

In [167]:
def predict_outcomes(df_fixtures, ratings):
    for index, row in df_fixtures.iterrows():
        home_team = row['home_team']
        away_team = row['away_team']
        rating_difference = calculate_rating_difference(home_team, away_team, ratings)

        home_win_prob, draw_prob, away_win_prob = calculate_probabilities(rating_difference)

        print(f"{home_team} - {away_team}: Outcome Predictions:")
        print(f"Home Win: {home_win_prob}")
        print(f"Draw: {draw_prob}")
        print(f"Away Win: {away_win_prob}", end='\n\n')

        update_ratings_single_game(home_team, away_team, row['home_goals'], row['away_goals'], ratings)


In [168]:
def rps(probs, outcome):
    cum_probs = np.cumsum(probs)
    cum_outcomes = np.cumsum(outcome)

    print(cum_outcomes)
    print(cum_probs)
    sum_rps = 0
    for i in range(len(outcome)):
        sum_rps+= (cum_probs[i] - cum_outcomes[i])**2

    return sum_rps/(len(outcome)-1)

In [169]:
# Main function
def main():
    # Load the results data file for seasons 2006-07 to 2016-17
    df_results = pd.read_csv('../data/results.csv')

    # Load the fixtures data file for the season 2017-18
    df_fixtures = pd.read_csv('../data/smallFixtures.csv')

    # Load the training data file
    #df_train = pd.read_csv('../data/train.csv')

    # Initialize ratings based on the results data
    ratings = initialize_ratings(df_results, df_fixtures)

    """
    ratings['Leicester City']['brH'] = 0.463014
    ratings['Leicester City']['brA'] = 0.208624
    ratings['Leicester City']['continuous_overunderperformances'] = 3

    ratings['Stoke City']['brH'] = 0.537708
    ratings['Stoke City']['brA'] = 0.037819
    ratings['Stoke City']['continuous_overunderperformances'] = -1

    ratings = update_ratings_single_game("Leicester City","Stoke City", 2, 0, ratings)
    """

    # Update ratings based on the results data
    update_ratings_multiple_games(df_results, ratings)

    # Train the model
    #train_model(df_train)

    # Predict the probabilities of home win, draw and away win for the fixtures data
    #predict_outcomes(df_fixtures, ratings)




    """
    # Call Ranked Probability Score function
    probs = [0.486, 0.261, 0.253]
    outcome = [1, 0, 0]
    rps_score = rps(probs, outcome)
    print("RPS Score:", rps_score, end='\n\n')
    """

    for team, team_ratings in ratings.items():
        print(f"Team: {team}")
        print(f"Background Rating Home: {team_ratings['brH']}")
        print(f"Background Rating Away: {team_ratings['brA']}")
        print(f"Continuous Over/Underperformances: {team_ratings['continuous_overunderperformances']}")
        print()

    """
    # Calculate the mean and standard deviation of the rating differences
    rating_differences = calculate_rating_difference(df_results, ratings)
    mean = np.mean(rating_differences)
    std = np.std(rating_differences)
    print("Mean:", mean)
    print("Standard Deviation:", std, end='\n\n')
    """



In [170]:
if __name__ == '__main__':
    main()

Teams: {'Chelsea', 'Hull City', 'Birmingham City', 'Stoke City', 'AFC Bournemouth', 'Sunderland', 'Cardiff City', 'Burnley', 'Sheffield United', 'Wolverhampton Wanderers', 'Middlesbrough', 'Norwich City', 'Tottenham Hotspur', 'Liverpool', 'Swansea City', 'Blackburn Rovers', 'Fulham', 'Portsmouth', 'West Bromwich Albion', 'Wigan Athletic', 'Crystal Palace', 'Derby County', 'Southampton', 'Bolton Wanderers', 'Everton', 'Arsenal', 'Aston Villa', 'Watford', 'Queens Park Rangers', 'Blackpool', 'Manchester City', 'West Ham United', 'Leicester City', 'Manchester United', 'Newcastle United', 'Charlton Athletic', 'Reading'}

Game Nr: 0
Sheffield United - Liverpool 1.0 : 1.0
Observed Goal Difference: 0.0
Expected Goals x: 0.0
Expected Goals y: 0.0
Expected Goal Difference: 0.0
error: 0.0
Diminishing Function psi x: -0.0
Diminishing Function psi y: 0.0
Old brH x: 0.0
Old brA x: 0.0
New brH x: 0.0
New brA x: 0.0
Old brH y: 0.0
Old brA y: 0.0
New brH y: 0.0
New brA y: 0.0
Old overunderperformance x