In [329]:
import numpy as np
import pandas as pd
import math
from scipy.stats import norm
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import joblib
from sklearn.model_selection import train_test_split

In [330]:
def train_model(df):
    # Convert match_outcome from categorical to numeric
    label_encoder = LabelEncoder()
    df['match_outcome_encoded'] = label_encoder.fit_transform(df['match_outcome'])

    # Create Logistic Regression model
    model = LogisticRegression(multi_class='ovr')  # 'ovr' stands for One-Vs-Rest

    # Reshape rating_difference to 2D array for model fitting
    X = df['rating_difference'].values.reshape(-1, 1)
    y = df['match_outcome_encoded']

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit the model with the training data
    model.fit(X_train, y_train)

    # Save the model and label encoder
    joblib.dump(model, 'model.pkl')
    joblib.dump(label_encoder, 'label_encoder.pkl')

    # Evaluate the model on the test set
    score = model.score(X_test, y_test)
    print(f'Model accuracy: {score*100:.2f}%')

    return model, label_encoder

In [331]:
# Function to initialize ratings
def initialize_ratings(df_results, df_fixtures):
    ratings = {}

    # Iterate over teams in the results data
    teams = set(df_results['home_team']).union(set(df_results['away_team'])).union(set(df_fixtures['home_team'])).union(set(df_fixtures['away_team']))
    for team in teams:
        # Initialize ratings for each team
        ratings[team] = {
            'brH': 0.0,
            'brA': 0.0,
            'continuous_overunderperformances': 0
        }
    print('Teams:', teams, end='\n\n')
    return ratings

In [332]:
# Function to update ratings based on results data
def update_ratings(df_results, ratings):

    #lambda: Determines to what extent the new match results influence the team ratings (could be improved to include temporal difference between matches)
    learning_rate_lambda = 0.054

    #psi: diminish the impact each additional goal difference error has on team ratings
    diminishing_function_psi = lambda error: 3 * np.log10(1 + error)

    #gamma: determines to what extent performances at the home grounds influence away team ratings and vice versa
    learning_rate_gamma = 0.79

    #phi: Represents the number of continuous performances, above or below expectations, which do not trigger the form factor
    form_threshold_phi = 1

    #mu: represents the rating difference used to establish provisional ratings from background ratings
    rating_impact_mu = 0.01

    #delta: the level by which rating impact μ diminishes with each additional continuous over/under-performance
    diminishing_factor_delta = 2.5

    # Iterate over each match in the results data
    for index, row in df_results.iterrows():
        print("Game Nr:", index)

        if index == 2000:
            break

        home_team = row['home_team']
        away_team = row['away_team']
        print("home_team:", home_team)
        print("away_team:", away_team)

        # Calculate home team rating
        home_rating_x = ratings[home_team]['brH']
        away_rating_x = ratings[home_team]['brA']
        if (abs(ratings[home_team]['continuous_overunderperformances']) > form_threshold_phi):
            home_rating_x = ratings[home_team]['prH']
            away_rating_x = ratings[home_team]['prA']

        # Calculate away team rating
        away_rating_y = ratings[away_team]['brA']
        home_rating_y = ratings[away_team]['brH']
        if (abs(ratings[away_team]['continuous_overunderperformances']) > form_threshold_phi):
            away_rating_y = ratings[away_team]['prA']
            home_rating_y = ratings[away_team]['prH']



        observed_goal_difference = row['home_goals'] - row['away_goals']
        print("observed_goal_difference:", observed_goal_difference)

        #Calculate expected goals for home team
        #expected_goal_x = round((10 ** (abs(ratings[home_team]['brH']) / 3)) - 1,5)
        expected_goal_x_temp = abs(home_rating_x) / 3
        expected_goal_x = np.power(10, expected_goal_x_temp) - 1
        print("expected_goal_x:", expected_goal_x)

        # Calculate expected goals for away team
        #expected_goal_y = round((10 ** (abs(ratings[away_team]['brA']) / 3)) - 1,5)
        expected_goal_y_temp = abs(away_rating_y) / 3
        expected_goal_y = np.power(10, expected_goal_y_temp) - 1
        print("expected_goal_y:", expected_goal_y)

        # Calculate expected goal difference based on ratings
        expected_goal_difference = expected_goal_x - expected_goal_y
        print("expected_goal_difference:", expected_goal_difference)


        # Calculate the error between observed and expected goal difference
        error = abs(observed_goal_difference - expected_goal_difference)
        print("error:", error)

        psi_temp = diminishing_function_psi(error)

        # Diminish the impact of the goal difference error for both teams x and y respectively
        if (expected_goal_difference < observed_goal_difference):
            diminishing_function_psi_x = psi_temp
            diminishing_function_psi_y = -psi_temp
        else:
            diminishing_function_psi_x = -psi_temp
            diminishing_function_psi_y = psi_temp
        print("diminishing_function_psi_x:", diminishing_function_psi_x)
        print("diminishing_function_psi_y:", diminishing_function_psi_y)

        # Update the home team x background ratings
        previous_home_rating_x = ratings[home_team]['brH']
        previous_away_rating_x = ratings[home_team]['brA']
        print("previous_home_rating_x:", previous_home_rating_x)
        print("previous_away_rating_x:", previous_away_rating_x)

        ratings[home_team]['brH'] = previous_home_rating_x + diminishing_function_psi_x * learning_rate_lambda
        ratings[home_team]['brA'] = previous_away_rating_x + (ratings[home_team]['brH'] - previous_home_rating_x) * learning_rate_gamma
        print("ratings[home_team]['brH']:", ratings[home_team]['brH'])
        print("ratings[home_team]['brA']:", ratings[home_team]['brA'])

        # Update the away team y background ratings
        previous_home_rating_y = ratings[away_team]['brH']
        previous_away_rating_y = ratings[away_team]['brA']
        print("previous_home_rating_y:", previous_home_rating_y)
        print("previous_away_rating_y:", previous_away_rating_y)

        ratings[away_team]['brA'] = previous_away_rating_y + diminishing_function_psi_y * learning_rate_lambda
        ratings[away_team]['brH'] = previous_home_rating_y + (ratings[away_team]['brA'] - previous_away_rating_y) * learning_rate_gamma
        print("ratings[away_team]['brH']:", ratings[away_team]['brH'])
        print("ratings[away_team]['brA']:", ratings[away_team]['brA'])

        print("home team: previous overunderperformance:", ratings[home_team]['continuous_overunderperformances'])
        print("away team: previous overunderperformance:", ratings[away_team]['continuous_overunderperformances'])

        # Update the continuous over/underperformances for the home team
        if (observed_goal_difference > expected_goal_difference):
            ratings[home_team]['continuous_overunderperformances'] = max(1, ratings[home_team]['continuous_overunderperformances'] + 1)
            ratings[away_team]['continuous_overunderperformances'] = min(-1, ratings[away_team]['continuous_overunderperformances'] - 1)
        elif (observed_goal_difference < expected_goal_difference):
            ratings[home_team]['continuous_overunderperformances'] = min(-1, ratings[home_team]['continuous_overunderperformances'] - 1)
            ratings[away_team]['continuous_overunderperformances'] = max(1, ratings[away_team]['continuous_overunderperformances'] + 1)

        print("home team: updated overunderperformance:", ratings[home_team]['continuous_overunderperformances'])
        print("away team: updated overunderperformance:", ratings[away_team]['continuous_overunderperformances'], end='\n\n')


        # Calculate performance factor for home team x
        a = ratings[home_team]['continuous_overunderperformances'] - form_threshold_phi
        b = a ** diminishing_factor_delta
        if (a == 0):
            form_factor_home = 0
        else:
            form_factor_home = a / b

        # Calculate home team x provisional rating
        home_rating = ratings[home_team]['brH']
        if (ratings[home_team]['continuous_overunderperformances'] > form_threshold_phi):
            home_rating = ratings[home_team]['brH'] + rating_impact_mu * form_factor_home
        if (ratings[home_team]['continuous_overunderperformances'] < -form_threshold_phi):
            home_rating = ratings[home_team]['brH'] - rating_impact_mu * form_factor_home

        # Calculate performance factor for away team y
        c = ratings[away_team]['continuous_overunderperformances'] - form_threshold_phi
        d = c ** diminishing_factor_delta
        if (c == 0):
            form_factor_away = 0
        else:
            form_factor_away = c / d

        # Calculate away team y provisional rating
        away_rating = ratings[away_team]['brA']
        if (ratings[away_team]['continuous_overunderperformances'] > form_threshold_phi):
            away_rating = ratings[away_team]['brA'] + rating_impact_mu * form_factor_away
        if (ratings[away_team]['continuous_overunderperformances'] < -form_threshold_phi):
            away_rating = ratings[away_team]['brA'] - rating_impact_mu * form_factor_away


    return ratings

In [333]:
def calculate_provisional_ratings(ratings, team):

    #phi: Represents the number of continuous performances, above or below expectations, which do not trigger the form factor
    form_threshold_phi = 1

    #mu: represents the rating difference used to establish provisional ratings from background ratings
    rating_impact_mu = 0.01

    #delta: the level by which rating impact μ diminishes with each additional continuous over/under-performance
    diminishing_factor_delta = 2.5

    brH = ratings[team]['brH']  # Background rating home
    brA = ratings[team]['brA']  # Background rating away
    prH = brH
    prA = brA

    # Calculate performance factor for home team x
    a = ratings[team]['continuous_overunderperformances'] - form_threshold_phi
    b = a ** diminishing_factor_delta
    if (a == 0):
        form_factor_home = 0
    else:
        form_factor_home = a / b

    # Calculate home team x provisional rating
    if (ratings[team]['continuous_overunderperformances'] > form_threshold_phi):
        prH = brH + rating_impact_mu * form_factor_home
        prA = brA + rating_impact_mu * form_factor_home
    if (ratings[team]['continuous_overunderperformances'] < -form_threshold_phi):
        prH = brH - rating_impact_mu * form_factor_home
        prA = brA - rating_impact_mu * form_factor_home

    return prH, prA

In [334]:
def calculate_probabilities(rating_difference):
    # Load the model and label encoder
    model = joblib.load('model.pkl')
    label_encoder = joblib.load('label_encoder.pkl')

    # Now you can predict probabilities for a new game:
    new_game_rating_difference = np.array([[rating_difference]])
    probabilities = model.predict_proba(new_game_rating_difference)
    decoded_predictions = {label: prob for label, prob in zip(label_encoder.classes_, probabilities[0])}

    return decoded_predictions

In [335]:
# Function to calculate the rating difference between two teams
def calculate_rating_difference(home_team, away_team, ratings):

    # Calculate home team rating
    home_rating_x = ratings[home_team]['brH']
    if (abs(ratings[home_team]['continuous_overunderperformances']) > 1):
        provisional_ratings_x = calculate_provisional_ratings(ratings, home_team)
        home_rating_x = provisional_ratings_x[0]

    # Calculate away team rating
    away_rating_y = ratings[away_team]['brA']
    if (abs(ratings[away_team]['continuous_overunderperformances']) > 1):
        provisional_ratings_y = calculate_provisional_ratings(ratings, away_team)
        away_rating_y = provisional_ratings_y[1]

    # Calculate rating difference
    rating_difference = home_rating_x - away_rating_y
    print(rating_difference)

    return rating_difference

In [336]:
def predict_outcomes(df_fixtures, ratings):
    for index, row in df_fixtures.iterrows():
        home_team = row['home_team']
        away_team = row['away_team']
        rating_difference = calculate_rating_difference(home_team, away_team, ratings)

        home_win_prob, draw_prob, away_win_prob = calculate_probabilities(rating_difference)

        print(f"{home_team} - {away_team}: Outcome Predictions:")
        print(f"Home Win: {home_win_prob}")
        print(f"Draw: {draw_prob}")
        print(f"Away Win: {away_win_prob}", end='\n\n')


In [337]:
def rps(probs, outcome):
    cum_probs = np.cumsum(probs)
    cum_outcomes = np.cumsum(outcome)

    print(cum_outcomes)
    print(cum_probs)
    sum_rps = 0
    for i in range(len(outcome)):
        sum_rps+= (cum_probs[i] - cum_outcomes[i])**2

    return sum_rps/(len(outcome)-1)

In [338]:
# Main function
def main():
    # Load the results data file for seasons 2006-07 to 2016-17
    df_results = pd.read_csv('../data/smallFixtures.csv')

    # Load the fixtures data file for the season 2017-18
    df_fixtures = pd.read_csv('../data/smallFixtures.csv')



    # Initialize ratings based on the results data
    ratings = initialize_ratings(df_results, df_fixtures)

    ratings['Leicester City']['brH'] = 0.463014
    ratings['Leicester City']['brA'] = 0.208624
    ratings['Leicester City']['continuous_overunderperformances'] = 3

    ratings['Stoke City']['brH'] = 0.537708
    ratings['Stoke City']['brA'] = 0.037819
    ratings['Stoke City']['continuous_overunderperformances'] = -1


    # Update ratings based on the results data
    #ratings = update_ratings(df_results, ratings)

    # Train the model
    train_model(df_train)

    # Predict the probabilities of home win, draw and away win for the fixtures data
    predict_outcomes(df_fixtures, ratings)

    #ratings = update_ratings(df_results, ratings)




    """
    # Call Ranked Probability Score function
    probs = [0.486, 0.261, 0.253]
    outcome = [1, 0, 0]
    rps_score = rps(probs, outcome)
    print("RPS Score:", rps_score, end='\n\n')
    """

    for team, team_ratings in ratings.items():
        print(f"Team: {team}")
        print(f"Background Rating Home: {team_ratings['brH']}")
        print(f"Background Rating Away: {team_ratings['brA']}")
        print(f"Continuous Over/Underperformances: {team_ratings['continuous_overunderperformances']}")
        print()

    """
    # Calculate the mean and standard deviation of the rating differences
    rating_differences = calculate_rating_difference(df_results, ratings)
    mean = np.mean(rating_differences)
    std = np.std(rating_differences)
    print("Mean:", mean)
    print("Standard Deviation:", std, end='\n\n')
    """



In [339]:
if __name__ == '__main__':
    main()

Teams: {'Stoke City', 'Leicester City'}

0.42873053390593274


FileNotFoundError: [Errno 2] No such file or directory: 'model.pkl'