In [177]:
"""
Project in course: Advanced probabalistic machine learning.

Probabalistic predictions on tennis and fotball matches using gibbs sampling and the message passing algorithms.


Done by Jonas Wikström, Ella C, Adam E and Axel J """ 

In [None]:
#imports 
import pandas as pd
import numpy as np
from scipy.stats import truncnorm, norm
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
import math 
import random

In [178]:
def new_var(samples, mean):
  sum = 0
  for i in range(len(samples)):
    sum = sum + (samples[i]-mean)**2
  var = sum / len(samples)
  return var

def mu_s_t(s1,s2,var_s1,var_s2,t):
  #returns the mean value of S = [s1,s2] given a value of t
  mean1 = s1
  var1 = var_s1
  mean2 = s2
  var2 = var_s2
  var_t_s = (25/6)**2  
  sigma_s_t = var_s_t(s1,s2,var1,var2)
  mean_s_t = sigma_s_t.dot(np.array([(mean1/var1 + t/var_t_s),(mean2/var2 - t/var_t_s)]))
  return mean_s_t

def var_s_t(s1,s2,var_s1,var_s2):
  mean1 = s1
  var1 = var_s1
  mean2 = s2
  var2 = var_s2
  var_t_s = (25/6)**2 

  #returns the variance of S = [s1,s2] given t
  denom = var1+var_t_s+var2 #denominator
  cov = np.array([[var1*(var_t_s+var2)/denom, var1*var2/denom],[var1*var2/denom, var2*(var1+var_t_s)/denom]])

  return cov

def mu_t_s(s1,s2):
  return (s1-s2 + 1/100)

def gibbs_sampler(s_team1,s_team2,var_team1,var_team2,t_input):
    
    if (t_input > 0):
        y = 1
    else: 
        y = -1
    N = 520
    s1 = np.zeros(N)
    s2 = np.zeros(N)
    t = np.zeros(N)
    n = N
    s1[0] = s_team1
    s2[0] = s_team2
    t[0] = t_input 

# Med dessa hyperparametrar fann vi en burn-in på omkring 20
    mean1 = s_team1
    var1 = var_team1
    mean2 = s_team2
    var2 = var_team2
    var_t_s = (25/6)**2 
    fixed_var_s_t = var_s_t(mean1,mean2,var1,var2)


    for k in range(n-1):
        #s1[k+1], s2[k+1] = np.random.multivariate_normal(mu_s_t(s1[k],s2[k],var1,var2,t[k]),var_s_t(s1[k],s2[k],var1,var2))
        s1[k+1], s2[k+1] = np.random.multivariate_normal(mu_s_t(mean1,mean2,var1,var2,t[k]),fixed_var_s_t)
        if (mu_t_s(s1[k],s2[k]) > 0 and (y == 1)):
            t[k+1] = truncnorm.rvs(a=(0-mu_t_s(s1[k],s2[k]))/np.sqrt(var_t_s),b= np.inf,loc=mu_t_s(s1[k],s2[k]), scale= np.sqrt(var_t_s)) #a = 0, b = np.inf
        
        elif (mu_t_s(s1[k],s2[k]) < 0 and (y == -1)):
             t[k+1] = truncnorm.rvs(a=-np.inf,b=(0-mu_t_s(s1[k],s2[k]))/np.sqrt(var_t_s),loc=mu_t_s(s1[k],s2[k]), scale= np.sqrt(var_t_s))
        else:
            t[k+1] = 0
# Sätter burn-in till 20 och tar bort de samplesen
    burn_in = [x for x in range(21)]
    s1 = np.delete(s1, burn_in)
    s2 = np.delete(s2, burn_in)
    return (s1, s2)

#gibbs sampler implementation of Q5 is OK

def which_team_won(s1,s2):
  var_t_s = (25/6)**2 
  mu_t_s = s1-s2
  cumulative_val = (s1-s2)/var_t_s
  if (norm.cdf(cumulative_val) > 0.5):
    return(1)
  else:
    return(-1)

In [192]:
url = "https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2019.csv"
matches = pd.read_csv(url, na_values='?');
tourn_arr = ['Australian Open','Wimbledon','US Open','Roland Garros']
#matches contains 500+ matches from Grand Slams
matches = matches.loc[matches['tourney_name'].isin(tourn_arr)]
matches = matches.reset_index(drop=True)

players_arr = matches['winner_name'].unique()
players_arr2 = matches['loser_name'].unique()
list1 = players_arr.tolist()
list2 = players_arr2.tolist()
list3 = list1+list2

players_arr = np.unique(list3)
initial_skills_t = [25] *(len(players_arr))
initial_var_t = [(25/3)**2]*(len(players_arr))
players_df = pd.DataFrame(np.array([initial_skills_t,initial_var_t]), columns =players_arr)

train_t, test_t = train_test_split(matches, test_size=0.33)
#------------------------------------------------------------------
url1 = "http://www.it.uu.se/edu/course/homepage/apml/project/SerieA.csv"
games = pd.read_csv(url1, na_values='?');
teams_array = games['team1'].unique()
teams_array2 = games['team2'].unique()

#create data frame with teams as columns
initial_skills_f = [25] *(len(teams_array))
initial_var_f = [(25/3)**2]*(len(teams_array))
teams_df = pd.DataFrame(np.array([initial_skills_f,initial_var_f]), columns =teams_array)

#remove all draws
games = games.drop(games[(games.score1 == games.score2)].index)
train_f, test_f = train_test_split(games, test_size=0.33)
print(teams_df)

      Chievo      Lazio     Torino   Sassuolo      Parma     Empoli  \
0  25.000000  25.000000  25.000000  25.000000  25.000000  25.000000   
1  69.444444  69.444444  69.444444  69.444444  69.444444  69.444444   

     Bologna   Atalanta   Juventus     Napoli       Spal    Udinese  \
0  25.000000  25.000000  25.000000  25.000000  25.000000  25.000000   
1  69.444444  69.444444  69.444444  69.444444  69.444444  69.444444   

       Inter      Genoa  Frosinone  Fiorentina   Cagliari       Roma  \
0  25.000000  25.000000  25.000000   25.000000  25.000000  25.000000   
1  69.444444  69.444444  69.444444   69.444444  69.444444  69.444444   

       Milan  Sampdoria  
0  25.000000  25.000000  
1  69.444444  69.444444  


In [193]:
def main_function_q9(train,what_model,sport):
    if (sport == "tennis"):
        for index, row in train.iterrows():
            player1 = row['winner_name']
            player2 = row['loser_name']
    
            s_team1 = players_df_t[player1].values[0]
            s_team2 = players_df_t[player2].values[0]
            var_team1 = players_df_t[player1].values[1]
            var_team2 = players_df_t[player2].values[1]
            if(what_model == "gibbs"):
                initial_t = 2
                s1_vect,s2_vect = gibbs_sampler(s_team1,s_team2,var_team1,var_team2,initial_t) 
                new_s_team1 = np.mean(s1_vect)
                new_s_team2 = np.mean(s2_vect)
                new_var_team1 = new_var(s1_vect, new_s_team1)
                new_var_team2 = new_var(s2_vect, new_s_team2)
            if(what_model == "message_pass"):
                new_s_team1,new_var_team1,new_s_team2,new_var_team2 = messagePassingMethod(s_team1, var_team1,s_team2, var_team2)
              
            players_df.loc[0,player1] = new_s_team1
            players_df.loc[0,player2] = new_s_team2
            players_df.loc[1,player1] = new_var_team1
            players_df.loc[1,player2] = new_var_team2
        return(players_df)
    
    if(sport == "football"):
        for index, row in train.iterrows():
            team1 = row['team1']
            team2 = row['team2']
            s_team1 = teams_df[team1].values[0]
            s_team2 = teams_df[team2].values[0]
            var_team1 = teams_df[team1].values[1]
            var_team2 = teams_df[team2].values[1]
            initial_t = row['score1']-row['score2']
            if(what_model == "gibbs"):
                s1_vect,s2_vect = gibbs_sampler(s_team1,s_team2,var_team1,var_team2,initial_t) 
                new_s_team1 = np.mean(s1_vect)
                new_s_team2 = np.mean(s2_vect)
                new_var_team1 = new_var(s1_vect, new_s_team1)
                new_var_team2 = new_var(s2_vect, new_s_team2)
            if(what_model == "message_pass"):
                new_s_team1,new_var_team1,new_s_team2,new_var_team2 = messagePassingMethod(s_team1, var_team1,s_team2, var_team2)
    
            teams_df.loc[0,team1] = new_s_team1
            teams_df.loc[0,team2] = new_s_team2
            teams_df.loc[1,team1] = new_var_team1
            teams_df.loc[1,team2] = new_var_team2
        return(teams_df)
    #swap between Gibbs and Message passi here
  

  

def prediction_function(model,test,what_model,extension,sport):
    pred_arr = []
    real_winner_arr = []
    correct_predictions = 0
    random_cor_pred = 0
    random_arr = [] 
    
    if (sport == "tennis"):
        for index, row in test.iterrows():
            player1 = row['winner_name'] #namn
            player2 = row['loser_name']
            s_team1 = model[player1].values[0]
            s_team2 = model[player2].values[0]
            var_team1 = model[player1].values[1]
            var_team2 = model[player2].values[1]
            initial_t = 2
            real_winner = 1
            var_t_s = (25/6)**2
            cumul_val = (s_team1-s_team2) / math.sqrt(var_t_s)

            #do prediction
            prediction = which_team_won(s_team1,s_team2)
            real_winner_arr.append(real_winner)
            pred_arr.append(prediction)
            random_arr.append(np.sign(random.uniform(-0.5,0.5)))
            if (what_model == "gibbs"):
                initial_t = 2
                s1_vect,s2_vect = gibbs_sampler(s_team1,s_team2,var_team1,var_team2,initial_t)    
                new_s_team1 = np.mean(s1_vect)
                new_s_team2 = np.mean(s2_vect)
                new_var_team1 = new_var(s1_vect, new_s_team1)
                new_var_team2 = new_var(s2_vect, new_s_team2)
            if (what_model == "message_pass"):
                new_s_team1,new_var_team1,new_s_team2,new_var_team2 = messagePassingMethod(s_team1, var_team1,s_team2, var_team2)
            if(extension == "extension"):
                score_str = row['score']
                score_winner = 0
                score_loser = 0
                for element in range(0, len(score_str)):
                    if (score_str[element] == "-"):
                        score_winner += int(score_str[element-1])
                        score_loser += int(score_str[element+1])
                if (score_winner != 0):
                    score_diff_rate = score_winner/ (score_winner +score_loser)
                else:
                    score_diff_rate = 0

                if(score_diff_rate > 0.62):
                    new_s_team1 = 1.4* new_s_team1
            
            model.loc[0,player1] = new_s_team1
            model.loc[0,player2] = new_s_team2
            model.loc[1,player1] = new_var_team1
            model.loc[1,player2] = new_var_team2
    
    
    
    #--------------------------------------------
            
    if(sport == "football"):
        for index, row in test.iterrows():
            team1 = row['team1']
            team2 = row['team2']
            s_team1 = teams_df[team1].values[0]
            s_team2 = teams_df[team2].values[0]
            var_team1 = teams_df[team1].values[1]
            var_team2 = teams_df[team2].values[1]
            initial_t = row['score1']-row['score2']
            real_winner = np.sign(row['score1'] - row['score2'])
            var_t_s = (25/6)**2
            cumul_val = (s_team1-s_team2) / math.sqrt(var_t_s)
            #do prediction
            prediction = which_team_won(s_team1,s_team2)
            real_winner_arr.append(real_winner)
            pred_arr.append(prediction)
            random_arr.append(np.sign(random.uniform(-0.5,0.5)))
            if (what_model == "gibbs"):
                s1_vect,s2_vect = gibbs_sampler(s_team1,s_team2,var_team1,var_team2,initial_t)    
                new_s_team1 = np.mean(s1_vect)
                new_s_team2 = np.mean(s2_vect)
                new_var_team1 = new_var(s1_vect, new_s_team1)
                new_var_team2 = new_var(s2_vect, new_s_team2)
            if (what_model == "message_pass"):
                new_s_team1,new_var_team1,new_s_team2,new_var_team2 = messagePassingMethod(s_team1, var_team1,s_team2, var_team2)
            if (extension == "extension"):
                if (initial_t > 3):
                    new_s_team1 = 1.4* new_s_team1
                    new_s_team2 = 0.9*new_s_team2
                elif (initial_t < -3):
                    new_s_team1 = 0.9* new_s_team1
                    new_s_team2 = 1.3 *new_s_team2
                    
            teams_df.loc[0,team1] = new_s_team1
            teams_df.loc[0,team2] = new_s_team2
            teams_df.loc[1,team1] = new_var_team1
            teams_df.loc[1,team2] = new_var_team2
            
    
    #swap between gibbs sampler and Message passing
   
    #----------------------------------------------------------     

    for i in range(len(pred_arr)):
        if random_arr[i] == real_winner_arr[i]:
            random_cor_pred +=1
        if pred_arr[i] == real_winner_arr[i]:
            correct_predictions +=1

    return(correct_predictions/(len(pred_arr)), random_cor_pred/(len(pred_arr)))
      

In [176]:
model = main_function_q9(train_f,"gibbs","football")
pred_pred, random_pred = prediction_function(model,test_f,"gibbs","","football")
print("case: football, gibbs, extension off")
print("pred_rate: ", pred_pred , "random_rate: ", random_pred)
print("----------------------------------------------------")
teams_df = pd.DataFrame(np.array([initial_skills_f,initial_var_f]), columns =teams_array)
model = main_function_q9(train_f,"message_pass","football")
pred_pred, random_pred = prediction_function(model,test_f,"message_pass","","football")
print("case: football, message passing, extension off")
print("pred_rate: ", pred_pred , "random_rate: ", random_pred)
print("----------------------------------------------------")
teams_df = pd.DataFrame(np.array([initial_skills_f,initial_var_f]), columns =teams_array)
model = main_function_q9(train_f,"message_pass","football")
pred_pred, random_pred = prediction_function(model,test_f,"message_pass","extension","football")
print("case: football, message passing, extension ON")
print("pred_rate: ", pred_pred , "random_rate: ", random_pred)
print("----------------------------------------------------")
teams_df = pd.DataFrame(np.array([initial_skills_f,initial_var_f]), columns =teams_array)
model = main_function_q9(train_f,"gibbs","football")
pred_pred, random_pred = prediction_function(model,test_f,"gibbs","extension","football")
print("case: football, gibbs, extension ON")
print("pred_rate: ", pred_pred , "random_rate: ", random_pred)
print("----------------------------------------------------")
players_df = pd.DataFrame(np.array([initial_skills_t,initial_var_t]), columns =players_arr)
model = main_function_q9(train_f,"gibbs","tennis")
pred_pred, random_pred = prediction_function(model,test_f,"gibbs","","tennis")
print("case: football, gibbs, extension off")
print("pred_rate: ", pred_pred , "random_rate: ", random_pred)
print("----------------------------------------------------")
players_df = pd.DataFrame(np.array([initial_skills_t,initial_var_t]), columns =players_arr)
model = main_function_q9(train_f,"message_pass","tennis")
pred_pred, random_pred = prediction_function(model,test_f,"message_pass","","tennis")
print("case: football, message passing, extension off")
print("pred_rate: ", pred_pred , "random_rate: ", random_pred)
print("----------------------------------------------------")
players_df = pd.DataFrame(np.array([initial_skills_t,initial_var_t]), columns =players_arr)
model = main_function_q9(train_f,"message_pass","tennis")
pred_pred, random_pred = prediction_function(model,test_f,"message_pass","extension","tennis")
print("case: football, message passing, extension ON")
print("pred_rate: ", pred_pred , "random_rate: ", random_pred)
print("----------------------------------------------------")
players_df = pd.DataFrame(np.array([initial_skills_t,initial_var_t]), columns =players_arr)
model = main_function_q9(train_f,"gibbs","tennis")
pred_pred, random_pred = prediction_function(model,test_f,"gibbs","extension","tennis")
print("case: football, gibbs, extension ON")
print("pred_rate: ", pred_pred , "random_rate: ", random_pred)
print("----------------------------------------------------")


KeyboardInterrupt: 

In [194]:
def print_function():
    model = main_function_q9(train_f,"gibbs","football")
    pred_pred, random_pred = prediction_function(model,test_f,"gibbs","","football")
    print("case: football, gibbs, extension off")
    print("pred_rate: ", pred_pred , "random_rate: ", random_pred)
    print("----------------------------------------------------")
    teams_df = pd.DataFrame(np.array([initial_skills_f,initial_var_f]), columns =teams_array)
    model = main_function_q9(train_f,"message_pass","football")
    pred_pred, random_pred = prediction_function(model,test_f,"message_pass","","football")
    print("case: football, message passing, extension off")
    print("pred_rate: ", pred_pred , "random_rate: ", random_pred)
    print("----------------------------------------------------")
    teams_df = pd.DataFrame(np.array([initial_skills_f,initial_var_f]), columns =teams_array)
    model = main_function_q9(train_f,"message_pass","football")
    pred_pred, random_pred = prediction_function(model,test_f,"message_pass","extension","football")
    print("case: football, message passing, extension ON")
    print("pred_rate: ", pred_pred , "random_rate: ", random_pred)
    print("----------------------------------------------------")
    teams_df = pd.DataFrame(np.array([initial_skills_f,initial_var_f]), columns =teams_array)
    model = main_function_q9(train_f,"gibbs","football")
    pred_pred, random_pred = prediction_function(model,test_f,"gibbs","extension","football")
    print("case: football, gibbs, extension ON")
    print("pred_rate: ", pred_pred , "random_rate: ", random_pred)
    print("----------------------------------------------------")
    players_df = pd.DataFrame(np.array([initial_skills_t,initial_var_t]), columns =players_arr)
    model = main_function_q9(train_t,"gibbs","tennis")
    pred_pred, random_pred = prediction_function(model,test_t,"gibbs","","tennis")
    print("case: football, gibbs, extension off")
    print("pred_rate: ", pred_pred , "random_rate: ", random_pred)
    print("----------------------------------------------------")
    players_df = pd.DataFrame(np.array([initial_skills_t,initial_var_t]), columns =players_arr)
    model = main_function_q9(train_t,"message_pass","tennis")
    pred_pred, random_pred = prediction_function(model,test_t,"message_pass","","tennis")
    print("case: tennis, message passing, extension off")
    print("pred_rate: ", pred_pred , "random_rate: ", random_pred)
    print("----------------------------------------------------")
    players_df = pd.DataFrame(np.array([initial_skills_t,initial_var_t]), columns =players_arr)
    model = main_function_q9(train_t,"message_pass","tennis")
    pred_pred, random_pred = prediction_function(model,test_t,"message_pass","extension","tennis")
    print("case: tennis, message passing, extension ON")
    print("pred_rate: ", pred_pred , "random_rate: ", random_pred)
    print("----------------------------------------------------")
    players_df = pd.DataFrame(np.array([initial_skills_t,initial_var_t]), columns =players_arr)
    model = main_function_q9(train_t,"gibbs","tennis")
    pred_pred, random_pred = prediction_function(model,test_t,"gibbs","extension","tennis")
    print("case: tennis, gibbs, extension ON")
    print("pred_rate: ", pred_pred , "random_rate: ", random_pred)
    print("----------------------------------------------------")


In [195]:
print_function()

case: football, gibbs, extension off
pred_rate:  0.7 random_rate:  0.5
----------------------------------------------------
case: football, message passing, extension off
pred_rate:  0.7333333333333333 random_rate:  0.5666666666666667
----------------------------------------------------
case: football, message passing, extension ON
pred_rate:  0.6666666666666666 random_rate:  0.45555555555555555
----------------------------------------------------
case: football, gibbs, extension ON
pred_rate:  0.6555555555555556 random_rate:  0.4888888888888889
----------------------------------------------------
case: football, gibbs, extension off
pred_rate:  0.6309523809523809 random_rate:  0.48214285714285715
----------------------------------------------------
case: football, message passing, extension off
pred_rate:  0.5952380952380952 random_rate:  0.47619047619047616
----------------------------------------------------


  """
  


case: football, message passing, extension ON
pred_rate:  0.6071428571428571 random_rate:  0.5535714285714286
----------------------------------------------------
case: football, gibbs, extension ON
pred_rate:  0.6607142857142857 random_rate:  0.4642857142857143
----------------------------------------------------
