This notebook, written by E. Karlé, contains the code necessary to reproduce Table 2 and Figure 6b from the article Dynamic Ranking and Translation Synchronization https://arxiv.org/pdf/2207.01455.pdf

In [None]:
import os
import random
import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
import pickle

from datetime import datetime
from itertools import combinations

import sys
sys.path.append('modules')

import real_data_module as rdata

In [None]:
plt.rcParams.update({'font.size': 16})

To use this notebook, the user can download the data from https://www.kaggle.com/datasets/saife245/english-premier-league, which is provided as csv files for each season. Note that this dataset is regularly kept up to date but this notebook only uses data from the seasons 2000-2001 to 2017-2018.

In [None]:
# Path to the folder where the data is saved

dir_path = 'epl_data/'

# Preparation of the data

This section of the code needs to be run the first time one uses the notebook. The prepared data is then automatically saved in the data directory.

In [None]:
# Creation of one dataset with all the results from seasons between 2000 and 2017 
all_seasons = np.arange(2000,2018)

# Initialize the data frame with the first season
df = pd.read_csv(dir_path+'2000.csv', sep=';',index_col=0)
df['Season'] = 2000
data = df.loc[:,['Season','MW','HomeTeam','AwayTeam','FTHG','FTAG']]
data = data.rename(columns={'MW':'Week'})

for s in all_seasons[1:]:
    df = pd.read_csv(dir_path+str(s)+'.csv', sep=';',index_col=0)
    df['Season'] = s
    df = df.loc[:,['Season','MW','HomeTeam','AwayTeam','FTHG','FTAG']]
    df = df.rename(columns={'MW':'Week'})
    data = pd.concat([data, df],ignore_index = True,sort = False)

In [None]:
# Computation of score differences
data['Score'] = data.FTHG-data.FTAG
data

In [None]:
# Get names of all teams that played in EPL between 2000 and 2017
teams = np.union1d(data.HomeTeam.unique().astype(str),data.AwayTeam.unique().astype(str))
N = len(teams)

There are no subsets of succesive seasons that form a connected graph. Hence we will merge data manually by group of 2 seasons in order to denoise the observations. This results in a sequence of 9 graphs of observation in our setting. 

In [None]:
# Merge the data (run only the first time to create data files)
merged_seasons = np.array_split(all_seasons,9)

In [None]:
# Preparation of the data as a sequence of matrices of scores and adjacency matrices

N = len(teams) # Number of teams
T = len(merged_seasons) # Number of graphs
A = np.zeros((T,N,N)) # Adjacency matrix
Y = np.zeros((T,N,N)) # Observation matrix

for i,d in enumerate(merged_seasons):
    l_y = []
    l_a = []
    for s in d:
        df = data.loc[data.Season == s] # Data contained in the i-th graph
        for j in df.Week.unique():
            df2 = df.loc[df.Week == j]
            y,a = rdata.get_single_round_matrix(df2,teams) # Get scores and games played during Week j
            l_y.append(y.to_numpy()) # List of scores
            l_a.append(a.to_numpy()) # List of games
            
    Y[i,:,:] = np.mean(l_y,axis=0) # Average scores between each pair of teams connected in the i-th graph
    A[i,:,:] = np.mean(l_a,axis=0) # Adjacency matric of the i-th graph


In [None]:
# Save data
import pickle
with open('epl_data/Y_merged.txt','wb') as y:
    pickle.dump(Y,y)
with open('epl_data/A_merged.txt','wb') as a:
    pickle.dump(A,a)

# Analysis

Once the data has been prepared, this part of the code can be run directly. 

In [None]:
# Load the prepared data

import pickle
with open('epl_data/Y_merged.txt','rb') as y:
    Y = pickle.load(y)
with open('epl_data/A_merged.txt','rb') as a:
    A = pickle.load(a)
with open('epl_data/teams.txt','rb') as t:
    teams = pickle.load(t)

In [None]:
# Number of graphs and of teams

T,N = np.shape(Y)[:2]

As a sanity check before using our method, we verify that the union of all the data form a connected graph

In [None]:
# Check connectivity
import graph_module as graph

print(graph.connected(sum(A))) # Connectivity of the union graph

for t in range(T):
    print(graph.connected(A[t,:,:])) # Individual connectivity

Here, individual graphs are not connected because of promotion/relegation system of EPL but the union of all the graphs is connected.

## Computation of optimal hyper parameters by cross validation

We run two cross-validations procedures in order to get optimal values for the hyper parameter $\lambda$ and $\tau$. The criteria for these procedures are the MSE and the mean number of upsets.

In [None]:
import loocv_module as cv
import tools_module as tools
import smoothness_module as smooth

In [None]:
# Parameters

num_loocv = 40 # Number of runs for cross-validation
lambda_list = np.linspace(0,100,50) # Candidates for lambda
tau_list = np.linspace(1e-6,50,50) # Candidates for tau

# Smoothness parameters
E = smooth.penalty_E(N,T) # used in the DLS method
eigs_E,V_E = smooth.eigs_E(N,T) # used in the DProj method

In [None]:
# Cross Validation with the number of upsets criterion

random.seed(0)
np.random.seed(0)

# Analysis with the DLS method
lam_up,z_up_dls = cv.cv_dls_up(Y,A,E,lambda_list,num_loocv)

# Analysis with the DProj method
tau_up,z_up_dproj = cv.cv_dproj_up(Y,A,V_E,eigs_E,tau_list,num_loocv)

In [None]:
# Cross Validation with the MSE criterion
random.seed(0)
np.random.seed(0)

# Analysis with the DLS method
lam_mse,z_mse_dls = cv.cv_dls_mse(Y,A,E,lambda_list,num_loocv)

# Analysis with the DProj method
tau_mse,z_mse_dproj = cv.cv_dproj_mse(Y,A,V_E,eigs_E,tau_list,num_loocv)

The cross-validation procedures give estimators for the optimal value of hyperparameters in DLS and DProj method. Let us compute the naive LS estimator for the sake of comparison.

In [None]:
# Computation of the LS estimator

Y_vec = tools.obs_transync(Y,A) # Vectorize the observations
Q = graph.diag_incidence(A)
Lv = Q@Q.T # Laplacian matrix
z_ls = scipy.sparse.linalg.lsqr(Lv,Q@Y_vec)[0] # LS estimator
z_ls = z_ls.reshape((T,N),order='F')

Let us now compute the error criterion for each estimator using the observations as ground truth. 
For estimators obtained by cross-validation with the Upsets criterion, we compute the mean number of upsets with respect to the observations.
For estimators obtained by cross-validation with the MSE criterion, we compute the MSE with respect to the observed score differences.

In [None]:
# Mean Number of upsets 
upsets_ls = rdata.get_mean_nb_upsets(Y,A,z_ls)
upsets_dls = rdata.get_mean_nb_upsets(Y,A,z_up_dls)
upsets_dproj = rdata.get_mean_nb_upsets(Y,A,z_up_dproj)

In [None]:
# MSE
MSE_ls = rdata.get_mse(Y,A,z_ls)
MSE_dls = rdata.get_mse(Y,A,z_mse_dls)
MSE_dproj = rdata.get_mse(Y,A,z_mse_dproj)

In [None]:
print(upsets_ls,upsets_dls,upsets_dproj)
print(MSE_ls,MSE_dls,MSE_dproj)

# Sanity check : smoothness of the data

Our analysis rely on a supposed smoothness of the data. Let us check that this dataset fits this criteria by defining a ground truth vector from the observations and plot its evolution for some teams.

We define the ground truth vector $z^*$ such that 
$$z^*_{t,i} = \frac{1}{N_{t,i}} \sum_{j \in N_{t,i}} y_{ij}(t)$$
where $N_{t,i}$ denotes the set of neighbours of $i$ at time $t$

In [None]:
# Define ground truth
z_star = np.zeros((T,N))
for t in range(T):
    for i in range(N):
        Nti = np.sum(A[t,i,:]) # Number of games played by team i at time t
        if Nti != 0:
            z_star[t,i] = np.sum(Y[t,i,:])/Nti
        else:
            z_star[t,i] = 0

We plot the evolution of teams that played at all times

In [None]:
# Select teams that played at all times
l_teams = []
for i in range(N):
    if np.all(z_star[:,i] != 0):
        l_teams.append(i)

In [None]:
# Plot for 5 teams that played at all times
fig,ax = plt.subplots(figsize=(8,6))

for i in l_teams[:5]:
    ax.plot([all_seasons[2*i] for i in range(9) ],z_star[:,i],label=teams[i])
    
ax.set_ylabel('Ground truth $z^{*,emp}_{t,i}$')
ax.set_xlabel('Seasons')
ax.xaxis.set_ticks([all_seasons[2*i] for i in range(9) ])
ax.set_xticklabels([all_seasons[2*i] for i in range(9) ])

ax.set_title('Evolution of the strength of the teams')
plt.legend(ncol=2,loc='lower right',frameon =False)

plt.show()