In [None]:
import os
import random
import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
import pickle

from datetime import datetime
from itertools import combinations

import sys
sys.path.append('modules')

import epl_module as epl

In [None]:
dir_path = 'epl_data/'

In [None]:
plt.rcParams.update({'font.size': 16})

# Load the data

In [None]:
# Create one dataset with all the results from seasons between 2000 and 2017
all_seasons = np.arange(2000,2018)
for s in all_seasons[1:]:
    df = pd.read_csv(dir_path+str(s)+'.csv', sep=';',index_col=0)
    df['Season'] = s
    df = df.loc[:,['Season','MW','HomeTeam','AwayTeam','FTHG','FTAG']]
    df = df.rename(columns={'MW':'Week'})
    data = pd.concat([data, df],ignore_index = True,sort = False)

In [None]:
# Compute score differences
data['Score'] = data.FTHG-data.FTAG
data

In [None]:
# Get names of all teams that played in EPL between 2000 and 2017
teams = np.union1d(data.HomeTeam.unique().astype(str),data.AwayTeam.unique().astype(str))
N = len(teams)

In [None]:
def get_single_round_matrix(data,teams):
    """
    Gets the pairwise numpy array of win/loss across teams for a single
       week in a season. pwise_diff[i,j] = score[i] - score[j] at this week.
    """
    
    # Matrix of data Y
    Y = pd.DataFrame(index=teams,columns=teams)
    Y.fillna(0,inplace=True)
    for i in data.HomeTeam:
        for j in data.AwayTeam:
            s = data.Score.loc[(data.HomeTeam==i) & (data.AwayTeam ==j)].values
            if s.size >0:
                Y.loc[[i],[j]] += s
                Y.loc[[j],[i]] -= s
    
    # Adjacency matrix A
    A = pd.DataFrame(index=teams,columns=teams)
    A.fillna(0,inplace=True)
    for i in data.HomeTeam:
        for j in data.AwayTeam:
            s = data.Score.loc[(data.HomeTeam==i) & (data.AwayTeam ==j)].values
            if s.size >0:
                A.loc[[i],[j]] = 1
                A.loc[[i],[j]] = 1
    
    return Y,A

There are no subsets of succesive seasons that form a connected graph.
Hence we will merge data manually by group of 2-3 seasons in order to denoise the observations 

# Merge manually (run only the first time to create data files)

In [None]:
merged_seasons = np.array_split(all_seasons,9)
merged_seasons

In [None]:
# Execute the first time, otherwise, results are saved in Y_merged.txt,A_merged.txt files
N = len(teams)
T = len(merged_seasons)
A = np.zeros((T,N,N))
Y = np.zeros((T,N,N))

for i,d in enumerate(merged_seasons):
    l_y = []
    l_a = []
    for s in d:
        df = data.loc[data.Season == s]
        for j in df.Week.unique():
            df2 = df.loc[df.Week == j]
            y,a = get_single_round_matrix(df2,teams)
            l_y.append(y.to_numpy())
            l_a.append(a.to_numpy())
    Y[i,:,:] = np.mean(l_y,axis=0)
    A[i,:,:] = np.mean(l_a,axis=0)


In [None]:
# Save data
import pickle
with open('epl_data/Y_merged.txt','wb') as y:
    pickle.dump(Y,y)
with open('epl_data/A_merged.txt','wb') as a:
    pickle.dump(A,a)

# Analysis

In [None]:
import pickle
with open('epl_data/Y_merged.txt','rb') as y:
    Y = pickle.load(y)
with open('epl_data/A_merged.txt','rb') as a:
    A = pickle.load(a)
with open('epl_data/teams.txt','rb') as t:
    teams = pickle.load(t)

In [None]:
T,N = np.shape(Y)[:2]
T,N

In [None]:
# Check connectivity
import graph_module as graph
print(graph.connected(sum(A)))
for t in range(T):
    print(graph.connected(A[t,:,:]))

Here, individual graphs are not connected because of promotion/relegation system of EPL but the union of all the graphs is connected.

# Cross validation

In [None]:
import loocv_module as cv
import ls_module as ls

In [None]:
E = ls.penalty_E(N,T)
with open("eigenpairs_E/eigenvectors_E_N"+str(N)+"_T"+str(T)+".txt", "rb") as v:
        V_E = pickle.load(v)
with open("eigenpairs_E/eigenvalues_E_N"+str(N)+"_T"+str(T)+".txt", "rb") as e:
        eigs_E = pickle.load(e)

In [None]:
random.seed(0)
np.random.seed(0)

lambda_list_up = np.linspace(0,100,50)
tau_list_up = np.linspace(1e-6,50,50)

result_dls_up = cv.cv_dls_transync_up(Y,A,E,lambda_list_up,num_loocv = 40)
result_dproj_up = cv.cv_dproj_transync_up(Y,A,E,V_E,eigs_E,tau_list_up,num_loocv = 40)
lam_up_dls,z_up_dls,error_up_dls = result_dls_up
tau_up_dproj,z_up_dproj,error_up_dproj = result_dproj_up
print(lam_up_dls,tau_up_dproj)

In [None]:
random.seed(0)
np.random.seed(0)

lambda_list_mse = np.linspace(0,100,50)
tau_list_mse = np.linspace(1e-6,50,50)

result_dls_mse = cv.cv_dls_transync_mse(Y,A,E,lambda_list_mse,num_loocv = 40)
result_dproj_mse = cv.cv_dproj_transync_mse(Y,A,E,V_E,eigs_E,tau_list_mse,num_loocv = 40)
lam_mse_dls,z_mse_dls,error_mse_dls = result_dls_mse
tau_mse_dproj,z_mse_dproj,error_mse_dproj = result_dproj_mse
print(lam_mse_dls,tau_mse_dproj)

In [None]:
# Values obtained by cross validation
lam_mse_dls = 12.24
tau_mse_dproj = 45.92

lam_up_dls = 16.33
tau_up_dproj = 39.8

In [None]:
# Analysis for these values of parameter
l_ls_mse,l_dls_mse,l_dproj_mse = epl.get_ranks_transync(lam_mse_dls,tau_mse_dproj,Y,A,teams,
                                                            ls_flag = True,dls_flag=True,dproj_flag = True)
l_ls_up,l_dls_up,l_dproj_up = epl.get_ranks_transync(lam_up_dls,tau_up_dproj,Y,A,teams,
                                                            ls_flag = True,dls_flag=True,dproj_flag = True)

In [None]:
# Optimal Results for MSE criteria
z_dls_mse,df_dls_mse,lam_mse = l_dls_mse
z_ls_mse,df_ls_mse = l_ls_mse
z_dproj_mse,df_dproj_mse,tau_mse = l_dproj_mse

# Optimal Results for Upsets criteria
z_dls_up,df_dls_up,lam_up = l_dls_up
z_ls_up,df_ls_up = l_ls_up
z_dproj_up,df_dproj_up,tau_up = l_dproj_up

In [None]:
# Number of upsets for upsets results
upsets_ls_up = epl.get_mean_upsets_transync(Y,A,z_ls_up)
upsets_dls_up = epl.get_mean_upsets_transync(Y,A,z_dls_up)
upsets_dproj_up = epl.get_mean_upsets_transync(Y,A,z_dproj_up)

# MSE for upsets results
mse_ls_up = epl.get_mse_upsets_transync(Y,A,z_ls_up)
mse_dls_up = epl.get_mse_upsets_transync(Y,A,z_dls_up)
mse_dproj_up = epl.get_mse_upsets_transync(Y,A,z_dproj_up)

In [None]:
# Number of upsets for mse results
upsets_ls_mse = epl.get_mean_upsets_transync(Y,A,z_ls_mse)
upsets_dls_mse = epl.get_mean_upsets_transync(Y,A,z_dls_mse)
upsets_dproj_mse = epl.get_mean_upsets_transync(Y,A,z_dproj_mse)

# MSE for mse results
mse_ls_mse = epl.get_mse_upsets_transync(Y,A,z_ls_mse)
mse_dls_mse = epl.get_mse_upsets_transync(Y,A,z_dls_mse)
mse_dproj_mse = epl.get_mse_upsets_transync(Y,A,z_dproj_mse)

In [None]:
# Mean number of upstests for each method
np.mean(upsets_ls_up),np.mean(upsets_dls_up),np.mean(upsets_dproj_up)

In [None]:
# Mean MSE for each method
np.mean(mse_ls_mse),np.mean(mse_dls_mse),np.mean(mse_dproj_mse)

# Another performance criterion

## Check smoothness for "ground truth"

We define the ground truth vector $z^*$ such that 
$$z^*_{t,i} = \frac{1}{N_{t,i}} \sum_{j \in N_{t,i}} y_{ij}(t)$$
where $N_{t,i}$ denotes the set of neighbours of $i$ at time $t$

In [None]:
# Define ground truth
z_star = np.zeros((T,N))
for t in range(T):
    for i in range(N):
        Nti = np.sum(A[t,i,:]) # Number of games played by team i at time t
        if Nti != 0:
            z_star[t,i] = np.sum(Y[t,i,:])/Nti
        else:
            z_star[t,i] = 0

In [None]:
# Select teams that played at all times
l_teams = []
for i in range(N):
    if np.all(z_star[:,i] != 0):
        l_teams.append(i)

In [None]:
teams[l_teams]

In [None]:
[str(merged_seasons[i])[1:-1] for i in range(9)]

In [None]:
[all_seasons[2*i] for i in range(9) ]

In [None]:
fig,ax = plt.subplots(figsize=(8,6))

for i in l_teams[:5]:
    ax.plot([all_seasons[2*i] for i in range(9) ],z_star[:,i],label=teams[i])
    
ax.set_ylabel('Ground truth $z^{*,emp}_{t,i}$')
ax.set_xlabel('Seasons')
ax.xaxis.set_ticks([all_seasons[2*i] for i in range(9) ])
ax.set_xticklabels([all_seasons[2*i] for i in range(9) ])

ax.set_title('Evolution of the strength of the teams')
plt.legend(ncol=2,loc='lower right',frameon =False)

fig.savefig('smoothness_epl_5teams.png')
plt.show()