In [42]:
# preliminaries
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from scipy.stats import skew, kurtosis

In [1]:
# paths to dataframe csv files
DF_FULL_PATH = 'Data/df_full.csv'

In [3]:
# load dataframes for each position
df_full = pd.read_csv(DF_FULL_PATH, index_col=0)
cb_df = df_full[df_full.position=='CB']
fb_df = df_full[df_full.position=='FB']
dm_df = df_full[df_full.position=='DM']
m_df = df_full[df_full.position=='M']
w_df = df_full[df_full.position=='W']
cf_df = df_full[df_full.position=='CF']

In [4]:
RAW_TRAITS = ['goals', 'shots', 'conversion', 'positioning', 'assists', 'crossing', 'dribbling', 'carries',
              'involvement', 'accuracy', 'intent', 'receiving', 'aerial', 'on_ball', 'off_ball', 'fouls']

In [40]:
# get raw traits per position as numpy array
cb_raw_traits = cb_df[RAW_TRAITS].to_numpy()
fb_raw_traits = fb_df[RAW_TRAITS].to_numpy()
dm_raw_traits = dm_df[RAW_TRAITS].to_numpy()
m_raw_traits = m_df[RAW_TRAITS].to_numpy()
w_raw_traits = w_df[RAW_TRAITS].to_numpy()
cf_raw_traits = cf_df[RAW_TRAITS].to_numpy()

raw_traits_dict = {'CB': cb_raw_traits,
                   'FB': fb_raw_traits,
                   'DM': dm_raw_traits,
                   'M': m_raw_traits,
                   'W': w_raw_traits,
                   'CF': cf_raw_traits
                  }

### Gaussian Mixture Model

In [9]:
# create models
gmm_cb = GaussianMixture(n_components=cb_raw_traits.shape[0]//20)
gmm_fb = GaussianMixture(n_components=fb_raw_traits.shape[0]//20)
gmm_dm = GaussianMixture(n_components=dm_raw_traits.shape[0]//20)
gmm_m = GaussianMixture(n_components=m_raw_traits.shape[0]//20)
gmm_w = GaussianMixture(n_components=w_raw_traits.shape[0]//20)
gmm_cf = GaussianMixture(n_components=cf_raw_traits.shape[0]//20)

# fit models
gmm_cb = gmm_cb.fit(cb_raw_traits)
gmm_fb = gmm_fb.fit(fb_raw_traits)
gmm_dm = gmm_dm.fit(dm_raw_traits)
gmm_m = gmm_m.fit(m_raw_traits)
gmm_w = gmm_w.fit(w_raw_traits)
gmm_cf = gmm_cf.fit(cf_raw_traits)


In [26]:
# get clusters for each position
gmm_cb_cluster = gmm_cb.predict(cb_raw_traits)
gmm_fb_cluster = gmm_fb.predict(fb_raw_traits)
gmm_dm_cluster = gmm_dm.predict(dm_raw_traits)
gmm_m_cluster = gmm_m.predict(m_raw_traits)
gmm_w_cluster = gmm_w.predict(w_raw_traits)
gmm_cf_cluster = gmm_cf.predict(cf_raw_traits)

# get scores for each position
gmm_cb_cluster_probs = gmm_cb.predict_proba(cb_raw_traits)
gmm_fb_cluster_probs = gmm_fb.predict_proba(fb_raw_traits)
gmm_dm_cluster_probs = gmm_dm.predict_proba(dm_raw_traits)
gmm_m_cluster_probs = gmm_m.predict_proba(m_raw_traits)
gmm_w_cluster_probs = gmm_w.predict_proba(w_raw_traits)
gmm_cf_cluster_probs = gmm_cf.predict_proba(cf_raw_traits)

In [22]:
gmm_cb_cluster_probs[0][46]

0.9999998601739764

In [None]:
gmm_cb_cluster_probs

In [29]:
gmm_scores_cb = np.zeros((cb_raw_traits.shape[0], cb_raw_traits.shape[0]))
for i in range(cb_raw_traits.shape[0]):
    for j in range(cb_raw_traits.shape[0]):
        cluster = gmm_cb_cluster[j]
        gmm_cb_scores[i][j] = gmm_cb_cluster_probs[i][cluster]

gmm_scores_fb = np.zeros((fb_raw_traits.shape[0], fb_raw_traits.shape[0]))
for i in range(fb_raw_traits.shape[0]):
    for j in range(fb_raw_traits.shape[0]):
        cluster = gmm_fb_cluster[j]
        gmm_fb_scores[i][j] = gmm_fb_cluster_probs[i][cluster]

gmm_scores_dm = np.zeros((dm_raw_traits.shape[0], dm_raw_traits.shape[0]))
for i in range(dm_raw_traits.shape[0]):
    for j in range(dm_raw_traits.shape[0]):
        cluster = gmm_dm_cluster[j]
        gmm_dm_scores[i][j] = gmm_dm_cluster_probs[i][cluster]
        
gmm_scores_m = np.zeros((m_raw_traits.shape[0], m_raw_traits.shape[0]))
for i in range(m_raw_traits.shape[0]):
    for j in range(m_raw_traits.shape[0]):
        cluster = gmm_m_cluster[j]
        gmm_m_scores[i][j] = gmm_m_cluster_probs[i][cluster]
        
gmm_scores_w = np.zeros((w_raw_traits.shape[0], w_raw_traits.shape[0]))
for i in range(w_raw_traits.shape[0]):
    for j in range(w_raw_traits.shape[0]):
        cluster = gmm_w_cluster[j]
        gmm_w_scores[i][j] = gmm_w_cluster_probs[i][cluster]
        
gmm_scores_cf = np.zeros((cf_raw_traits.shape[0], cf_raw_traits.shape[0]))
for i in range(cf_raw_traits.shape[0]):
    for j in range(cf_raw_traits.shape[0]):
        cluster = gmm_cf_cluster[j]
        gmm_cf_scores[i][j] = gmm_cf_cluster_probs[i][cluster]

gmm_dict = {'CB': gmm_scores_cb,
            'FB': gmm_scores_fb,
            'DM': gmm_scores_dm,
            'M': gmm_scores_m,
            'W': gmm_scores_w,
            'CF': gmm_scores_cf
           }

### Evaluation

In [31]:
def eval_1(scores, raw_traits):
    
    # get list of predicted ranks of queried players
    queried_player_ranks = []
    for queried_player_index, scores in enumerate(scores):
        player_indices = np.argpartition(scores, -len(raw_traits))
        player_indices = np.flip(player_indices[np.argsort(scores[player_indices])])
        
        for rank, index in enumerate(player_indices):
            if index == queried_player_index:
                queried_player_ranks.append(rank+1)
    
    return np.average(queried_player_ranks)

In [32]:
def eval_2(scores, raw_traits, top_n=20):

    # get list of all averaged ratio stds for each queried player
    avg_ratio_stds = []
    for queried_player_index, scores in enumerate(scores):

        queried_traits = raw_traits[queried_player_index]
        queried_traits[np.where(queried_traits==0)] = 0.001 # small epsilon to avoid division by zero
        
        top_n_indices = np.argpartition(scores, -top_n-1)[-top_n-1:]
        top_n_indices = np.flip(top_n_indices[np.argsort(scores[top_n_indices])])
        
        # get list of ratio std's between queried player and ranked player
        ratio_stds = []
        for ind in top_n_indices:
            # stop loop when top_n players have been processed
            if len(ratio_stds) == top_n:
                break
            # skip queried player
            if ind == queried_player_index:
                continue
                
            player_traits = raw_traits[ind]
            ratios = player_traits / queried_traits
            ratio_std = ratios.std()
            ratio_stds.append(ratio_std)

        # compute average of ratio stds
        avg_ratio_stds.append(np.average(ratio_stds))

    return np.average(avg_ratio_stds)

In [33]:
def eval_3(scores):
    # compute variances of all queries
    all_variances = np.var(scores, axis=1)
    
    return np.average(all_variances)

In [34]:
def eval_4(scores):
    # compute skewness of all queries
    all_skews = skew(scores, axis=1)
    
    return np.average(all_skews)

In [35]:
def eval_5(scores):
    # compute kurtosis of all queries
    all_kurtosis = kurtosis(scores, axis=1)
    
    return np.average(all_kurtosis)

In [36]:
def eval_6(scores):
    # get min and max of all scores
    min_score = scores.min()
    max_score = scores.max()
    
    return [min_score, max_score]

In [38]:
methods_dict = {'Gaussian Mixture Model': gmm_dict}
dynamicity_dict = {'Gaussian Mixture Model': 'Y'}

In [43]:
results = {'Method': [],
           'Average rank of queried player': [],
           'Standard deviation of trait ratios of top 20 similar players': [],
           'Scores sparsity': [],
           'Skewness of scores': [],
           'Kurtosis of scores': [],
           'Range of scores': [],
           'Dynamicity': []
          }

for name, dictionary in methods_dict.items():
    results['Method'].append(name)
    results['Dynamicity'].append(dynamicity_dict[name])
    eval_1_list = []
    eval_2_list = []
    eval_3_list = []
    eval_4_list = []
    eval_5_list = []
    min_score = 99999
    max_score = -99999
    for pos, scores in dictionary.items():
        raw_traits = raw_traits_dict[pos]
        eval_1_list.append(eval_1(scores, raw_traits))
        eval_2_list.append(eval_2(scores, raw_traits))
        eval_3_list.append(eval_3(scores))
        eval_4_list.append(eval_4(scores))
        eval_5_list.append(eval_5(scores))
        temp_min_score, temp_max_score = eval_6(scores)
        min_score = min(min_score, temp_min_score)
        max_score = max(max_score, temp_max_score)
    results['Average rank of queried player'].append(np.average(eval_1_list))
    results['Standard deviation of trait ratios of top 20 similar players'].append(np.average(eval_2_list))
    results['Scores sparsity'].append(np.average(eval_3_list))
    results['Skewness of scores'].append(np.average(eval_4_list))
    results['Kurtosis of scores'].append(np.average(eval_5_list))
    results['Range of scores'].append([round(min_score, 2), round(max_score, 2)])

In [44]:
results_df = pd.DataFrame(results)

In [45]:
results_df

Unnamed: 0,Method,Average rank of queried player,Standard deviation of trait ratios of top 20 similar players,Scores sparsity,Skewness of scores,Kurtosis of scores,Range of scores,Dynamicity
0,Gaussian Mixture Model,13.291785,0.462423,0.024403,6.762616,48.512136,"[0.0, 1.0]",Y
