In [1]:
# preliminaries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances, cosine_similarity
from scipy.spatial.distance import pdist, squareform
from scipy.stats import skew, kurtosis

In [2]:
# paths to dataframe csv files
DF_FULL_PATH = 'Data/df_full.csv'

In [3]:
# load dataframes for each position
df_full = pd.read_csv(DF_FULL_PATH, index_col=0)
cb_df = df_full[df_full.position=='CB']
fb_df = df_full[df_full.position=='FB']
dm_df = df_full[df_full.position=='DM']
m_df = df_full[df_full.position=='M']
w_df = df_full[df_full.position=='W']
cf_df = df_full[df_full.position=='CF']

In [4]:
# constants
RAW_TRAITS = ['goals', 'shots', 'conversion', 'positioning', 'assists', 'crossing', 'dribbling', 'carries',
              'involvement', 'accuracy', 'intent', 'receiving', 'aerial', 'on_ball', 'off_ball', 'fouls']

In [5]:
# get raw traits per position as numpy array
cb_raw_traits = cb_df[RAW_TRAITS].to_numpy()
fb_raw_traits = fb_df[RAW_TRAITS].to_numpy()
dm_raw_traits = dm_df[RAW_TRAITS].to_numpy()
m_raw_traits = m_df[RAW_TRAITS].to_numpy()
w_raw_traits = w_df[RAW_TRAITS].to_numpy()
cf_raw_traits = cf_df[RAW_TRAITS].to_numpy()

raw_traits_dict = {'CB': cb_raw_traits,
                   'FB': fb_raw_traits,
                   'DM': dm_raw_traits,
                   'M': m_raw_traits,
                   'W': w_raw_traits,
                   'CF': cf_raw_traits
                  }

### Euclidean Similarity

In [6]:
def euclidean_similarity(raw_traits):
    distances = euclidean_distances(raw_traits)
    distances[distances==0] = 0.1
    scores = 1 / distances
    
    return scores

In [7]:
euclidean_scores_cb = euclidean_similarity(cb_raw_traits)
euclidean_scores_fb = euclidean_similarity(fb_raw_traits)
euclidean_scores_dm = euclidean_similarity(dm_raw_traits)
euclidean_scores_m = euclidean_similarity(m_raw_traits)
euclidean_scores_w = euclidean_similarity(w_raw_traits)
euclidean_scores_cf = euclidean_similarity(cf_raw_traits)

euclidean_scores_dict = {'CB': euclidean_scores_cb,
                         'FB': euclidean_scores_fb,
                         'DM': euclidean_scores_dm,
                         'M': euclidean_scores_m,
                         'W': euclidean_scores_w,
                         'CF': euclidean_scores_cf
                        }

### Manhattan Similarity

In [8]:
def manhattan_similarity(raw_traits):
    distances = manhattan_distances(raw_traits)
    distances[distances==0] = 0.1
    scores = 1 / distances
    
    return scores

In [9]:
manhattan_scores_cb = manhattan_similarity(cb_raw_traits)
manhattan_scores_fb = manhattan_similarity(fb_raw_traits)
manhattan_scores_dm = manhattan_similarity(dm_raw_traits)
manhattan_scores_m = manhattan_similarity(m_raw_traits)
manhattan_scores_w = manhattan_similarity(w_raw_traits)
manhattan_scores_cf = manhattan_similarity(cf_raw_traits)

manhattan_scores_dict = {'CB': manhattan_scores_cb,
                         'FB': manhattan_scores_fb,
                         'DM': manhattan_scores_dm,
                         'M': manhattan_scores_m,
                         'W': manhattan_scores_w,
                         'CF': manhattan_scores_cf
                        }

### Cosine Similarity

In [10]:
cosine_scores_cb = cosine_similarity(cb_raw_traits)
cosine_scores_fb = cosine_similarity(fb_raw_traits)
cosine_scores_dm = cosine_similarity(dm_raw_traits)
cosine_scores_m = cosine_similarity(m_raw_traits)
cosine_scores_w = cosine_similarity(w_raw_traits)
cosine_scores_cf = cosine_similarity(cf_raw_traits)

cosine_scores_dict = {'CB': cosine_scores_cb,
                      'FB': cosine_scores_fb,
                      'DM': cosine_scores_dm,
                      'M': cosine_scores_m,
                      'W': cosine_scores_w,
                      'CF': cosine_scores_cf
                     }

### Adjusted Cosine Similarity

In [28]:
def adjusted_cosine_similarity(raw_traits):
    reduced_u = raw_traits.mean(axis=1)
    reduced_sub = raw_traits - reduced_u[:, None]
    scores = 1 - squareform(pdist(reduced_sub, 'cosine'))
    
    return scores

In [29]:
adjusted_cosine_scores_cb = adjusted_cosine_similarity(cb_raw_traits)
adjusted_cosine_scores_fb = adjusted_cosine_similarity(fb_raw_traits)
adjusted_cosine_scores_dm = adjusted_cosine_similarity(dm_raw_traits)
adjusted_cosine_scores_m = adjusted_cosine_similarity(m_raw_traits)
adjusted_cosine_scores_w = adjusted_cosine_similarity(w_raw_traits)
adjusted_cosine_scores_cf = adjusted_cosine_similarity(cf_raw_traits)

adjusted_cosine_scores_dict = {'CB': adjusted_cosine_scores_cb,
                               'FB': adjusted_cosine_scores_fb,
                               'DM': adjusted_cosine_scores_dm,
                               'M': adjusted_cosine_scores_m,
                               'W': adjusted_cosine_scores_w,
                               'CF': adjusted_cosine_scores_cf
                              }

### Pearson Correlation

In [30]:
pearson_scores_cb = np.corrcoef(cb_raw_traits)
pearson_scores_fb = np.corrcoef(fb_raw_traits)
pearson_scores_dm = np.corrcoef(dm_raw_traits)
pearson_scores_m = np.corrcoef(m_raw_traits)
pearson_scores_w = np.corrcoef(w_raw_traits)
pearson_scores_cf = np.corrcoef(cf_raw_traits)

pearson_scores_dict = {'CB': pearson_scores_cb,
                       'FB': pearson_scores_fb,
                       'DM': pearson_scores_dm,
                       'M': pearson_scores_m,
                       'W': pearson_scores_w,
                       'CF': pearson_scores_cf
                      }

### Evaluation

In [14]:
def eval_1(scores, raw_traits):
    
    # get list of predicted ranks of queried players
    queried_player_ranks = []
    for queried_player_index, scores in enumerate(scores):
        player_indices = np.argpartition(scores, -len(raw_traits))
        player_indices = np.flip(player_indices[np.argsort(scores[player_indices])])
        
        for rank, index in enumerate(player_indices):
            if index == queried_player_index:
                queried_player_ranks.append(rank+1)
    
    return np.average(queried_player_ranks)

In [16]:
def eval_2(scores, raw_traits, top_n=20):

    # get list of all averaged ratio stds for each queried player
    avg_ratio_stds = []
    for queried_player_index, scores in enumerate(scores):

        queried_traits = raw_traits[queried_player_index]
        queried_traits[np.where(queried_traits==0)] = 0.001 # small epsilon to avoid division by zero
        
        top_n_indices = np.argpartition(scores, -top_n-1)[-top_n-1:]
        top_n_indices = np.flip(top_n_indices[np.argsort(scores[top_n_indices])])
        
        # get list of ratio std's between queried player and ranked player
        ratio_stds = []
        for ind in top_n_indices:
            # stop loop when top_n players have been processed
            if len(ratio_stds) == top_n:
                break
            # skip queried player
            if ind == queried_player_index:
                continue
                
            player_traits = raw_traits[ind]
            ratios = player_traits / queried_traits
            ratio_std = ratios.std()
            ratio_stds.append(ratio_std)

        # compute average of ratio stds
        avg_ratio_stds.append(np.average(ratio_stds))

    return np.average(avg_ratio_stds)

In [17]:
def eval_3(scores):
    # compute variances of all queries
    all_variances = np.var(scores, axis=1)
    
    return np.average(all_variances)

In [18]:
def eval_4(scores):
    # compute skewness of all queries
    all_skews = skew(scores, axis=1)
    
    return np.average(all_skews)

In [19]:
def eval_5(scores):
    # compute kurtosis of all queries
    all_kurtosis = kurtosis(scores, axis=1)
    
    return np.average(all_kurtosis)

In [20]:
def eval_6(scores):
    # get min and max of all scores
    min_score = scores.min()
    max_score = scores.max()
    
    return [min_score, max_score]

In [21]:
methods_dict = {'Euclidean similarity': euclidean_scores_dict,
                'Manhattan similarity': manhattan_scores_dict,
                'Cosine similarity': cosine_scores_dict,
                'Adjusted cosine similarity': adjusted_cosine_scores_dict,
                'Pearson correlation': pearson_scores_dict,
               }
dynamicity_dict = {'Euclidean similarity': 'Y',
                   'Manhattan similarity': 'Y',
                   'Cosine similarity': 'Y',
                   'Adjusted cosine similarity': 'Y',
                   'Pearson correlation': 'Y',
               }

In [22]:
results = {'Method': [],
           'Average rank of queried player': [],
           'Standard deviation of trait ratios of top 20 similar players': [],
           'Scores sparsity': [],
           'Skewness of scores': [],
           'Kurtosis of scores': [],
           'Range of scores': [],
           'Dynamicity': []
          }

for name, dictionary in methods_dict.items():
    results['Method'].append(name)
    results['Dynamicity'].append(dynamicity_dict[name])
    eval_1_list = []
    eval_2_list = []
    eval_3_list = []
    eval_4_list = []
    eval_5_list = []
    min_score = 99999
    max_score = -99999
    for pos, scores in dictionary.items():
        raw_traits = raw_traits_dict[pos]
        eval_1_list.append(eval_1(scores, raw_traits))
        eval_2_list.append(eval_2(scores, raw_traits))
        eval_3_list.append(eval_3(scores))
        eval_4_list.append(eval_4(scores))
        eval_5_list.append(eval_5(scores))
        temp_min_score, temp_max_score = eval_6(scores)
        min_score = min(min_score, temp_min_score)
        max_score = max(max_score, temp_max_score)
    results['Average rank of queried player'].append(np.average(eval_1_list))
    results['Standard deviation of trait ratios of top 20 similar players'].append(np.average(eval_2_list))
    results['Scores sparsity'].append(np.average(eval_3_list))
    results['Skewness of scores'].append(np.average(eval_4_list))
    results['Kurtosis of scores'].append(np.average(eval_5_list))
    results['Range of scores'].append([round(min_score, 2), round(max_score, 2)])

In [23]:
results_df = pd.DataFrame(results)

In [24]:
results_df

Unnamed: 0,Method,Average rank of queried player,Standard deviation of trait ratios of top 20 similar players,Scores sparsity,Skewness of scores,Kurtosis of scores,Range of scores,Dynamicity
0,Euclidean similarity,1.0,0.37131,0.100881,29.132255,920.561954,"[0.06, 10.0]",Y
1,Manhattan similarity,1.0,0.399121,0.09781,32.195386,1057.538564,"[0.02, 10.0]",Y
2,Cosine similarity,1.0,0.360389,0.001345,-1.354056,5.572576,"[0.28, 1.0]",Y
3,Adjusted cosine similarity,1.0,0.376132,0.13221,0.093123,-0.704145,"[-0.96, 1.0]",Y
4,Pearson correlation,1.0,0.376132,0.13221,0.093123,-0.704145,"[-0.96, 1.0]",Y
