# DATA IMPORT

In [1]:
import pandas as pd
import numpy as np
from functools import reduce
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import pylab as pl
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics import silhouette_score
from functools import reduce
import sqlite3
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
combined = pd.read_csv('00 combined_filtered.csv')
# mens = pd.read_csv('01 men_filtered.csv')
# womens = pd.read_csv('01 women_filtered.csv')

In [3]:
renamed = ['Player', 'Pos', 'Squad', 'Gender', 'League', 'Gen+League', 'Season',
       'Age', 'Born', 'Nation', 'ATT', 'MID', 'DEF', 'MP', 'Min', 'Starts', 'Ast', 'G+A/90',
       'G+A-PK/90',  'G-PK/90', 'Gls/90', 'Ast/90', 'CrdR', 'CrdY',
       'Fls', 'Gls', 'OG', 'PK','PKatt', 'G/SoT','SoT', 'SoT/90', 'Tkl+Int']
combined.columns = renamed
# mens.columns = renamed
# womens.columns = renamed

In [4]:
total_matches = combined.groupby('Gen+League').agg({'MP': 'max'})
total_matches.rename(columns ={'MP': 'Total_MP'}, inplace = True)
top_fls = combined.groupby('Gen+League').agg({'Fls': 'max'})
top_fls.rename(columns ={'Fls': 'Top_Fls'}, inplace = True)
top_tkl = combined.groupby('Gen+League').agg({'Tkl+Int': 'max'})
top_tkl.rename(columns ={'Tkl+Int': 'Top_Tkl'}, inplace = True)
combined = combined.merge(total_matches, on = "Gen+League", how = 'left')
combined = combined.merge(top_fls, on = "Gen+League", how = 'left')
combined = combined.merge(top_tkl, on = "Gen+League", how = 'left')


In [5]:
def distrib(df, attributes, category):
    temp_list = attributes.copy()
    temp_list.append(category)
    a = df[temp_list].groupby(category)
    outdict = {}
    for name, group in a:
        outdict[name] = {"averages":group.mean(axis=0), "std dev":group.std(axis=0)}
        group.hist(figsize=(50, 30))
        plt.suptitle(name, fontsize=40)
    return outdict

def by_stat(summary, statlist):
    statdict = {}
    for stat in summary["A-League"]['averages'].keys():
#         statlist.append(stat)
        statdict[stat] = {}
    
    for stat in statlist:
        for lg in summary:
            mean = summary[lg]['averages'][stat]
            stddev = summary[lg]['std dev'][stat]
            #print(lg, stat, mean, stddev)
            statdict[stat][lg] = (mean,stddev)
    #print(statdict)
    return statdict

def normalize(df, attribute, group):
    col_name = 'norm ' + attribute
    a = df.groupby(group)
    aDict = {}
    for name, g in a:
        mu = g[attribute].mean()
        sigma = g[attribute].std()
        aDict[name] = [mu, sigma]
    df1 = pd.DataFrame(aDict).transpose()
    df1.reset_index(level=0, inplace=True)
    df1.columns = [group, 'mean', 'stdev']
    df = df.merge(df1, how = 'left', on = group)
    df[col_name] = (df[attribute] - df['mean'])/df['stdev']
    df.drop('mean', axis=1 , inplace = True)
    df.drop('stdev', axis=1 , inplace = True)
    return df

def log_scale(df, attribute):
    col_name = 'log ' + attribute
    df[col_name] = np.log(df[[attribute]].replace(0, np.nan)).fillna(0)
    return df

def rank_order(df, attribute, group):
    col_name = 'rank ' + attribute
    df[col_name] = df.groupby(group)[attribute].rank("dense", pct = True)
    return df

In [6]:
# Get % for Minutes and Matches Played
combined['%MP'] = combined['MP']/combined['Total_MP']
combined['%Starts'] = combined['Starts']/combined['Total_MP']
combined['%Min'] = combined['Min']/(combined['Total_MP']*90)

# Z-score normalizing
combined = normalize(combined, 'Age', 'Gen+League')

# log scaling
combined = log_scale(combined, 'CrdY')
combined = log_scale(combined, 'CrdR')

# rank orders
combined = rank_order(combined,"MP", 'Gen+League')
combined = rank_order(combined,"Gls", 'Gen+League')
combined = rank_order(combined,"Ast", 'Gen+League')
combined = rank_order(combined,"PK", 'Gen+League')
combined = rank_order(combined,"SoT", 'Gen+League')
combined = rank_order(combined,"CrdY", 'Gen+League')
combined = rank_order(combined,"CrdR", 'Gen+League')

# percentile scaled
combined['Fls/Top'] = combined['Fls']/combined['Top_Fls']
combined['Tkl/Top'] = combined['Tkl+Int']/combined['Top_Tkl']


combined.columns

Index(['Player', 'Pos', 'Squad', 'Gender', 'League', 'Gen+League', 'Season',
       'Age', 'Born', 'Nation', 'ATT', 'MID', 'DEF', 'MP', 'Min', 'Starts',
       'Ast', 'G+A/90', 'G+A-PK/90', 'G-PK/90', 'Gls/90', 'Ast/90', 'CrdR',
       'CrdY', 'Fls', 'Gls', 'OG', 'PK', 'PKatt', 'G/SoT', 'SoT', 'SoT/90',
       'Tkl+Int', 'Total_MP', 'Top_Fls', 'Top_Tkl', '%MP', '%Starts', '%Min',
       'norm Age', 'log CrdY', 'log CrdR', 'rank MP', 'rank Gls', 'rank Ast',
       'rank PK', 'rank SoT', 'rank CrdY', 'rank CrdR', 'Fls/Top', 'Tkl/Top'],
      dtype='object')

In [7]:
col1 = ['Age', '%MP', '%Min', '%Starts', 
               'G+A/90', 'G+A-PK/90', 'G-PK/90', 'Gls/90', 'Ast/90', 'SoT/90',
               'Tkl+Int', 'OG', 'Fls', 'PK', 'PKatt', 'G/SoT', 
               'norm Age', 'rank SoT', 'rank CrdY', 'rank CrdR', 'rank MP', 'rank Gls', 'rank Ast', 
               'Fls/Top', 'Tkl/Top']

## Finding Similar Players

Finds the other players most similar to specified player. Can look for other players in a specific league or gender

In [8]:
def find_similar(name, df, league = None, gender = None):
  combined['League'].unique()
  df[df['Player']==name]

  scaler = StandardScaler()
  scaled_features = scaler.fit_transform(df[col1].fillna(0))

  coordinates = pd.DataFrame(scaled_features.copy())
  coordinates['Player'] = df['Player']
  coordinates['League'] = df['League']
  coordinates['Gender'] = df['Gender']


  player = coordinates[coordinates['Player'] == name]
  player.drop(['Player',  'League', 'Gender'], axis = 1, inplace = True)
  player = np.array(player)

  others = coordinates[coordinates['Player'] != name]
  results_df = combined[combined['Player'] != name]

  if league != None:
    others = others[others['League'] == league]
    results_df = results_df[results_df['League'] == league]
  if gender != None:
    others = others[others['Gender'] == gender]
    results_df = results_df[results_df['Gender'] == gender]
  others = others.drop(['Player','League', 'Gender'], axis = 1)
  others = np.array(others)

  indices = pairwise_distances_argmin_min(player, others)
  results_df = results_df.iloc[indices[0]]

  results_df = results_df[['Player', 'Pos', 'Squad', 'Gender', 'League', 'Season',
       'Age', 'Min', 'Gls/90', 'Ast/90',
       'SoT/90',
       'Fls', 'Tkl+Int']]
  return results_df



In [9]:
players = combined['Player'].unique()

In [10]:
len(players)

24818

In [11]:
output = pd.DataFrame(columns=['Source', 'Player', 'Pos', 'Squad', 'Gender', 'League', 'Season',
       'Age', 'Min', 'Gls/90', 'Ast/90',
       'SoT/90',
       'Fls', 'Tkl+Int'])
output.head()

Unnamed: 0,Source,Player,Pos,Squad,Gender,League,Season,Age,Min,Gls/90,Ast/90,SoT/90,Fls,Tkl+Int


In [12]:

for p in tqdm(players):
    rst = find_similar(p, combined)
    rst['Source'] = p
    output = output.append(rst)

HBox(children=(FloatProgress(value=0.0, max=24818.0), HTML(value='')))




In [16]:
output.to_csv('similar_players.csv', index=False)

In [17]:
edges = output[['Source','Player']].drop_duplicates()
edges.to_csv('player_edges.csv', index=False)