# Set Up

In [5]:
import pandas as pd
# Data comes from https://fbref.com/
df = pd.read_csv('data/24_25_EPL.csv')

In [7]:
df.columns

Index(['Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Age', 'Born', 'MP', 'Starts',
       'Min', '90s', 'Gls', 'Ast', 'G+A', 'G-PK', 'PK', 'PKatt', 'CrdY',
       'CrdR', 'xG', 'npxG', 'xAG', 'npxG+xAG', 'PrgC', 'PrgP', 'PrgR',
       'Gls per 90', 'Exp gls per 90', 'Scoring'],
      dtype='object')

In [16]:
df = df[df['90s'] >= 3] # Had to have played at least 3 games in the 90s
df = df[df['Pos'] != 'GK']  # Exclude goalkeepers
df['Gls per 90'] = df['Gls'] / df['90s']
df['Exp gls per 90'] = df['xG'] / df['90s']
df['Scoring'] = (df['Gls per 90']*2) - df['Exp gls per 90'] # This way total goals and efficiency are both considered
df['Exp ast per 90'] = df['xAG'] / df['90s']
#df['Ast per 90'] = df['Ast'] / df['90s']
#df['Cards per 90'] = (df['CrdY'] + df['CrdR']) / df['90s']
df['Pro Carries per 90'] = df['PrgC'] / df['90s']
df['Pro Passes per 90'] = df['PrgP'] / df['90s']
#filtered = df[['Player','Squad','Gls per 90','Ast per 90','Cards per 90','Pro Carries per 90','Pro Passes per 90']]
filtered = df[['Player','Squad','Pos','Scoring','Exp ast per 90','Pro Carries per 90','Pro Passes per 90', '90s']]
filtered.head(2)

Unnamed: 0,Player,Squad,Pos,Scoring,Exp ast per 90,Pro Carries per 90,Pro Passes per 90,90s
2,Tyler Adams,Bournemouth,MF,-0.088889,0.05,0.666667,3.388889,18.0
3,Tosin Adarabioyo,Chelsea,DF,0.080292,0.014599,0.364964,2.70073,13.7


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
features = filtered[['Scoring','Exp ast per 90','Pro Carries per 90','Pro Passes per 90','90s']]
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)
similarity_matrix = cosine_similarity(normalized_features)
similarity_df = pd.DataFrame(similarity_matrix, index=df['Player'], columns=df['Player'])
#similarity_df.to_csv("data/similarity_matrix.csv")
similarity_df.head(2)

Player,Tyler Adams,Tosin Adarabioyo,Simon Adingra,Emmanuel Agbadou,Ola Aina,Rayan Aït-Nouri,Kristoffer Ajer,Manuel Akanji,Nathan Aké,Carlos Alcaraz,...,Nathan Wood-Gordon,Luke Woolfenden,Yehor Yarmoliuk,Ryan Yates,Leny Yoro,Ashley Young,Illia Zabarnyi,Oleksandr Zinchenko,Joshua Zirkzee,Martin Ødegaard
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Tyler Adams,1.0,0.763445,-0.530508,0.76559,0.374262,-0.21913,0.809926,0.273223,0.314807,-0.510881,...,0.713441,0.676817,0.799819,0.709362,0.695844,0.529752,0.685771,0.212349,0.714642,-0.230059
Tosin Adarabioyo,0.763445,1.0,-0.311705,0.945141,0.239924,-0.481008,0.517294,-0.052958,0.219452,-0.570053,...,0.927183,0.911228,0.761521,0.814728,0.6566,0.262801,0.349224,0.092726,0.815101,-0.642425


In [19]:
from sklearn.preprocessing import MinMaxScaler

stat_cols = ["Scoring", "Exp ast per 90", "Pro Carries per 90", "Pro Passes per 90", "90s"]

# Apply min-max normalization for graphing
scaler = MinMaxScaler()
df_normalized = filtered.copy()
df_normalized[stat_cols] = scaler.fit_transform(filtered[stat_cols])
# Adjusting scale to be 0.5-1.0 (The graph would look weird if some players had 0 values)
df_normalized[stat_cols] = 0.5 + 0.5 * df_normalized[stat_cols]
df_normalized.rename(columns={
    "Exp ast per 90": "Assisting",
    "Pro Carries per 90": "Dribbling", # This is a bit misleading, but only data we have available
    "Pro Passes per 90": "Passing", # Also a bit misleading
    "90s": "Minutes Played"
}, inplace=True)
df_normalized.to_csv("data/normalized_stats.csv", index=False)
df_normalized.head(2)

Unnamed: 0,Player,Squad,Pos,Scoring,Assisting,Dribbling,Passing,Minutes Played
2,Tyler Adams,Bournemouth,MF,0.574911,0.553261,0.527462,0.647802,0.741935
3,Tosin Adarabioyo,Chelsea,DF,0.622156,0.515551,0.515034,0.612093,0.672581


In [20]:
import plotly.graph_objects as go

player1 = 'Tyler Adams'
player2 = 'Harvey Barnes'

metrics = ["Scoring", "Assisting", "Dribbling", "Passing", "Minutes Played"]

row1 = df_normalized[df_normalized['Player'] == player1].iloc[0]
row2 = df_normalized[df_normalized['Player'] == player2].iloc[0]

values1 = [row1[m] for m in metrics] + [row1[metrics[0]]]  # Loop closure
values2 = [row2[m] for m in metrics] + [row2[metrics[0]]]  # Loop closure
metrics += [metrics[0]]  # Loop closure for labels

fig = go.Figure()

# Player 1
fig.add_trace(go.Scatterpolar(
    r=values1,
    theta=metrics,
    fill='toself',
    name=player1,
    fillcolor='rgba(31, 119, 180, 0.3)',  # Blue with 30% opacity
    line=dict(color='rgba(31, 119, 180, 1)') 
))

# Player 2
fig.add_trace(go.Scatterpolar(
    r=values2,
    theta=metrics,
    fill='toself',
    name=player2,
    fillcolor='rgba(255, 127, 14, 0.3)',  # Orange with 30% opacity
    line=dict(color='rgba(255, 127, 14, 1)')
))

# Layout
fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 1],
            showticklabels=False
        )),
    showlegend=True,
    title=f"{player1} vs {player2} - Attribute Comparison"
)

fig.show()
