# FIFA2020 Player Clustering
[Instructions](https://www.kaggle.com/stefanoleone992/fifa-20-complete-player-dataset#players_20.csv)
##### Part A:
Cluster the players to three clusters, and evaluate the clustering without any real labels
##### Part B:
Same as part A but this time having 15% of the labels

## Download data
Run only if players_20.csv is not in folder

In [28]:
!kaggle datasets download -d stefanoleone992/fifa-20-complete-player-dataset
!unzip fifa-20-complete-player-dataset.zip
!rm fifa-20-complete-player-dataset.zip

fifa-20-complete-player-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  fifa-20-complete-player-dataset.zip
  inflating: players_15.csv          
  inflating: players_16.csv          
  inflating: players_17.csv          
  inflating: players_18.csv          
  inflating: players_19.csv          
  inflating: players_20.csv          
  inflating: teams_and_leagues.csv   


## Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

import plotly.express as px

## Consts

In [2]:
INPUT_PATH = 'players_20.csv'
INDEX_COL = 'sofifa_id'
# As given in the instructions + remoced columns that gives the answer too easily (nation position...)
COLUMNS_TO_DROP = ['gk_diving', 'gk_handling', 'gk_kicking', 'gk_reflexes', 'gk_speed', 'gk_positioning',
                  'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning',
                  'goalkeeping_reflexes', 'goalkeeping_diving', 'team_position', 'player_positions', 'nation_position']

# Columns that looked worthless or encapsulated in other columns 
COLUMNS_TO_DROP_AFTER_INSPECTING = ['short_name', 'long_name', 'nationality', 'club', 'real_face', 'loaned_from', 
                                    'joined', 'contract_valid_until', 'dob', 'international_reputation', 'potential']

# Columns that of the form 'int+int' to be evaluated
COLUMNS_TO_EVAL = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 
                   'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb']

TARGET_COL = 'target'

CLUSTER_NUMBER = 3 # Attacking, midfieled or defensive
RANDOM_SEED = 6

In [3]:
preferred_foot_vals = {
    'Left' : 0,
    'Right' : 1
}

body_type_vals = {
    'Lean' : 1,
    'Normal' : 2,
    'Stocky' : 3,
    'Neymar' : 1,
    'C. Ronaldo' : 2,
    'Shaqiri' : 3,
    'Messi' : 1,
    'Akinfenwa' : 3, # suuuper big
    'PLAYER_BODY_TYPE_25' : 2 # M. Salah
}

work_rate_vals = {
    'Low' : 1,
    'Medium' : 2,
    'High' : 3
}

## Read and Parse Data

### Read

In [4]:
df = pd.read_csv(INPUT_PATH, index_col=INDEX_COL)
df[TARGET_COL] = [possible_positions.split(', ')[0] for possible_positions in df['player_positions']]
df = df.drop(df.loc[(df[TARGET_COL] == 'GK') | (df[TARGET_COL].isna())].index, axis=0) # Drop goalkeepers
df = df.drop(COLUMNS_TO_DROP, axis=1)
df = df.drop(TARGET_COL, axis=1)
df.head()

Unnamed: 0_level_0,player_url,short_name,long_name,age,dob,height_cm,weight_kg,nationality,club,overall,...,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb
sofifa_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,32,1987-06-24,170,72,Argentina,FC Barcelona,94,...,68+2,66+2,66+2,66+2,68+2,63+2,52+2,52+2,52+2,63+2
20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,1985-02-05,187,83,Portugal,Juventus,93,...,65+3,61+3,61+3,61+3,65+3,61+3,53+3,53+3,53+3,61+3
190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Junior,27,1992-02-05,175,68,Brazil,Paris Saint-Germain,92,...,66+3,61+3,61+3,61+3,66+3,61+3,46+3,46+3,46+3,61+3
183277,https://sofifa.com/player/183277/eden-hazard/2...,E. Hazard,Eden Hazard,28,1991-01-07,175,74,Belgium,Real Madrid,91,...,66+3,63+3,63+3,63+3,66+3,61+3,49+3,49+3,49+3,61+3
192985,https://sofifa.com/player/192985/kevin-de-bruy...,K. De Bruyne,Kevin De Bruyne,28,1991-06-28,181,70,Belgium,Manchester City,91,...,77+3,77+3,77+3,77+3,77+3,73+3,66+3,66+3,66+3,73+3


In [5]:
# Currenty, alot of columns exist but won't help us (call it domain knowledge...)

# Removing colunms that contains data that can be inferred/affected by other columns
# Removing columns that probably have no special value
df = df.drop(COLUMNS_TO_DROP_AFTER_INSPECTING, axis=1)

### Parse

In [6]:
# Needs special processing (after searching online, looking at the values...)

# evaluate columns that have a base value and a reputation bonus
df[COLUMNS_TO_EVAL] = df[COLUMNS_TO_EVAL].apply(np.vectorize(eval))

# work_rate has the form attcking_work_rate/defensive_work_rate - split to two features and transform to numeric
work_rates_df = df.work_rate.str.split('/', expand=True)
df['attacking_work_rate'] = work_rates_df[0]
df['defensive_work_rate'] = work_rates_df[1]
df = df.drop('work_rate', axis=1)

# Get features from player's uniqe attributes (dribbler, long shooter...)
df.player_tags = df.player_tags.fillna("").str.split(", ")
mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('player_tags')),
                          columns=mlb.classes_,
                          index=df.index))
df = df.drop('', axis=1)

# Get features from player's uniqe traits (leader, diver...)
df.player_traits = df.player_traits.fillna("").str.split(", ")
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('player_traits')),
                          columns=mlb.classes_,
                          index=df.index))
df = df.drop('', axis=1)

# Turn categorical to numeric (when it is logical)
df.body_type.replace(body_type_vals, inplace=True)
df.preferred_foot.replace(preferred_foot_vals, inplace=True)
df.attacking_work_rate.replace(work_rate_vals, inplace=True)
df.defensive_work_rate.replace(work_rate_vals, inplace=True)


# Fill na with values that looks fine to me
df.release_clause_eur.fillna(0, inplace=True) # No release clause -> can be signed by another team...
df.team_jersey_number.fillna(df.nation_jersey_number, inplace=True) # No player with both numbers missing
df = df.drop('nation_jersey_number', axis=1) # Most players aren't playing in their national team

# Turning floats to ints (after checking they are ints in the shape of floats... (X.0))
float_cols =  ['release_clause_eur', 'team_jersey_number', 'pace', 
               'shooting', 'passing', 'dribbling', 'defending', 'physic']

df[float_cols] = df[float_cols].astype(int)

# Set index to be the URL (momre informative than index and still unique)
df.set_index('player_url', inplace=True)

### Normalize 

In [7]:
# Divide position scores by overall score
df[COLUMNS_TO_EVAL] = df[COLUMNS_TO_EVAL].div(df['overall'], axis=0)#.isna()


In [18]:
# df=(df-df.min())/(df.max()-df.min())
df=(df-df.mean())/(df.std()) # Z-scores.. not in range of (-1,1)

## Creating Different datasets

### PCA

In [8]:
pca = PCA(n_components=20, random_state=RANDOM_SEED)
pca_data = pca.fit_transform(df)
pca_df = pd.DataFrame(pca_data,index=df.index, 
                      columns=[f'c{i}' for i in range(1, pca.n_components+1)])

pca.explained_variance_ratio_
px.line(x=pca_df.columns, y=pca.explained_variance_ratio_)

### t-SNE

In [23]:
tsne = TSNE(n_components=3, random_state=RANDOM_SEED)
tsne_data = tsne.fit_transform(df)
tsne_df = pd.DataFrame(tsne_data,index=df.index, 
                       columns=[f'c{i}' for i in range(1, tsne.n_components+1)])

### Sub Datasets

In [8]:
tags = ['#Acrobat', '#Aerial Threat' ,'#Clinical Finisher', '#Complete Defender', 
        '#Complete Forward', '#Complete Midfielder', '#Crosser', '#Distance Shooter', 
        '#Dribbler', '#Engine', '#FK Specialist', '#Playmaker\xa0 ', '#Poacher',  '#Speedster', 
        '#Strength', '#Tackling\xa0', '#Tactician\xa0']

tags_df = df[tags]

traits = ['1-on-1 Rush', 'Argues with Officials', 'Avoids Using Weaker Foot', 
          'Beat Offside Trap', 'Crowd Favourite', 'Diver', 'Dives Into Tackles (CPU AI Only)', 
          'Early Crosser', 'Finesse Shot', 'Giant Throw-in', 'Inflexible', 'Injury Free', 
          'Injury Prone', 'Leadership', 'Long Passer (CPU AI Only)', 
          'Long Shot Taker (CPU AI Only)', 'Long Throw-in', 'Outside Foot Shot', 
          'Power Free-Kick', 'Selfish', 'Skilled Dribbling', 'Speed Dribbler (CPU AI Only)']
traits_df = df[traits]

positions_scores = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 
                    'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 
                    'cb', 'rcb', 'rb']
positions_scores_df = df[positions_scores]

general_player_scores = ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']
general_player_scores_df = df[general_player_scores]

general_player_details = ['age', 'height_cm', 'weight_kg', 'overall', 'value_eur', 'wage_eur', 
                          'preferred_foot', 'weak_foot', 'skill_moves', 'body_type', 
                          'release_clause_eur', 'team_jersey_number']
general_player_details_df = df[general_player_details]

## Clustering

In [18]:
# What df to use and how to visualize results
chosen_df = positions_scores_df
dim_reduction_for_plot = "PCA"

In [19]:
kmeans = KMeans(n_clusters=CLUSTER_NUMBER, random_state=RANDOM_SEED)
res = kmeans.fit_predict(chosen_df)

In [20]:
if dim_reduction_for_plot == 'PCA':
    dim_red = PCA(n_components=2, random_state=RANDOM_SEED)
elif dim_reduction_for_plot == 'TSNE':
    dim_red = TSNE(n_components=2, random_state=RANDOM_SEED)
dim_red_data = dim_red.fit_transform(chosen_df)
dim_red_df = pd.DataFrame(dim_red_data,index=chosen_df.index, 
                      columns=[f'c{i}' for i in range(1, dim_red.n_components+1)])
dim_red_df['cluster'] = res

In [21]:
fig = px.scatter(dim_red_df, x='c1', y='c2', color='cluster', hover_name=dim_red_df.index)
fig.update_layout(title=f"Data after dimention reduction ({dim_reduction_for_plot})")
fig.show()

In [22]:

silhouette_score(chosen_df, res)

0.5470520051223846

In [22]:
pd.value_counts(res)

0    6497
1    5063
2    4682
dtype: int64

## Fifa 19 scores
Got formula to clculate each position's overall score. Can use features and overall score to see what formula used-> get position

In [164]:
raw_df = pd.read_csv(INPUT_PATH, index_col='player_url')
reputations = raw_df['international_reputation']

### Calculate Scores per Position

In [178]:
# Defensive positions
cb_score = pd.Series(index=raw_df.index, 
                     data=[0.15*raw_df.defending_marking
                           +0.15*raw_df.defending_standing_tackle
                           +0.15*raw_df.defending_sliding_tackle
                           +0.1*raw_df.attacking_heading_accuracy
                           +0.1*raw_df.power_strength
                           +0.08*raw_df.mentality_aggression
                           +0.08*raw_df.mentality_interceptions
                           +0.05*raw_df.attacking_short_passing
                           +0.05*raw_df.skill_ball_control
                           +0.05*raw_df.movement_reactions
                           +0.04*raw_df.power_jumping][0],
                     dtype=int, name="cb")

rb_score = pd.Series(index=raw_df.index, 
                      data=[0.13*raw_df.defending_sliding_tackle
                            +0.12*raw_df.defending_standing_tackle
                            +0.12*raw_df.mentality_interceptions
                            +0.1*raw_df.power_stamina
                            +0.08*raw_df.power_stamina
                            +0.08*raw_df.movement_reactions
                            +0.07*raw_df.attacking_crossing
                            +0.07*raw_df.attacking_heading_accuracy
                            +0.07*raw_df.skill_ball_control
                            +0.06*raw_df.attacking_short_passing
                            +0.05*raw_df.movement_sprint_speed
                            +0.05*raw_df.mentality_aggression][0],
                      dtype=int, name="rb")

rwb_score = pd.Series(index=raw_df.index, 
                      data=[0.11*raw_df.defending_standing_tackle
                            +0.1*raw_df.defending_sliding_tackle
                            +0.1*raw_df.attacking_crossing
                            +0.1*raw_df.attacking_short_passing
                            +0.1*raw_df.skill_ball_control
                            +0.1*raw_df.mentality_interceptions
                            +0.09*raw_df.defending_marking
                            +0.08*raw_df.power_stamina
                            +0.08*raw_df.movement_reactions
                            +0.07*raw_df.skill_dribbling
                            +0.04*raw_df.movement_sprint_speed
                            +0.03*raw_df.mentality_aggression][0],
                      dtype=int, name="rwb")

In [179]:
# Midfield positions
cdm_score = pd.Series(index=raw_df.index, 
                      data=[0.13*raw_df.attacking_short_passing
                            +0.12*raw_df.mentality_interceptions
                            +0.11*raw_df.skill_long_passing
                            +0.1*raw_df.defending_marking
                            +0.1*raw_df.defending_standing_tackle
                            +0.09*raw_df.skill_ball_control
                            +0.09*raw_df.movement_reactions
                            +0.08*raw_df.mentality_vision
                            +0.06*raw_df.power_stamina
                            +0.06*raw_df.power_strength
                            +0.04*raw_df.mentality_aggression][0],
                      dtype=int, name="cdm")

cm_score = pd.Series(index=raw_df.index,
                     data=[0.15*raw_df.attacking_short_passing
                           +0.13*raw_df.skill_long_passing
                           +0.12*raw_df.mentality_vision
                           +0.1*raw_df.skill_ball_control
                           +0.09*raw_df.skill_dribbling
                           +0.08*raw_df.movement_reactions
                           +0.08*raw_df.mentality_interceptions
                           +0.08*raw_df.mentality_positioning
                           +0.06*raw_df.defending_standing_tackle
                           +0.06*raw_df.power_stamina
                           +0.05*raw_df.power_long_shots][0],
                      dtype=int, name="cm")

cam_score = pd.Series(index=raw_df.index,
                      data=[0.16*raw_df.attacking_short_passing
                            +0.16*raw_df.mentality_vision
                            +0.13*raw_df.skill_ball_control
                            +0.12*raw_df.mentality_positioning
                            +0.11*raw_df.skill_dribbling
                            +0.08*raw_df.movement_reactions
                            +0.06*raw_df.power_long_shots
                            +0.05*raw_df.attacking_finishing
                            +0.05*raw_df.power_shot_power
                            +0.04*raw_df.movement_acceleration
                            +0.04*raw_df.movement_agility][0],
                      dtype=int, name="cam")

In [180]:
# Wingers
rm_score = pd.Series(index=raw_df.index,
                     data=[0.14*raw_df.attacking_crossing
                           +0.14*raw_df.skill_dribbling
                           +0.12*raw_df.attacking_short_passing
                           +0.12*raw_df.skill_ball_control
                           +0.08*raw_df.skill_long_passing
                           +0.08*raw_df.mentality_vision
                           +0.07*raw_df.movement_reactions
                           +0.07*raw_df.mentality_positioning
                           +0.05*raw_df.power_stamina
                           +0.05*raw_df.movement_acceleration
                           +0.05*raw_df.movement_sprint_speed
                           +0.03*raw_df.movement_agility][0],
                      dtype=int, name="rm")

rw_score = pd.Series(index=raw_df.index,
                     data=[0.16*raw_df.attacking_crossing
                           +0.12*raw_df.attacking_short_passing
                           +0.11*raw_df.skill_dribbling
                           +0.11*raw_df.skill_ball_control
                           +0.1*raw_df.power_shot_power
                           +0.1*raw_df.power_long_shots
                           +0.1*raw_df.movement_reactions
                           +0.06*raw_df.attacking_short_passing
                           +0.05*raw_df.attacking_heading_accuracy
                           +0.05*raw_df.mentality_vision
                           +0.04*raw_df.movement_acceleration
                           +0.04*raw_df.movement_sprint_speed][0],
                      dtype=int, name="rw")

In [181]:
# Strikers
cf_score = pd.Series(index=raw_df.index,
                     data=[0.12*raw_df.attacking_finishing
                           +0.12*raw_df.attacking_short_passing
                           +0.11*raw_df.skill_dribbling
                           +0.11*raw_df.skill_ball_control
                           +0.1*raw_df.power_shot_power
                           +0.1*raw_df.power_long_shots
                           +0.1*raw_df.movement_reactions
                           +0.06*raw_df.attacking_short_passing
                           +0.05*raw_df.attacking_heading_accuracy
                           +0.05*raw_df.mentality_vision
                           +0.04*raw_df.movement_acceleration
                           +0.04*raw_df.movement_sprint_speed][0],
                      dtype=int, name="cf")

st_score = pd.Series(index=raw_df.index,
                     data=[0.2*raw_df.attacking_finishing
                           +0.12*raw_df.mentality_positioning
                           +0.1*raw_df.attacking_heading_accuracy
                           +0.1*raw_df.power_shot_power
                           +0.1*raw_df.movement_reactions
                           +0.08*raw_df.skill_dribbling
                           +0.08*raw_df.skill_ball_control
                           +0.05*raw_df.attacking_volleys
                           +0.05*raw_df.power_long_shots
                           +0.05*raw_df.movement_acceleration
                           +0.04*raw_df.movement_sprint_speed
                           +0.03*raw_df.power_strength][0],
                      dtype=int, name="st")

In [196]:
position_scores_df = pd.DataFrame(data=[cb_score, rb_score, rwb_score,
                                        cdm_score, cm_score, cam_score, 
                                        rm_score, rw_score, cf_score, st_score]).transpose()
# Since international reputation adds to all position score, will ignore that
position_scores_df.head()

Unnamed: 0_level_0,cb,rb,rwb,cdm,cm,cam,rm,rw,cf,st
player_url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
https://sofifa.com/player/158023/lionel-messi/20/159586,52,63,66,69,85,93,91,94,91,89
https://sofifa.com/player/20801/c-ronaldo-dos-santos-aveiro/20/159586,54,64,63,64,78,89,87,92,90,92
https://sofifa.com/player/190871/neymar-da-silva-santos-jr/20/159586,46,62,64,63,80,90,89,91,87,84
https://sofifa.com/player/200389/jan-oblak/20/159586,33,31,30,40,36,39,36,38,37,31
https://sofifa.com/player/183277/eden-hazard/20/159586,48,62,64,66,80,89,88,89,86,83


### Use Scores to get Natural Position

In [198]:
# take the FIRST best position
position_scores_df['natural_pos'] = position_scores_df.idxmax(axis=1)
pd.value_counts(position_scores_df['natural_pos'])

rw     4986
cb     4442
rb     2309
st     1852
rm     1746
cam    1225
cdm    1026
cm      366
rwb     238
cf       88
Name: natural_pos, dtype: int64