# Importar librerías

In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap

from itertools import combinations
from sklearn.cluster import KMeans,MeanShift
from sklearn.experimental import enable_iterative_imputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA

# Cargar dataset

In [None]:
df = pd.read_csv("players_22.csv")

# Análisis exploratorio de la base

In [None]:
df.info()

In [None]:
df.head()

In [None]:
col_groups = df.columns.to_series().groupby(df.dtypes).groups
group_types = {k.name: v.tolist() for k, v in col_groups.items()}

group_types

In [None]:
group_types["float64"]

In [None]:
group_types["object"]

In [None]:
positions = ['ls',
 'st',
 'rs',
 'lw',
 'lf',
 'cf',
 'rf',
 'rw',
 'lam',
 'cam',
 'ram',
 'lm',
 'lcm',
 'cm',
 'rcm',
 'rm',
 'lwb',
 'ldm',
 'cdm',
 'rdm',
 'rwb',
 'lb',
 'lcb',
 'cb',
 'rcb',
 'rb',
 'gk']

In [None]:
df_splitted = df.assign(player_positions=df.player_positions.str.split(',')).explode('player_positions').reset_index(drop=True)

In [None]:
df_splitted.player_positions.unique()

In [None]:
df[df["club_position"]=="RWB"][df["club_name"]=="Manchester City"][["long_name", "club_position", "nation_position", "player_positions"]]

In [None]:
df[df["club_position"]=="RWB"][["long_name", "club_position", "nation_position", "player_positions"]]

In [None]:
numerical_cols = group_types["float64"] + group_types["int64"]

In [None]:
score_cols = ['attacking_crossing',
  'attacking_finishing',
  'attacking_heading_accuracy',
  'attacking_short_passing',
  'attacking_volleys',
  'skill_dribbling',
  'skill_curve',
  'skill_fk_accuracy',
  'skill_long_passing',
  'skill_ball_control',
  'movement_acceleration',
  'movement_sprint_speed',
  'movement_agility',
  'movement_reactions',
  'movement_balance',
  'power_shot_power',
  'power_jumping',
  'power_stamina',
  'power_strength',
  'power_long_shots',
  'mentality_aggression',
  'mentality_interceptions',
  'mentality_positioning',
  'mentality_vision',
  'mentality_penalties',
  'mentality_composure',
  'defending_marking_awareness',
  'defending_standing_tackle',
  'defending_sliding_tackle',
  'goalkeeping_diving',
  'goalkeeping_handling',
  'goalkeeping_kicking',
  'goalkeeping_positioning',
  'goalkeeping_reflexes']

In [None]:
for col in score_cols:
    print(f"{col} - Minimo: {df[col].min()} Máximo {df[col].max()}")

In [None]:
df["club_position"].unique()

In [None]:
df["is_gk"] = df["club_position"] == 'GK'

# Diagramas de dispersión

In [None]:
group_types["float64"]+["is_gk"]

In [None]:
sns.pairplot(df[group_types["float64"]+["is_gk"]], hue="is_gk")

In [None]:
correlations = df[numerical_cols].corr()

In [None]:
class Correlation:
    def __init__(self, name_var_1, name_var_2, correlation):
        self.name_var_1 = name_var_1
        self.name_var_2 = name_var_2
        self.correlation = correlation

    def is_very_high(self):
        return abs(self.correlation) >= 0.8
    
    def is_high(self):
        return 0.8 > abs(self.correlation) >= 0.6

    def print(self):
        print(f"Las variables {self.name_var_1} y {self.name_var_2} tienen una correlación de {self.correlation}")

In [None]:
combinations_columns = list(combinations(numerical_cols, r=2))

In [None]:
combinations_columns

In [None]:

high_correlations = []

for pair in combinations_columns:
    if(pair[0] != pair[1]):
        correlation = Correlation(pair[0], pair[1], correlations[pair[0]][pair[1]])
        if(correlation.is_very_high()):
            high_correlations.append(correlation)

# Preprocesamiento

## Imputación de valores faltantes

In [None]:

knn_imputer = IterativeImputer(random_state=0, estimator=KNeighborsRegressor())
df[numerical_cols] = knn_imputer.fit_transform(df[numerical_cols])

## Estandarización de variables numéricas

Escalamos los datos. Aplicaremos KMeans sobre las columnas score_cols, que como vimos anteriormente no tienen exactamente el mismo rango. 

In [None]:
scaler = MinMaxScaler()

standarized_df = df.copy()

standarized_df[numerical_cols] = pd.DataFrame(scaler.fit_transform(df[numerical_cols]), columns=df[numerical_cols].columns)


# Clustering

## K-Means

In [None]:
# Número de clusters buscado
n_clust = 5

km = KMeans(n_clusters=n_clust, random_state=565)
km.fit(standarized_df[score_cols])

# Etiquetas asignadas por el algoritmo
standarized_df["km_cluster"] = km.labels_


In [None]:
standarized_df["club_name"]

In [None]:
standarized_df[standarized_df["club_name"] == "FC Bayern München"][["short_name", "km_cluster"]]

Según un análisis superficial de equipos como FC Bayern München, Paris Saint-Germain, Manchester United, Manchester City y Boca Juniors, al parecer los agrupamientos son los siguientes 

0. Defensores
1. Arqueros
2. Mediocampista ofensivo / Delantero 
3. Mediocampista ofensivo / Delantero 
4. Mediocampista / Lateral

La diferencia entre el 2 y el 3 pasa por la calidad del jugador, siendo los 2, jugadores de baja calidad. Podríamos considerar a los mismos con un overall<0.5

In [None]:
standarized_df["is_expected_group"] = standarized_df.apply(lambda player: is_expected_group(player["player_positions"], 
    player['overall'], 
    player["km_cluster"]), axis=1)

standarized_df["is_expected_group"].sum()/len(standarized_df)

La hipótesis es cierta en el 80% de los jugadores

In [None]:
def contains(a, b):
    for i in a:
        if (i in b):
            return True
    return False

In [None]:
goalkeeper = ["GK"]
backs = ["CB"]
lateral_backs = ["LB","RB","RWB","LWB"]
offensive_midfielders = ["CF","CAM","LM","RM"]
midfielders = ["CDM", "CM"]
forwards = ["RW","ST","LW"]

BACK = 0
GOALKEEPER = 1
BAD_OFFENSIVE_MIDFIELDER_FORWARD = 2
OFFENSIVE_MIDFIELDER_FORWARD = 3
MIDFIELDER_LATERAL_BACK = 4


def is_expected_group(player_positions, overall, group):
    player_positions = player_positions.replace(" ", "").split(",")

    if (contains(player_positions, goalkeeper) & (group == GOALKEEPER)):
        return 1

    if (contains(player_positions, backs) & (group == BACK)):
        return 1

    if (contains(player_positions, lateral_backs+midfielders)  & (group == MIDFIELDER_LATERAL_BACK)):
        return 1
    
    if (contains(player_positions, offensive_midfielders+forwards)):
        if ((overall<0.5) & (group == BAD_OFFENSIVE_MIDFIELDER_FORWARD)):
            return 1
        if ((overall>=0.5) & (group == OFFENSIVE_MIDFIELDER_FORWARD)):
            return 1
            
    return 0

        

# Embedding

## PCA

In [None]:
pca = PCA(n_components=4)
pca.fit(standarized_df[group_types["int64"]])

In [None]:
pca.explained_variance_ratio_

In [None]:
pca.transform(standarized_df[group_types["int64"]])

In [None]:
pca_df = pd.DataFrame(data = pca.transform(standarized_df[group_types["int64"]]), columns=["pc1", "pc2", "pc3", "pc4"])

In [None]:
standarized_df["pc1"] = pca_df["pc1"]
standarized_df["pc2"] = pca_df["pc2"]
standarized_df["pc3"] = pca_df["pc3"]
standarized_df["pc4"] = pca_df["pc4"]

In [None]:

sns.set(rc={'figure.figsize':(15,15)})
sns.scatterplot(data=standarized_df.sample(8000), x="pc1", y="pc2", hue="km_cluster")

Si bien las dos componentes principales conservan aproximadamente solo el 65% de la información, es suficiente para visualizar en el plano la separación entre ambos grupos.
Se observa que la PC1 se encuentra significativamente relacionada a la capacidad que tiene el jugador para atajar (altos valores en los scores que comienzan con "goalkeeping_") 