# Notebook Final del PFM
## Título: Comparativa de algoritmos de clustering sobre datos de futbolistas de las grandes ligas europeas.
## Tutor: Miguel Camacho

Los datos se han obtenido de fbref.com. Están compuestos de 128 jugadores ofensivos que han participado en las últimas cinco temporadas de forma ininterrumpida en las cinco grandes ligas europeas de fútbol (Premier League, LaLiga, Serie A, Bundesliga y Ligue 1). 



# Importar Librerías

In [None]:
!pip install pandas-profiling==2.7.1
!pip install pandas==1.2.4
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import sys, getopt
import csv
import pandas_profiling
import numpy as np
from sklearn import preprocessing
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Definir constantes y funciones auxiliares

In [None]:
class Summary:
  START_HEADER = 9
  N_COLUMNS = 38
  END_HEADER = START_HEADER + N_COLUMNS
  FIELDS_SUMMARY = ['Date','Day','Comp','Round','Venue','Result',
                    'Squad','Opponent','Start','Pos','Min','Gls','Ast','PK',
                    'PKatt','Sh','SoT','CrdY','CrdR','Touches','Press','Tkl',
                    'Int','Blocks','xG','npxG','xA','SCA','GCA','Cmp','Att',
                    'CmpPerc','Prog','Carries','Prog','Succ','Att',
                    'Match Report','Player']
  FIELDS_MERGE_SUMMARY = ['Gls', 'PK', 'PKatt', 'Sh', 'SoT', 'xG', 'npxG']

class Passing:
  START_HEADER = 8
  N_COLUMNS = 33
  END_HEADER = START_HEADER + N_COLUMNS
  FIELDS_PASSING = ['Date','Day','Comp','Round','Venue','Result',
                    'Squad','Opponent','Start','Pos','Min','TotCmp','TotAtt',
                    'TotCmpPerc','TotDist','PrgDist','ShCmp','ShAtt',
                    'ShCmpPerc','MedCmp','MedAtt','MedCmpPerc','LongCmp',
                    'LongAtt','LongCmpPerc','Ast','xA','KP','1/3','PPA','CrsPA',
                    'Prog','Match Report','Player']

class Pass_Types:
  START_HEADER = 9
  N_COLUMNS = 37
  END_HEADER = START_HEADER + N_COLUMNS
  FIELDS_PASS_TYPES = ['Date','Day','Comp','Round','Venue','Result',
                       'Squad','Opponent','Start','Pos','Min','Att','Live',
                       'Dead','FK','TB','Press','Sw','Crs','CK','InCK','OutCK',
                       'StrCK','GroundPass','LowPass','HighPass','LeftPass',
                       'RightPass','HeadPass','TI','Other','CmpPass','OffSide',
                       'Out','IntPass','BlockPass','Match Report','Player']

class GCA:
  START_HEADER = 5
  N_COLUMNS = 26
  END_HEADER = START_HEADER + N_COLUMNS
  FIELDS_GCA = ['Date','Day','Comp','Round','Venue','Result','Squad',
                'Opponent','Start','Pos','Min','SCA','PassLiveSCA',
                'PassDeadSCA','DribSCA','ShSCA','FldSCA','DefSCA','GCA',
                'PassLiveGCA','PassDeadGCA','DribGCA','ShGCA','FldGCA',
                'DefGCA','Match Report','Player']

class Defensive:
  START_HEADER = 7
  N_COLUMNS = 35
  END_HEADER = START_HEADER + N_COLUMNS
  FIELDS_DEFENSIVE = ['Date','Day','Comp','Round','Venue','Result',
                      'Squad','Opponent','Start','Pos','Min','Tkl','TklW',
                      'Def3rdTkl','Mid3rdTkl','Att3rdTkl','TklDrib',
                      'TklDribAtt','TklDribPerc','TklDribPast','PressAtt',
                      'PressSucc','PressPerc','Def3rdPress','Mid3rdPress',
                      'Att3rdPress','Blocks','ShBlock','ShSvBlock','PassBlock',
                      'Int','Tkl+Int','Clr','Err','Match Report','Player']

class Possession:
  START_HEADER = 7
  N_COLUMNS = 36
  END_HEADER = START_HEADER + N_COLUMNS
  FIELDS_POSSESSION = ['Date','Day','Comp','Round','Venue','Result',
                       'Squad','Opponent','Start','Pos','Min','Touches',
                       'DefPenTouches','Def3rdTouches','Mid3rdTouches',
                       'Att3rdTouches','AttPenTouches','LiveTouches',
                       'SuccDrib','AttDrib','SuccDribPerc','NuPlDrib',
                       'Nutmegs','Carries','TotDistCarries','PrgDistCarries',
                       'ProgCarries','1/3Carries','CPA','MisCarries',
                       'DisCarries','TargPass','RecPass','RecPassPerc',
                       'ProgPassRec','Match Report','Player']

class Miscellaneous:
  START_HEADER = 5
  N_COLUMNS = 28
  END_HEADER = START_HEADER + N_COLUMNS
  FIELDS_MISC = ['Date','Day','Comp','Round','Venue','Result','Squad',
                 'Opponent','Start','Pos','Min','CrdY','CrdR','2CrdY','FlsComm',
                 'FlsDrawn','Offsides','Crs','Int','TklW','PKwon','PKcon','OG',
                 'Recov','AerialDuelsWon','AerialDuelsLost',
                 'AerialDuelsWinPerc','Match Report', 'Player']
  FIELDS_MERGE_MISC = ['CrdY', 'CrdR', '2CrdY', 'FlsComm', 'FlsDrawn', 
                       'Offsides', 'PKwon', 'PKcon', 'OG', 'Recov', 
                       'AerialDuelsWon', 'AerialDuelsLost', 
                       'AerialDuelsWinPerc', 'Match Report']


class Global:
  START_DATA = 2
  KEY = ['Player', 'Date', 'Day', 'Comp', 'Round', 'Venue', 'Result', 'Squad', 
       'Opponent', 'Start', 'Pos', 'Min']
  TYPES = ['summary', 'passing', 'passing_types', 'gca', 'defense', 
           'possession', 'misc']
  DICT_TYPES = {'summary': Summary, 'passing': Passing, 
                'passing_types': Pass_Types, 'gca': GCA, 
                'defense': Defensive, 'possession': Possession, 
                'misc': Miscellaneous}
  SEASONS = ['2021-2022', '2020-2021', '2019-2020', '2018-2019', '2017-2018']
  DICT_PLAYERS = {
      'Karim Benzema': 'https://fbref.com/en/players/70d74ece/matchlogs/',
      'Lionel Messi': 'https://fbref.com/en/players/d70ce98e/matchlogs/',
      'Kylian Mbappe': 'https://fbref.com/en/players/42fd9c7f/matchlogs/',
      'Memphis Depay': 'https://fbref.com/en/players/8f696594/matchlogs/',
      'Duvan Zapata': 'https://fbref.com/en/players/d3de9af0/matchlogs/',
      'Luis Muriel': 'https://fbref.com/en/players/eb2fe5b6/matchlogs/',
      'Robert Lewandowski': 'https://fbref.com/en/players/8d78e732/matchlogs/',
      'Gerard Moreno': 'https://fbref.com/en/players/81f0781e/matchlogs/',
      'Cristiano Ronaldo': 'https://fbref.com/en/players/dea698d9/matchlogs/',
      'Harry Kane': 'https://fbref.com/en/players/21a66f6a/matchlogs/',
      'Paulo Dybala': 'https://fbref.com/en/players/e0921a4f/matchlogs/',
      'Antoine Griezmann': 'https://fbref.com/en/players/df69b544/matchlogs/',
      'Roberto Firmino': 'https://fbref.com/en/players/4c370d81/matchlogs/',
      'Dimitri Payet': 'https://fbref.com/en/players/58ae47b2/matchlogs/',
      'Iago Aspas': 'https://fbref.com/en/players/7dcf86f6/matchlogs/',
      'Marco Reus': 'https://fbref.com/en/players/36a3ff67/matchlogs/',
      'Kingsley Coman': 'https://fbref.com/en/players/042e8a49/matchlogs/',
      'Angel Di Maria': 'https://fbref.com/en/players/19cda00b/matchlogs/',
      'Leroy Sane': 'https://fbref.com/en/players/2b114be3/matchlogs/',
      'Neymar': 'https://fbref.com/en/players/69384e5d/matchlogs/',
      'Riyad Mahrez': 'https://fbref.com/en/players/892d5bb1/matchlogs/',
      'Nabil Fekir': 'https://fbref.com/en/players/bece776f/matchlogs/',
      'Serge Gnabry': 'https://fbref.com/en/players/88e357ef/matchlogs/',
      'Erik Lamela': 'https://fbref.com/en/players/abe66106/matchlogs/',
      'Nicolas Pepe': 'https://fbref.com/en/players/57e3f0c7/matchlogs/',
      'Christopher Nkunku': 'https://fbref.com/en/players/7c56da38/matchlogs/',
      'Phil Foden': 'https://fbref.com/en/players/ed1e53f3/matchlogs/',
      'Roberto Firmino': 'https://fbref.com/en/players/4c370d81/matchlogs/',
      'Dimitri Payet': 'https://fbref.com/en/players/58ae47b2/matchlogs/',
      'Angel Correa': 'https://fbref.com/en/players/01eb744d/matchlogs/',
      'Federico Chiesa': 'https://fbref.com/en/players/b0f7e36c/matchlogs/',
      'Sadio Mane': 'https://fbref.com/en/players/c691bfe2/matchlogs/',
      'Wissam Ben Yedder': 'https://fbref.com/en/players/942b4f90/matchlogs/',
      'Breel Embolo': 'https://fbref.com/en/players/0b4f388a/matchlogs/',
      'Alexandre Lacazette': 'https://fbref.com/en/players/9dbb75ca/matchlogs/',
      'Alvaro Morata': 'https://fbref.com/en/players/129af0db/matchlogs/',
      'Gabriel Jesus': 'https://fbref.com/en/players/b66315ae/matchlogs/',
      'Ciro Immobile': 'https://fbref.com/en/players/4431aed2/matchlogs/',
      'Romelu Lukaku': 'https://fbref.com/en/players/5eae500a/matchlogs/',
      'Edin Dzeko': 'https://fbref.com/en/players/3bb7f478/matchlogs/',
      'Andre Silva': 'https://fbref.com/en/players/3effaa34/matchlogs/',
      'Timo Werner':'https://fbref.com/en/players/49fe9070/matchlogs/',
      'Patrik Schick': 'https://fbref.com/en/players/5d4f7d61/matchlogs/',
      'Andy Delort': 'https://fbref.com/en/players/d33c706e/matchlogs/',
      'Luis Suarez': 'https://fbref.com/en/players/a6154613/matchlogs/',
      'Carlos Bacca': 'https://fbref.com/en/players/09a9e921/matchlogs/',
      'Rodrigo': 'https://fbref.com/en/players/1fb1c435/matchlogs/',
      'Gareth Bale': 'https://fbref.com/en/players/a58bb1e1/matchlogs/',
      'Antonio Sanabria': 'https://fbref.com/en/players/0a447501/matchlogs/',
      'Kike': 'https://fbref.com/en/players/e897d8ba/matchlogs/',
      'Inaki Williams': 'https://fbref.com/en/players/6a99e0b1/matchlogs/',
      'Jamie Vardy': 'https://fbref.com/en/players/45963054/matchlogs/',
      'Paco Alcacer': 'https://fbref.com/en/players/a7a9d95a/matchlogs/',
      'Edinson Cavani': 'https://fbref.com/en/players/527f063d/matchlogs/',
      'Willian Jose': 'https://fbref.com/en/players/d87e2cae/matchlogs/',
      'Joao Pedro': 'https://fbref.com/en/players/81255c03/matchlogs/',
      'Kevin Lasagna': 'https://fbref.com/en/players/09538fdb/matchlogs/',
      'Fabio Quagliarella': 'https://fbref.com/en/players/ee4f2f3b/matchlogs/',
      'Lucas Perez': 'https://fbref.com/en/players/a300ac7e/matchlogs/',
      'Manolo Gabbiadini': 'https://fbref.com/en/players/8f866fe8/matchlogs/',
      'Roger Marti': 'https://fbref.com/en/players/0ae4e09a/matchlogs/',
      'Danny Ings': 'https://fbref.com/en/players/07802f7f/matchlogs/',
      'Ruben Sobrino': 'https://fbref.com/en/players/19b776e9/matchlogs/',
      'Raul Garcia': 'https://fbref.com/en/players/b418dbd4/matchlogs/',
      'Richarlison': 'https://fbref.com/en/players/fa031b34/matchlogs/',
      'Maxi Gomez': 'https://fbref.com/en/players/4c2e9442/matchlogs/',
      'Jorge Molina': 'https://fbref.com/en/players/43f71e77/matchlogs/',
      'Joselu': 'https://fbref.com/en/players/6265208f/matchlogs/',
      'Mattia Destro': 'https://fbref.com/en/players/d7d32194/matchlogs/',
      'Youssef En-Nesyri': 'https://fbref.com/en/players/04e17fd5/matchlogs/',
      'Olivier Giroud': 'https://fbref.com/en/players/16ceb862/matchlogs/',
      'Santi Mina': 'https://fbref.com/en/players/0b90bb97/matchlogs/',
      'Chris Wood': 'https://fbref.com/en/players/4e9a0555/matchlogs/',
      'Enes Unal': 'https://fbref.com/en/players/f8eca1b6/matchlogs/',
      'Leonardo Pavoletti': 'https://fbref.com/en/players/d37b0350/matchlogs/',
      'Giovanni Simeone': 'https://fbref.com/en/players/343c0d52/matchlogs/',
      'Christian Benteke': 'https://fbref.com/en/players/ab070c55/matchlogs/',
      'Dominic Calvert Lewin': 'https://fbref.com/en/players/59e6e5bf/matchlogs/',
      'Callum Wilson': 'https://fbref.com/en/players/c596fcb0/matchlogs/',
      'Keita Balde': 'https://fbref.com/en/players/509a4ccb/matchlogs/',
      'Edinson Cavani': 'https://fbref.com/en/players/527f063d/matchlogs/',
      'Mauro Icardi': 'https://fbref.com/en/players/43b78598/matchlogs/',
      'Arkadiusz Milik': 'https://fbref.com/en/players/85613cf0/matchlogs/',
      'Sehrou Guirassy': 'https://fbref.com/en/players/923f4dda/matchlogs/',
      'Angel Rodriguez': 'https://fbref.com/en/players/8cfc2f69/matchlogs/',
      'Munir-El-Haddadi': 'https://fbref.com/en/players/8696bc90/matchlogs/',
      'Mikel Oyarzabal': 'https://fbref.com/en/players/8c3c640c/matchlogs/',
      'Alex Berenguer': 'https://fbref.com/en/players/dc1c2fce/matchlogs/',
      'Henrikh Mkhitaryan': 'https://fbref.com/en/players/dd0daf32/matchlogs/',
      'Ivan Perisic': 'https://fbref.com/en/players/6fe90922/matchlogs/',
      'Pedro': 'https://fbref.com/en/players/3ca7254a/matchlogs/',
      'Raheem Sterling': 'https://fbref.com/en/players/b400bde0/matchlogs/',
      'Federico Bernardeschi': 'https://fbref.com/en/players/ee93c1a9/matchlogs/',
      'Antonio Candreva': 'https://fbref.com/en/players/356c9002/matchlogs/',
      'Hakan Calhanoglu': 'https://fbref.com/en/players/cd0fa27b/matchlogs/',
      'Jadon Sancho': 'https://fbref.com/en/players/dbf053da/matchlogs/',
      'Lorenzo Insigne': 'https://fbref.com/en/players/2f557579/matchlogs/',
      'Adnan Januzaj': 'https://fbref.com/en/players/4737cebe/matchlogs/',
      'Nathan Redmond': 'https://fbref.com/en/players/ab651565/matchlogs/',
      'Vincenzo Grifo': 'https://fbref.com/en/players/54e4866f/matchlogs/',
      'Iker Muniain': 'https://fbref.com/en/players/c05dfb74/matchlogs/',
      'Jordan Ayew': 'https://fbref.com/en/players/da052c14/matchlogs/',
      'Lucas Ocampos': 'https://fbref.com/en/players/a08b974a/matchlogs/',
      'Ludovic Blas': 'https://fbref.com/en/players/6191093d/matchlogs/',
      'Wilfried Zaha': 'https://fbref.com/en/players/b2bc3b1f/matchlogs/',
      'Lucas Moura': 'https://fbref.com/en/players/2b622f01/matchlogs/',
      'Papu Gomez': 'https://fbref.com/en/players/6e4df551/matchlogs/',
      'Suso': 'https://fbref.com/en/players/4e219ad2/matchlogs/',
      'Dele Alli': 'https://fbref.com/en/players/cea4ee8f/matchlogs/',
      'Xherdan Shaqiri': 'https://fbref.com/en/players/6421ec64/matchlogs/',
      'Son Heung-min': 'https://fbref.com/en/players/92e7e919/matchlogs/',
      'Mohamed Salah': 'https://fbref.com/en/players/e342ad68/matchlogs/',
      'Marcus Rashford': 'https://fbref.com/en/players/a1d5bd30/matchlogs/',
      'Ferran Torres': 'https://fbref.com/en/players/9e1035f8/matchlogs/',
      'Juanmi': 'https://fbref.com/en/players/84399660/matchlogs/',
      'Jose Luis Morales': 'https://fbref.com/en/players/4a478107/matchlogs/',
      'Kevin Volland': 'https://fbref.com/en/players/64f69877/matchlogs/',
      'Portu': 'https://fbref.com/en/players/1bda5842/matchlogs/',
      'Felipe Caicedo': 'https://fbref.com/en/players/93b891d1/matchlogs/',
      'Kelechi Iheanacho': 'https://fbref.com/en/players/c92e1a31/matchlogs/',
      'Simone Zaza': 'https://fbref.com/en/players/9592289a/matchlogs/',
      'Pierre-Emerick Aubameyang': 'https://fbref.com/en/players/d5dd5f1f/matchlogs/',
      'Borja Mayoral': 'https://fbref.com/en/players/64e8ed6d/matchlogs/',
      'Joaquin Correa': 'https://fbref.com/en/players/45b9b619/matchlogs/',
      'Andrej Kramaric': 'https://fbref.com/en/players/603cb947/matchlogs/',
      'Sandro Ramirez': 'https://fbref.com/en/players/833fb62e/matchlogs/',
      'Martin Braithwaite': 'https://fbref.com/en/players/fd771f95/matchlogs/',
      'Mariano': 'https://fbref.com/en/players/5c4dc0ff/matchlogs/',
      'Stevan Jovetic': 'https://fbref.com/en/players/f36c432f/matchlogs/',
      'Eden Hazard ': 'https://fbref.com/en/players/a39bb753/matchlogs/',
      'Anthony Martial': 'https://fbref.com/en/players/8b788c01/matchlogs/'
  }

In [None]:
## Scrape one type
def scrape(url, page_type):
  res = requests.get(url).text
  soup = BeautifulSoup(res, 'html.parser')
  table = soup.find('table', class_='min_width sortable stats_table min_width shade_zero')
  n_games = len(table.tbody.find_all('tr'))
  columns = []
  dates = []
  row = []
  for i, header in enumerate(table.find_all('th')):
    if i in range(page_type.START_HEADER, page_type.END_HEADER):
      columns.append(header.text)
    if i in range(page_type.END_HEADER, page_type.END_HEADER + n_games):
      dates.append(header.text)
  df = pd.DataFrame(columns=columns)
  for j, data in enumerate(table.find_all('tr')):
    if j in range(Global.START_DATA, Global.START_DATA + n_games):
      raw_data = data.find_all('td')
      for a in raw_data:
        row.append(a.text)
      row.insert(0, dates[j-Global.START_DATA])
      if len(row) == page_type.N_COLUMNS:
        new_row = pd.Series(row, index = df.columns)
        df = df.append(new_row, ignore_index=True)
      row.clear()
  df['Player'] = url.split('/')[-1].replace('-Match-Logs', '').replace('-',' ')
  return df

def get_result(result):
  if len(result.split(' ')) == 2:
    return result.split(' ')
  else:
    return [result[0], result[1:].replace(' ','')]

## Scrape all types
def scrape_all(player, season, global_url, types):
  urls = []
  keys_list = list(Global.DICT_TYPES)
  for typ in types:
    urls.append(global_url + season + '/' + typ + '/' + player.replace(' ','-') + '-Match-Logs')
  for i, url in enumerate(urls):
    if i == 0:
      df_summary = scrape(url, Global.DICT_TYPES[url.split('/')[8]])
      df_summary.rename(columns=dict(zip(df_summary.columns, 
                                         Summary.FIELDS_SUMMARY)), inplace=True)
    if i == 1:
      df_passing = scrape(url, Global.DICT_TYPES[url.split('/')[8]])
      df_passing.rename(columns=dict(zip(df_passing.columns, 
                                         Passing.FIELDS_PASSING)), inplace=True)
    if i == 2:
      df_pass_types = scrape(url, Global.DICT_TYPES[url.split('/')[8]])
      df_pass_types.rename(columns=dict(zip(df_pass_types.columns, 
                                      Pass_Types.FIELDS_PASS_TYPES)), 
                           inplace=True)
    if i == 3:
      df_gca = scrape(url, Global.DICT_TYPES[url.split('/')[8]])
      df_gca.rename(columns=dict(zip(df_gca.columns, 
                                     GCA.FIELDS_GCA)), inplace=True)
    if i == 4:
      df_defensive = scrape(url, Global.DICT_TYPES[url.split('/')[8]])
      df_defensive.rename(columns=dict(zip(
          df_defensive.columns, Defensive.FIELDS_DEFENSIVE)), inplace=True)
    if i == 5:
      df_possession = scrape(url, Global.DICT_TYPES[url.split('/')[8]])
      df_possession.rename(columns=dict(
          zip(df_possession.columns, Possession.FIELDS_POSSESSION)), 
          inplace=True)
    if i == 6:
      df_misc = scrape(url, Global.DICT_TYPES[url.split('/')[8]])
      df_misc.rename(columns=dict(zip(
          df_misc.columns, Miscellaneous.FIELDS_MISC)), inplace=True)

  df = df_summary[Global.KEY + Summary.FIELDS_MERGE_SUMMARY].merge(
                     df_passing, left_on=Global.KEY, right_on=Global.KEY)
  df = df.merge(df_pass_types, left_on=Global.KEY, right_on=Global.KEY)
  df = df.merge(df_gca, left_on=Global.KEY, right_on=Global.KEY)
  df = df.merge(df_defensive, left_on=Global.KEY, right_on=Global.KEY)
  df = df.merge(df_possession, left_on=Global.KEY, right_on=Global.KEY)
  df = df.merge(df_misc[Global.KEY + Miscellaneous.FIELDS_MERGE_MISC], 
                left_on=Global.KEY, right_on=Global.KEY)
  df = df.fillna(0)
  df['Start'] = df['Start'].apply(lambda x: 1 if x in ('Y', 'Y*') else 0)
  df['Opponent'] = df['Opponent'].apply(lambda x: x[2:] if x[0].islower() else x)
  df['Squad'] = df['Squad'].apply(lambda x: x[2:] if x[0].islower() else x)
  df['Result'] = df['Result'].apply(lambda x: get_result(x))
  df['Score'] = df['Result'].apply(lambda x: x[1])
  df['Result'] = df['Result'].apply(lambda x: x[0])
  df['Score'] = df['Score'].apply(lambda x: x.split('–'))
  df['Squad_Goals'] = df['Score'].apply(lambda x: x[0])
  df['Opponent_Goals'] = df['Score'].apply(lambda x: x[1])
  df = df.drop(columns=['Score', 'Match Report_x', 'Match Report_y'])
  return df

In [None]:
# RESULTADO
def result(df_points, i):
  return 0.3 if df_points.loc[i,'Result'] == 1 else -0.3 if df_points.loc[i,'Result'] == 3 else 0

# PASES TOTALES
def tot_pass(df_points, i):
  if df_points.loc[i,'TotAtt'] <= 20:
    return 0
  elif (df_points.loc[i,'TotAtt'] > 20) & (df_points.loc[i,'TotAtt'] <= 40):
    if df_points.loc[i,'TotCmpPerc'] <= 50:
      return -0.1
    elif (df_points.loc[i,'TotCmpPerc'] > 50) & (df_points.loc[i,'TotCmpPerc'] <= 60):
      return -0.05
    elif (df_points.loc[i,'TotCmpPerc'] > 60) & (df_points.loc[i,'TotCmpPerc'] <= 70):
      return 0
    elif (df_points.loc[i,'TotCmpPerc'] > 70) & (df_points.loc[i,'TotCmpPerc'] <= 80):
      return 0.025
    elif (df_points.loc[i,'TotCmpPerc'] > 80) & (df_points.loc[i,'TotCmpPerc'] <= 90):
      return 0.05
    else:
      return 0.1
  elif (df_points.loc[i,'TotAtt'] > 40) & (df_points.loc[i,'TotAtt'] <= 60):
    if df_points.loc[i,'TotCmpPerc'] <= 50:
      return -0.2
    elif (df_points.loc[i,'TotCmpPerc'] > 50) & (df_points.loc[i,'TotCmpPerc'] <= 60):
      return -0.1
    elif (df_points.loc[i,'TotCmpPerc'] > 60) & (df_points.loc[i,'TotCmpPerc'] <= 70):
      return 0
    elif (df_points.loc[i,'TotCmpPerc'] > 70) & (df_points.loc[i,'TotCmpPerc'] <= 80):
      return 0.05
    elif (df_points.loc[i,'TotCmpPerc'] > 80) & (df_points.loc[i,'TotCmpPerc'] <= 90):
      return 0.1
    else:
      return 0.2
  else:
    if df_points.loc[i,'TotCmpPerc'] <= 50:
      return -0.3
    elif (df_points.loc[i,'TotCmpPerc'] > 50) & (df_points.loc[i,'TotCmpPerc'] <= 60):
      return -0.15
    elif (df_points.loc[i,'TotCmpPerc'] > 60) & (df_points.loc[i,'TotCmpPerc'] <= 70):
      return 0
    elif (df_points.loc[i,'TotCmpPerc'] > 70) & (df_points.loc[i,'TotCmpPerc'] <= 80):
      return 0.075
    elif (df_points.loc[i,'TotCmpPerc'] > 80) & (df_points.loc[i,'TotCmpPerc'] <= 90):
      return 0.15
    else:
      return 0.3

# PASES EN CORTO
def short_pass(df_points, i):
  if df_points.loc[i,'ShortAtt'] <= 10:
    return 0
  elif (df_points.loc[i,'ShortAtt'] > 10) & (df_points.loc[i,'ShortAtt'] <= 25):
    if df_points.loc[i,'ShortCmpPerc'] <= 50:
      return -0.05
    elif (df_points.loc[i,'ShortCmpPerc'] > 50) & (df_points.loc[i,'ShortCmpPerc'] <= 70):
      return -0.025
    elif (df_points.loc[i,'ShortCmpPerc'] > 70) & (df_points.loc[i,'ShortCmpPerc'] <= 80):
      return 0
    elif (df_points.loc[i,'ShortCmpPerc'] > 80) & (df_points.loc[i,'ShortCmpPerc'] <= 90):
      return 0.025
    else:
      return 0.05
  elif (df_points.loc[i,'ShortAtt'] > 25) & (df_points.loc[i,'ShortAtt'] <= 50):
    if df_points.loc[i,'ShortCmpPerc'] <= 50:
      return -0.1
    elif (df_points.loc[i,'ShortCmpPerc'] > 50) & (df_points.loc[i,'ShortCmpPerc'] <= 70):
      return -0.05
    elif (df_points.loc[i,'ShortCmpPerc'] > 70) & (df_points.loc[i,'ShortCmpPerc'] <= 80):
      return 0
    elif (df_points.loc[i,'ShortCmpPerc'] > 80) & (df_points.loc[i,'ShortCmpPerc'] <= 90):
      return 0.05
    else:
      return 0.1
  else:
    if df_points.loc[i,'ShortCmpPerc'] <= 50:
      return -0.2
    elif (df_points.loc[i,'ShortCmpPerc'] > 50) & (df_points.loc[i,'ShortCmpPerc'] <= 70):
      return -0.1
    elif (df_points.loc[i,'ShortCmpPerc'] > 70) & (df_points.loc[i,'ShortCmpPerc'] <= 80):
      return 0
    elif (df_points.loc[i,'ShortCmpPerc'] > 80) & (df_points.loc[i,'ShortCmpPerc'] <= 90):
      return 0.1
    else:
      return 0.2

# PASES A MEDIA DISTANCIA
def mid_pass(df_points, i):
  if df_points.loc[i,'MedAtt'] <= 10:
    return 0
  elif (df_points.loc[i,'MedAtt'] > 10) & (df_points.loc[i,'MedAtt'] <= 25):
    if df_points.loc[i,'MedCmpPerc'] <= 50:
      return -0.05
    elif (df_points.loc[i,'MedCmpPerc'] > 50) & (df_points.loc[i,'MedCmpPerc'] <= 65):
      return -0.025
    elif (df_points.loc[i,'MedCmpPerc'] > 65) & (df_points.loc[i,'MedCmpPerc'] <= 75):
      return 0
    elif (df_points.loc[i,'MedCmpPerc'] > 75) & (df_points.loc[i,'MedCmpPerc'] <= 90):
      return 0.025
    else:
      return 0.05
  else:
    if df_points.loc[i,'MedCmpPerc'] <= 50:
      return -0.1
    elif (df_points.loc[i,'MedCmpPerc'] > 50) & (df_points.loc[i,'MedCmpPerc'] <= 65):
      return -0.05
    elif (df_points.loc[i,'MedCmpPerc'] > 65) & (df_points.loc[i,'MedCmpPerc'] <= 75):
      return 0
    elif (df_points.loc[i,'MedCmpPerc'] > 75) & (df_points.loc[i,'MedCmpPerc'] <= 90):
      return 0.05
    else:
      return 0.1

# PASES LARGOS
def long_pass(df_points, i):
  if df_points.loc[i,'LongAtt'] <= 10:
    return 0
  else:
    if df_points.loc[i,'LongCmpPerc'] <= 50:
      return -0.1
    elif (df_points.loc[i,'LongCmpPerc'] > 65) & (df_points.loc[i,'LongCmpPerc'] <= 75):
      return 0
    elif (df_points.loc[i,'LongCmpPerc'] > 75) & (df_points.loc[i,'LongCmpPerc'] <= 90):
      return 0.05
    else:
      return 0.1

## CALCULATE POINTS
def calculate_points(df_points, i, df_att_values):
  punt_res = result(df_points, i)
  punt_tot = tot_pass(df_points, i)
  punt_short = short_pass(df_points, i)
  punt_mid = mid_pass(df_points, i)
  punt_long = long_pass(df_points, i)
  df_new.at[i, 'Puntuacion'] += np.sum(
      np.multiply(list(df_points.iloc[i][df_att_values.index]), 
                  list(df_att_values.Values)))
  df_new.at[i,'Puntuacion'] += punt_res + punt_tot + punt_short + punt_mid + punt_long
  return df_new.at[i,'Puntuacion']

In [None]:
# Crear variable puntuación
def create_var_points(df_new):
  df_points = df_new[['Player', 'Date', 'Day', 'Comp', 'Round', 'Venue', 'Squad', 
                     'Opponent', 'Start', 'Pos', 'Min', 'Result', 'nPG', 'PK',
                     'Ast', 'PKMissed', 'Sh', 'SoT', 'TotAtt', 'TotCmpPerc',
                     'PrgDist', 'ShortAtt', 'ShortCmpPerc', 'MedAtt',
                     'MedCmpPerc', 'LongAtt', 'LongCmpPerc', 'KP', 'SCA', 'GCA',
                     'TklW', 'PressSucc', 'Blocks', 'Int', 'Clr', 'Err', 
                     'SuccDrib', 'PrgDistCarries', 'ProgPassRec', 'CrdY', 'CrdR', 
                     'FlsComm', 'FlsDrawn', 'Offsides', 'PKwon', 'PKcon', 'OG', 
                     'Recov', 'AerialDuelsWon', 'AerialDuelsLost']]
  df_new.loc[:, 'Puntuacion'] = 5.0
  df_att_values = pd.DataFrame(data = df_points.columns).rename(columns={0: 'Attributes'})
  df_att_values['Values'] = 0
  df_att_values.loc[df_att_values['Attributes'] == 'Result','Values'] = 0.3
  df_att_values.loc[df_att_values['Attributes'] == 'nPG','Values'] = 0.75
  df_att_values.loc[df_att_values['Attributes'] == 'PK','Values'] = 0.3
  df_att_values.loc[df_att_values['Attributes'] == 'Ast','Values'] = 0.6
  df_att_values.loc[df_att_values['Attributes'] == 'PKMissed','Values'] = -0.6
  df_att_values.loc[df_att_values['Attributes'] == 'Sh','Values'] = 0.1
  df_att_values.loc[df_att_values['Attributes'] == 'SoT','Values'] = 0.05
  df_att_values.loc[df_att_values['Attributes'] == 'PrgDist','Values'] = 0.001
  df_att_values.loc[df_att_values['Attributes'] == 'KP','Values'] = 0.05
  df_att_values.loc[df_att_values['Attributes'] == 'SCA','Values'] = 0.075
  df_att_values.loc[df_att_values['Attributes'] == 'GCA','Values'] = 0.175
  df_att_values.loc[df_att_values['Attributes'] == 'TklW','Values'] = 0.05
  df_att_values.loc[df_att_values['Attributes'] == 'PressSucc','Values'] = 0.025
  df_att_values.loc[df_att_values['Attributes'] == 'Blocks','Values'] = 0.05
  df_att_values.loc[df_att_values['Attributes'] == 'Int','Values'] = 0.05
  df_att_values.loc[df_att_values['Attributes'] == 'Clr','Values'] = 0.05
  df_att_values.loc[df_att_values['Attributes'] == 'Err','Values'] = -1
  df_att_values.loc[df_att_values['Attributes'] == 'SuccDrib','Values'] = 0.1
  df_att_values.loc[df_att_values['Attributes'] == 'PrgDistCarries','Values'] = 0.001
  df_att_values.loc[df_att_values['Attributes'] == 'ProgPassRec','Values'] = 0.01
  df_att_values.loc[df_att_values['Attributes'] == 'CrdY','Values'] = -0.3
  df_att_values.loc[df_att_values['Attributes'] == 'CrdR','Values'] = -0.5
  df_att_values.loc[df_att_values['Attributes'] == 'FlsComm','Values'] = -0.1
  df_att_values.loc[df_att_values['Attributes'] == 'FlsDrawn','Values'] = 0.1
  df_att_values.loc[df_att_values['Attributes'] == 'Offsides','Values'] = -0.05
  df_att_values.loc[df_att_values['Attributes'] == 'PKwon','Values'] = 0.5
  df_att_values.loc[df_att_values['Attributes'] == 'PKcon','Values'] = -0.5
  df_att_values.loc[df_att_values['Attributes'] == 'OG','Values'] = -1
  df_att_values.loc[df_att_values['Attributes'] == 'Recov','Values'] = 0.05
  df_att_values.loc[df_att_values['Attributes'] == 'AerialDuelsWon','Values'] = 0.05
  df_att_values.loc[df_att_values['Attributes'] == 'AerialDuelsLost','Values'] = -0.03
  df_att_values = df_att_values[~df_att_values['Attributes'].isin(
      ['Player', 'Date', 'Day', 'Comp', 'Round', 'Venue', 'Squad', 'Opponent', 
       'Start', 'Pos', 'Min', 'TotAtt', 'TotCmpPerc', 'ShortAtt', 'ShortCmpPerc', 
       'MedAtt', 'MedCmpPerc', 'LongAtt', 'LongCmpPerc', 
       'Result', 'Puntuacion'])].set_index('Attributes')
  df_points = df_points.reset_index(drop=True)

  for i in range(len(df_points)):
    df_new.at[i, 'Puntuacion'] = calculate_points(df_points, i, df_att_values)
  return df_new['Puntuacion']

# Obtención del dato

In [None]:
# df = pd.DataFrame()
# for player in Global.DICT_PLAYERS:
#   print(player)
#   for season in Global.SEASONS:
#     df_new = scrape_all(player, season, Global.DICT_PLAYERS[player], Global.TYPES)
#     df = pd.concat([df, df_new], ignore_index=True)
# df = df.fillna(0)
# df.to_csv('df.csv')
df = pd.read_csv('/content/drive/My Drive/MasterIADeporte/TFM/DatosFinales/df.csv')
df

Unnamed: 0,Player,Date,Day,Comp,Round,Venue,Result,Squad,Opponent,Start,Pos,Min,Gls,PK,PKatt,Sh,SoT,xG,npxG,TotCmp,TotAtt,TotCmpPerc,TotDist,PrgDist,ShortCmp,ShortAtt,ShortCmpPerc,MedCmp,MedAtt,MedCmpPerc,LongCmp,LongAtt,LongCmpPerc,Ast,xA,KP,1/3,PPA,CrsPA,Prog,...,Err,Touches,DefPenTouches,Def3rdTouches,Mid3rdTouches,Att3rdTouches,AttPenTouches,LiveTouches,SuccDrib,AttDrib,SuccDribPerc,NuPlDrib,Nutmegs,Carries,TotDistCarries,PrgDistCarries,ProgCarries,1/3Carries,CPA,MisCarries,DisCarries,TargPass,RecPass,RecPassPerc,ProgPassRec,CrdY,CrdR,2CrdY,FlsComm,FlsDrawn,Offsides,PKwon,PKcon,OG,Recov,AerialDuelsWon,AerialDuelsLost,AerialDuelsWinPerc,Squad_Goals,Opponent_Goals
0,Karim Benzema,2021-08-14,Sat,La Liga,Matchweek 1,Away,W,Real Madrid,Alavés,1,FW,88.0,2,0,0,6.0,3.0,0.7,0.7,39.0,45.0,86.7,720.0,130.0,39.0,45.0,86.7,39.0,45.0,86.7,39.0,45.0,86.7,0.0,0.0,0.0,2.0,1.0,0.0,4.0,...,0.0,55.0,0.0,3.0,25.0,30.0,10.0,53.0,2.0,2.0,100.0,2.0,0.0,36.0,194.0,78.0,5.0,2.0,2.0,0.0,1.0,58.0,48.0,82.8,5.0,0,0,0,0.0,1.0,1.0,0.0,0.0,0.0,3.0,1.0,0.0,100.0,4,1
1,Karim Benzema,2021-08-22,Sun,La Liga,Matchweek 2,Away,D,Real Madrid,Levante,1,FW,90.0,0,0,0,1.0,0.0,0.1,0.1,27.0,32.0,84.4,365.0,67.0,27.0,32.0,84.4,27.0,32.0,84.4,27.0,32.0,84.4,2.0,0.2,2.0,0.0,3.0,0.0,3.0,...,0.0,39.0,0.0,0.0,17.0,23.0,8.0,35.0,0.0,0.0,,0.0,0.0,25.0,126.0,44.0,3.0,1.0,0.0,1.0,2.0,44.0,29.0,65.9,3.0,0,0,0,1.0,1.0,4.0,0.0,0.0,0.0,5.0,0.0,0.0,,3,3
2,Karim Benzema,2021-08-28,Sat,La Liga,Matchweek 3,Away,W,Real Madrid,Betis,1,FW,90.0,0,0,0,2.0,0.0,0.1,0.1,31.0,37.0,83.8,389.0,92.0,31.0,37.0,83.8,31.0,37.0,83.8,31.0,37.0,83.8,1.0,0.3,3.0,0.0,4.0,1.0,5.0,...,0.0,48.0,0.0,4.0,21.0,27.0,4.0,47.0,2.0,2.0,100.0,2.0,1.0,30.0,112.0,74.0,5.0,2.0,2.0,5.0,0.0,48.0,37.0,77.1,5.0,0,0,0,0.0,1.0,3.0,0.0,0.0,0.0,6.0,1.0,2.0,33.3,1,0
3,Karim Benzema,2021-09-01,Wed,WCQ,First round,Home,D,France,Bosnia and Herzegovina,1,FW,75.0,0,0,0,2.0,1.0,,,,,,,,,,,,,,,,,0.0,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0.0,1.0,1.0,,,0.0,,,,,1,1
4,Karim Benzema,2021-09-04,Sat,WCQ,First round,Away,D,France,Ukraine,0,,27.0,0,0,0,0.0,0.0,,,,,,,,,,,,,,,,,0.0,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0.0,0.0,1.0,,,0.0,,,,,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24459,Anthony Martial,2018-04-15,Sun,Premier League,Matchweek 34,Home,L,Manchester Utd,West Brom,0,"LB,LW",33.0,0,0,0,1.0,0.0,0.1,0.1,19.0,29.0,65.5,358.0,114.0,19.0,29.0,65.5,19.0,29.0,65.5,19.0,29.0,65.5,0.0,0.0,0.0,3.0,0.0,0.0,4.0,...,0.0,34.0,0.0,0.0,19.0,21.0,2.0,31.0,2.0,2.0,100.0,2.0,0.0,28.0,145.0,97.0,7.0,3.0,0.0,0.0,2.0,30.0,26.0,86.7,7.0,0,0,0,0.0,2.0,0.0,0.0,0.0,0.0,5.0,0.0,1.0,0.0,0,1
24460,Anthony Martial,2018-04-18,Wed,Premier League,Matchweek 35,Away,W,Manchester Utd,Bournemouth,1,LW,90.0,0,0,0,2.0,0.0,0.1,0.1,26.0,38.0,68.4,450.0,93.0,26.0,38.0,68.4,26.0,38.0,68.4,26.0,38.0,68.4,0.0,0.1,2.0,5.0,0.0,0.0,2.0,...,0.0,47.0,1.0,7.0,23.0,22.0,13.0,47.0,3.0,4.0,75.0,3.0,1.0,42.0,309.0,215.0,13.0,4.0,6.0,4.0,1.0,57.0,38.0,66.7,13.0,0,0,0,2.0,0.0,1.0,0.0,0.0,0.0,7.0,1.0,2.0,33.3,2,0
24461,Anthony Martial,2018-04-29,Sun,Premier League,Matchweek 36,Home,W,Manchester Utd,Arsenal,0,"LW,CM",27.0,0,0,0,1.0,0.0,0.0,0.0,13.0,20.0,65.0,233.0,47.0,13.0,20.0,65.0,13.0,20.0,65.0,13.0,20.0,65.0,0.0,0.3,1.0,0.0,1.0,1.0,1.0,...,0.0,25.0,0.0,1.0,8.0,18.0,4.0,25.0,2.0,5.0,40.0,2.0,0.0,25.0,145.0,106.0,7.0,1.0,2.0,1.0,0.0,24.0,21.0,87.5,7.0,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,,2,1
24462,Anthony Martial,2018-05-04,Fri,Premier League,Matchweek 37,Away,L,Manchester Utd,Brighton,1,LW,90.0,0,0,0,3.0,0.0,0.1,0.1,48.0,55.0,87.3,643.0,192.0,48.0,55.0,87.3,48.0,55.0,87.3,48.0,55.0,87.3,0.0,0.1,2.0,3.0,0.0,0.0,1.0,...,0.0,69.0,1.0,7.0,24.0,42.0,8.0,67.0,1.0,5.0,20.0,2.0,0.0,56.0,231.0,141.0,11.0,3.0,1.0,3.0,3.0,73.0,59.0,80.8,11.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0,1.0,50.0,0,1


# Procesamiento del dato

## Primer análisis exploratorio de datos

In [None]:
profile = pandas_profiling.ProfileReport(df, minimal=True)
profile

## Ingeniería de variables

Definir nuevas variables, seleccionar entre las existentes y borrar campos con muchos valores vacíos.

In [None]:
df_new = df.dropna(subset=['xG']).reset_index(drop=True)
df_new['nPG'] = df_new['Gls'] - df_new['PK']
df_new['PKMissed'] = df_new['PKatt'] - df_new['PK']
df_new['xPK'] = df_new['xG'] - df_new['npxG']
df_new['Puntuacion'] = create_var_points(df_new)
df_result = pd.get_dummies(df_new.Result, prefix='Result')
df_result[['Player', 'Date']] = df_new[['Player', 'Date']]
df_final = df_new.merge(df_result, how='inner', on=['Player', 'Date']).drop(
    columns = ['Date', 'Day', 'Comp', 'Round', 'Venue', 'Squad', 'Opponent', 
               'Pos', 'TotCmpPerc', 'ShortCmpPerc', 'MedCmpPerc', 'LongCmpPerc', 
               'TklDrib', 'Squad_Goals', 'Opponent_Goals',
               'Gls', 'PKatt', 'xG', 'Att', 'InCK', 'OutCK', 'StrCK', 'Other', 
               'CmpPass', 'SCA', 'GCA', 'TklDribPerc', 'PressPerc', 
               'PassBlock', 'Tkl+Int', 'Touches', 'SuccDribPerc', 'RecPassPerc', 
               'AerialDuelsWinPerc', 'Puntuacion', 'Result', 'Result_D']).fillna(0)
df_final

Unnamed: 0,Player,Start,Min,PK,Sh,SoT,npxG,TotCmp,TotAtt,TotDist,PrgDist,ShortCmp,ShortAtt,MedCmp,MedAtt,LongCmp,LongAtt,Ast,xA,KP,1/3,PPA,CrsPA,Prog,Live,Dead,FK,TB,Press,Sw,Crs,CK,GroundPass,LowPass,HighPass,LeftPass,RightPass,HeadPass,TI,OffSide,...,Clr,Err,DefPenTouches,Def3rdTouches,Mid3rdTouches,Att3rdTouches,AttPenTouches,LiveTouches,SuccDrib,AttDrib,NuPlDrib,Nutmegs,Carries,TotDistCarries,PrgDistCarries,ProgCarries,1/3Carries,CPA,MisCarries,DisCarries,TargPass,RecPass,ProgPassRec,CrdY,CrdR,2CrdY,FlsComm,FlsDrawn,Offsides,PKwon,PKcon,OG,Recov,AerialDuelsWon,AerialDuelsLost,nPG,PKMissed,xPK,Result_L,Result_W
0,Karim Benzema,1,88.0,0,6.0,3.0,0.7,39.0,45.0,720.0,130.0,39.0,45.0,39.0,45.0,39.0,45.0,0.0,0.0,0.0,2.0,1.0,0.0,4.0,43.0,2.0,0.0,0.0,9.0,1.0,0.0,0.0,35.0,7.0,3.0,3.0,39.0,3.0,0.0,0.0,...,0.0,0.0,0.0,3.0,25.0,30.0,10.0,53.0,2.0,2.0,2.0,0.0,36.0,194.0,78.0,5.0,2.0,2.0,0.0,1.0,58.0,48.0,5.0,0,0,0,0.0,1.0,1.0,0.0,0.0,0.0,3.0,1.0,0.0,2,0,0.0,0,1
1,Karim Benzema,1,90.0,0,1.0,0.0,0.1,27.0,32.0,365.0,67.0,27.0,32.0,27.0,32.0,27.0,32.0,2.0,0.2,2.0,0.0,3.0,0.0,3.0,28.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,23.0,7.0,2.0,3.0,28.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,17.0,23.0,8.0,35.0,0.0,0.0,0.0,0.0,25.0,126.0,44.0,3.0,1.0,0.0,1.0,2.0,44.0,29.0,3.0,0,0,0,1.0,1.0,4.0,0.0,0.0,0.0,5.0,0.0,0.0,0,0,0.0,0,0
2,Karim Benzema,1,90.0,0,2.0,0.0,0.1,31.0,37.0,389.0,92.0,31.0,37.0,31.0,37.0,31.0,37.0,1.0,0.3,3.0,0.0,4.0,1.0,5.0,36.0,1.0,0.0,0.0,7.0,0.0,1.0,0.0,28.0,3.0,6.0,4.0,30.0,2.0,0.0,0.0,...,0.0,0.0,0.0,4.0,21.0,27.0,4.0,47.0,2.0,2.0,2.0,1.0,30.0,112.0,74.0,5.0,2.0,2.0,5.0,0.0,48.0,37.0,5.0,0,0,0,0.0,1.0,3.0,0.0,0.0,0.0,6.0,1.0,2.0,0,0,0.0,0,1
3,Karim Benzema,1,90.0,1,4.0,2.0,0.3,39.0,44.0,601.0,140.0,39.0,44.0,39.0,44.0,39.0,44.0,1.0,1.3,4.0,3.0,1.0,0.0,5.0,39.0,5.0,0.0,0.0,6.0,0.0,1.0,2.0,39.0,4.0,1.0,13.0,30.0,0.0,1.0,0.0,...,0.0,0.0,0.0,5.0,20.0,31.0,9.0,49.0,1.0,1.0,1.0,0.0,36.0,160.0,89.0,5.0,1.0,0.0,2.0,1.0,67.0,44.0,5.0,0,0,0,2.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,1.0,2,0,0.8,0,1
4,Karim Benzema,1,90.0,0,2.0,0.0,0.1,33.0,43.0,575.0,97.0,33.0,43.0,33.0,43.0,33.0,43.0,0.0,0.3,2.0,3.0,3.0,0.0,4.0,43.0,0.0,0.0,0.0,9.0,2.0,0.0,0.0,34.0,7.0,2.0,9.0,33.0,0.0,0.0,0.0,...,0.0,0.0,1.0,4.0,22.0,29.0,6.0,51.0,1.0,1.0,1.0,0.0,41.0,196.0,118.0,6.0,3.0,1.0,3.0,2.0,59.0,45.0,6.0,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,2.0,1.0,0,0,0.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19534,Anthony Martial,1,63.0,0,1.0,0.0,0.0,22.0,33.0,290.0,48.0,22.0,33.0,22.0,33.0,22.0,33.0,1.0,0.8,2.0,1.0,1.0,0.0,1.0,33.0,0.0,0.0,0.0,5.0,0.0,3.0,0.0,28.0,3.0,2.0,1.0,30.0,2.0,0.0,0.0,...,0.0,0.0,1.0,1.0,17.0,25.0,5.0,41.0,0.0,0.0,0.0,0.0,37.0,141.0,85.0,6.0,0.0,3.0,0.0,5.0,49.0,36.0,6.0,0,0,0,1.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0,0,0.0,0,1
19535,Anthony Martial,0,33.0,0,1.0,0.0,0.1,19.0,29.0,358.0,114.0,19.0,29.0,19.0,29.0,19.0,29.0,0.0,0.0,0.0,3.0,0.0,0.0,4.0,26.0,3.0,0.0,0.0,3.0,1.0,3.0,0.0,19.0,5.0,5.0,5.0,21.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,19.0,21.0,2.0,31.0,2.0,2.0,2.0,0.0,28.0,145.0,97.0,7.0,3.0,0.0,0.0,2.0,30.0,26.0,7.0,0,0,0,0.0,2.0,0.0,0.0,0.0,0.0,5.0,0.0,1.0,0,0,0.0,1,0
19536,Anthony Martial,1,90.0,0,2.0,0.0,0.1,26.0,38.0,450.0,93.0,26.0,38.0,26.0,38.0,26.0,38.0,0.0,0.1,2.0,5.0,0.0,0.0,2.0,38.0,0.0,0.0,0.0,9.0,0.0,3.0,0.0,31.0,3.0,4.0,6.0,30.0,1.0,0.0,0.0,...,0.0,0.0,1.0,7.0,23.0,22.0,13.0,47.0,3.0,4.0,3.0,1.0,42.0,309.0,215.0,13.0,4.0,6.0,4.0,1.0,57.0,38.0,13.0,0,0,0,2.0,0.0,1.0,0.0,0.0,0.0,7.0,1.0,2.0,0,0,0.0,0,1
19537,Anthony Martial,0,27.0,0,1.0,0.0,0.0,13.0,20.0,233.0,47.0,13.0,20.0,13.0,20.0,13.0,20.0,0.0,0.3,1.0,0.0,1.0,1.0,1.0,20.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,17.0,1.0,2.0,2.0,17.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,8.0,18.0,4.0,25.0,2.0,5.0,2.0,0.0,25.0,145.0,106.0,7.0,1.0,2.0,1.0,0.0,24.0,21.0,7.0,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0,0,0.0,0,1


## Agregar datos por jugador
Generar variables de porcentaje. Borrar variables de intentos relacionadas con dichos porcentajes (estarían correladas).

In [None]:
df_agg = df_final.groupby(['Player']).mean().rename(
                   columns={'Result_L': 'LosePerc', 'Result_W': 'WinPerc',
                            'Start': 'StartPerc'})
df_agg['TotCmpPerc'] = df_agg['TotCmp']/df_agg['TotAtt']
df_agg['ShortCmpPerc'] = df_agg['ShortCmp']/df_agg['ShortAtt']
df_agg['MedCmpPerc'] = df_agg['MedCmp']/df_agg['MedAtt']
df_agg['LongCmpPerc'] = df_agg['LongCmp']/df_agg['LongAtt']
df_agg['TklWPerc'] = df_agg['TklW']/df_agg['Tkl']
df_agg['TklDribPerc'] = df_agg['TklDribPast']/df_agg['TklDribAtt']
df_agg['PressPerc'] = df_agg['PressSucc']/df_agg['PressAtt']
df_agg['SuccDribPerc'] = df_agg['SuccDrib']/df_agg['AttDrib']
df_agg['RecPassPerc'] = df_agg['RecPass']/df_agg['TargPass']
df_agg['AerialDuelsPerc'] = df_agg['AerialDuelsWon']/df_agg['AerialDuelsLost']
df_agg = df_agg.drop(columns = ['TotAtt', 'ShortAtt', 'MedAtt', 'LongAtt', 
                                'Tkl', 'TklDribAtt', 'PressAtt', 'AttDrib', 
                                'TargPass', 'AerialDuelsLost'])
df_agg

Unnamed: 0_level_0,StartPerc,Min,PK,Sh,SoT,npxG,TotCmp,TotDist,PrgDist,ShortCmp,MedCmp,LongCmp,Ast,xA,KP,1/3,PPA,CrsPA,Prog,Live,Dead,FK,TB,Press,Sw,Crs,CK,GroundPass,LowPass,HighPass,LeftPass,RightPass,HeadPass,TI,OffSide,Out,IntPass,BlockPass,PassLiveSCA,PassDeadSCA,...,LiveTouches,SuccDrib,NuPlDrib,Nutmegs,Carries,TotDistCarries,PrgDistCarries,ProgCarries,1/3Carries,CPA,MisCarries,DisCarries,RecPass,ProgPassRec,CrdY,CrdR,2CrdY,FlsComm,FlsDrawn,Offsides,PKwon,PKcon,OG,Recov,AerialDuelsWon,nPG,PKMissed,xPK,LosePerc,WinPerc,TotCmpPerc,ShortCmpPerc,MedCmpPerc,LongCmpPerc,TklWPerc,TklDribPerc,PressPerc,SuccDribPerc,RecPassPerc,AerialDuelsPerc
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
Adnan Januzaj,0.592593,52.444444,0.029630,1.488889,0.377778,0.111111,15.622222,302.674074,106.651852,15.622222,15.622222,15.622222,0.133333,0.137037,1.192593,1.355556,1.311111,0.362963,2.422222,19.496296,3.429630,1.022222,0.140741,4.162963,1.874074,1.948148,2.074074,14.637037,2.703704,5.585185,19.000000,3.148148,0.222222,0.303704,0.200000,0.548148,0.622222,1.200000,1.333333,0.548148,...,29.925926,2.066667,2.259259,0.148148,27.318519,141.296296,86.037037,4.948148,1.340741,0.837037,2.022222,1.962963,24.651852,4.948148,0.111111,0.000000,0.000000,0.822222,2.000000,0.133333,0.014815,0.000000,0.00000,2.918519,0.266667,0.081481,0.000000,0.022222,0.318519,0.392593,0.681422,0.681422,0.681422,0.681422,0.608696,0.821918,0.251381,0.574074,0.801348,0.529412
Alex Berenguer,0.650000,57.214286,0.000000,1.092857,0.371429,0.104286,17.385714,280.064286,85.992857,17.385714,17.385714,17.385714,0.100000,0.110000,1.035714,0.892857,0.850000,0.178571,1.864286,21.621429,3.300000,0.735714,0.142857,3.957143,0.892857,1.614286,1.578571,15.014286,4.214286,5.692857,2.950000,19.485714,0.928571,0.935714,0.128571,0.378571,0.657143,1.192857,1.285714,0.400000,...,31.442857,1.578571,1.678571,0.164286,23.700000,127.642857,78.307143,3.785714,1.364286,0.542857,1.450000,1.707143,23.221429,3.785714,0.114286,0.000000,0.000000,1.285714,1.264286,0.314286,0.014286,0.000000,0.00000,4.085714,0.671429,0.121429,0.007143,0.005714,0.328571,0.364286,0.697621,0.697621,0.697621,0.697621,0.634615,0.754098,0.244573,0.571059,0.757809,0.559524
Alexandre Lacazette,0.748503,65.197605,0.041916,1.808383,0.814371,0.300000,14.760479,211.724551,45.077844,14.760479,14.760479,14.760479,0.137725,0.096407,0.826347,0.898204,0.886228,0.095808,1.568862,18.221557,1.263473,0.035928,0.155689,4.485030,0.281437,0.532934,0.000000,14.053892,3.359281,2.071856,2.461078,15.688623,0.520958,0.041916,0.101796,0.000000,0.718563,0.832335,1.383234,0.000000,...,27.365269,1.000000,1.065868,0.077844,19.796407,72.443114,33.095808,1.826347,0.508982,0.389222,1.736527,1.383234,21.095808,1.826347,0.107784,0.005988,0.000000,1.497006,1.712575,0.508982,0.047904,0.000000,0.00000,3.245509,0.832335,0.335329,0.005988,0.035329,0.287425,0.491018,0.757529,0.757529,0.757529,0.757529,0.744828,0.784314,0.274236,0.611722,0.610995,0.373656
Alvaro Morata,0.744565,64.864130,0.016304,2.043478,0.891304,0.347283,12.570652,195.510870,29.510870,12.570652,12.570652,12.570652,0.114130,0.106522,0.891304,0.771739,0.440217,0.065217,0.929348,16.750000,0.679348,0.027174,0.108696,4.608696,0.423913,0.608696,0.000000,10.603261,4.054348,2.771739,1.994565,12.510870,1.065217,0.027174,0.081522,0.000000,0.543478,0.717391,1.456522,0.005435,...,26.815217,0.804348,0.869565,0.097826,17.744565,85.353261,48.119565,2.391304,0.961957,0.586957,2.228261,1.500000,21.217391,2.391304,0.173913,0.010870,0.005435,1.516304,1.793478,0.913043,0.027174,0.000000,0.00000,2.809783,1.875000,0.342391,0.010870,0.022283,0.190217,0.559783,0.721235,0.721235,0.721235,0.721235,0.738318,0.853659,0.268699,0.578125,0.581992,0.871212
Andre Silva,0.732484,66.331210,0.070064,2.222930,0.910828,0.357962,13.707006,192.789809,38.910828,13.707006,13.707006,13.707006,0.095541,0.115924,0.872611,0.783439,0.579618,0.076433,1.375796,17.191083,1.471338,0.050955,0.076433,5.605096,0.146497,0.439490,0.006369,11.853503,4.248408,2.560510,1.898089,13.433121,1.649682,0.178344,0.089172,0.000000,0.598726,0.885350,1.464968,0.006369,...,26.388535,0.764331,0.847134,0.031847,18.369427,79.222930,42.363057,2.426752,0.878981,0.649682,2.089172,1.554140,20.942675,2.426752,0.050955,0.000000,0.000000,0.987261,1.140127,0.643312,0.044586,0.000000,0.00000,3.496815,1.624204,0.343949,0.006369,0.054140,0.343949,0.414013,0.734471,0.734471,0.734471,0.734471,0.660377,0.826923,0.289855,0.468750,0.523984,0.631188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wilfried Zaha,0.972973,86.945946,0.027027,1.945946,0.662162,0.203378,21.878378,340.587838,96.554054,21.878378,21.878378,21.878378,0.094595,0.134459,1.256757,1.398649,1.777027,0.317568,3.391892,27.891892,0.932432,0.094595,0.162162,6.797297,0.601351,2.067568,0.033784,22.858108,3.709459,2.256757,3.756757,23.770270,0.317568,0.567568,0.121622,0.000000,1.236486,1.702703,2.067568,0.000000,...,45.662162,3.364865,3.628378,0.229730,41.195946,214.141892,138.121622,7.527027,2.067568,1.945946,3.358108,4.209459,35.817568,7.527027,0.202703,0.013514,0.013514,1.432432,3.513514,0.567568,0.074324,0.000000,0.00000,6.108108,0.317568,0.236486,0.000000,0.020946,0.391892,0.331081,0.759025,0.759025,0.759025,0.759025,0.711111,0.701987,0.271114,0.553949,0.679615,0.251337
Willian Jose,0.739130,64.223602,0.062112,1.614907,0.559006,0.218634,15.248447,262.211180,43.670807,15.248447,15.248447,15.248447,0.099379,0.094410,0.813665,0.956522,0.465839,0.049689,1.173913,18.776398,1.341615,0.062112,0.055901,4.782609,0.937888,0.372671,0.012422,13.397516,3.521739,3.198758,2.024845,16.124224,0.975155,0.031056,0.111801,0.000000,0.559006,0.720497,1.273292,0.006211,...,25.447205,0.496894,0.515528,0.037267,18.701863,76.236025,26.509317,1.403727,0.484472,0.180124,1.304348,1.136646,20.776398,1.403727,0.093168,0.006211,0.000000,0.739130,1.043478,0.403727,0.006211,0.006211,0.00000,2.776398,0.900621,0.242236,0.024845,0.065839,0.378882,0.409938,0.757950,0.757950,0.757950,0.757950,0.827586,0.756757,0.304348,0.625000,0.635690,0.703883
Wissam Ben Yedder,0.827160,72.574074,0.141975,1.901235,0.895062,0.356790,16.913580,246.419753,51.635802,16.913580,16.913580,16.913580,0.172840,0.166667,1.123457,1.191358,0.895062,0.086420,1.814815,21.172840,1.432099,0.067901,0.209877,5.629630,0.246914,0.759259,0.006173,15.580247,3.919753,3.104938,10.530864,9.993827,1.141975,0.148148,0.135802,0.000000,0.660494,0.956790,1.777778,0.000000,...,29.240741,1.067901,1.172840,0.037037,22.277778,106.055556,62.753086,3.777778,1.209877,0.907407,1.691358,1.401235,23.271605,3.777778,0.061728,0.006173,0.006173,0.567901,0.956790,0.629630,0.024691,0.000000,0.00000,3.672840,0.716049,0.407407,0.012346,0.117284,0.314815,0.487654,0.748225,0.748225,0.748225,0.748225,0.738095,0.770270,0.277475,0.600694,0.592488,0.348348
Xherdan Shaqiri,0.681034,59.370690,0.000000,1.362069,0.474138,0.114655,23.301724,437.758621,129.146552,23.301724,23.301724,23.301724,0.163793,0.154310,1.413793,2.163793,1.405172,0.267241,3.051724,27.163793,3.775862,1.077586,0.189655,5.344828,1.629310,1.663793,1.767241,19.810345,3.836207,7.293103,23.982759,5.370690,0.775862,0.491379,0.146552,0.491379,0.974138,0.991379,1.517241,0.534483,...,34.043103,0.672414,0.715517,0.068966,27.586207,139.215517,63.939655,3.250000,1.241379,0.301724,1.241379,1.181034,27.129310,3.250000,0.086207,0.000000,0.000000,0.689655,0.879310,0.224138,0.000000,0.000000,0.00000,4.465517,0.189655,0.181034,0.008621,0.006897,0.275862,0.465517,0.753135,0.753135,0.753135,0.753135,0.594203,0.662338,0.265748,0.503226,0.788722,0.379310


## Segundo análisis exploratorio de datos

In [None]:
profile = pandas_profiling.ProfileReport(df_agg, minimal=True)
profile

## Normalizar datos

In [None]:
x_scaled = preprocessing.MinMaxScaler().fit_transform(df_agg.values)
df_norm = pd.DataFrame(x_scaled, index=df_agg.index, columns=df_agg.columns)
df_norm

Unnamed: 0_level_0,StartPerc,Min,PK,Sh,SoT,npxG,TotCmp,TotDist,PrgDist,ShortCmp,MedCmp,LongCmp,Ast,xA,KP,1/3,PPA,CrsPA,Prog,Live,Dead,FK,TB,Press,Sw,Crs,CK,GroundPass,LowPass,HighPass,LeftPass,RightPass,HeadPass,TI,OffSide,Out,IntPass,BlockPass,PassLiveSCA,PassDeadSCA,...,LiveTouches,SuccDrib,NuPlDrib,Nutmegs,Carries,TotDistCarries,PrgDistCarries,ProgCarries,1/3Carries,CPA,MisCarries,DisCarries,RecPass,ProgPassRec,CrdY,CrdR,2CrdY,FlsComm,FlsDrawn,Offsides,PKwon,PKcon,OG,Recov,AerialDuelsWon,nPG,PKMissed,xPK,LosePerc,WinPerc,TotCmpPerc,ShortCmpPerc,MedCmpPerc,LongCmpPerc,TklWPerc,TklDribPerc,PressPerc,SuccDribPerc,RecPassPerc,AerialDuelsPerc
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
Adnan Januzaj,0.372087,0.235824,0.145809,0.173468,0.055595,0.044764,0.223608,0.294515,0.355655,0.223608,0.223608,0.223608,0.283724,0.305484,0.316317,0.181028,0.313958,0.506238,0.231925,0.234878,0.459228,0.426298,0.160967,0.216604,0.569089,0.478769,0.520230,0.223566,0.215265,0.377753,0.345878,0.042155,0.000000,0.127411,0.309360,0.538961,0.179536,0.556851,0.252275,0.368666,...,0.273127,0.379377,0.376506,0.240741,0.315583,0.333951,0.368438,0.367335,0.261895,0.407027,0.453834,0.387662,0.260520,0.367335,0.379669,0.000000,0.000000,0.295544,0.383607,0.049724,0.163786,0.000000,0.000000,0.249166,0.037137,0.027667,0.000000,0.119070,0.539397,0.264318,0.374164,0.374164,0.374164,0.374164,0.251459,0.673638,0.400519,0.524062,0.870836,0.258512
Alex Berenguer,0.466853,0.341472,0.000000,0.084331,0.052235,0.035207,0.265275,0.263476,0.274486,0.265275,0.265275,0.265275,0.194760,0.233373,0.262105,0.100168,0.185832,0.249060,0.164188,0.280414,0.440751,0.306815,0.163388,0.195096,0.259234,0.393697,0.395945,0.232772,0.470279,0.387240,0.045467,0.382432,0.216426,0.392556,0.188251,0.372227,0.196709,0.552432,0.238435,0.269027,...,0.298543,0.282329,0.272084,0.266964,0.252008,0.296520,0.331975,0.273622,0.266933,0.249719,0.235400,0.317932,0.233926,0.273622,0.392260,0.000000,0.000000,0.522456,0.215701,0.237931,0.157937,0.000000,0.000000,0.463166,0.099031,0.076445,0.166964,0.030618,0.563959,0.211847,0.437817,0.437817,0.437817,0.437817,0.333166,0.488045,0.350357,0.513738,0.779385,0.278341
Alexandre Lacazette,0.629456,0.518295,0.206272,0.245379,0.286648,0.309243,0.203247,0.169659,0.113732,0.203247,0.203247,0.203247,0.295444,0.197120,0.189756,0.101103,0.195899,0.133627,0.128322,0.207564,0.150482,0.014983,0.178063,0.250260,0.066156,0.118157,0.000000,0.209336,0.325938,0.068214,0.036316,0.303346,0.091533,0.017585,0.142853,0.000000,0.226914,0.329368,0.266778,0.000000,...,0.230222,0.167292,0.161905,0.126497,0.183424,0.145190,0.118706,0.115661,0.083937,0.167564,0.344775,0.229642,0.194408,0.115661,0.366475,0.175150,0.000000,0.625899,0.318010,0.440433,0.529607,0.000000,0.000000,0.309118,0.123636,0.337633,0.139970,0.189300,0.463430,0.446765,0.673215,0.673215,0.673215,0.673215,0.680589,0.570732,0.568911,0.652996,0.471016,0.155945
Alvaro Morata,0.622956,0.510909,0.080235,0.298293,0.327363,0.375447,0.151506,0.147401,0.052569,0.151506,0.151506,0.151506,0.232473,0.224096,0.212202,0.079002,0.071969,0.090961,0.050680,0.176032,0.067226,0.011332,0.124317,0.263183,0.111148,0.137461,0.000000,0.125134,0.443278,0.129876,0.027584,0.237161,0.258295,0.011400,0.108476,0.000000,0.140812,0.258250,0.288079,0.003655,...,0.221006,0.128390,0.126605,0.158967,0.147375,0.180583,0.189576,0.161207,0.180853,0.273300,0.532485,0.261470,0.196669,0.161207,0.628753,0.317935,0.211957,0.635347,0.336474,0.860694,0.300423,0.000000,0.000000,0.229230,0.283075,0.346256,0.254076,0.119394,0.225931,0.574231,0.530603,0.530603,0.530603,0.530603,0.660068,0.760500,0.528118,0.537936,0.410096,0.483591
Andre Silva,0.603013,0.543403,0.344787,0.338684,0.337695,0.390400,0.178356,0.143665,0.089502,0.178356,0.178356,0.178356,0.182860,0.249172,0.205743,0.081047,0.110703,0.106604,0.104882,0.185484,0.180109,0.021250,0.087418,0.367305,0.023543,0.094346,0.001598,0.155642,0.476039,0.111266,0.025778,0.256369,0.437375,0.074820,0.121448,0.000000,0.167982,0.362170,0.290534,0.004284,...,0.213857,0.120434,0.122571,0.051752,0.158353,0.163777,0.162421,0.164065,0.163100,0.306842,0.479390,0.276227,0.191561,0.164065,0.141080,0.000000,0.000000,0.376342,0.187365,0.580148,0.492923,0.000000,0.000000,0.355194,0.244725,0.348158,0.148885,0.290092,0.601529,0.304024,0.582612,0.582612,0.582612,0.582612,0.414376,0.687336,0.683993,0.163352,0.288257,0.325533
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wilfried Zaha,1.000000,1.000000,0.133001,0.276341,0.206096,0.173955,0.371426,0.346563,0.315981,0.371426,0.371426,0.371426,0.180333,0.298610,0.338489,0.188559,0.443419,0.442923,0.349649,0.414772,0.103298,0.039449,0.185467,0.491888,0.167180,0.509198,0.008474,0.424177,0.385055,0.084504,0.060567,0.471670,0.029214,0.238109,0.176467,0.000000,0.481615,0.867885,0.465676,0.000000,...,0.536792,0.637499,0.622707,0.373311,0.559402,0.533656,0.614130,0.575240,0.417402,1.000000,0.963780,1.000000,0.468106,0.575240,0.742939,0.395270,0.527027,0.594285,0.729023,0.501367,0.821697,0.000000,0.000000,0.833964,0.044920,0.216939,0.000000,0.112232,0.718664,0.150297,0.679091,0.679091,0.679091,0.679091,0.574304,0.345437,0.545911,0.455138,0.615146,0.075396
Willian Jose,0.613985,0.496722,0.305655,0.201832,0.151504,0.195315,0.214776,0.238967,0.108203,0.214776,0.214776,0.214776,0.193102,0.191793,0.185373,0.111294,0.079088,0.069304,0.080372,0.219452,0.161620,0.025903,0.063934,0.281356,0.273454,0.077320,0.003116,0.193319,0.353364,0.167499,0.028151,0.312419,0.230700,0.013029,0.159816,0.000000,0.148448,0.260171,0.234824,0.004177,...,0.198084,0.067259,0.062940,0.060559,0.164194,0.155589,0.087637,0.081590,0.078693,0.055753,0.179800,0.162429,0.188470,0.081590,0.308502,0.181677,0.000000,0.254865,0.165308,0.330958,0.068668,0.354037,0.000000,0.223109,0.134078,0.223959,0.580745,0.352774,0.686878,0.296470,0.674868,0.674868,0.674868,0.674868,0.941469,0.495320,0.790775,0.698472,0.522885,0.373404
Wissam Ben Yedder,0.759300,0.681677,0.698668,0.266278,0.329351,0.388760,0.254119,0.217289,0.139498,0.254119,0.254119,0.254119,0.389163,0.384510,0.292426,0.152334,0.198353,0.120533,0.158182,0.270802,0.174517,0.028317,0.240038,0.369869,0.055254,0.175827,0.001548,0.246582,0.420556,0.159233,0.187360,0.184736,0.281814,0.062152,0.200511,0.000000,0.198357,0.406372,0.381450,0.000000,...,0.261646,0.180793,0.181141,0.060185,0.227021,0.237338,0.258604,0.272982,0.233896,0.444657,0.327533,0.234549,0.234859,0.272982,0.183808,0.180556,0.240741,0.171036,0.145524,0.565917,0.272977,0.000000,0.000000,0.387468,0.105854,0.425645,0.288580,0.628427,0.530348,0.440530,0.636656,0.636656,0.636656,0.636656,0.659366,0.532301,0.592777,0.615231,0.432143,0.139279
Xherdan Shaqiri,0.518083,0.389234,0.000000,0.144924,0.106590,0.049726,0.405056,0.479959,0.444036,0.405056,0.405056,0.405056,0.365019,0.351554,0.392755,0.322274,0.340094,0.372731,0.308351,0.399171,0.508577,0.449386,0.216911,0.340107,0.491796,0.406312,0.443268,0.349805,0.406452,0.528228,0.439141,0.088446,0.169636,0.206146,0.218737,0.483144,0.352599,0.427773,0.305727,0.359475,...,0.342111,0.102158,0.098903,0.112069,0.320287,0.328246,0.264202,0.230434,0.240636,0.120776,0.155763,0.174528,0.306579,0.230434,0.280894,0.000000,0.000000,0.230643,0.127841,0.144169,0.000000,0.000000,0.000000,0.532802,0.025360,0.149228,0.201509,0.036953,0.435179,0.399495,0.655947,0.655947,0.655947,0.655947,0.205774,0.236934,0.506373,0.281424,0.844315,0.159668


## Tercer análisis exploratorio de datos

In [None]:
profile = pandas_profiling.ProfileReport(df_norm, minimal=True)
profile

## Selección final de variables 
Tras observar la correlación entre las distintas variables, realizamos una selección final de las mismas.

In [None]:
df_corr = df_norm.corr().abs()
upper_tri = df_corr.where(np.triu(np.ones(df_corr.shape),k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.75)]
df_sel = df_norm.drop(columns=to_drop, axis=1)
df_sel

# Modelización del dato

## Primer algoritmo

In [None]:
plt.figure(figsize=(30, 8))
plt.title('Dendograma')
plt.xlabel('Jugadores')
plt.ylabel('Distancias Euclidianas')
dendrogram = sch.dendrogram(sch.linkage(df_sel.values, method = 'ward'),
                            labels = df_sel.index, leaf_font_size = 12.,
                            color_threshold = 4)
plt.show()

## Segundo algoritmo

In [None]:
plt.figure(figsize=(30, 8))
plt.title('Dendograma')
plt.xlabel('Jugadores')
plt.ylabel('Distancias Euclidianas')
dendrogram = sch.dendrogram(sch.linkage(df_sel.values, method = 'complete'),
                            labels=df_sel.index, leaf_font_size=12.,
                            color_threshold=2.5)
plt.show()

## Tercer algoritmo

In [None]:
## K-MEANS
clustering = KMeans(n_clusters=3, random_state=1234).fit(df_sel)
# 2-COMPONENT PCA
pca = PCA(n_components=2, random_state=1234)
pca.fit(df_sel.values)
pca_vals = pca.transform(df_sel.values)
df_pca = pd.DataFrame(pca_vals, columns=['PC1','PC2'])
# PLOT K-MEANS
plt.figure(figsize=(20, 10))
plt.scatter(df_pca['PC1'], df_pca['PC2'], c=clustering.labels_, cmap='rainbow', 
            alpha=0.7, edgecolors='b')
for i, txt in enumerate(df_sel.index):
    plt.annotate(txt, (df_pca['PC1'][i], df_pca['PC2'][i]))
plt.show()
print(pca.explained_variance_ratio_)
df_comp = pd.DataFrame(pca.components_, columns=df_sel.columns, index=['PC1', 'PC2'])
df_comp

## Cuarto algoritmo

In [None]:
## GMM
gmm = GaussianMixture(n_components=3, random_state=1234).fit(df_sel.values)
labels = gmm.predict(df_sel.values)
## 2-COMPONENT PCA
pca = PCA(n_components=2, random_state=1234)
pca.fit(df_sel.values)
X_pca_array = pca.transform(df_sel.values)
X_pca = pd.DataFrame(X_pca_array, columns=['PC1','PC2'])
## PLOT GMM
plt.figure(figsize=(20, 10))
plt.scatter(X_pca['PC1'], X_pca['PC2'], c=labels, cmap='rainbow', 
            alpha=0.7, edgecolors='b')
for i, txt in enumerate(df_sel.index):
    plt.annotate(txt, (X_pca['PC1'][i], X_pca['PC2'][i]))
plt.show()
print(pca.explained_variance_ratio_)
df_comp = pd.DataFrame(pca.components_, columns=df_sel.columns, index=['PC1', 'PC2'])
df_comp