# Аналитика параметров спортсменов с помощью методов Big Data

In [45]:
import numpy as np 
import pandas as pd

from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

In [46]:
warnings.simplefilter(action='ignore', category=FutureWarning) 

### Подгружаем датасет профессиональных спортсменов и обрабатываем

In [47]:
FILE_PATH = './player_stats.csv'

df = pd.read_csv(FILE_PATH, encoding='ISO-8859-1')

df

Unnamed: 0,player,country,height,weight,age,club,ball_control,dribbling,marking,slide_tackle,...,curve,fk_acc,penalties,volleys,gk_positioning,gk_diving,gk_handling,gk_kicking,gk_reflexes,value
0,Cristian Castro Devenish,Colombia,192,84,22,Atl. Nacional,55,43,,68,...,32,34,41,33,10,11,6,7,9,$1.400.000
1,Silaldo Taffarel,Brazil,181,80,31,Corinthians,69,70,,56,...,65,62,48,46,12,15,14,8,14,$975.00
2,Thomas DÃ¤hne,Germany,193,84,29,Holstein Kiel,25,12,,13,...,20,15,26,16,64,74,65,68,74,$1.100.000
3,Michael Sollbauer,Austria,187,86,33,SK Rapid Wien,46,48,,66,...,25,13,22,19,10,10,8,14,9,$650.00
4,Diego Segovia,Uruguay,191,80,23,Independiente,14,8,,14,...,9,10,16,5,61,59,62,64,64,$300.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5677,Mohamed Salah,Egypt,175,71,31,Liverpool,86,88,,41,...,84,69,86,83,11,14,14,9,14,$99.500.000
5678,Sadio Mané,Senegal,174,69,31,FC Bayern München,86,88,,38,...,77,64,75,75,7,10,10,15,14,$101.000.000
5679,Neymar Jr,Brazil,175,68,31,Paris SG,94,95,,29,...,88,87,91,86,15,9,9,15,11,$99.500.000
5680,Cristiano Ronaldo,Portugal,187,83,38,Al Nassr,87,81,,24,...,79,75,90,85,14,7,11,15,11,$31.000.000


In [48]:
df.columns

Index(['player', 'country', 'height', 'weight', 'age', 'club', 'ball_control',
       'dribbling', 'marking', 'slide_tackle', 'stand_tackle', 'aggression',
       'reactions', 'att_position', 'interceptions', 'vision', 'composure',
       'crossing', 'short_pass', 'long_pass', 'acceleration', 'stamina',
       'strength', 'balance', 'sprint_speed', 'agility', 'jumping', 'heading',
       'shot_power', 'finishing', 'long_shots', 'curve', 'fk_acc', 'penalties',
       'volleys', 'gk_positioning', 'gk_diving', 'gk_handling', 'gk_kicking',
       'gk_reflexes', 'value'],
      dtype='object')

#### Описание столбцов

|Наименование столбца        |Описание                                      |
|:--------------------------|:----------------------------------------------|
|`player`                    | Имя игрока                                   |
|`country`                   | Страна игрока                                |
|`height`                    | Рост в см                                    |
|`weight`                    | Вес в кг                                     |
|`age`                       | Возраст                                      |
|`club`                      | Клуб за который играет                       |
|`ball_control`              | Умение владения мяча                         |
|`dribbling`                 | Способность к дриблингу                      |
|`marking`                   | Способность отмечать соперника *(заметить)*  |
|`slide_tackle`              | Способность выполнять скользящий подкат      |
|`stand_tackle`              | Способность выполнять подкат стоя            |
|`aggression`                | Уровень агрессии                             |
|`reactions`                 | Время реакции игрока                         |
|`att_position`              | Позиция для атакующих действий               |
|`interceptions`             | Умение перехватывать передачи                |
|`vision`                    | Уровень видения игрока на поле               |
|`composure`                 | Уровень хладнокровия *(в момент давления)*   |
|`crossing`                  | Способность выполнять кроссы                 |
|`short_pass`                | Точность коротких передач                    |
|`long_pass`                 | Точность дальних передач                     |
|`acceleration`              | Ускорение                                    |
|`stamina`                   | Уровень выносливости                         |
|`strength`                  | Физическая сила                              |
|`balance`                   | Равновесие игрока                            |
|`sprint_speed`              | Скорость спринта                             |
|`agility`                   | Ловкость в маневрировании                    |
|`jumping`                   | Способность к прыжкам                        |
|`heading`                   | Уровень владения мячом головной игрой        |
|`shot_power`                | Сила удара                                   |
|`finishing`                 | Навык в завершении *(решить)*                |
|`long_shots`                | Навык наносить дальние удары                 |
|`curve`                     | Способность отправлять мяч по кривой         |
|`fk_acc`                    | Точность штрафного удара                     |
|`penalties`                 | Навык исполнения пенальти                    |
|`volleys`                   | Навык при выполнении залпового удара         |
|`gk_positioning`            | Атрибут расположения вратаря **(специфично для вратаря)**                  |
|`gk_diving`                 | Способность к прыжкам вратаря **(специфично для вратаря)**                 |
|`gk_handling`               | Умение вратаря обращаться с мячом **(специфично для вратаря)**             |
|`gk_kicking`                | Способность вратаря наносить удары **(специфично для вратаря)**            |
|`gk_reflexes`               | Уровень рефлексов вратаря **(специфично для вратаря)**                     |
|`value`                     | Ценность/стоимость игрока *($)*                                            |

### Этап EDA

In [66]:
df.shape

(5679, 40)

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5682 entries, 0 to 5681
Data columns (total 41 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   player          5682 non-null   object 
 1   country         5682 non-null   object 
 2   height          5682 non-null   int64  
 3   weight          5682 non-null   int64  
 4   age             5682 non-null   int64  
 5   club            5682 non-null   object 
 6   ball_control    5682 non-null   int64  
 7   dribbling       5682 non-null   int64  
 8   marking         0 non-null      float64
 9   slide_tackle    5682 non-null   int64  
 10  stand_tackle    5682 non-null   int64  
 11  aggression      5682 non-null   int64  
 12  reactions       5682 non-null   int64  
 13  att_position    5682 non-null   int64  
 14  interceptions   5682 non-null   int64  
 15  vision          5682 non-null   int64  
 16  composure       5682 non-null   int64  
 17  crossing        5682 non-null   i

In [50]:
df.drop_duplicates(inplace=True)

In [51]:
def player_value_to_num(value):
    return value.replace('.', '')[1:]

# Заметим, что поле value (стоимость игрока) имеет тип object, а примерное значение - $1.400.000
# Нужно перевести в число
df['value']  = df['value'].apply(lambda v: player_value_to_num(v))

In [52]:
df.isnull().mean()*100

player              0.0
country             0.0
height              0.0
weight              0.0
age                 0.0
club                0.0
ball_control        0.0
dribbling           0.0
marking           100.0
slide_tackle        0.0
stand_tackle        0.0
aggression          0.0
reactions           0.0
att_position        0.0
interceptions       0.0
vision              0.0
composure           0.0
crossing            0.0
short_pass          0.0
long_pass           0.0
acceleration        0.0
stamina             0.0
strength            0.0
balance             0.0
sprint_speed        0.0
agility             0.0
jumping             0.0
heading             0.0
shot_power          0.0
finishing           0.0
long_shots          0.0
curve               0.0
fk_acc              0.0
penalties           0.0
volleys             0.0
gk_positioning      0.0
gk_diving           0.0
gk_handling         0.0
gk_kicking          0.0
gk_reflexes         0.0
value               0.0
dtype: float64

In [53]:
df.isna().sum()

player               0
country              0
height               0
weight               0
age                  0
club                 0
ball_control         0
dribbling            0
marking           5679
slide_tackle         0
stand_tackle         0
aggression           0
reactions            0
att_position         0
interceptions        0
vision               0
composure            0
crossing             0
short_pass           0
long_pass            0
acceleration         0
stamina              0
strength             0
balance              0
sprint_speed         0
agility              0
jumping              0
heading              0
shot_power           0
finishing            0
long_shots           0
curve                0
fk_acc               0
penalties            0
volleys              0
gk_positioning       0
gk_diving            0
gk_handling          0
gk_kicking           0
gk_reflexes          0
value                0
dtype: int64

In [54]:
df.drop('marking', axis=1, inplace=True) # Все строки пропуски

In [55]:
df.shape

(5679, 40)

In [67]:
# df_nums - копия исходного датафрейма, состоящая только из числовых значений 

# Категориальные
df_nums = df.drop('player', axis=1)
df_nums = df_nums.drop('club', axis=1)
# df_nums = df_nums.drop('country', axis=1)

#### Проверять датасет на аномалии не будем, поскольку все данные - параметры спортсменов, характеризующие их силовые показатели  

In [57]:
# Как видим, 135 уникальных значений
df['country'].value_counts(normalize=True)*100 

country
England                 9.068498
Germany                 6.849797
Spain                   6.268709
France                  5.441099
Argentina               5.317838
                          ...   
Mauritania              0.017609
Central African Rep.    0.017609
Guyana                  0.017609
Antigua & Barbuda       0.017609
Dominican Republic      0.017609
Name: proportion, Length: 135, dtype: float64

In [58]:
# Этот метод присваивает каждому уникальному значению в исходном признаке целочисленное значение
encoder = LabelEncoder()
df_nums['country'] = encoder.fit_transform(df_nums['country'])

In [59]:
df_nums.head(5)

Unnamed: 0,country,height,weight,age,ball_control,dribbling,slide_tackle,stand_tackle,aggression,reactions,...,curve,fk_acc,penalties,volleys,gk_positioning,gk_diving,gk_handling,gk_kicking,gk_reflexes,value
0,24,192,84,22,55,43,68,73,72,68,...,32,34,41,33,10,11,6,7,9,1400000
1,14,181,80,31,69,70,56,58,62,70,...,65,62,48,46,12,15,14,8,14,97500
2,47,193,84,29,25,12,13,16,27,65,...,20,15,26,16,64,74,65,68,74,1100000
3,8,187,86,33,46,48,66,69,71,64,...,25,13,22,19,10,10,8,14,9,65000
4,130,191,80,23,14,8,14,16,28,50,...,9,10,16,5,61,59,62,64,64,30000


In [60]:
corr = df_nums.corr()

corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,country,height,weight,age,ball_control,dribbling,slide_tackle,stand_tackle,aggression,reactions,att_position,interceptions,vision,composure,crossing,short_pass,long_pass,acceleration,stamina,strength,balance,sprint_speed,agility,jumping,heading,shot_power,finishing,long_shots,curve,fk_acc,penalties,volleys,gk_positioning,gk_diving,gk_handling,gk_kicking,gk_reflexes,value
country,1.0,0.000415,-0.042527,-0.024059,-0.02409,-0.021857,-0.019191,-0.013748,-0.036609,-0.0379,-0.005135,-0.005645,-0.012488,-0.036702,-0.011998,-0.014773,-0.010712,-0.024997,-0.028946,-0.058838,-0.012031,-0.029892,-0.022319,-0.043406,-0.036081,-0.018857,-0.002071,-0.014595,0.007262,-0.013686,-0.026169,-0.014448,0.013854,0.01444,0.015432,0.011592,0.012173,-0.011924
height,0.000415,1.0,0.758482,0.077817,-0.410968,-0.488841,-0.127468,-0.124719,-0.092996,-0.005857,-0.443762,-0.111714,-0.364935,-0.189543,-0.516881,-0.348977,-0.32125,-0.55163,-0.307312,0.528468,-0.768037,-0.452734,-0.645372,0.038296,0.000551,-0.171695,-0.381036,-0.391582,-0.469235,-0.424115,-0.325945,-0.354664,0.361496,0.368764,0.367475,0.363156,0.370085,-0.014166
weight,-0.042527,0.758482,1.0,0.214683,-0.334754,-0.404859,-0.107964,-0.103263,-0.017832,0.081932,-0.353098,-0.081381,-0.281985,-0.092072,-0.414504,-0.274267,-0.249243,-0.474077,-0.234633,0.591936,-0.63439,-0.387934,-0.543512,0.107552,0.030477,-0.075776,-0.302407,-0.294764,-0.375185,-0.327091,-0.239325,-0.264813,0.330812,0.334247,0.333112,0.330928,0.335554,0.008124
age,-0.024059,0.077817,0.214683,1.0,0.091833,0.03689,0.089874,0.102422,0.240099,0.407265,0.106898,0.171277,0.200688,0.356812,0.130823,0.141986,0.185459,-0.135033,0.127153,0.349079,-0.046371,-0.124212,-0.017405,0.241094,0.159391,0.273609,0.094827,0.166352,0.147298,0.185162,0.160479,0.155398,0.101409,0.087679,0.09513,0.093489,0.093897,0.079491
ball_control,-0.02409,-0.410968,-0.334754,0.091833,1.0,0.946902,0.447432,0.481412,0.614991,0.47804,0.875352,0.473138,0.70945,0.761637,0.841835,0.92607,0.809897,0.700787,0.752235,0.130475,0.641623,0.683807,0.716963,0.223442,0.708649,0.610369,0.810097,0.838089,0.839117,0.772914,0.781658,0.800099,-0.800514,-0.807923,-0.806373,-0.799788,-0.806453,0.276531
dribbling,-0.021857,-0.488841,-0.404859,0.03689,0.946902,1.0,0.35233,0.382864,0.52226,0.408217,0.913825,0.37158,0.730208,0.70123,0.870092,0.85953,0.744795,0.759706,0.716712,0.01757,0.694915,0.731767,0.778467,0.15431,0.61048,0.614685,0.846194,0.855499,0.862399,0.783748,0.785357,0.818627,-0.775359,-0.781598,-0.779948,-0.776691,-0.781232,0.252159
slide_tackle,-0.019191,-0.127468,-0.107964,0.089874,0.447432,0.35233,1.0,0.977909,0.733617,0.244573,0.195502,0.947442,0.140089,0.362678,0.449315,0.55957,0.606641,0.234684,0.58744,0.275101,0.235802,0.248088,0.190895,0.30596,0.543449,-0.021079,0.01389,0.17233,0.293901,0.253189,0.105451,0.071752,-0.547843,-0.549259,-0.54762,-0.546357,-0.547501,0.074831
stand_tackle,-0.013748,-0.124719,-0.103263,0.102422,0.481412,0.382864,0.977909,1.0,0.751273,0.273263,0.232497,0.955764,0.177452,0.400252,0.469928,0.593446,0.632486,0.246409,0.60938,0.295019,0.24794,0.260701,0.210164,0.306536,0.566278,0.016863,0.054892,0.21293,0.327121,0.286758,0.142678,0.113108,-0.570099,-0.570141,-0.56878,-0.567455,-0.569028,0.094232
aggression,-0.036609,-0.092996,-0.017832,0.240099,0.614991,0.52226,0.733617,0.751273,1.0,0.425708,0.45397,0.753376,0.331361,0.574807,0.50943,0.670473,0.642228,0.317808,0.679119,0.470723,0.282022,0.347192,0.302757,0.436239,0.726747,0.30308,0.324422,0.433237,0.453207,0.416146,0.388798,0.371963,-0.615062,-0.617517,-0.616398,-0.613896,-0.614358,0.18402
reactions,-0.0379,-0.005857,0.081932,0.407265,0.47804,0.408217,0.244573,0.273263,0.425708,1.0,0.411027,0.328697,0.538407,0.69589,0.388055,0.536761,0.512644,0.21312,0.412007,0.335534,0.187751,0.219597,0.280356,0.301707,0.380769,0.522502,0.373311,0.431836,0.425192,0.394106,0.381831,0.409543,-0.087849,-0.097191,-0.095169,-0.093717,-0.09091,0.498378


#### Выделим наиболее интересующие нас признаки для анализа
* `shot_power`
* `ball_control`
* `dribbling`
* `stamina`
* `sprint_speed`
* `finishing`

In [68]:
''' 
shot_power
ball_control 
dribbling
stamina
sprint_speed
finishing
short_pass
composure
long_shot
att_position
dribbling
'''

' \nball_control \nshort_pass\ndribbling\ncomposure\nlong_shot\nshot_power\nstamina\natt_position\nsprint_speed\ndribbling\n'