In [9]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.formula.api import ols

In [2]:
fifa_preprocessed = pd.read_csv('fifa_preprocessed.csv')

In [3]:
fifa = fifa_preprocessed.copy()

In [4]:
fifa.columns

Index(['ID', 'Name', 'Age', 'Photo', 'Nationality', 'Flag', 'Overall',
       'Potential', 'Club', 'Club Logo', 'Value', 'Wage', 'Special',
       'Preferred Foot', 'International Reputation', 'Weak Foot',
       'Skill Moves', 'Work Rate', 'Body Type', 'Real Face', 'Position',
       'Jersey Number', 'Joined', 'Loaned From', 'Contract Valid Until',
       'Height', 'Weight', 'LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW',
       'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM',
       'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'Crossing',
       'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling',
       'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
       'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
       'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
       'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
       'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiv

# 변수선택 1. 각 포지션별로 점수 중간값이 Top 5인 세부기술들을 넣는다.

In [6]:
skills = fifa[['Position',
       'Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling',
       'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
       'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
       'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
       'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
       'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
       'GKKicking', 'GKPositioning', 'GKReflexes']]

### 포지션별 세부기술 점수 중간값

In [7]:
skill_med = skills.groupby('Position').median().T.rename(columns={'index':'skill'})
skill_med

Position,CAM,CB,CDM,CF,CM,GK,LAM,LB,LCB,LCM,LDM,LF,LM,LS,LW,LWB,RAM,RB,RCB,RCM,RDM,RF,RM,RS,RW,RWB,ST
Crossing,61.0,36.0,52.0,55.0,55.0,13.0,71.0,64.0,43.0,62.0,58.0,65.0,63.0,55.0,62.0,64.0,71.0,63.0,40.0,60.0,57.5,67.0,63.0,54.0,63.0,62.0,46.0
Finishing,60.0,27.0,44.0,64.5,50.0,12.0,61.0,39.0,31.0,56.0,52.0,69.0,60.0,69.0,62.0,42.0,65.0,39.0,31.0,57.0,52.0,67.5,60.0,69.0,62.0,44.0,66.0
HeadingAccuracy,48.0,64.0,57.0,54.5,51.0,14.0,56.0,56.0,68.0,55.0,58.0,58.0,48.0,65.0,49.0,54.5,49.0,57.0,69.0,56.0,59.0,54.5,49.0,66.0,49.0,53.0,64.0
ShortPassing,68.0,57.0,68.0,62.0,67.0,27.0,71.0,63.0,61.0,70.0,70.0,73.0,64.0,64.0,64.0,61.5,72.0,63.0,60.0,71.0,71.0,70.5,64.0,63.0,64.0,62.0,58.0
Volleys,56.0,30.0,42.0,58.0,46.0,12.0,62.0,37.0,31.0,52.0,47.0,64.0,54.0,63.0,55.0,39.5,65.0,37.0,32.0,53.0,48.0,63.5,53.0,63.0,55.0,40.0,58.0
Dribbling,68.0,42.0,60.0,68.0,63.0,13.0,74.0,62.0,48.0,67.0,64.0,75.0,68.0,68.0,70.0,62.0,73.0,62.0,47.5,66.0,64.0,69.0,68.0,68.0,70.0,62.0,63.0
Curve,63.0,32.0,48.0,58.5,54.0,14.0,68.0,54.0,36.0,61.0,55.0,70.0,60.0,57.0,59.0,59.0,67.0,50.0,35.0,61.0,54.0,70.5,59.0,57.0,59.0,51.0,50.0
FKAccuracy,59.0,31.0,46.0,51.0,49.0,13.0,63.0,42.0,32.0,59.0,54.0,60.0,52.0,48.0,50.0,50.5,71.0,38.0,32.0,57.0,52.0,66.0,50.0,47.0,49.0,37.0,41.0
LongPassing,63.0,51.0,64.0,55.0,64.0,24.0,66.0,57.0,57.0,67.0,66.0,68.0,58.0,52.0,56.0,58.0,63.0,57.0,57.0,67.0,68.0,66.5,58.0,50.0,56.0,58.0,44.0
BallControl,69.0,52.0,65.0,67.0,65.0,20.0,73.0,63.0,57.0,69.0,68.0,74.0,67.0,68.0,68.0,64.0,74.0,63.0,56.0,69.0,68.0,70.5,66.0,69.0,68.0,62.0,64.0


In [8]:
skill_med['CB'].sort_values(ascending=False)

Strength          75.000000
Jumping           71.000000
StandingTackle    66.000000
Aggression        65.000000
HeadingAccuracy   64.000000
SlidingTackle     63.000000
Marking           63.000000
Stamina           63.000000
Interceptions     62.000000
SprintSpeed       60.000000
Reactions         59.000000
Acceleration      58.000000
ShortPassing      57.000000
Composure         56.000000
Balance           56.000000
Agility           52.000000
BallControl       52.000000
LongPassing       51.000000
ShotPower         46.000000
Dribbling         42.000000
Penalties         40.000000
Vision            38.000000
Crossing          36.000000
Curve             32.000000
Positioning       31.000000
FKAccuracy        31.000000
LongShots         30.000000
Volleys           30.000000
Finishing         27.000000
GKPositioning     11.000000
GKKicking         11.000000
GKReflexes        11.000000
GKDiving          10.000000
GKHandling        10.000000
Name: CB, dtype: float64

In [10]:
res = ols('CB ~ Strength + Jumping + StandingTackle + Aggression + HeadingAccuracy', fifa).fit()
res.summary()

0,1,2,3
Dep. Variable:,CB,R-squared:,0.938
Model:,OLS,Adj. R-squared:,0.938
Method:,Least Squares,F-statistic:,55210.0
Date:,"Tue, 14 Jan 2020",Prob (F-statistic):,0.0
Time:,20:37:54,Log-Likelihood:,-55453.0
No. Observations:,18147,AIC:,110900.0
Df Residuals:,18141,BIC:,111000.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.8118,0.259,-6.985,0.000,-2.320,-1.303
Strength,-0.1718,0.004,-47.925,0.000,-0.179,-0.165
Jumping,-0.0175,0.004,-4.910,0.000,-0.024,-0.010
StandingTackle,0.5245,0.003,197.518,0.000,0.519,0.530
Aggression,0.1989,0.004,50.765,0.000,0.191,0.207
HeadingAccuracy,0.5248,0.003,163.880,0.000,0.519,0.531

0,1,2,3
Omnibus:,196.916,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,332.999
Skew:,0.048,Prob(JB):,4.9e-73
Kurtosis:,3.657,Cond. No.,896.0


AIC, BIC, Cond.No.가 너무 큰 것 같은데 크고 작다의 기준은?

In [11]:
fifa[['Strength', 'Jumping', 'StandingTackle', 'Aggression', 'HeadingAccuracy']].corr()

Unnamed: 0,Strength,Jumping,StandingTackle,Aggression,HeadingAccuracy
Strength,1.0,0.284188,0.33199,0.473874,0.486947
Jumping,0.284188,1.0,0.260769,0.373426,0.380146
StandingTackle,0.33199,0.260769,1.0,0.744142,0.561103
Aggression,0.473874,0.373426,0.744142,1.0,0.692888
HeadingAccuracy,0.486947,0.380146,0.561103,0.692888,1.0


변수 간 상관계수가 0.74인 StandingTackle과 Aggression 중 하나만 넣는다.

In [12]:
res = ols('CB ~ Strength + Jumping + StandingTackle + HeadingAccuracy', fifa).fit()
res.summary()

0,1,2,3
Dep. Variable:,CB,R-squared:,0.93
Model:,OLS,Adj. R-squared:,0.93
Method:,Least Squares,F-statistic:,59870.0
Date:,"Tue, 14 Jan 2020",Prob (F-statistic):,0.0
Time:,20:41:51,Log-Likelihood:,-56659.0
No. Observations:,18147,AIC:,113300.0
Df Residuals:,18142,BIC:,113400.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.7499,0.277,-6.313,0.000,-2.293,-1.207
Strength,-0.1363,0.004,-36.273,0.000,-0.144,-0.129
Jumping,0.0076,0.004,2.024,0.043,0.000,0.015
StandingTackle,0.6045,0.002,264.591,0.000,0.600,0.609
HeadingAccuracy,0.5876,0.003,186.127,0.000,0.581,0.594

0,1,2,3
Omnibus:,131.781,Durbin-Watson:,2.004
Prob(Omnibus):,0.0,Jarque-Bera (JB):,181.456
Skew:,0.095,Prob(JB):,3.96e-40
Kurtosis:,3.451,Cond. No.,805.0


5개 변수를 하나씩 넣어본다.

In [14]:
res = ols('CB ~ Strength', fifa).fit()
res.summary()

0,1,2,3
Dep. Variable:,CB,R-squared:,0.136
Model:,OLS,Adj. R-squared:,0.136
Method:,Least Squares,F-statistic:,2859.0
Date:,"Tue, 14 Jan 2020",Prob (F-statistic):,0.0
Time:,20:42:18,Log-Likelihood:,-79405.0
No. Observations:,18147,AIC:,158800.0
Df Residuals:,18145,BIC:,158800.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,9.6765,0.757,12.788,0.000,8.193,11.160
Strength,0.6083,0.011,53.469,0.000,0.586,0.631

0,1,2,3
Omnibus:,4373.442,Durbin-Watson:,1.915
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8533.847
Skew:,-1.491,Prob(JB):,0.0
Kurtosis:,4.549,Cond. No.,353.0


In [15]:
res = ols('CB ~ Jumping', fifa).fit()
res.summary()

0,1,2,3
Dep. Variable:,CB,R-squared:,0.111
Model:,OLS,Adj. R-squared:,0.111
Method:,Least Squares,F-statistic:,2270.0
Date:,"Tue, 14 Jan 2020",Prob (F-statistic):,0.0
Time:,20:42:25,Log-Likelihood:,-79664.0
No. Observations:,18147,AIC:,159300.0
Df Residuals:,18145,BIC:,159300.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,11.4124,0.811,14.080,0.000,9.824,13.001
Jumping,0.5837,0.012,47.645,0.000,0.560,0.608

0,1,2,3
Omnibus:,3191.855,Durbin-Watson:,1.883
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5206.621
Skew:,-1.209,Prob(JB):,0.0
Kurtosis:,4.019,Cond. No.,370.0


In [16]:
res = ols('CB ~ StandingTackle', fifa).fit()
res.summary()

0,1,2,3
Dep. Variable:,CB,R-squared:,0.78
Model:,OLS,Adj. R-squared:,0.78
Method:,Least Squares,F-statistic:,64440.0
Date:,"Tue, 14 Jan 2020",Prob (F-statistic):,0.0
Time:,20:42:35,Log-Likelihood:,-66983.0
No. Observations:,18147,AIC:,134000.0
Df Residuals:,18145,BIC:,134000.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,9.1544,0.174,52.562,0.000,8.813,9.496
StandingTackle,0.8439,0.003,253.847,0.000,0.837,0.850

0,1,2,3
Omnibus:,1274.357,Durbin-Watson:,1.944
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1716.403
Skew:,-0.621,Prob(JB):,0.0
Kurtosis:,3.853,Cond. No.,127.0


In [17]:
res = ols('CB ~ Aggression', fifa).fit()
res.summary()

0,1,2,3
Dep. Variable:,CB,R-squared:,0.685
Model:,OLS,Adj. R-squared:,0.685
Method:,Least Squares,F-statistic:,39480.0
Date:,"Tue, 14 Jan 2020",Prob (F-statistic):,0.0
Time:,20:42:42,Log-Likelihood:,-70247.0
No. Observations:,18147,AIC:,140500.0
Df Residuals:,18145,BIC:,140500.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-5.7083,0.290,-19.652,0.000,-6.278,-5.139
Aggression,0.9864,0.005,198.707,0.000,0.977,0.996

0,1,2,3
Omnibus:,1350.541,Durbin-Watson:,1.984
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1688.701
Skew:,-0.7,Prob(JB):,0.0
Kurtosis:,3.524,Cond. No.,197.0


In [18]:
res = ols('CB ~ HeadingAccuracy', fifa).fit()
res.summary()

0,1,2,3
Dep. Variable:,CB,R-squared:,0.656
Model:,OLS,Adj. R-squared:,0.656
Method:,Least Squares,F-statistic:,34600.0
Date:,"Tue, 14 Jan 2020",Prob (F-statistic):,0.0
Time:,20:42:52,Log-Likelihood:,-71051.0
No. Observations:,18147,AIC:,142100.0
Df Residuals:,18145,BIC:,142100.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.0279,0.286,-3.598,0.000,-1.588,-0.468
HeadingAccuracy,0.9644,0.005,186.014,0.000,0.954,0.975

0,1,2,3
Omnibus:,569.696,Durbin-Watson:,1.995
Prob(Omnibus):,0.0,Jarque-Bera (JB):,539.538
Skew:,-0.378,Prob(JB):,6.930000000000001e-118
Kurtosis:,2.624,Cond. No.,175.0


# 변수선택 2. 중간값이 다른 포지션 전부와 10점 이상 차이나는 세부기술들을 넣는다.

In [19]:
skill_med

Position,CAM,CB,CDM,CF,CM,GK,LAM,LB,LCB,LCM,LDM,LF,LM,LS,LW,LWB,RAM,RB,RCB,RCM,RDM,RF,RM,RS,RW,RWB,ST
Crossing,61.0,36.0,52.0,55.0,55.0,13.0,71.0,64.0,43.0,62.0,58.0,65.0,63.0,55.0,62.0,64.0,71.0,63.0,40.0,60.0,57.5,67.0,63.0,54.0,63.0,62.0,46.0
Finishing,60.0,27.0,44.0,64.5,50.0,12.0,61.0,39.0,31.0,56.0,52.0,69.0,60.0,69.0,62.0,42.0,65.0,39.0,31.0,57.0,52.0,67.5,60.0,69.0,62.0,44.0,66.0
HeadingAccuracy,48.0,64.0,57.0,54.5,51.0,14.0,56.0,56.0,68.0,55.0,58.0,58.0,48.0,65.0,49.0,54.5,49.0,57.0,69.0,56.0,59.0,54.5,49.0,66.0,49.0,53.0,64.0
ShortPassing,68.0,57.0,68.0,62.0,67.0,27.0,71.0,63.0,61.0,70.0,70.0,73.0,64.0,64.0,64.0,61.5,72.0,63.0,60.0,71.0,71.0,70.5,64.0,63.0,64.0,62.0,58.0
Volleys,56.0,30.0,42.0,58.0,46.0,12.0,62.0,37.0,31.0,52.0,47.0,64.0,54.0,63.0,55.0,39.5,65.0,37.0,32.0,53.0,48.0,63.5,53.0,63.0,55.0,40.0,58.0
Dribbling,68.0,42.0,60.0,68.0,63.0,13.0,74.0,62.0,48.0,67.0,64.0,75.0,68.0,68.0,70.0,62.0,73.0,62.0,47.5,66.0,64.0,69.0,68.0,68.0,70.0,62.0,63.0
Curve,63.0,32.0,48.0,58.5,54.0,14.0,68.0,54.0,36.0,61.0,55.0,70.0,60.0,57.0,59.0,59.0,67.0,50.0,35.0,61.0,54.0,70.5,59.0,57.0,59.0,51.0,50.0
FKAccuracy,59.0,31.0,46.0,51.0,49.0,13.0,63.0,42.0,32.0,59.0,54.0,60.0,52.0,48.0,50.0,50.5,71.0,38.0,32.0,57.0,52.0,66.0,50.0,47.0,49.0,37.0,41.0
LongPassing,63.0,51.0,64.0,55.0,64.0,24.0,66.0,57.0,57.0,67.0,66.0,68.0,58.0,52.0,56.0,58.0,63.0,57.0,57.0,67.0,68.0,66.5,58.0,50.0,56.0,58.0,44.0
BallControl,69.0,52.0,65.0,67.0,65.0,20.0,73.0,63.0,57.0,69.0,68.0,74.0,67.0,68.0,68.0,64.0,74.0,63.0,56.0,69.0,68.0,70.5,66.0,69.0,68.0,62.0,64.0


In [20]:
skill_med.columns

Index(['CAM', 'CB', 'CDM', 'CF', 'CM', 'GK', 'LAM', 'LB', 'LCB', 'LCM', 'LDM',
       'LF', 'LM', 'LS', 'LW', 'LWB', 'RAM', 'RB', 'RCB', 'RCM', 'RDM', 'RF',
       'RM', 'RS', 'RW', 'RWB', 'ST'],
      dtype='object', name='Position')

In [21]:
def skills_by_pos(pos):
    
    skill_set = set(skill_med.index)
    exclusion = set()
    
    skill_med[pos]
    
    for sk in skill_med.index:
        for p in skill_med.columns:
            if 0 < abs(skill_med.loc[sk, pos] - skill_med.loc[sk, p]) < 10:
                exclusion.add(sk)
                
    return skill_set - exclusion

In [22]:
skills_by_pos('ST')

set()

GK 말고는 없으므로 그룹으로 나눠서 보기

### 크게 6그룹으로 나눠서 그룹별 중간값 확인
* 공격수 LS, ST, RS, LF, CF, RF
* 공격형 미드필더 LAM, CAM, RAM
* 중앙 미드필더 LW, LM, LCM, CM, RCM, RM, RW
* 수비형 미드필더 LDM, CDM, RDM
* 수비수 LWB, LCB, RCB, RWB, LB, CB, RB
* 골키퍼 GK

In [25]:
# 공격수
forward = skill_med[['LS', 'ST', 'RS', 'LF', 'CF', 'RF']].T.median().reset_index().rename(columns={0: 'forward'}).sort_values(by='forward', ascending=False)
forward

Unnamed: 0,index,forward
11,SprintSpeed,74.0
10,Acceleration,73.5
12,Agility,73.25
14,Balance,70.5
15,ShotPower,70.0
16,Jumping,69.0
9,BallControl,68.5
1,Finishing,68.25
5,Dribbling,68.0
22,Positioning,67.5


In [26]:
# 공격형 미드필더
a_mid = skill_med[['LAM', 'CAM', 'RAM']].T.median().reset_index().rename(columns={0: 'a_mid'}).sort_values(by='a_mid', ascending=False)
a_mid

Unnamed: 0,index,a_mid
12,Agility,78.0
14,Balance,76.0
10,Acceleration,75.0
5,Dribbling,73.0
9,BallControl,73.0
23,Vision,72.0
11,SprintSpeed,72.0
0,Crossing,71.0
3,ShortPassing,71.0
25,Composure,69.0


In [27]:
# 중앙형 미드필더
c_mid = skill_med[['LW', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'RW']].T.median().reset_index().rename(columns={0: 'c_mid'}).sort_values(by='c_mid', ascending=False)
c_mid

Unnamed: 0,index,c_mid
10,Acceleration,77.0
11,SprintSpeed,76.0
12,Agility,75.0
14,Balance,73.0
17,Stamina,68.0
5,Dribbling,68.0
9,BallControl,68.0
15,ShotPower,65.0
3,ShortPassing,64.0
16,Jumping,63.0


In [28]:
# 수비형 미드필더
d_mid = skill_med[['LDM', 'CDM', 'RDM']].T.median().reset_index().rename(columns={0: 'd_mid'}).sort_values(by='d_mid', ascending=False)
d_mid

Unnamed: 0,index,d_mid
17,Stamina,75.0
3,ShortPassing,70.0
20,Aggression,70.0
18,Strength,69.0
9,BallControl,68.0
14,Balance,67.0
13,Reactions,67.0
12,Agility,66.0
8,LongPassing,66.0
27,StandingTackle,66.0


In [29]:
# 수비수
defender = skill_med[['LWB', 'LCB', 'RCB', 'RWB', 'LB', 'CB', 'RB']].T.median().reset_index().rename(columns={0: 'defender'}).sort_values(by='defender', ascending=False)
defender

Unnamed: 0,index,defender
17,Stamina,72.0
10,Acceleration,72.0
11,SprintSpeed,72.0
16,Jumping,69.0
14,Balance,69.0
12,Agility,69.0
27,StandingTackle,66.0
18,Strength,66.0
28,SlidingTackle,65.0
20,Aggression,65.0


In [30]:
# 골키퍼
gk = skill_med['GK'].reset_index().sort_values(by='GK', ascending=False)
gk

Unnamed: 0,index,GK
33,GKReflexes,66.0
29,GKDiving,65.0
32,GKPositioning,63.0
30,GKHandling,63.0
18,Strength,62.0
31,GKKicking,61.0
13,Reactions,60.0
16,Jumping,59.0
25,Composure,46.0
14,Balance,43.0


### 수비수의 점수가 다른 포지션 그룹의 점수보다 10점 이상 높거나 낮은 세부기술이 무엇인지 확인

In [32]:
medians = defender.merge(forward, how='inner', on='index').merge(a_mid, how='inner', on='index').merge(c_mid, how='inner', on='index').merge(d_mid, how='inner', on='index').merge(gk, how='inner', on='index')
medians

Unnamed: 0,index,defender,forward,a_mid,c_mid,d_mid,GK
0,Stamina,72.0,67.5,67.0,68.0,75.0,30.0
1,Acceleration,72.0,73.5,75.0,77.0,65.0,39.0
2,SprintSpeed,72.0,74.0,72.0,76.0,64.0,40.0
3,Jumping,69.0,69.0,61.0,63.0,66.0,59.0
4,Balance,69.0,70.5,76.0,73.0,67.0,43.0
5,Agility,69.0,73.25,78.0,75.0,66.0,38.0
6,StandingTackle,66.0,28.0,41.0,37.0,66.0,14.0
7,Strength,66.0,67.0,57.0,60.0,69.0,62.0
8,SlidingTackle,65.0,23.75,36.0,35.0,63.0,13.0
9,Aggression,65.0,53.0,48.0,51.0,70.0,25.0


In [33]:
# 골키퍼는 특수하므로 제외하고 그 이외 포지션 그룹과 비교해서 10점 이상 차이나는 기술들 찾기

def_skill = set(medians['index'])
exclusion = set()

for i in range(len(medians)):
    for j in range(2,6):
        if abs(medians.loc[i, 'defender'] - medians.iloc[i, j]) < 10:
            exclusion.add(medians.loc[i, 'index'])
            
print(def_skill - exclusion)

{'Volleys', 'LongShots', 'FKAccuracy', 'Penalties', 'Finishing'}


# 변수 선택 3. (왼쪽, 오른쪽만 다른 포지션 등) 비슷한 포지션끼리 그룹 지어서 그 안에서 공통적으로 높은 중간값을 가지는 세부기술들을 넣는다.

### 수비수 포지션 별 테이블 만들기

In [35]:
LWB = fifa[fifa['Position']=='LWB']
RWB = fifa[fifa['Position']=='RWB']
LCB = fifa[fifa['Position']=='LCB']
RCB = fifa[fifa['Position']=='RCB']
CB = fifa[fifa['Position']=='CB']
LB = fifa[fifa['Position']=='LB']
RB = fifa[fifa['Position']=='RB']
GK = fifa[fifa['Position']=='GK']

print(len(LWB), len(RWB), len(LCB), len(RCB), len(CB), len(LB), len(RB), len(GK))

78 87 648 662 1778 1322 1291 2025


### 포지션 별 세부기술 점수 중간값

In [36]:
skills = ['Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling',
       'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
       'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
       'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
       'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
       'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
       'GKKicking', 'GKPositioning', 'GKReflexes']

In [37]:
CB[skills].median().sort_values(ascending=False)

Strength          75.000000
Jumping           71.000000
StandingTackle    66.000000
Aggression        65.000000
HeadingAccuracy   64.000000
SlidingTackle     63.000000
Marking           63.000000
Stamina           63.000000
Interceptions     62.000000
SprintSpeed       60.000000
Reactions         59.000000
Acceleration      58.000000
ShortPassing      57.000000
Composure         56.000000
Balance           56.000000
Agility           52.000000
BallControl       52.000000
LongPassing       51.000000
ShotPower         46.000000
Dribbling         42.000000
Penalties         40.000000
Vision            38.000000
Crossing          36.000000
Curve             32.000000
Positioning       31.000000
FKAccuracy        31.000000
LongShots         30.000000
Volleys           30.000000
Finishing         27.000000
GKPositioning     11.000000
GKKicking         11.000000
GKReflexes        11.000000
GKDiving          10.000000
GKHandling        10.000000
dtype: float64

In [38]:
LCB[skills].median().sort_values(ascending=False)

Strength          78.000000
Jumping           73.000000
StandingTackle    70.000000
Aggression        70.000000
HeadingAccuracy   68.000000
Stamina           68.000000
Marking           68.000000
Interceptions     67.000000
SlidingTackle     67.000000
Reactions         63.000000
Composure         62.000000
SprintSpeed       62.000000
ShortPassing      61.000000
Acceleration      59.000000
BallControl       57.000000
LongPassing       57.000000
Balance           57.000000
Agility           56.000000
ShotPower         51.000000
Dribbling         48.000000
Vision            44.000000
Crossing          43.000000
Penalties         41.000000
Curve             36.000000
LongShots         35.000000
Positioning       35.000000
FKAccuracy        32.000000
Volleys           31.000000
Finishing         31.000000
GKPositioning     11.000000
GKDiving          11.000000
GKHandling        11.000000
GKKicking         10.000000
GKReflexes        10.000000
dtype: float64

In [39]:
RCB[skills].median().sort_values(ascending=False)

Strength          78.000000
Jumping           72.000000
Aggression        70.000000
HeadingAccuracy   69.000000
StandingTackle    69.000000
Marking           68.000000
Stamina           68.000000
SlidingTackle     67.000000
Interceptions     67.000000
Reactions         64.000000
Composure         62.000000
SprintSpeed       61.000000
ShortPassing      60.000000
Acceleration      58.000000
LongPassing       57.000000
BallControl       56.000000
Balance           56.000000
Agility           55.000000
ShotPower         50.000000
Dribbling         47.500000
Vision            43.000000
Penalties         41.000000
Crossing          40.000000
Curve             35.000000
LongShots         34.000000
Positioning       34.000000
FKAccuracy        32.000000
Volleys           32.000000
Finishing         31.000000
GKPositioning     11.000000
GKKicking         11.000000
GKReflexes        11.000000
GKDiving          10.000000
GKHandling        10.000000
dtype: float64

In [40]:
LB[skills].median().sort_values(ascending=False)

Acceleration      72.000000
SprintSpeed       72.000000
Stamina           72.000000
Balance           70.000000
Agility           69.000000
Jumping           68.000000
SlidingTackle     65.000000
StandingTackle    65.000000
Strength          65.000000
Aggression        64.000000
Crossing          64.000000
Marking           63.000000
BallControl       63.000000
ShortPassing      63.000000
Interceptions     63.000000
Reactions         62.000000
Dribbling         62.000000
Composure         59.000000
LongPassing       57.000000
ShotPower         56.000000
HeadingAccuracy   56.000000
Curve             54.000000
Positioning       54.000000
Vision            52.000000
Penalties         44.000000
LongShots         44.000000
FKAccuracy        42.000000
Finishing         39.000000
Volleys           37.000000
GKReflexes        11.000000
GKDiving          11.000000
GKKicking         10.000000
GKPositioning     10.000000
GKHandling        10.000000
dtype: float64

In [41]:
RB[skills].median().sort_values(ascending=False)

Acceleration      73.000000
SprintSpeed       73.000000
Stamina           73.000000
Jumping           69.000000
Agility           69.000000
Balance           69.000000
StandingTackle    66.000000
Strength          66.000000
Aggression        65.000000
SlidingTackle     65.000000
ShortPassing      63.000000
BallControl       63.000000
Crossing          63.000000
Interceptions     63.000000
Marking           63.000000
Reactions         62.000000
Dribbling         62.000000
Composure         60.000000
HeadingAccuracy   57.000000
LongPassing       57.000000
ShotPower         55.000000
Positioning       54.000000
Vision            51.000000
Curve             50.000000
Penalties         44.000000
LongShots         44.000000
Finishing         39.000000
FKAccuracy        38.000000
Volleys           37.000000
GKHandling        11.000000
GKDiving          10.000000
GKKicking         10.000000
GKPositioning     10.000000
GKReflexes        10.000000
dtype: float64

In [42]:
LWB[skills].median().sort_values(ascending=False)

Stamina           76.000000
SprintSpeed       75.000000
Acceleration      74.000000
Agility           71.000000
Balance           69.500000
Jumping           69.000000
Strength          66.000000
BallControl       64.000000
Crossing          64.000000
StandingTackle    64.000000
Aggression        63.500000
SlidingTackle     62.000000
Dribbling         62.000000
Marking           62.000000
Reactions         62.000000
ShortPassing      61.500000
Interceptions     61.000000
Composure         59.500000
Curve             59.000000
LongPassing       58.000000
ShotPower         58.000000
Positioning       57.000000
Vision            56.500000
HeadingAccuracy   54.500000
FKAccuracy        50.500000
LongShots         48.000000
Penalties         47.000000
Finishing         42.000000
Volleys           39.500000
GKHandling        11.000000
GKDiving          11.000000
GKReflexes        11.000000
GKKicking          9.000000
GKPositioning      9.000000
dtype: float64

In [43]:
RWB[skills].median().sort_values(ascending=False)

Acceleration      75.000000
SprintSpeed       75.000000
Stamina           74.000000
Agility           71.000000
Balance           70.000000
Jumping           67.000000
Strength          65.000000
StandingTackle    63.000000
Aggression        62.000000
ShortPassing      62.000000
Dribbling         62.000000
BallControl       62.000000
Crossing          62.000000
SlidingTackle     61.000000
Reactions         60.000000
Marking           60.000000
LongPassing       58.000000
Interceptions     58.000000
ShotPower         57.000000
Composure         56.000000
Vision            53.000000
Positioning       53.000000
HeadingAccuracy   53.000000
Curve             51.000000
LongShots         45.000000
Penalties         44.000000
Finishing         44.000000
Volleys           40.000000
FKAccuracy        37.000000
GKDiving          11.000000
GKKicking         11.000000
GKPositioning     10.000000
GKHandling        10.000000
GKReflexes        10.000000
dtype: float64

In [44]:
GK[skills].median().sort_values(ascending=False)

GKReflexes        66.000000
GKDiving          65.000000
GKHandling        63.000000
GKPositioning     63.000000
Strength          62.000000
GKKicking         61.000000
Reactions         60.000000
Jumping           59.000000
Composure         46.000000
Balance           43.000000
SprintSpeed       40.000000
Acceleration      39.000000
Agility           38.000000
Vision            36.000000
Stamina           30.000000
ShortPassing      27.000000
Aggression        25.000000
LongPassing       24.000000
ShotPower         22.000000
Penalties         20.000000
BallControl       20.000000
Interceptions     17.000000
HeadingAccuracy   14.000000
Marking           14.000000
Curve             14.000000
StandingTackle    14.000000
FKAccuracy        13.000000
LongShots         13.000000
Dribbling         13.000000
SlidingTackle     13.000000
Crossing          13.000000
Volleys           12.000000
Finishing         12.000000
Positioning       11.000000
dtype: float64

### 중간값 점수가 높은 세부기술 항목을 기준 삼아 4그룹으로 나눈다.  
* 센터백(CB, LCB, RCB)
* 사이드백(LB, RB)
* 윙백 (LWB, RWB)
* 골키퍼(GK) 

### 센터백 (CB, LCB, RCB)

In [45]:
CB_skills_median = CB[skills].median().sort_values(ascending=False).reset_index().rename(columns={0: 'CB_median'})
LCB_skills_median = LCB[skills].median().sort_values(ascending=False).reset_index().rename(columns={0: 'LCB_median'})
LCB_skills_median['LCB_rank'] = LCB_skills_median.index 
# CB의 세부기술 점수 순위 기준으로 merge할 것이므로 LCB와 RCB의 점수 중에서는 그 항목이 몇 위였는지를 보기 위해 '포지션명_rank' 컬럼을 만들어 준다.
RCB_skills_median = RCB[skills].median().sort_values(ascending=False).reset_index().rename(columns={0: 'RCB_median'})
RCB_skills_median['RCB_rank'] = RCB_skills_median.index
CB_skills_median.merge(LCB_skills_median, how='inner', on='index').merge(RCB_skills_median, how='inner', on='index')

Unnamed: 0,index,CB_median,LCB_median,LCB_rank,RCB_median,RCB_rank
0,Strength,75.0,78.0,0,78.0,0
1,Jumping,71.0,73.0,1,72.0,1
2,StandingTackle,66.0,70.0,2,69.0,4
3,Aggression,65.0,70.0,3,70.0,2
4,HeadingAccuracy,64.0,68.0,4,69.0,3
5,SlidingTackle,63.0,67.0,8,67.0,7
6,Marking,63.0,68.0,6,68.0,5
7,Stamina,63.0,68.0,5,68.0,6
8,Interceptions,62.0,67.0,7,67.0,8
9,SprintSpeed,60.0,62.0,11,61.0,11


센터백 선수들의 점수가 높은 세부기술 10가지:  
'Strength', 'Jumping', 'StandingTackle', 'Aggression', 'HeadingAccuracy', 'SlidingTackle', 'Marking', 'Stamina', 'Interceptions', 'Reactions'

(05% Short Passing 05% Ball Control)

### 사이드백(LB, RB)

In [46]:
LB_skills_median = LB[skills].median().sort_values(ascending=False).reset_index().rename(columns={0: 'LB_median'})
RB_skills_median = RB[skills].median().sort_values(ascending=False).reset_index().rename(columns={0: 'RB_median'})
RB_skills_median['RB_rank'] = RB_skills_median.index
LB_skills_median.merge(RB_skills_median, how='inner', on='index')

Unnamed: 0,index,LB_median,RB_median,RB_rank
0,Acceleration,72.0,73.0,0
1,SprintSpeed,72.0,73.0,1
2,Stamina,72.0,73.0,2
3,Balance,70.0,69.0,5
4,Agility,69.0,69.0,4
5,Jumping,68.0,69.0,3
6,SlidingTackle,65.0,65.0,9
7,StandingTackle,65.0,66.0,6
8,Strength,65.0,66.0,7
9,Aggression,64.0,65.0,8


사이드백 선수들의 점수가 높은 세부기술 10가지:  
'Acceleration', 'SprintSpeed', 'Stamina', 'Balance', 'Agility', 'Jumping', 'SlidingTackle', 'StandingTackle', 'Strength', 'Aggression'

(12% Interceptions 10% Marking 08% Reactions 07% Crossing 07% Heading 07% Ball Control 06% Short Passing)

### 윙백 (LWB, RWB)

In [47]:
LWB_skills_median = LWB[skills].median().sort_values(ascending=False).reset_index().rename(columns={0: 'LWB_median'})
RWB_skills_median = RWB[skills].median().sort_values(ascending=False).reset_index().rename(columns={0: 'RWB_median'})
RWB_skills_median['RWB_rank'] = RWB_skills_median.index
LWB_skills_median.merge(RWB_skills_median, how='inner', on='index')

Unnamed: 0,index,LWB_median,RWB_median,RWB_rank
0,Stamina,76.0,74.0,2
1,SprintSpeed,75.0,75.0,1
2,Acceleration,74.0,75.0,0
3,Agility,71.0,71.0,3
4,Balance,69.5,70.0,4
5,Jumping,69.0,67.0,5
6,Strength,66.0,65.0,6
7,BallControl,64.0,62.0,11
8,Crossing,64.0,62.0,12
9,StandingTackle,64.0,63.0,7


윙백 선수들의 점수가 높은 세부기술 10가지:  
'Stamina', 'SprintSpeed', 'Acceleration', 'Agility', 'Balance', 'Jumping', 'Strength', 'StandingTackle', 'Aggression', 'BallControl'

(10% Sliding Tackle 10% Crossing 10% Short Passing 10% Interceptions 09% Marking 08% Reactions 07% Dribbling)

### 센터백 (CB, LCB, RCB) 회귀분석

In [48]:
centerback = fifa[(fifa['Position']=='CB') | (fifa['Position']=='LCB') | (fifa['Position']=='RCB')]

In [49]:
# 기술들 간 상관관계 확인
centerback[['Overall', 'Strength', 'Jumping', 'StandingTackle', 'Aggression', 'HeadingAccuracy', 'SlidingTackle', 'Marking', 'Stamina', 'Interceptions', 'Reactions']].corr()

Unnamed: 0,Overall,Strength,Jumping,StandingTackle,Aggression,HeadingAccuracy,SlidingTackle,Marking,Stamina,Interceptions,Reactions
Overall,1.0,0.536391,0.204343,0.909005,0.729125,0.832236,0.871751,0.872058,0.294131,0.901862,0.861155
Strength,0.536391,1.0,-0.081127,0.393806,0.507898,0.52334,0.326763,0.397088,0.149739,0.382744,0.37799
Jumping,0.204343,-0.081127,1.0,0.158986,0.120677,0.091747,0.205431,0.136175,0.240183,0.157806,0.147914
StandingTackle,0.909005,0.393806,0.158986,1.0,0.570414,0.739718,0.902561,0.797342,0.228056,0.825012,0.755838
Aggression,0.729125,0.507898,0.120677,0.570414,1.0,0.59855,0.555742,0.564818,0.208716,0.614192,0.612036
HeadingAccuracy,0.832236,0.52334,0.091747,0.739718,0.59855,1.0,0.689522,0.692215,0.159638,0.713908,0.670698
SlidingTackle,0.871751,0.326763,0.205431,0.902561,0.555742,0.689522,1.0,0.763557,0.25116,0.792523,0.72989
Marking,0.872058,0.397088,0.136175,0.797342,0.564818,0.692215,0.763557,1.0,0.181316,0.782801,0.735601
Stamina,0.294131,0.149739,0.240183,0.228056,0.208716,0.159638,0.25116,0.181316,1.0,0.245701,0.273985
Interceptions,0.901862,0.382744,0.157806,0.825012,0.614192,0.713908,0.792523,0.782801,0.245701,1.0,0.814377


상관관계가 0.9 이상인 변수: StandingTackle, SlidingTackle   
-> 다중공선성 방지를 위해 둘 중 'StandingTackle'만 변수로 넣음

In [50]:
from statsmodels.formula.api import ols

In [51]:
res = ols('Overall ~ Strength + Jumping + StandingTackle + Aggression + HeadingAccuracy + Marking + Stamina + Interceptions + Reactions', centerback).fit()
res.summary()

0,1,2,3
Dep. Variable:,Overall,R-squared:,0.979
Model:,OLS,Adj. R-squared:,0.979
Method:,Least Squares,F-statistic:,15960.0
Date:,"Tue, 14 Jan 2020",Prob (F-statistic):,0.0
Time:,20:55:56,Log-Likelihood:,-4214.4
No. Observations:,3088,AIC:,8449.0
Df Residuals:,3078,BIC:,8509.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.7865,0.228,7.846,0.000,1.340,2.233
Strength,0.0742,0.003,28.735,0.000,0.069,0.079
Jumping,0.0332,0.002,20.377,0.000,0.030,0.036
StandingTackle,0.2512,0.005,49.413,0.000,0.241,0.261
Aggression,0.0751,0.002,32.112,0.000,0.070,0.080
HeadingAccuracy,0.1090,0.003,31.756,0.000,0.102,0.116
Marking,0.1509,0.004,39.144,0.000,0.143,0.158
Stamina,0.0250,0.002,13.535,0.000,0.021,0.029
Interceptions,0.1466,0.004,32.867,0.000,0.138,0.155

0,1,2,3
Omnibus:,2447.494,Durbin-Watson:,1.894
Prob(Omnibus):,0.0,Jarque-Bera (JB):,297114.633
Skew:,3.032,Prob(JB):,0.0
Kurtosis:,50.67,Cond. No.,2690.0


다중공선성이 존재하므로 상관관계가 0.8 이상인 변수도 정리
* StandingTackle, Interceptions  
* Interceptions, Reactions  
-> 'Interceptions' 변수 제외

In [52]:
res = ols('Overall ~ Strength + Jumping + StandingTackle + Aggression + HeadingAccuracy + Marking + Stamina + Reactions', centerback).fit()
res.summary()

0,1,2,3
Dep. Variable:,Overall,R-squared:,0.972
Model:,OLS,Adj. R-squared:,0.972
Method:,Least Squares,F-statistic:,13190.0
Date:,"Tue, 14 Jan 2020",Prob (F-statistic):,0.0
Time:,20:55:58,Log-Likelihood:,-4678.8
No. Observations:,3088,AIC:,9376.0
Df Residuals:,3079,BIC:,9430.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.4675,0.264,5.551,0.000,0.949,1.986
Strength,0.0692,0.003,23.112,0.000,0.063,0.075
Jumping,0.0340,0.002,17.938,0.000,0.030,0.038
StandingTackle,0.3079,0.006,55.387,0.000,0.297,0.319
Aggression,0.0847,0.003,31.421,0.000,0.079,0.090
HeadingAccuracy,0.1224,0.004,30.896,0.000,0.115,0.130
Marking,0.1775,0.004,40.539,0.000,0.169,0.186
Stamina,0.0275,0.002,12.795,0.000,0.023,0.032
Reactions,0.1600,0.004,40.013,0.000,0.152,0.168

0,1,2,3
Omnibus:,1131.393,Durbin-Watson:,1.919
Prob(Omnibus):,0.0,Jarque-Bera (JB):,34928.458
Skew:,1.109,Prob(JB):,0.0
Kurtosis:,19.326,Cond. No.,2550.0


여전히 다중공선성 존재. 상관관계 0.7 이상인 변수 정리.
* StandingTackle, Marking
* StandingTackle, Reactions 
* StandingTackle, HeadingAccuracy
-> 'Marking', 'Reactions', 'HeadingAccuracy' 제외

In [53]:
res = ols('Overall ~ Strength + Jumping + StandingTackle + Aggression + Stamina', centerback).fit()
res.summary()

0,1,2,3
Dep. Variable:,Overall,R-squared:,0.911
Model:,OLS,Adj. R-squared:,0.91
Method:,Least Squares,F-statistic:,6272.0
Date:,"Tue, 14 Jan 2020",Prob (F-statistic):,0.0
Time:,20:56:00,Log-Likelihood:,-6453.5
No. Observations:,3088,AIC:,12920.0
Df Residuals:,3082,BIC:,12960.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.6756,0.466,-1.450,0.147,-1.589,0.238
Strength,0.1069,0.005,21.046,0.000,0.097,0.117
Jumping,0.0375,0.003,11.162,0.000,0.031,0.044
StandingTackle,0.6523,0.006,103.036,0.000,0.640,0.665
Aggression,0.1556,0.004,34.599,0.000,0.147,0.164
Stamina,0.0322,0.004,8.568,0.000,0.025,0.040

0,1,2,3
Omnibus:,397.958,Durbin-Watson:,1.774
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2976.213
Skew:,0.367,Prob(JB):,0.0
Kurtosis:,7.753,Cond. No.,2050.0


로지스틱 회귀분석