# Midfilder OLS

In [1]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import MySQLdb

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

### Connect DB & Get Midfielder Data

In [2]:
db = MySQLdb.connect(
    "db.fastcamp.us",
    "root",
    "dkstncks",
    "football",
    charset='utf8',
)

def make_query(position):
    """
    
    parameter------------
    position : M, D, F, G
    
    return---------------
    SQL_QUERY String
    
    """
    SQL_QUERY = """
        SELECT 
            age, tall, weight, apps_start, apps_sub, mins, goals, assists, yel, red
            , spg, ps_x, motm, aw, tackles, inter, fouls, offsides, clear, drb, blocks
            , owng, keyp_x, fouled, off, disp, unstch, avgp, ps_y, rating
        FROM player
        WHERE position like "%{position}%" and mins > 270
        ;
    """.format(position=position)
    
    return SQL_QUERY

# forword
SQL_QUERY = make_query("M")
midfielder_df = pd.read_sql(SQL_QUERY, db)

len(midfielder_df)

1582

### Scaling

In [4]:
X = midfielder_df.ix[:,:-1]
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

dfX0 = pd.DataFrame(X_scaled, columns=X.columns)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(midfielder_df.ix[:,-1], columns=["rating"])
m_df = pd.concat([dfX, dfy], axis=1)
m_df.head()

Unnamed: 0,const,age,tall,weight,apps_start,apps_sub,mins,goals,assists,yel,...,blocks,owng,keyp_x,fouled,off,disp,unstch,avgp,ps_y,rating
0,1,7.318647,14.8008,10.561215,2.780837,0.704054,2.879113,1.876429,2.456496,3.478088,...,0.639768,0.0,1.436757,4.101984,1.808584,2.168207,3.033856,2.165255,9.363235,7.25
1,1,6.813913,14.961678,10.039673,3.089819,0.938738,3.262186,0.536123,1.228248,2.086853,...,2.559071,0.0,0.718378,0.984476,0.0,1.000711,1.179833,3.107282,11.568079,7.05
2,1,7.318647,14.076848,9.387747,3.295807,0.938738,3.515548,0.268061,2.04708,2.78247,...,0.639768,0.0,3.232703,2.46119,0.0,1.834636,1.011285,3.704836,11.950252,7.03
3,1,6.813913,14.479043,9.909288,3.398801,0.234685,3.498576,0.268061,1.228248,2.086853,...,0.639768,0.0,1.257162,0.984476,0.452146,1.167496,1.34838,2.404277,10.377463,7.02
4,1,7.06628,14.398604,8.73582,2.265867,1.408107,2.32996,0.804184,3.275329,1.043426,...,0.0,0.0,2.693919,1.804873,1.356438,2.501777,2.022571,1.638001,11.303498,6.96


### Summary OLS

In [5]:
model = sm.OLS(m_df.ix[:, -1], m_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.860
Model:                            OLS   Adj. R-squared:                  0.857
Method:                 Least Squares   F-statistic:                     340.7
Date:                Mon, 27 Jun 2016   Prob (F-statistic):               0.00
Time:                        11:38:29   Log-Likelihood:                 1058.1
No. Observations:                1582   AIC:                            -2058.
Df Residuals:                    1553   BIC:                            -1903.
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.6114      0.075     74.415      0.0

### Find Proper Model

In [6]:
# delete features based on t-value
remove_column_list = [
    "tall", "weight", "apps_start", "offsides", "blocks", "owng", "off", "unstch"
]
removed_m_df = m_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed_m_df.ix[:, -1], removed_m_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.859
Model:                            OLS   Adj. R-squared:                  0.857
Method:                 Least Squares   F-statistic:                     476.4
Date:                Mon, 27 Jun 2016   Prob (F-statistic):               0.00
Time:                        11:39:08   Log-Likelihood:                 1053.8
No. Observations:                1582   AIC:                            -2066.
Df Residuals:                    1561   BIC:                            -1953.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.6644      0.060     94.990      0.0

In [7]:
# check f-value using anova table
formula_str = """
rating ~ age + apps_sub + mins + goals + assists + yel + red
+ spg + ps_x + motm + aw + tackles + inter + fouls + clear + drb
+ keyp_x + fouled + disp + avgp + ps_y
"""

model = sm.OLS.from_formula(formula_str, data=removed_m_df)
result = model.fit()
table_anova = sm.stats.anova_lm(result)
table_anova

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
age,1.0,0.011797,0.011797,0.753402,0.3855346
apps_sub,1.0,39.412044,39.412044,2517.095061,0.0
mins,1.0,38.137891,38.137891,2435.719829,5.415e-321
goals,1.0,15.192936,15.192936,970.314153,4.4907199999999993e-166
assists,1.0,10.301919,10.301919,657.943753,2.250478e-121
yel,1.0,0.420752,0.420752,26.871815,2.457993e-07
red,1.0,0.000933,0.000933,0.059596,0.8071673
spg,1.0,0.645137,0.645137,41.202407,1.816193e-10
ps_x,1.0,5.430199,5.430199,346.805845,4.657934e-70
motm,1.0,7.640912,7.640912,487.995563,2.564771e-94


In [8]:
# delete features based on F-value
remove_column_list = [
    "age", "apps_sub", "red", "drb", "disp", "ps_y"
]
removed2_m_df = removed_m_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed2_m_df.ix[:, -1], removed2_m_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.854
Model:                            OLS   Adj. R-squared:                  0.852
Method:                 Least Squares   F-statistic:                     609.4
Date:                Mon, 27 Jun 2016   Prob (F-statistic):               0.00
Time:                        11:40:18   Log-Likelihood:                 1023.7
No. Observations:                1582   AIC:                            -2015.
Df Residuals:                    1566   BIC:                            -1929.
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.5553      0.053    105.524      0.0

### 결론
- 주요 변수 : goals, tackles, keypass, intercept
- 미드필더의 경우 공격수 평점 모델과 큰 차이는 보이지 않았다. 그러나 세부적인 포지션을 나눠서 모델을 설계한다면 더 좋은 평점 모델이 나올수 있을것이라 확신한다.