# Defender OLS analysis

In [1]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import MySQLdb

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

### Connect DB & Get Defender Player

In [2]:
db = MySQLdb.connect(
    "db.fastcamp.us",
    "root",
    "dkstncks",
    "football",
    charset='utf8',
)

def make_query(position):
    """
    
    parameter------------
    position : M, D, F, G
    
    return---------------
    SQL_QUERY String
    
    """
    SQL_QUERY = """
        SELECT 
            age, tall, weight, apps_start, apps_sub, mins, goals, assists, yel, red
            , spg, ps_x, motm, aw, tackles, inter, fouls, offsides, clear, drb, blocks
            , owng, keyp_x, fouled, off, disp, unstch, avgp, ps_y, rating
        FROM player
        WHERE position like "%{position}%" and mins > 270
        ;
    """.format(position=position)
    
    return SQL_QUERY

# forword
SQL_QUERY = make_query("D")
defender_df = pd.read_sql(SQL_QUERY, db)

len(defender_df)

1285

### Scaling

In [3]:
X = defender_df.ix[:,:-1]
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

dfX0 = pd.DataFrame(X_scaled, columns=X.columns)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(defender_df.ix[:,-1], columns=["rating"])
d_df = pd.concat([dfX, dfy], axis=1)
d_df.head()

Unnamed: 0,const,age,tall,weight,apps_start,apps_sub,mins,goals,assists,yel,...,blocks,owng,keyp_x,fouled,off,disp,unstch,avgp,ps_y,rating
0,1,6.780152,15.780678,10.251944,3.160504,1.459488,3.239903,1.646978,2.210363,2.156231,...,1.409258,0.0,1.081647,1.277396,0.0,1.59796,1.954485,3.659975,11.330165,7.05
1,1,6.780152,15.271624,10.118801,3.476554,0.364872,3.474678,0.823489,2.210363,2.156231,...,0.352315,0.0,1.892883,1.277396,1.212081,1.864287,2.233697,2.831926,10.164036,7.02
2,1,5.524568,16.035205,11.317081,2.633753,0.364872,2.665606,2.470467,0.736788,2.515603,...,2.113887,2.756952,0.540824,2.128993,0.0,0.532653,0.837636,4.123682,11.546115,6.93
3,1,7.031268,15.017097,9.985659,3.476554,0.364872,3.498758,0.0,2.210363,2.156231,...,0.352315,0.0,1.892883,1.490295,0.0,1.59796,1.954485,2.591792,11.171802,6.9
4,1,6.780152,15.356466,10.251944,1.580252,2.918976,1.760215,0.0,0.0,0.359372,...,0.352315,0.0,1.081647,0.851597,1.212081,0.79898,1.396061,2.939572,11.099818,6.89


### Summary OLS

In [4]:
model = sm.OLS(d_df.ix[:, -1], d_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.785
Model:                            OLS   Adj. R-squared:                  0.780
Method:                 Least Squares   F-statistic:                     163.7
Date:                Mon, 27 Jun 2016   Prob (F-statistic):               0.00
Time:                        11:45:25   Log-Likelihood:                 844.16
No. Observations:                1285   AIC:                            -1630.
Df Residuals:                    1256   BIC:                            -1481.
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.5272      0.086     64.180      0.0

### Find Proper Model

In [5]:
# delete features based on t-value
remove_column_list = [
    "age", "tall", "weight", "apps_start", "apps_sub", "mins", "yel", "off"
]
removed_d_df = d_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed_d_df.ix[:, -1], removed_d_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.782
Model:                            OLS   Adj. R-squared:                  0.778
Method:                 Least Squares   F-statistic:                     226.2
Date:                Mon, 27 Jun 2016   Prob (F-statistic):               0.00
Time:                        11:46:04   Log-Likelihood:                 834.33
No. Observations:                1285   AIC:                            -1627.
Df Residuals:                    1264   BIC:                            -1518.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.5216      0.059     93.627      0.0

In [6]:
# check F-value using anova table
formula_str = """
rating ~ goals + assists + red + spg + ps_x + motm + aw
+ tackles + inter + fouls + offsides + clear + drb + blocks
+ owng + keyp_x + fouled + disp + unstch + avgp + ps_y
"""

model = sm.OLS.from_formula(formula_str, data=removed_d_df)
result = model.fit()
table_anova = sm.stats.anova_lm(result)
table_anova

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
goals,1.0,12.920812,12.920812,795.37297,3.807543e-136
assists,1.0,7.241352,7.241352,445.75954,5.384417e-85
red,1.0,6.4e-05,6.4e-05,0.003919,0.9500957
spg,1.0,0.236036,0.236036,14.529804,0.0001446024
ps_x,1.0,3.679938,3.679938,226.527782,3.2465310000000003e-47
motm,1.0,13.291143,13.291143,818.169608,3.591387e-139
aw,1.0,10.477143,10.477143,644.946792,2.681638e-115
tackles,1.0,12.941306,12.941306,796.634483,2.5843479999999997e-136
inter,1.0,5.739425,5.739425,353.304675,1.0660800000000001e-69
fouls,1.0,0.772133,0.772133,47.530589,8.528686e-12


In [7]:
# delete features based on F-value
remove_column_list = [
    "red", "offsides", "drb", "blocks", "disp", "ps_y"
]
removed2_d_df = removed_d_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed2_d_df.ix[:, -1], removed2_d_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.768
Model:                            OLS   Adj. R-squared:                  0.765
Method:                 Least Squares   F-statistic:                     280.3
Date:                Mon, 27 Jun 2016   Prob (F-statistic):               0.00
Time:                        11:47:07   Log-Likelihood:                 795.93
No. Observations:                1285   AIC:                            -1560.
Df Residuals:                    1269   BIC:                            -1477.
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.4988      0.060     91.338      0.0

### 결론
- 주요 변수 : tackles, intercept, aerial won (공중볼 경합 승리), keypass, pass success percentage