# Forward OLS analysis

In [1]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import MySQLdb

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

### Connect DB & Get Forward Data

In [4]:
db = MySQLdb.connect(
    "db.fastcamp.us",
    "root",
    "dkstncks",
    "football",
    charset='utf8',
)

def make_query(position):
    """
    
    parameter------------
    position : M, D, F, G
    
    return---------------
    SQL_QUERY String
    
    """
    SQL_QUERY = """
        SELECT 
            age, tall, weight, apps_start, apps_sub, mins, goals, assists, yel, red
            , spg, ps_x, motm, aw, tackles, inter, fouls, offsides, clear, drb, blocks
            , owng, keyp_x, fouled, off, disp, unstch, avgp, ps_y, rating
        FROM player
        WHERE position like "%{position}%" and position not like "%D%" and mins > 270
        ;
    """.format(position=position)
    
    return SQL_QUERY

# forword
SQL_QUERY = make_query("F")
forward_df = pd.read_sql(SQL_QUERY, db)

len(forward_df)

611

### Scaling

In [5]:
X = forword_df.ix[:,:-1]
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

dfX0 = pd.DataFrame(X_scaled, columns=X.columns)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(forword_df.ix[:,-1], columns=["rating"])
f_df = pd.concat([dfX, dfy], axis=1)
f_df.head()

Unnamed: 0,const,age,tall,weight,apps_start,apps_sub,mins,goals,assists,yel,...,blocks,owng,keyp_x,fouled,off,disp,unstch,avgp,ps_y,rating
0,1,8.382932,9.541107,6.649772,2.985339,0.820926,3.27687,3.344336,2.398933,3.995343,...,1.313059,0.0,2.015096,3.009618,3.162057,3.701969,3.353454,2.286773,8.088244,7.37
1,1,6.945858,9.699247,7.181753,2.686805,0.615694,2.803518,1.170518,2.398933,4.43927,...,1.313059,0.0,1.612077,3.762023,1.149839,2.005233,2.624442,3.353934,8.501999,7.25
2,1,6.945858,9.699247,7.09309,0.298534,1.231389,0.341144,0.0,0.0,0.0,...,2.626118,0.0,0.403019,0.451443,0.0,1.388238,0.874814,1.08894,9.783304,6.22
3,1,5.987809,9.224827,6.383781,3.582407,0.410463,3.609751,3.678769,1.999111,2.219635,...,1.313059,0.0,2.821134,1.504809,1.149839,2.005233,2.624442,3.887515,10.450651,7.5
4,1,7.664395,9.75196,7.004426,1.691692,2.668009,1.997285,1.839385,1.599289,1.331781,...,1.313059,0.0,1.410567,1.805771,1.437298,2.159482,2.332838,1.295838,8.795631,6.98


### Summary OLS

In [6]:
model = sm.OLS(f_df.ix[:, -1], f_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.919
Model:                            OLS   Adj. R-squared:                  0.915
Method:                 Least Squares   F-statistic:                     234.8
Date:                Mon, 27 Jun 2016   Prob (F-statistic):          5.79e-296
Time:                        11:29:21   Log-Likelihood:                 461.08
No. Observations:                 611   AIC:                            -864.2
Df Residuals:                     582   BIC:                            -736.1
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.6388      0.091     62.193      0.0

### Find Proper Model

In [7]:
# delete feature based on t-value
remove_column_list = [
    "age", "tall", "weight", "apps_start", "apps_sub", "red", "clear", "blocks", "owng", "unstch", "offsides", "disp", "off"
]
removed_f_df = f_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed_f_df.ix[:, -1], removed_f_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.915
Model:                            OLS   Adj. R-squared:                  0.913
Method:                 Least Squares   F-statistic:                     429.4
Date:                Mon, 27 Jun 2016   Prob (F-statistic):          2.74e-307
Time:                        11:30:44   Log-Likelihood:                 449.12
No. Observations:                 611   AIC:                            -866.2
Df Residuals:                     595   BIC:                            -795.6
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.6896      0.067     84.360      0.0

In [9]:
# check F-value using anova table
formula_str = """
rating ~ mins + goals + assists + yel + spg + ps_x
+ motm + aw + tackles + inter + fouls + drb + keyp_x
+ fouled + avgp + ps_y
"""

model = sm.OLS.from_formula(formula_str, data=removed_f_df)
result = model.fit()
table_anova = sm.stats.anova_lm(result)
table_anova

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
mins,1.0,44.227823,44.227823,3199.652899,1.467266e-241
goals,1.0,14.209164,14.209164,1027.959101,9.23716e-132
assists,1.0,7.687434,7.687434,556.145853,2.536368e-87
yel,1.0,0.116096,0.116096,8.398942,0.003892666
spg,1.0,5.702216,5.702216,412.525675,4.544223e-70
ps_x,1.0,0.71686,0.71686,51.861086,1.807262e-12
motm,1.0,2.835558,2.835558,205.13789,3.429509e-40
aw,1.0,2.351298,2.351298,170.104197,2.240872e-34
tackles,1.0,5.572391,5.572391,403.13349,7.421342000000001e-69
inter,1.0,1.060663,1.060663,76.73345,2.031578e-17


In [10]:
# delete features based on F-value
remove_column_list = [
    "fouls", "drb", "ps_y"
]
removed2_f_df = removed_f_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed2_f_df.ix[:, -1], removed2_f_df.ix[:, :-1])
result = model.fit()
print(result.summary())


                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.914
Model:                            OLS   Adj. R-squared:                  0.912
Method:                 Least Squares   F-statistic:                     486.5
Date:                Mon, 27 Jun 2016   Prob (F-statistic):          1.93e-307
Time:                        11:32:34   Log-Likelihood:                 443.11
No. Observations:                 611   AIC:                            -858.2
Df Residuals:                     597   BIC:                            -796.4
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.6694      0.068     83.946      0.0

### 결론
- 주요변수 : goals, keypass, shots per game, aerial won, assists