# Forword OLS
- Import Package
- Connect DB & get Forword Player Data
- Scaling
- Summary OLS 
- Remove Feature

### Import Package

In [1]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import MySQLdb

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

### Connect DB & get Forword Player Data

In [2]:
db = MySQLdb.connect(
    "db.fastcamp.us",
    "root",
    "dkstncks",
    "football",
    charset='utf8',
)

def make_query(position):
    """
    
    parameter------------
    position : M, D, F, G
    
    return---------------
    SQL_QUERY String
    
    """
    SQL_QUERY = """
        SELECT 
            age, tall, weight, apps_start, apps_sub, mins, goals, assists, yel, red
            , spg, ps_x, motm, aw, tackles, inter, fouls, offsides, clear, drb, blocks
            , owng, keyp_x, fouled, off, disp, unstch, avgp, ps_y, rating
        FROM player
        WHERE position like "%{position}%"
        ;
    """.format(position=position)
    
    return SQL_QUERY

# forword
SQL_QUERY = make_query("F")
forword_df = pd.read_sql(SQL_QUERY, db)

len(forword_df)

1106

##### Scaling

In [3]:
X = forword_df.ix[:,:-1]
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

dfX0 = pd.DataFrame(X_scaled, columns=X.columns)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(forword_df.ix[:,-1], columns=["rating"])
f_df = pd.concat([dfX, dfy], axis=1)
f_df.head()

Unnamed: 0,const,age,tall,weight,apps_start,apps_sub,mins,goals,assists,yel,...,blocks,owng,keyp_x,fouled,off,disp,unstch,avgp,ps_y,rating
0,1,7.753974,4.572042,3.70725,2.920678,0.893045,3.124043,3.872875,2.810126,4.50275,...,0.465,0.0,1.841614,2.744057,3.057305,3.281157,2.735755,1.926532,3.573768,7.37
1,1,6.424721,4.647822,4.00383,2.62861,0.669784,2.672768,1.355506,2.810126,5.003056,...,0.465,0.0,1.473291,3.430071,1.111747,1.777293,2.141026,2.82558,3.756585,7.25
2,1,4.873926,4.395223,3.41067,0.194712,0.223261,0.20932,0.0,0.0,0.0,...,0.0,0.0,1.28913,0.96042,0.83381,5.468595,0.0,2.321012,4.652975,6.78
3,1,6.424721,4.647822,3.9544,0.292068,1.339568,0.325234,0.0,0.0,0.0,...,0.929999,0.0,0.368323,0.411609,0.0,1.230434,0.713675,0.917396,4.322726,6.22
4,1,6.646263,4.673082,4.05326,0.097356,0.893045,0.085529,0.0,0.0,0.0,...,0.0,0.0,0.0,1.646434,0.0,0.27343,0.237892,0.201827,4.287342,6.09


##### Summary OLS

In [4]:
model = sm.OLS(f_df.ix[:, -1], f_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.873
Model:                            OLS   Adj. R-squared:                  0.869
Method:                 Least Squares   F-statistic:                     263.4
Date:                Sat, 25 Jun 2016   Prob (F-statistic):               0.00
Time:                        19:35:09   Log-Likelihood:                 523.52
No. Observations:                1106   AIC:                            -989.0
Df Residuals:                    1077   BIC:                            -843.8
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          6.0044      0.039    155.680      0.0

##### Remove Some Feature

In [5]:
# remove features
remove_column_list = [
    "age", "tall", "weight", "apps_start", "red", "ps_x", "fouls", "offsides", "owng", "off", "disp", "clear", "ps_y"
]
f_df = f_df.drop(remove_column_list, axis=1) 

model = sm.OLS(f_df.ix[:, -1], f_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.866
Model:                            OLS   Adj. R-squared:                  0.864
Method:                 Least Squares   F-statistic:                     440.2
Date:                Sat, 25 Jun 2016   Prob (F-statistic):               0.00
Time:                        19:35:09   Log-Likelihood:                 496.06
No. Observations:                1106   AIC:                            -958.1
Df Residuals:                    1089   BIC:                            -873.0
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.9590      0.012    505.668      0.0

In [7]:
f_df.columns

Index(['const', 'apps_sub', 'mins', 'goals', 'assists', 'yel', 'spg', 'motm',
       'aw', 'tackles', 'inter', 'drb', 'blocks', 'keyp_x', 'fouled', 'unstch',
       'avgp', 'rating'],
      dtype='object')