In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import pickle
import copy

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

In [2]:
wrs = pickle.load(open("../data/qbs_senior.pkl","rb"))

In [3]:
y = wrs["Rate_x"]
X = wrs.loc[:,"Cmp_y":"Conf"]

X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=0.8, random_state=29)
data = X_train.merge(pd.DataFrame(y_train), left_index = True, right_index = True)
data = data.dropna()

In [4]:
lm = smf.ols('Rate_x ~ Rate_y + Yds_y + TD_y + Pct + Att_y + Cmp_y + Int_y + Year + Conf', data = data)
most_naive_model = lm.fit()
most_naive_model.summary()

0,1,2,3
Dep. Variable:,Rate_x,R-squared:,0.186
Model:,OLS,Adj. R-squared:,0.154
Method:,Least Squares,F-statistic:,5.772
Date:,"Fri, 03 Aug 2018",Prob (F-statistic):,3.19e-07
Time:,11:54:33,Log-Likelihood:,-930.63
No. Observations:,237,AIC:,1881.0
Df Residuals:,227,BIC:,1916.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-500.5603,141.779,-3.531,0.001,-779.932,-221.188
Conf[T.True],-1.5659,2.406,-0.651,0.516,-6.308,3.176
Rate_y,-0.0273,0.162,-0.169,0.866,-0.346,0.292
Yds_y,-0.0043,0.006,-0.727,0.468,-0.016,0.007
TD_y,0.0527,0.250,0.211,0.833,-0.439,0.544
Pct,-0.2061,0.418,-0.493,0.623,-1.030,0.618
Att_y,-0.1014,0.047,-2.167,0.031,-0.194,-0.009
Cmp_y,0.2194,0.103,2.139,0.034,0.017,0.421
Int_y,-0.0954,0.276,-0.345,0.730,-0.640,0.449

0,1,2,3
Omnibus:,16.787,Durbin-Watson:,2.088
Prob(Omnibus):,0.0,Jarque-Bera (JB):,19.78
Skew:,-0.56,Prob(JB):,5.07e-05
Kurtosis:,3.864,Cond. No.,543000.0


In [5]:
lm = smf.ols('Rate_x ~  Pct + Att_y + Cmp_y + Year', data = data)
most_naive_model = lm.fit()
most_naive_model.summary()

0,1,2,3
Dep. Variable:,Rate_x,R-squared:,0.177
Model:,OLS,Adj. R-squared:,0.162
Method:,Least Squares,F-statistic:,12.44
Date:,"Fri, 03 Aug 2018",Prob (F-statistic):,3.51e-09
Time:,11:54:33,Log-Likelihood:,-932.03
No. Observations:,237,AIC:,1874.0
Df Residuals:,232,BIC:,1891.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-500.8457,126.741,-3.952,0.000,-750.555,-251.136
Pct,-0.2978,0.173,-1.723,0.086,-0.638,0.043
Att_y,-0.1202,0.039,-3.049,0.003,-0.198,-0.043
Cmp_y,0.1992,0.068,2.928,0.004,0.065,0.333
Year,0.2953,0.064,4.628,0.000,0.170,0.421

0,1,2,3
Omnibus:,17.656,Durbin-Watson:,2.102
Prob(Omnibus):,0.0,Jarque-Bera (JB):,20.986
Skew:,-0.579,Prob(JB):,2.77e-05
Kurtosis:,3.885,Cond. No.,315000.0


In [6]:
lm = smf.ols('Rate_x ~  Pct + Att_y + Cmp_y + Year + Pct * Year', data = data)
most_naive_model = lm.fit()
most_naive_model.summary()

0,1,2,3
Dep. Variable:,Rate_x,R-squared:,0.197
Model:,OLS,Adj. R-squared:,0.18
Method:,Least Squares,F-statistic:,11.33
Date:,"Fri, 03 Aug 2018",Prob (F-statistic):,8.75e-10
Time:,11:54:33,Log-Likelihood:,-929.07
No. Observations:,237,AIC:,1870.0
Df Residuals:,231,BIC:,1891.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-2242.6270,730.839,-3.069,0.002,-3682.589,-802.665
Pct,30.9414,12.914,2.396,0.017,5.496,56.386
Att_y,-0.1321,0.039,-3.358,0.001,-0.210,-0.055
Cmp_y,0.2200,0.068,3.240,0.001,0.086,0.354
Year,1.1724,0.368,3.186,0.002,0.447,1.898
Pct:Year,-0.0157,0.007,-2.419,0.016,-0.029,-0.003

0,1,2,3
Omnibus:,21.132,Durbin-Watson:,2.136
Prob(Omnibus):,0.0,Jarque-Bera (JB):,25.82
Skew:,-0.656,Prob(JB):,2.47e-06
Kurtosis:,3.945,Cond. No.,104000000.0


In [7]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print(mean_squared_error(y_train,lr.predict(X_train)))
print(mean_squared_error(y_test,lr.predict(X_test)))

144.29430044037005
265.66342056034057


In [8]:
lr = Lasso()
lr.fit(X_train, y_train)
print(mean_squared_error(y_train,lr.predict(X_train)))
print(mean_squared_error(y_test,lr.predict(X_test)))

151.2039960795743
259.8830500799048


In [9]:
lr = Ridge()
lr.fit(X_train, y_train)
print(mean_squared_error(y_train,lr.predict(X_train)))
print(mean_squared_error(y_test,lr.predict(X_test)))

144.3307757012772
262.8042744457822


In [10]:
lr = ElasticNet()
lr.fit(X_train, y_train)
print(mean_squared_error(y_train,lr.predict(X_train)))
print(mean_squared_error(y_test,lr.predict(X_test)))

150.74336137226283
259.45025331380185


In [13]:
X2 = X_train[["Pct", "Year", "Att_y", "Cmp_y"]]
X2["int"] = 1
X2["Pct_Year"] = X2["Pct"] * X2["Year"]
lr = LinearRegression()
lr.fit(X2, y_train)
print(mean_squared_error(y_train,lr.predict(X2)))
X3 = X_test[["Pct", "Year", "Att_y", "Cmp_y"]]
X3["int"] = 1
X3["Pct_Year"] = X3["Pct"] * X3["Year"]
print(mean_squared_error(y_test,lr.predict(X3)))

148.76152019625687
258.9416265071606


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://panda

In [14]:
pickle.dump(lr, open("model.pkl","wb"))

In [12]:
X2 = X_train[["Pct", "Year", "Att_y", "Cmp_y"]]
X2["int"] = 1
lr = Lasso()
lr.fit(X2, y_train)
print(mean_squared_error(y_train,lr.predict(X2)))
X3 = X_test[["Pct", "Year", "Att_y", "Cmp_y"]]
X3["int"] = 1
print(mean_squared_error(y_test,lr.predict(X3)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


152.6298090632096
256.5510829999185


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
