In [1]:
# In this notebook we're just going to verify the results of Going For Three, which trained a logistic regression model on kicks from 2000 to 2011.
# Initially we use all kickers, not just those that had had at least 50 kicks at the time of the attempt.

In [27]:
import pandas as pd
from pandas.plotting import scatter_matrix
import statsmodels.api as sm
from statsmodels.formula.api import glm as glm_sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import numpy as np
import mysql.connector
import itertools
from scipy.stats import chi2


In [3]:
cnx = mysql.connector.connect(user='root', password='mOntie20!mysql', host='127.0.0.1', database='nfl')

In [4]:
base_query = '''select
p.pid,fg.good,fg.dist, 
g.seas as year, k.seas as seasons,
case when g.temp<50 then 1 else 0 end as cold,
case when g.stad like "%Mile High%" then 1 else 0 end as altitude,
case when g.humd>=60 then 1 else 0 end as humid,
case when g.wspd>=10 then 1 else 0 end as windy,
case when g.v=p.off then 1 else 0 end as away_game,
case when g.wk>=10 then 1 else 0 end as postseason,
case when (pp.qtr=p.qtr) and ((pp.timd-p.timd)>0 or (pp.timo-p.timo)>0) then 1 else 0 end as iced,
case g.surf when 'Grass' then 0 else 1 end as turf,
case when g.cond like "%Snow%" then 1 when g.cond like "%Rain%" and not "Chance Rain" then 1 else 0 end as precipitation,
case when p.qtr=4 and ABS(p.ptso - p.ptsd)>21 then 0
when p.qtr=4 and p.min<2 and ABS(p.ptso - p.ptsd)>8 then 0
when p.qtr=4 and p.min<2 and p.ptso-p.ptsd < -7 then 0
when p.qtr<=3 then 0
when p.qtr=4 and p.min>=2 and ABS(p.ptso - p.ptsd)<21 then 0
when p.qtr=4 and p.min<2 and p.ptso-p.ptsd >=5 and p.ptso-p.ptsd <=8 then 0
when p.qtr=4 and p.min<2 and p.ptso-p.ptsd >=-4 and p.ptso-p.ptsd <=-6 then 0
else 1 end as pressure'''

In [5]:
# pre-2011, include kickers with <50 kicks at time of attempt
query = base_query+'''
from FGXP fg
left join PLAY p on fg.pid=p.pid
left join game g on p.gid=g.gid
join kicker k on k.player = fg.fkicker and g.gid=k.gid
join PLAY pp on pp.pid=p.pid-1 and pp.gid=p.gid
where fg.fgxp='FG' -- not an xp
and g.seas <= 2011
order by p.pid
'''

df = pd.read_sql(query, cnx, index_col = 'pid')
df.head(10)

Unnamed: 0_level_0,good,dist,year,seasons,cold,altitude,humid,windy,away_game,postseason,iced,turf,precipitation,pressure
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
17,1,43,2000,19,0,0,0,0,0,0,0,1,0,0
34,1,44,2000,19,0,0,0,0,0,0,0,1,0,0
52,1,24,2000,19,0,0,0,0,0,0,0,1,0,0
64,1,44,2000,19,0,0,0,0,0,0,0,1,0,0
95,1,48,2000,19,0,0,0,0,0,0,0,1,0,0
241,1,50,2000,6,0,0,1,0,1,0,1,0,0,0
277,1,25,2000,6,0,0,1,0,1,0,0,0,0,0
375,1,33,2000,3,0,0,0,0,1,0,0,1,0,0
387,1,34,2000,1,0,0,0,0,0,0,0,1,0,0
401,1,38,2000,1,0,0,0,0,0,0,0,1,0,0


In [19]:
# windy vs precip tetrchoric
a = len(df[(df['precipitation']==1) & (df['windy']==0)])
b = len(df[(df['precipitation']==1) & (df['windy']==1)])
c = len(df[(df['precipitation']==0) & (df['windy']==0)])
d = len(df[(df['precipitation']==0) & (df['windy']==1)])
tetra_precip_wind = np.cos(np.radians(180/(1+np.sqrt(b*c/a/d))))
print(tetra_precip_wind)

0.26363806684953395


In [26]:
# Using the glm API (Intercept added automatically)
# Lets drop the year, seasons for now
model = glm_sm('good ~ ' + '+'.join(df.drop(['year','seasons','good'], axis=1).columns.values), df, family=sm.families.Binomial())
result = model.fit(method='newton')
print(result.summary())
base_ll = pd.read_html(result.summary().tables[0].as_html())[0].iloc[4,3]

Generalized Linear Model Regression Results                  
Dep. Variable:                   good   No. Observations:                11901
Model:                            GLM   Df Residuals:                    11889
Model Family:                Binomial   Df Model:                           11
Link Function:                  logit   Scale:                          1.0000
Method:                        newton   Log-Likelihood:                -5001.9
Date:                Fri, 14 Feb 2020   Deviance:                       10004.
Time:                        15:51:42   Pearson chi2:                 1.14e+04
No. Iterations:                     3                                         
Covariance Type:            nonrobust                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         5.6933      0.139     40.916      0.000      

In [None]:
# We can see the results are similar to those achieved by the authors of Going For Three
# What about interactions? Lets include all possible 2 covariate interactions.
def get_interactions(covariates):
    interactions = []
    for i,j in itertools.product(covariates,covariates):
        if i>j:
            interactions.append(i+'*'+j)

    return interactions

def LR_test(l1,l2):
    LR = -2*(l1) - (-2*(l2))
    p = chi2.sf(LR, 1)
    return p

def pd_summary(result):
    return pd.read_html(result.summary().tables[1].as_html(), header=0, index_col=0)[0]


covariates = df.drop(['year','seasons','good'], axis=1).columns.values
sig_interactions = []

for interaction in get_interactions(covariates):
    formula = 'good ~ ' + '+'.join(covariates) + '+' + interaction

    model = glm_sm(formula, df, family=sm.families.Binomial())
    result = model.fit(method='irls')
    log_likelihood = pd.read_html(result.summary().tables[0].as_html())[0].iloc[4,3]
    p = LR_test(base_ll, log_likelihood)
    if p<0.1:
        sig_interactions.append({'interaction':interaction,'ll':log_likelihood, 'summary':pd_summary(result), 'P':p})

# wary of multiple tests -- bonferroni correction


In [None]:
df_interactions = pd.DataFrame.from_dict(sig_interactions, orient='columns')
df_interactions

In [None]:
# If we now add back in the year and seasons of experience and control for kickers that dont make it in the NFL (so >50 kicks)
# We see that both year and seasons are significant covariates

In [None]:
# pre-2011, remove kickers with <50 kicks at time of attempt
query = base_query+'''
from FGXP fg
left join PLAY p on fg.pid=p.pid
left join game g on p.gid=g.gid
join kicker k on k.player = fg.fkicker and g.gid=k.gid
join PLAY pp on pp.pid=p.pid-1 and pp.gid=p.gid
where fg.fgxp='FG' -- not an xp
and fg.fkicker in (
select fkicker
from fifty) -- has had at least 50 attempts overall
and fg.pid > (
select pid
from fifty
where fg.fkicker = fkicker) -- this kick came after the 50th attempt
and g.seas <= 2011
order by p.pid
'''

df = pd.read_sql(query, cnx, index_col = 'pid')
df.head(10)

In [None]:
model = glm_sm('good ~ '+'+'.join(df.drop(['good'], axis=1).columns.values), df, family=sm.families.Binomial())
result = model.fit(method='newton')
print(result.summary())

In [None]:
#And with interactions


In [None]:
# In Choking Under the Pressure, they used similar data now from 2000-2017.
# Lets repeat the modelling with this data, again leaving out the seasons and year covariates and not controlling for >50 kicks

In [None]:
# pre-2017, include kickers with <50 kicks at time of attempt
query = base_query+'''
from FGXP fg
left join PLAY p on fg.pid=p.pid
left join game g on p.gid=g.gid
join kicker k on k.player = fg.fkicker and g.gid=k.gid
join PLAY pp on pp.pid=p.pid-1 and pp.gid=p.gid
where fg.fgxp='FG' -- not an xp
and g.seas <= 2017
order by p.pid
'''

df = pd.read_sql(query, cnx, index_col = 'pid')
df.head(10)

In [None]:
model = glm_sm('good ~ '+'+'.join(df.drop(['good','seasons','year'], axis=1).columns.values), df, family=sm.families.Binomial())
result = model.fit(method='newton')
print(result.summary())

In [None]:
# Now that we've verified the results of the previous studies, we turn out attention to the most recent seasons and employ a bayesian framework for improving the estimates.

In [None]:
# Below we continue the evaluation by assessing the predictive power of the model. Which isnt our main goal, but here it is anyway.

In [None]:
y_pred = result.predict(exog=x_test)
y_logits = -np.log((1/y_pred) -1)

In [None]:
loss = log_loss(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
print(loss,auc)

In [None]:
df_results = x_test.copy()
df_results['good'] = y_test
df_results['prob'] = y_pred
df_results['logits'] = y_logits
df_results

In [None]:
ax = sns.regplot(x='logits', y='prob', data=df_results, lowess=True)
sns.scatterplot(x='logits', y='good', data=df_results, ax=ax, color='r')

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
ax = sns.lineplot(x=fpr, y=tpr, ci=0, markers=True)
sns.lineplot(x=np.arange(0,1,0.1), y=np.arange(0,1,0.1), dashes=True, ax=ax)