# Bayesian Regression

In this notebook we are going to continue our modelling of field goals, but instead of using a frequentist maximum likelihood approach we shall shift to a Bayesian statistical framework. The data we will be training on are from the 2018 and 2019 seasons of the NFL. We previously trained on pre-2017 data, to line up with the exisiting studies. We did this to show that there are interactions happening in those years that were not accounted for. Let's get started in updating our model for the latest seasons.

In [1]:
import pymc3 as pm
import pandas as pd
import mysql.connector
from pymc3.distributions.continuous import Normal



In [28]:
cnx = mysql.connector.connect(user='root', password='mOntie20!mysql', host='127.0.0.1', database='nfl')

base_query = '''select
p.pid,fg.good,fg.dist, 
g.seas as year, k.seas as seasons,
case when g.temp<50 then 1 else 0 end as cold,
case when g.stad like "%Mile High%" then 1 else 0 end as altitude,
case when g.humd>=60 then 1 else 0 end as humid,
case when g.wspd>=10 then 1 else 0 end as windy,
case when g.v=p.off then 1 else 0 end as away_game,
case when g.wk>=10 then 1 else 0 end as postseason,
case when (pp.qtr=p.qtr) and ((pp.timd-p.timd)>0 or (pp.timo-p.timo)>0) then 1 else 0 end as iced,
case g.surf when 'Grass' then 0 else 1 end as turf,
case when g.cond like "%Snow%" then 1 when g.cond like "%Rain%" and not "Chance Rain" then 1 else 0 end as precipitation,
case when p.qtr=4 and ABS(p.ptso - p.ptsd)>21 then 0
when p.qtr=4 and p.min<2 and ABS(p.ptso - p.ptsd)>8 then 0
when p.qtr=4 and p.min<2 and p.ptso-p.ptsd < -7 then 0
when p.qtr<=3 then 0
when p.qtr=4 and p.min>=2 and ABS(p.ptso - p.ptsd)<21 then 0
when p.qtr=4 and p.min<2 and p.ptso-p.ptsd >=5 and p.ptso-p.ptsd <=8 then 0
when p.qtr=4 and p.min<2 and p.ptso-p.ptsd >=-4 and p.ptso-p.ptsd <=-6 then 0
else 1 end as pressure'''

query = base_query+'''
from FGXP fg
left join PLAY p on fg.pid=p.pid
left join game g on p.gid=g.gid
join kicker k on k.player = fg.fkicker and g.gid=k.gid
join PLAY pp on pp.pid=p.pid-1 and pp.gid=p.gid
where fg.fgxp='FG' -- not an xp
and g.seas >2017
order by p.pid
'''

df = pd.read_sql(query, cnx, index_col = 'pid')
df['cold*windy'] = df['cold']*df['windy']
df['postseason*away_game'] = df['postseason']*df['away_game']
df.head(10)

Unnamed: 0_level_0,good,dist,year,seasons,cold,altitude,humid,windy,away_game,postseason,iced,turf,precipitation,pressure,Intercept,cold*windy,postseason*away_game
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
785567,1,21,2018,17,0,0,1,0,1,0,0,1,0,0,1,0,0
785596,1,26,2018,2,0,0,1,0,0,0,0,1,0,0,1,0,0
785605,1,52,2018,17,0,0,1,0,1,0,0,1,0,0,1,0,0
785764,1,41,2018,7,0,0,1,1,0,0,0,0,1,0,1,0,0
785780,0,52,2018,11,0,0,1,1,1,0,0,0,1,0,1,0,0
785794,1,39,2018,7,0,0,1,1,0,0,0,0,1,0,1,0,0
785851,1,35,2018,11,0,0,1,1,1,0,0,0,1,0,1,0,0
786101,0,42,2018,5,0,0,1,1,1,0,1,0,1,1,1,0,0
786111,0,43,2018,2,0,0,1,1,0,0,0,0,1,1,1,0,0
786128,1,42,2018,7,1,0,0,0,1,0,0,1,0,0,1,0,0


In [33]:
len(priors)

16

In [29]:
exp_results = pd.read_csv('expanded_results.csv', index_col=0).rename(index={'cold:windy':'cold*windy', 'postseason:away_game':'postseason*away_game'})
exp_results

Unnamed: 0,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-88.3109,8.221,-10.741,0.0,-104.425,-72.197
dist,-0.1054,0.002,-42.272,0.0,-0.11,-0.101
year,0.0468,0.004,11.401,0.0,0.039,0.055
seasons,0.0173,0.004,4.163,0.0,0.009,0.025
cold,-0.1208,0.055,-2.2,0.028,-0.228,-0.013
altitude,0.3854,0.125,3.072,0.002,0.14,0.631
humid,-0.0751,0.046,-1.621,0.105,-0.166,0.016
windy,-0.0667,0.058,-1.152,0.249,-0.18,0.047
away_game,0.0664,0.06,1.11,0.267,-0.051,0.184
postseason,0.0672,0.062,1.092,0.275,-0.053,0.188


In [34]:
with pm.Model() as logistic_model:
    # priors
    priors = {}
    for cov in df.drop('good', axis=1).columns.values:
        priors[cov] = Normal.dist(mu=exp_results.loc[cov,'coef'], sigma=exp_results.loc[cov,'std err'])

    # formula
    formula = 'good ~ '+'+'.join(df.drop(['good','Intercept'], axis=1).columns.values)
    print(formula)

    # model
    pm.glm.GLM.from_formula(formula,
                            df,
                            priors=priors,
                            family=pm.glm.families.Binomial())
    
    # trace
    trace = pm.sample(1000, tune=1000, init='adapt_diag')


good ~ dist+year+seasons+cold+altitude+humid+windy+away_game+postseason+iced+turf+precipitation+pressure+cold*windy+postseason*away_game
Auto-assigning NUTS sampler...
Initializing NUTS using adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [postseason:away_game, cold:windy, pressure, precipitation, turf, iced, postseason, away_game, windy, humid, altitude, cold, seasons, year, dist, Intercept]
Sampling 2 chains, 0 divergences:   1%|          | 21/4000 [07:48<24:39:02, 22.30s/draws]


ValueError: Not enough samples to build a trace.