In [None]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import glm as glm_sm
import numpy as np
import mysql.connector
import itertools
import dtale
from scipy.stats import chi2, pointbiserialr, pearsonr
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt


import sys
sys.path.append('..')
from util.metrics import tetrachoric
from util.graphics.factorplots import interaction_plot
from scripts.train import get_data
from util.summary import Summary

In [None]:
cnx = mysql.connector.connect(user='root', password='mOntie20!mysql', host='127.0.0.1', database='nfl')

In [None]:
df = get_data(cnx, '<= 2019').drop('fkicker', axis=1)

# dtale.show(df)
print(len(df), 'rows.')
df.head(10)

## Going For Three

Query the database for the data used by Going For Three. i.e. pre-2011.
Going For Three didn't use the year and seasons variables in their tabulated results, so our first model won't either.

In [None]:
df_gft = get_data(cnx, '<= 2011').drop(['seasons','year','fkicker'], axis=1)
print(len(df_gft))
df_gft.head(10)

In [None]:
model = glm_sm('good ~ ' + '+'.join(df_gft.drop('good', axis=1).columns.values), df_gft, family=sm.families.Binomial())
result = model.fit(method='newton')
print(result.summary())
base_ll = pd.read_html(result.summary().tables[0].as_html())[0].iloc[4,3]

We now add back in the year and seasons of experience and control for kickers that dont make it in the NFL (so >=50 kicks overall or >=3 seasons at the time of kick). Notice this is a little different from the Going For Three paper. Their criteria threw out a few valid datapoints from experienced kickers at the start and end of the dataset.

In [None]:
where = '''\nand (
(
    fg.fkicker in (select fkicker from fifty) -- has had at least 50 attempts overall (this keeps only kickers that would end up making it in the NFL)
) or    
(
    k.seas>=3  -- or they had played 3 seasons up to the kick (stops removal of kicks from experienced kickers' kicks from early or late in the dataset)
)
)'''

df_gft_ext = get_data(cnx, '<= 2011', where).drop('fkicker', axis=1)
print(len(df_gft_ext), 'valid kicks.')
df_gft_ext.head(5)

In [None]:
model = glm_sm('good ~ '+'+'.join(df_gft_ext.drop('good', axis=1).columns.values), df_gft_ext, family=sm.families.Binomial())
result = model.fit(method='newton')
print(result.summary())

Again, we see that we've arrived at similar results. The psychological variables seem less influential than the environmental and physical factors.

## Choking Under The Pressure

In Choking Under the Pressure, they used similar data now from 2000-2017.
Lets repeat the modelling with this data, again leaving out the seasons and year covariates and not controlling for >50 kicks

In [None]:
df_cup = get_data(cnx, '<= 2017').drop(['seasons','year','fkicker'], axis=1)
print(len(df_cup),'valid kicks.')
df_cup.head(5)

In [None]:
model = glm_sm('good ~ '+'+'.join(df_cup.drop(['good'], axis=1).columns.values), df_cup, family=sm.families.Binomial())
result = model.fit(method='newton')
print(result.summary())

Again we see similar results. Icing is significant at the 0.1 level as per the paper.