In [None]:
#imports
import sqlite3
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols, logit

In [None]:
# load data for distance model
query = """
select Plays.*, S.*, P.name, G.weather
from Plays join Stats S on Plays.gameId = S.gameId and Plays.playId = S.playId join Players P on Plays.kickerId = P.id join Games G on Plays.gameId = G.id
where Plays.specialTeamsPlayType = 'Punt' and kickerId != 'NA' and
snapTime != 'NA' and operationTime != 'NA' and hangTime != 'NA' and
kickLength != 'NA';
"""

conn = sqlite3.connect('NFL_ST_data')
cur = conn.cursor()
cur.execute(query)
rows = cur.fetchall()
out = []
for row in rows:
    out.append(row)
data = pd.DataFrame(out)
data = data.rename(columns={x:y for x,y in enumerate(['gameId','playId','Description','qtr','down','yardsToGo','posTeam','playType','STResult','kickerId','kickBlockerId','yardlineSide',
                     'yardline','gameClock', 'penaltyYards','HomeScore','VisitorScore','passResult','kickLength','kickReturnYardage',
                     'playResult', 'absoluteYardlineNumber','s.gameID', 's.playId','snapDetail','snapTime','operationTime', 'hangTime', 'kickType', 'kickDirectionIntended',
                     'kickDirectionActual','returnDirectionIntended', 'returnDirectionActual','tackler','kickoffReturnFormation','kickContactType', 'punter', 'weather'])})

data['endzoneDistance'] = np.where(data['posTeam'] == data['yardlineSide'], data['yardline']+50, data['yardline'])
data['kickShank'] = np.where(data['kickDirectionIntended'] != data['kickDirectionActual'], 1, 0)
X = data[['endzoneDistance', 'snapDetail','snapTime','operationTime','hangTime','kickType', 'kickShank', 'kickDirectionActual']]
Y = data['kickLength']

In [None]:
#Get the weather data
bad_weather = {'Rain', 'Light Rain', 'Drizzle', 'Light Snow', 'Light Sleet', 'Light Rain and Windy', 'Snow'}

for i in range(len(data)):
    if data.loc[i,'weather'] in bad_weather:
        data.loc[i, 'weather_simp'] = 'elements'
    else:
        data.loc[i, 'weather_simp'] = 'clear'
#data['weather_simp'] = np.where(np.in1d(data['weather'],bad_weather), 'elements', 'clear')
data['weather_simp'] = np.where(data['weather'] == 'dome', 'dome', data['weather_simp'])

In [None]:
#convert to numeric
for col in ['kickLength', 'snapTime', 'operationTime', 'hangTime']:
    data[col] = pd.to_numeric(data[col])

In [None]:
# Make the model to get the punt distance
# Omits all information about punter except which it is, treating each punter as a fixed effects
# Possible continuation would be attempting mixed effects models
punt_model = ols('kickLength ~ bs(endzoneDistance, knots = (55,90), degree= 3, include_intercept = False) + C(punter) + C(snapDetail) + snapTime + operationTime + C(kickType) + kickShank + C(kickDirectionActual) + C(weather)- 1', data=data)
punt_fit = punt_model.fit()
print('Punt length:')
print(punt_fit.summary())

In [None]:
#Look at coefficients for punters and also the standardized coefficient
punter_coef = punt_fit.params.reset_index()
punter_coef = punter_coef.iloc[:(len(data['punter'].unique())-1),:]
punter_coef['index'] = punter_coef['index'].str.split('[').str[1].str[:-1]
punter_coef = punter_coef.rename(columns = {0:'coef'})
punt_avg, punt_sd = (punter_coef['coef'].mean(), punter_coef['coef'].std())
punter_coef['Adj. coef'] = (punter_coef['coef']-punt_avg)/punt_sd
print(punter_coef)

In [None]:
# do cleaning to prepare for modeling the reutrn
data['penaltyYards'] = np.where(data['penaltyYards'] == 'NA', 0, data['penaltyYards'])
data['returnLength'] = data['playResult'] - data['kickLength'] - data['penaltyYards']
data['returnLength'] = pd.to_numeric(np.where(data['STResult'] == 'Return', data['returnLength'], 0))

returns = data[(data['STResult'] == "Return")]
returns = returns[returns['kickReturnYardage'] != "NA"]

returns['kickReturnYardage'] = pd.to_numeric(returns['kickReturnYardage'])

In [None]:
# fit a linear model for returns
return_model = ols('kickReturnYardage ~ kickLength + operationTime + hangTime + C(punter) + C(kickType) + C(kickDirectionActual) + kickShank + C(weather_simp) -1', data=returns)
return_fit = return_model.fit()
print("\n\n\n")
print('Return Length: ')
print(return_fit.summary())

In [None]:
#Look at how different factors impact if a kick is caught
caught = data[data['STResult'].isin(['Return', 'Fair Catch', 'Muffed'])]
caught['returnAttempted'] = pd.to_numeric(np.where(caught['STResult'] == 'Return', 1, 0))
return_prob = logit("returnAttempted ~ kickLength + operationTime + hangTime + C(kickType) + C(kickDirectionActual) + kickShank+ C(weather_simp)", data = caught)
prob_fit = return_prob.fit()
print("\n\n\n")
print("Return Chance: ")
print(prob_fit.summary())

In [None]:
# pull the tracking data for use in the models
tracking_query = """
select *
from Tracking t join Plays P on P.playId = t.playId and P.gameId = t.gameId join Games g on g.Id = t.gameId
where p.specialTeamsPlayType = 'Punt' and even = 'punt_received'
"""
cur.execute(tracking_query)
rows = cur.fetchall()
out = []
for row in rows:
    out.append(row)
column_names = [description[0] for description in cur.description]
tracking = pd.DataFrame(out, columns=column_names)

#drop unneeded columns
columns_to_drop = [15, 16]
tracking = tracking.iloc[:, [i for i in range(len(tracking.columns)) if i not in columns_to_drop]]

In [None]:
# remove blocked kicks
tracking = tracking[tracking['kickBlockerId'] == 'NA']

In [None]:
#get list of all punts
uniques = list(set(zip(tracking['gameId'], tracking['playId'])))
# remove a misclassified blocked punt
uniques.pop(1261)

In [None]:
# fix NA in direction
tracking['dir'] = np.where(tracking['dir'] != 'NA',  tracking['dir'], tracking['o'])
tracking['dir'] = np.where(tracking['dir'] != 'NA',  tracking['dir'], 0)
tracking['dir'] = tracking['dir'].fillna(0).astype(float)

In [None]:
#get list of all returners
all_returners = ['Bryce Callahan', 'Tarik Cohen', 'Marvin Hall', 'Marquez Callaway', 'T.J. Jones', 'Chris Claybrooks', 'Brandon Powell', 'Randall Cobb', 'Keke Coutee', 'Hunter Renfrow', 'Rashad Greene', 'Janarion Grant', 'Alex Erickson', 'Tyler Ervin', 'Tyler Boyd', 'Patrick Peterson', 'Julian Edelman', 'Tyler Lockett', 'Chris Hogan', 'Corey Coleman', 'Darrius Shepherd', 'DeAndre Carter', 'Diontae Johnson', 'Jalen Richard', 'Cameron Batson', 'Jarvis Landry', 'Darius Phillips', 'Dontrell Hilliard', 'Tavon Austin', 'Jamison Crowder', 'DeSean Jackson', 'Kaelin Clay', 'Kenny Moore', 'Anthony Miller', 'K.J. Osborn', 'Matthew Slater', 'Golden Tate', 'Dede Westbrook', 'Jalen Reagor', 'Isaiah Rodgers', 'K.J. Hamler', 'Cyrus Jones', 'Chad Beebe', 'Donovan Peoples-Jones', 'Brandon Zylstra', 'C.J. Board', 'Quadree Henderson', 'Jabrill Peppers', 'Jonathan Jones', 'Charvarius Ward', 'River Cracraft', 'Jeremy Kerley', 'James Proche', 'Andre Roberts', 'T.Y. Hilton', 'Tim White', 'Damiere Byrd', 'Jamal Agnew', 'Mohamed Sanu', 'Austin Carr', 'Quandre Diggs', 'Preston Williams', 'Marcus Murphy', 'Jakeem Grant', 'Michael Walker', 'Troymaine Pope', 'Dwayne Harris', 'Tyreek Hill', 'J.J. Taylor', 'Stacy Coley', 'D.J. Reed', 'Deionte Thompson', 'Ted Ginn', 'Greg Ward', 'Brandon Aiyuk', 'Tremon Smith', 'D.J. Moore', 'Spencer Schnell', 'Christian Kirk', 'Demarcus Robinson', 'Mecole Hardman', 'Antonio Hamilton', 'Dante Pettis', 'Odell Beckham', 'Marcus Sherels', 'Antonio Callaway', 'Ced Wilson', 'Jaydon Mickens', 'Greg Dortch', 'Jamie Collins', 'Diontae Spencer', 'Antonio Brown', 'Keelan Cole', 'Taysom Hill', 'Bisi Johnson', 'Kenjon Barner', 'Ryan Switzer', 'Danny Amendola', 'Russell Gage', 'Justin Hardy', 'DaeSean Hamilton', 'Tyrann Mathieu', 'Gunner Olszewski', 'Nyheim Hines', 'Rashard Davis', 'football', 'Brandon Tate', 'Nelson Agholor', 'Janoris Jenkins', 'Trent Taylor', 'Bobo Wilson', 'Javien Elliott', 'LeShaun Sims', 'Boston Scott', 'Maurice Harris', 'Cole Beasley', 'Richie James', 'Trevor Davis', 'Rishard Matthews', 'Alvin Kamara', 'Steven Sims', 'Chester Rogers', 'Darren Sproles', 'Isaiah McKenzie', "Adoree' Jackson", 'Charone Peake', 'Justin Watson', 'Kalif Raymond', "Da'Mari Scott", 'Isaiah Wright', 'Nsimba Webster', 'Josh Jackson', 'Myles Hartsfield', 'Nick Williams', 'Torry McTyer', 'J.J. Jones', 'Mike Hughes', 'Jaire Alexander', 'Phillip Lindsay', 'Andy Isabella', 'Marqui Christian', 'Tramon Williams', 'Byron Murphy', 'Greg Stroman', 'Quenton Meeks', 'Riley McCarron', 'Corey Clement', 'Travis Benjamin', 'Cedrick Wilson', 'Tommylee Lewis', 'Deonte Harris', 'Devin Duvernay', 'Cooper Kupp', 'Will Fuller', 'Desmond King', 'Pharoh Cooper', 'Jawill Davis', 'CeeDee Lamb', 'Kenny Stills', 'Jordy Nelson', 'Adam Humphries', 'Braxton Berrios', 'Brandon Rusnak', 'Dominique Rodgers-Cromartie', 'Mike Thomas', 'Micah Hyde', 'Trey Quinn', 'K.J. Hill', "De'Anthony Thomas", 'Danny Johnson', 'David Moore', 'JoJo Natson', 'T.J. Logan', 'Adam Jones', 'Avonte Maddox', "D'Ernest Johnson", 'Ray-Ray McCloud']

In [None]:
# get arrays of all punters and returners
all_punters = tracking[tracking['position'] == 'P']['name'].unique()
all_returners = np.array(all_returners)

In [None]:
#prepare data for model

#make empty data frame
ready_data = pd.DataFrame(columns = [x for x in range(0,127+len(all_returners)+3+len(all_punters))])
#iterate through all punts
for i in range(len(uniques)):
    #get the current punt
    play_one = tracking[(tracking['gameId'] == uniques[i][0]) & (tracking['playId'] == uniques[i][1])].reset_index(drop=True)
    # adjust all positions to be realtive to the football
    football = play_one[play_one['name'] == 'football']
    football_loc = (football['x'], football['y'])
    for j in range(len(play_one)):
        play_one.iloc[j,1] -= football_loc[0]
        play_one.iloc[j,2] -= football_loc[1]
    
    #get distance between each player and the football
    # dist only needed for sorting so we can skip square root since it is also always increasing
    play_one.loc[:,'dist'] = play_one['x']**2 + play_one['y']**2

    #switch teams to from home and away to recv and kick
    if play_one.loc[0, 'possessionTeam'] == play_one.loc[0, 'home']:
        play_one['team'] = np.where(play_one['team'] == 'home', 'kick', 'recv')
    else:
        play_one['team'] = np.where(play_one['team'] == 'away', 'kick', 'recv')
        
    #calculate speed at ballcarrier using a dot product
    mag1 = np.sqrt(np.cos(np.radians((play_one['dir']+90)%360))**2 + np.sin(np.radians((play_one['dir']+90)%360))**2)
    mag2 = np.sqrt(play_one['x']**2 + play_one['y']**2)
    play_one['dot'] = play_one['s'] * (-np.cos(np.radians((play_one['dir']+90)%360)) * -play_one['x'] + np.sin(np.radians((play_one['dir']+90)%360)) * -play_one['y'])/(mag1*mag2)
        
    #sort by team and distance so it is also in a consistient order for the model
    sorted = play_one.sort_values(by=['team', 'dist']).reset_index(drop=True)
    #drop football and returner (11 is ball, 12 is returner
    returner = sorted.loc[12,'name']
    sorted = sorted.drop([11,12])
    all_values = []

    
    
    # extend dataframe into one long row so each play is one row 
    for index, row in sorted.iterrows():
        all_values.extend([row['x'], row['y'], row['s'], row['a'], row['dir'], row['dot']])
    
    aggregated_row = pd.Series(all_values).to_frame().T
    
    #add returner as a column
    returner_ind = np.where(all_returners == returner)
    returner_cols = pd.DataFrame(np.zeros((1, len(all_returners))))
    returner_cols.iloc[0,returner_ind] = 1
    returner_cols.columns = range(126, 126+len(all_returners))
    aggregated_row = pd.concat([aggregated_row, returner_cols],axis=1)
    
    #Tadd initial yardline to model (subratcing 10 to get rid of endzone)
    aggregated_row[len(aggregated_row.iloc[0,:])] = play_one.loc[0,'absoluteYardlineNumber'] - 10
    
    #add punter as a column
    try:
        punter = sorted[sorted['position'] == 'P'].iloc[0,10]
    except IndexError:
        punter = sorted[sorted['position'] == 'K'].iloc[0,10]
    punter_ind = np.where(all_punters == punter)
    punter_cols = pd.DataFrame(np.zeros((1, len(all_punters))))
    punter_cols.iloc[0,punter_ind] = 1
    punter_cols.columns = range(len(aggregated_row.iloc[0,:]), len(aggregated_row.iloc[0,:])+len(all_punters))
    aggregated_row = pd.concat([aggregated_row, punter_cols],axis=1)
    
    
    #add hangtime as a column
    try:
        aggregated_row[len(aggregated_row.iloc[0,:])] = data[(data['gameId'] == uniques[i][0]) & (data['playId'] == uniques[i][1])].iloc[0,27]
    except IndexError:
        aggregated_row[len(aggregated_row.iloc[0,:])] = np.NAN
        print(i)
    
    
    #add punt distance as a column
    aggregated_row[len(aggregated_row.iloc[0,:])] = play_one.loc[0,'kickLength']
    
    #add return yardage
    aggregated_row[len(aggregated_row.iloc[0,:])] = play_one['kickReturnYardage'][0]
    
    ready_data.loc[i] = aggregated_row.loc[0]
    
    #track progress
    if i%500 == 0:
        print(f"{round((i/len(uniques))*100,2)}% complete")


In [None]:
#make sure that there is return yardage
ready_data = ready_data[ready_data[len(ready_data.iloc[0,:])-1] != 'NA']

In [None]:
# get dependant variable
Y = ready_data.iloc[:,-1].astype('float32')
# get independent variables, one collection of all, one collection excluding the punter and punt characteristics
X1 = ready_data.iloc[:,:-1]
X2 = ready_data.iloc[:,:-(3+len(all_punters))]

In [None]:
# more imports
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import KFold

In [None]:
#clear keras just in case
tf.keras.backend.clear_session()

In [None]:
#set up cross-validation to find best model structure
kf = KFold(n_splits = 8)
VALIDATION_LOSS = []

for train_index, val_index in kf.split(X1, Y):
  #set train and validation sets
  train_x = X1.iloc[train_index]
  tf.convert_to_tensor(train_x, dtype=tf.float32)
  valid_x = X1.iloc[val_index]
  tf.convert_to_tensor(valid_x, dtype=tf.float32)
  train_y = Y.iloc[train_index]
  tf.convert_to_tensor(train_y, dtype=tf.float32)
  valid_y = Y.iloc[val_index]
  tf.convert_to_tensor(valid_y, dtype=tf.float32)
  
  #set seed to make reproducable (for random starting weights)
  keras.utils.set_random_seed(14)
  #build model
  network = keras.models.Sequential([
      keras.layers.Dense(1024, activation = 'elu'),
      keras.layers.Dense(2048, activation = "elu"),
      keras.layers.Dense(4096, activation = 'elu'),
      keras.layers.Dense(4096, activation="elu"),
      keras.layers.Dense(4096, activation="elu"),
      keras.layers.Dense(2048, activation="elu"),
      keras.layers.Dense(1, activation = 'linear')
  ])
  
  network.compile(loss="mean_squared_error",
                optimizer=keras.optimizers.Adam(learning_rate=0.001),
                metrics = ["mean_squared_error", "mean_absolute_error"])
  
  
  history = network.fit(train_x, train_y,
                      epochs=12, batch_size=128,
                      validation_data=(valid_x, valid_y), verbose = 0)
  
  results = network.evaluate(valid_x, valid_y)
  
  VALIDATION_LOSS.append(results[0])
  
  tf.keras.backend.clear_session()
  

In [None]:
# make a second network for the data not including the punter or punt characteristics
network2 = keras.models.Sequential([
      keras.layers.Dense(1024, activation = 'elu'),
      keras.layers.Dense(2048, activation = "elu"),
      keras.layers.Dense(4096, activation = 'elu'),
      keras.layers.Dense(4096, activation="elu"),
      keras.layers.Dense(4096, activation="elu"),
      keras.layers.Dense(2048, activation="elu"),
      keras.layers.Dense(1, activation = 'linear')
  ])
  
network2.compile(loss="mean_squared_error",
                optimizer=keras.optimizers.Adam(learning_rate=0.001),
                metrics = ["mean_squared_error", "mean_absolute_error"])
  
  

In [None]:
#Fit the two models
full_model = network.fit(X1, Y,
                      epochs=12, batch_size=128,verbose = 0)
limited_model = network2.fit(X2, Y,
                            epochs=12, batch_size=128,verbose = 0)

In [None]:
# Find average punter attributed distance for each punter
final = pd.DataFrame(columns = ['Name', 'Distance', 'Return', 'total'])
for i in range(len(all_punters)):
    #just get punts from the current punter
    mask = np.where(ready_data.iloc[:,127+len(all_returners)+i] == 1)
    #get the expected return values for all of their punts
    full = network.predict(X1.iloc[mask], verbose=0)
    limited = network2.predict(X2.iloc[mask], verbose=0)
    # take the difference to get the punters contribution to expected return
    diff = (limited-full).mean()
    try:
        #add the added distance added to the punt to the expectedreturn distance mitigated
        dist = punter_coef[punter_coef['index'] == all_punters[i]].iloc[0,1] + diff
    except IndexError:
        print(all_punters[i])
        #Catch Lachlan Edwards also going as Lac Edwards
        dist = punter_coef[punter_coef['index'] == "Lachlan Edwards"].iloc[0,1] + diff
    # collect all into one dataframe
    final.loc[len(final)] = [all_punters[i],dist-diff, diff, dist]
    print(all_punters[i], dist)

In [None]:
#add a combined name column
final[['first', 'last']] = final['Name'].str.split(' ', expand=True)
final['short'] = final['first'].str[0] + '. ' + final['last']

Make Graphics

In [None]:
# import plotnine to graph results
from plotnine import ggplot, geom_point, geom_label, aes, labs, theme_grey, xlim, ggsave, geom_col, coord_flip, scale_x_continuous, scale_color_manual, scale_y_continuous, geom_segment,theme, element_text, geom_vline, geom_line, geom_smooth

In [None]:
# Graphic comparing distance added to punt with distance expected return is shorten
adjust_text_dict = {
    'force_text':(0.45,0.45),
    'force_static':(0.3, 0.3),
    'force_pull':(0.4,0.4),
    'pull_threshold':5,
    'expand_axes':True,
    'arrowprops': {
        'arrowstyle': '->',
        'color': 'red'
    }
}


p = ggplot(final, aes("Distance","Return",label = "short", size = 11))+geom_point()+ geom_label(size=16) + labs(x = "Calculated Average Impact on Gross Punt Distance (Yards)", y= "Calculated Average Impact on Return (Yards)", title = "Calculated Punter Impact") + theme_grey()+xlim(25,35.5) + theme(axis_title=element_text(size=24), plot_title=element_text(size=32,face="bold"), axis_text=element_text(size=24))

#ggsave(plot = p, filename="filename.png", dpi=500, height=10, width=10, units='in')

In [None]:
p.save(filename="filename.png", dpi=1000, height=8, width=14, units='in')

In [None]:
p

In [None]:
#Look at results
final

In [None]:
#look at ranges between distance and return
print(final['Distance'].max() - final['Distance'].min())
print(final['Return'].max() - final['Return'].min())

In [None]:
#Get top five and bottom 5
pd.concat([final.sort_values('total').iloc[:5], final.sort_values('total').iloc[len(final)-5:]])

In [None]:
#make graphic showing top five and bottom five punters by average punter attributed distance

p2 = ggplot(pd.concat([final.sort_values('total').iloc[:5], final.sort_values('total').iloc[len(final)-5:]]), aes("reorder(Name,total)","total")) + geom_point(size = 5)+ geom_segment(aes(x='Name',xend='Name',y=0, yend='total'), size =3)+coord_flip()+labs(y="Average Yards Added per Punt", x= "Punter", title= "Average Punter Attributed Distance", subtitle = "Top Five vs. Bottom Five") + theme(axis_title=element_text(size=28), plot_title=element_text(size=32,face="bold", hjust=0.5), plot_subtitle=element_text(size = 32, hjust=0.5), axis_text=element_text(size=24))+ geom_vline(xintercept=5.5, linetype='dashed', size =2, color = 'red')+ scale_y_continuous(breaks=[0,5,10,15,20,25,30,35])

In [None]:
p2.save(filename="filename2.png", dpi=1000, height=8, width=14, units='in')

In [None]:
# make graphic showing kick length controling for yardline of the kick and weather
p3 = ggplot(data, aes(x='endzoneDistance', y = 'kickLength', color = 'weather_simp')) + geom_point(size=0.3)+geom_smooth(method= 'loess', size = 1.5) + theme(axis_title=element_text(size=24), plot_title=element_text(size=25,face="bold", hjust=1), axis_text=element_text(size=18), legend_position=(0.01,0.99), legend_title=element_text(size=24), legend_text=element_text(size=20)) + labs(y="Punt Length (Yards)", x= "Distance to End Zone (Yards)", title= "Punt Length vs. End Zone Distance", color = "Weather")  + scale_color_manual(labels = ("Clear", "Domed", 'Elements'), values = ("red", 'green', 'blue'))

In [None]:
p3.save(filename="filename3.png", dpi=750, height=6, width=7.5, units='in')

In [None]:
# Make graphic showing kick length controlling for hangtime
p4 = ggplot(data[data['hangTime']>2], aes(x='hangTime', y = 'kickLength')) + geom_point(size=0.3)+geom_smooth(method= 'loess', size = 1.5, color = 'blue')+theme(axis_title=element_text(size=24), plot_title=element_text(size=25,face="bold"), axis_text=element_text(size=18)) + labs(y="Punt Length (Yards)", x= "Hang Time (Seconds)", title= "Punt Length vs. Hang Time")

In [None]:
p4.save(filename="filename4.png", dpi=750, height=6, width=7.5, units='in')