# Model Notebook

This notebook is only used to help speed up data cleaning/transformations and model creation process.

In [1]:
import math
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from matplotlib.animation import FuncAnimation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

field_height = 53.3
field_length = 120
endzone_length = 10
seconds_in_a_half = 15 * 60
seconds_in_overtime = 10 * 60

In [2]:
start = time.time()
games = pd.read_csv('data/games.csv')
print("Done in:", time.time() - start)

Done in: 0.007201194763183594


In [3]:
start = time.time()
players = pd.read_csv('data/players.csv')
print("Done in:", time.time() - start)

Done in: 0.009339094161987305


In [4]:
start = time.time()
plays = pd.read_csv('data/plays.csv')
print("Done in:", time.time() - start)

Done in: 0.07805180549621582


In [5]:
start = time.time()
plays = plays.merge(games, on='gameId')
print("Done in:", time.time() - start)

plays['is_home_offense'] = plays['possessionTeam'] == plays['homeTeamAbbr']
print("Done in:", time.time() - start)

Done in: 0.03026294708251953
Done in: 0.03308892250061035


In [6]:
start = time.time()
scout = pd.read_csv('data/PFFScoutingData.csv')
print("Done in:", time.time() - start)

Done in: 0.04794907569885254


In [7]:
start = time.time()
tracking_2018 = pd.read_csv('data/tracking2018.csv')
tracking_2019 = pd.read_csv('data/tracking2019.csv')
tracking_2020 = pd.read_csv('data/tracking2020.csv')
tracking = tracking_2018.append([tracking_2019, tracking_2020])
print("Done in:", time.time() - start)

Done in: 73.86847996711731


In [8]:
start = time.time()
left_mask = tracking['playDirection'] == 'left'
right_mask = tracking['playDirection'] == 'right'
tracking.loc[left_mask, 'stdX'] = field_length - tracking.loc[left_mask, 'x']
tracking.loc[left_mask, 'stdY'] = field_height - tracking.loc[left_mask, 'y']
tracking.loc[left_mask, 'stdDir'] = (360 - tracking.loc[left_mask, 'dir']) % 360
tracking.loc[right_mask, 'stdX'] = tracking.loc[right_mask, 'x']
tracking.loc[right_mask, 'stdY'] = tracking.loc[right_mask, 'y']
tracking.loc[right_mask, 'stdDir'] = tracking.loc[right_mask, 'dir']
tracking['stdR'] = (tracking['stdX'].pow(2) + tracking['stdY'].pow(2)).pow(1/2)
tracking['stdTheta'] = np.arctan(tracking['stdY']/tracking['stdX'])
tracking['nflId'] = tracking['nflId'].fillna(-1)
print("Done in:", time.time() - start)

Done in: 21.714648246765137


In [9]:
num_games = plays['gameId'].unique().size

start = time.time()
play_results = plays.groupby(['specialTeamsPlayType', 'specialTeamsResult']).size().unstack('specialTeamsPlayType')
play_results = play_results.divide(num_games)
play_results_probs = play_results.div(play_results.sum(axis=0), axis=1)
excitement_scores = play_results_probs.subtract(1).multiply(-1).unstack().dropna().rename('excitement_score_v1')
plays = plays.join(excitement_scores, on=['specialTeamsPlayType', 'specialTeamsResult'])
print("Done in:", time.time() - start)

start = time.time()
plays.loc[plays['is_home_offense'], 'score_diff'] = plays['preSnapHomeScore'] - plays['preSnapVisitorScore']
plays.loc[~plays['is_home_offense'], 'score_diff'] = plays['preSnapVisitorScore'] - plays['preSnapHomeScore']
plays['score_diff_ex'] = pd.cut(plays['score_diff'].abs(), 
    bins=[-1, 0, 1, 3, 5, 8, 11, 16, 21, 26, 31], 
    labels=[.9, 1, .8, .7, .6, .5, .4, .3, .2, .1]
).astype(float)
plays['excitement_score_v2'] = 0.5 * plays['excitement_score_v1'] + 0.5 * plays['score_diff_ex']
print("Done in:", time.time() - start)

start = time.time()
plays['gameClockMins'] = plays['gameClock'].str.split(':').str[0]
plays['gameClockSecs'] = plays['gameClock'].str.split(':').str[1]
plays['seconds_left_in_quarter'] = plays['gameClockMins'].astype(int) * 60 + plays['gameClockSecs'].astype(int)
plays.loc[plays['quarter'].isin([2, 4, 5]), 'seconds_left_in_half'] = plays['seconds_left_in_quarter']
plays.loc[plays['quarter'].isin([1, 3]), 'seconds_left_in_half'] = plays['seconds_left_in_quarter'] + seconds_in_a_half
print("Done in:", time.time() - start)

start = time.time()
plays.loc[plays['quarter'] != 5, 'time_left_ex'] = (1 - plays['seconds_left_in_half'] / (seconds_in_a_half * 2))
plays.loc[plays['quarter'] == 5, 'time_left_ex'] = (1 - plays['seconds_left_in_half'] / seconds_in_overtime)
plays['quarter_ex'] = plays['quarter'] / 5
plays['excitement_score_v3'] = 0.25 * (plays['excitement_score_v1'] + plays['score_diff_ex'] + plays['quarter_ex'] + plays['time_left_ex'])
print("Done in:", time.time() - start)

Done in: 0.0620427131652832
Done in: 0.012898921966552734
Done in: 0.14750099182128906
Done in: 0.007073879241943359


In [10]:
kickoffs = plays[plays['specialTeamsPlayType']=='Kickoff'].reset_index(drop=True)

start = time.time()
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
transformed = encoder.fit_transform(kickoffs[['specialTeamsResult']])
output_cols = [x.split('_')[1] for x in encoder.get_feature_names()]
ohe_df = pd.DataFrame(transformed, columns=output_cols)
kickoffs = pd.concat([kickoffs, ohe_df], axis=1)
print("Done in:", time.time() - start)

start = time.time()
kickoffs['initial_excitement'] = 1/3 * (kickoffs['score_diff_ex'] + kickoffs['quarter_ex'] + kickoffs['time_left_ex'])
kickoff_tracking = kickoffs.merge(tracking, on=['gameId', 'playId'])
print("Done in:", time.time() - start)

start = time.time()
kickoff_tracking['is_off'] = (kickoff_tracking['is_home_offense'] & (kickoff_tracking['team'] == 'home')) | (~kickoff_tracking['is_home_offense'] & (kickoff_tracking['team'] == 'away')) | (kickoff_tracking['team'] == 'football')
print("Done in:", time.time() - start)

Done in: 0.009643077850341797
Done in: 112.90142703056335
Done in: 2.022387981414795


In [11]:
# kickoff_tracking = kickoff_tracking[(kickoff_tracking['gameId']==2018090600) & (kickoff_tracking['playId'].isin([37, 677]))]
# kickoff_tracking.head()

In [12]:
kickoff_tracking.head()

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,specialTeamsPlayType,specialTeamsResult,kickerId,...,position,team,frameId,playDirection,stdX,stdY,stdDir,stdR,stdTheta,is_off
0,2018090600,37,J.Elliott kicks 65 yards from PHI 35 to end zo...,1,0,0,PHI,Kickoff,Touchback,44966.0,...,FS,home,1,right,43.76,8.1,53.28,44.503344,0.183029,True
1,2018090600,37,J.Elliott kicks 65 yards from PHI 35 to end zo...,1,0,0,PHI,Kickoff,Touchback,44966.0,...,FS,home,2,right,43.77,8.1,73.05,44.513177,0.182988,True
2,2018090600,37,J.Elliott kicks 65 yards from PHI 35 to end zo...,1,0,0,PHI,Kickoff,Touchback,44966.0,...,FS,home,3,right,43.78,8.1,70.39,44.52301,0.182947,True
3,2018090600,37,J.Elliott kicks 65 yards from PHI 35 to end zo...,1,0,0,PHI,Kickoff,Touchback,44966.0,...,FS,home,4,right,43.8,8.11,71.53,44.544496,0.183086,True
4,2018090600,37,J.Elliott kicks 65 yards from PHI 35 to end zo...,1,0,0,PHI,Kickoff,Touchback,44966.0,...,FS,home,5,right,43.84,8.12,75.17,44.585648,0.183143,True


The only features we're interested in right now is location of players and the ball. So we'll selected those columns and standardize (scale) them.

In [13]:
features = ['stdR', 'stdTheta']

sc = MinMaxScaler()

start = time.time()
kickoff_tracking[features] = sc.fit_transform(kickoff_tracking[features])
print("Done in:", time.time() - start)

Done in: 37.24310326576233


In [14]:
start = time.time()
sort_cols = ['gameId', 'playId', 'frameId', 'is_off', 'nflId']
df = kickoff_tracking[[*sort_cols, *features]].set_index(sort_cols).sort_index(level=sort_cols).droplevel('nflId')
df = df.set_index(df.groupby(['gameId', 'playId', 'frameId']).cumcount().rename('playerNum'), append=True)
df = df[features].unstack(['is_off', 'playerNum']).reset_index(['gameId', 'playId', 'frameId'])
print("Done in:", time.time() - start)

df.head()

Done in: 24.175102949142456


Unnamed: 0_level_0,gameId,playId,frameId,stdR,stdR,stdR,stdR,stdR,stdR,stdR,...,stdTheta,stdTheta,stdTheta,stdTheta,stdTheta,stdTheta,stdTheta,stdTheta,stdTheta,stdTheta
is_off,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True
playerNum,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,0,1,2,3,4,5,6,...,13,14,15,16,17,18,19,20,21,22
0,2018090600,37,1,0.553096,0.522896,0.480649,0.660584,0.683967,0.480869,0.439862,...,0.329865,0.68,0.468614,0.611685,0.508762,0.586472,0.583543,0.426882,0.63134,0.658324
1,2018090600,37,2,0.553035,0.522996,0.480588,0.660573,0.683939,0.480799,0.439796,...,0.329646,0.68,0.468796,0.611403,0.508847,0.583207,0.58351,0.42698,0.631282,0.658429
2,2018090600,37,3,0.553071,0.523095,0.480588,0.660503,0.683939,0.480812,0.439769,...,0.329533,0.679953,0.468796,0.611184,0.508931,0.579726,0.58334,0.427077,0.631229,0.658587
3,2018090600,37,4,0.553071,0.523151,0.480611,0.660573,0.68391,0.480812,0.439769,...,0.329524,0.6799,0.468886,0.611194,0.509016,0.576015,0.58329,0.427145,0.631176,0.658798
4,2018090600,37,5,0.553071,0.523306,0.480671,0.660584,0.68391,0.480743,0.439769,...,0.329515,0.6799,0.469105,0.611268,0.5091,0.572099,0.583,0.427378,0.631069,0.658956


Then we'll convert the DataFrame to a 2D array and remove all the `nan` values, since those are for players that are not in the play. The first column is the **gameId**, second is **playId**, and third is **frameId**. This will probably be useful once we implement a recurrent neural net (which can remember previous input), but that's probably pretty far down the road.

In [15]:
start = time.time()
x = np.array([x[~np.isnan(x)] for x in df.values])
print("Done in:", time.time() - start)
print(x.shape)
print(x[0])

Done in: 2.2330360412597656
(618098, 49)
[2.01809060e+09 3.70000000e+01 1.00000000e+00 5.53095559e-01
 5.22896344e-01 4.80648535e-01 6.60584415e-01 6.83967252e-01
 4.80868708e-01 4.39861895e-01 5.16338284e-01 7.95061811e-01
 4.08953143e-01 4.21182966e-01 3.56200050e-01 2.99571912e-01
 2.96243873e-01 4.48049165e-01 3.19173744e-01 3.86027982e-01
 3.30171198e-01 3.44803820e-01 3.67992180e-01 3.05836510e-01
 3.99778786e-01 4.26450259e-01 5.37624925e-01 6.02738911e-01
 5.48160466e-01 3.67949691e-01 4.81729350e-01 3.75553518e-01
 4.70716375e-01 4.74209721e-01 4.00768151e-01 3.56011607e-01
 4.11065449e-01 5.38590477e-01 3.74742907e-01 3.29865065e-01
 6.79999898e-01 4.68614001e-01 6.11685385e-01 5.08762379e-01
 5.86472143e-01 5.83542745e-01 4.26881952e-01 6.31340218e-01
 6.58323573e-01]


Input is only 46 values, the first 3 values in x in order are: `gameId`, `playId`, `frameId`. We'll drop these for now, but potentially may want them later if we need them for an RNN.

In [16]:
x = np.array([x[3:] for x in x])

In [17]:
start = time.time()
y = kickoff_tracking.groupby(['gameId', 'playId', 'frameId'])[output_cols].first().values
print("Done in:", time.time() - start)
print(y.shape)
print(y[0])

Done in: 2.333343982696533
(618098, 7)
[0. 0. 0. 0. 0. 0. 1.]


In [18]:
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=1)

In [19]:
num_input_nodes = x.shape[1]
num_hidden_nodes = x.shape[1] * 2
num_output_nodes = y.shape[1]

def baseline_model():
    model = Sequential()
    model.add(Dense(num_hidden_nodes, input_dim=num_input_nodes, activation='relu'))
    model.add(Dense(num_output_nodes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [20]:
estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)
kfold = KFold(n_splits=10, shuffle=True)
results = cross_val_score(estimator, x, y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

NameError: name 'KerasClassifier' is not defined