# Model Notebook

This notebook is only used to help speed up data cleaning/transformations and model creation process.

In [1]:
import math
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.animation import FuncAnimation
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

field_height = 53.3
field_length = 120
endzone_length = 10
seconds_in_a_half = 15 * 60
seconds_in_overtime = 10 * 60

In [2]:
start = time.time()
games = pd.read_csv('data/games.csv')
print("Done in:", time.time() - start)

Done in: 0.0047130584716796875


In [3]:
start = time.time()
players = pd.read_csv('data/players.csv')
print("Done in:", time.time() - start)

Done in: 0.011034250259399414


In [4]:
start = time.time()
plays = pd.read_csv('data/plays.csv')
print("Done in:", time.time() - start)

Done in: 0.0747981071472168


In [5]:
start = time.time()
plays = plays.merge(games, on='gameId')
print("Done in:", time.time() - start)

plays['is_home_offense'] = plays['possessionTeam'] == plays['homeTeamAbbr']
print("Done in:", time.time() - start)

Done in: 0.026874065399169922
Done in: 0.029808759689331055


In [6]:
start = time.time()
scout = pd.read_csv('data/PFFScoutingData.csv')
print("Done in:", time.time() - start)

Done in: 0.04279780387878418


In [7]:
start = time.time()
tracking_2018 = pd.read_csv('data/tracking2018.csv')
tracking_2019 = pd.read_csv('data/tracking2019.csv')
tracking_2020 = pd.read_csv('data/tracking2020.csv')
tracking = tracking_2018.append([tracking_2019, tracking_2020])
print("Done in:", time.time() - start)

Done in: 69.28465628623962


In [8]:
start = time.time()
left_mask = tracking['playDirection'] == 'left'
right_mask = tracking['playDirection'] == 'right'
tracking.loc[left_mask, 'stdX'] = field_length - tracking.loc[left_mask, 'x']
tracking.loc[left_mask, 'stdY'] = field_height - tracking.loc[left_mask, 'y']
tracking.loc[left_mask, 'stdDir'] = (360 - tracking.loc[left_mask, 'dir']) % 360
tracking.loc[right_mask, 'stdX'] = tracking.loc[right_mask, 'x']
tracking.loc[right_mask, 'stdY'] = tracking.loc[right_mask, 'y']
tracking.loc[right_mask, 'stdDir'] = tracking.loc[right_mask, 'dir']
tracking['stdR'] = (tracking['stdX'].pow(2) + tracking['stdY'].pow(2)).pow(1/2)
tracking['stdTheta'] = np.arctan(tracking['stdY']/tracking['stdX'])
print("Done in:", time.time() - start)

Done in: 19.19552779197693


In [9]:
num_games = plays['gameId'].unique().size

start = time.time()
play_results = plays.groupby(['specialTeamsPlayType', 'specialTeamsResult']).size().unstack('specialTeamsPlayType')
play_results = play_results.divide(num_games)
play_results_probs = play_results.div(play_results.sum(axis=0), axis=1)
excitement_scores = play_results_probs.subtract(1).multiply(-1).unstack().dropna().rename('excitement_score_v1')
plays = plays.join(excitement_scores, on=['specialTeamsPlayType', 'specialTeamsResult'])
print("Done in:", time.time() - start)

start = time.time()
plays.loc[plays['is_home_offense'], 'score_diff'] = plays['preSnapHomeScore'] - plays['preSnapVisitorScore']
plays.loc[~plays['is_home_offense'], 'score_diff'] = plays['preSnapVisitorScore'] - plays['preSnapHomeScore']
plays['score_diff_ex'] = pd.cut(plays['score_diff'].abs(), 
    bins=[-1, 0, 1, 3, 5, 8, 11, 16, 21, 26, 31], 
    labels=[.9, 1, .8, .7, .6, .5, .4, .3, .2, .1]
).astype(float)
plays['excitement_score_v2'] = 0.5 * plays['excitement_score_v1'] + 0.5 * plays['score_diff_ex']
print("Done in:", time.time() - start)

start = time.time()
plays['gameClockMins'] = plays['gameClock'].str.split(':').str[0]
plays['gameClockSecs'] = plays['gameClock'].str.split(':').str[1]
plays['seconds_left_in_quarter'] = plays['gameClockMins'].astype(int) * 60 + plays['gameClockSecs'].astype(int)
plays.loc[plays['quarter'].isin([2, 4, 5]), 'seconds_left_in_half'] = plays['seconds_left_in_quarter']
plays.loc[plays['quarter'].isin([1, 3]), 'seconds_left_in_half'] = plays['seconds_left_in_quarter'] + seconds_in_a_half
print("Done in:", time.time() - start)

start = time.time()
plays.loc[plays['quarter'] != 5, 'time_left_ex'] = (1 - plays['seconds_left_in_half'] / (seconds_in_a_half * 2))
plays.loc[plays['quarter'] == 5, 'time_left_ex'] = (1 - plays['seconds_left_in_half'] / seconds_in_overtime)
plays['quarter_ex'] = plays['quarter'] / 5
plays['excitement_score_v3'] = 0.25 * (plays['excitement_score_v1'] + plays['score_diff_ex'] + plays['quarter_ex'] + plays['time_left_ex'])
print("Done in:", time.time() - start)

Done in: 0.08814620971679688
Done in: 0.020440101623535156
Done in: 0.08430886268615723
Done in: 0.00700688362121582


In [10]:
kickoffs = plays[plays['specialTeamsPlayType']=='Kickoff'].reset_index(drop=True)

start = time.time()
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
transformed = encoder.fit_transform(kickoffs[['specialTeamsResult']])
output_cols = [x.split('_')[1] for x in encoder.get_feature_names()]
ohe_df = pd.DataFrame(transformed, columns=output_cols)
kickoffs = pd.concat([kickoffs, ohe_df], axis=1)
print("Done in:", time.time() - start)

start = time.time()
kickoffs['initial_excitement'] = 1/3 * (kickoffs['score_diff_ex'] + kickoffs['quarter_ex'] + kickoffs['time_left_ex'])
kickoff_tracking = kickoffs.merge(tracking, on=['gameId', 'playId'])
print("Done in:", time.time() - start)

start = time.time()
kickoff_tracking['is_off'] = (kickoff_tracking['is_home_offense'] & (kickoff_tracking['team'] == 'home')) | (~kickoff_tracking['is_home_offense'] & (kickoff_tracking['team'] != 'home'))
print("Done in:", time.time() - start)

Done in: 0.012713909149169922
Done in: 101.9402801990509
Done in: 1.3968241214752197


In [11]:
# kickoff_tracking = kickoff_tracking[(kickoff_tracking['gameId']==2018090600) & (kickoff_tracking['playId'].isin([37, 1387]))]
# kickoff_tracking.head()

In [12]:
print(kickoff_tracking.columns)
kickoff_tracking.head()

Index(['gameId', 'playId', 'playDescription', 'quarter', 'down', 'yardsToGo',
       'possessionTeam', 'specialTeamsPlayType', 'specialTeamsResult',
       'kickerId', 'returnerId', 'kickBlockerId', 'yardlineSide',
       'yardlineNumber', 'gameClock', 'penaltyCodes', 'penaltyJerseyNumbers',
       'penaltyYards', 'preSnapHomeScore', 'preSnapVisitorScore', 'passResult',
       'kickLength', 'kickReturnYardage', 'playResult',
       'absoluteYardlineNumber', 'season', 'week', 'gameDate',
       'gameTimeEastern', 'homeTeamAbbr', 'visitorTeamAbbr', 'is_home_offense',
       'excitement_score_v1', 'score_diff', 'score_diff_ex',
       'excitement_score_v2', 'gameClockMins', 'gameClockSecs',
       'seconds_left_in_quarter', 'seconds_left_in_half', 'time_left_ex',
       'quarter_ex', 'excitement_score_v3', 'Downed', 'Fair Catch',
       'Kickoff Team Recovery', 'Muffed', 'Out of Bounds', 'Return',
       'Touchback', 'initial_excitement', 'time', 'x', 'y', 's', 'a', 'dis',
       'o', 'di

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,specialTeamsPlayType,specialTeamsResult,kickerId,...,position,team,frameId,playDirection,stdX,stdY,stdDir,stdR,stdTheta,is_off
0,2018090600,37,J.Elliott kicks 65 yards from PHI 35 to end zo...,1,0,0,PHI,Kickoff,Touchback,44966.0,...,FS,home,1,right,43.76,8.1,53.28,44.503344,0.183029,True
1,2018090600,37,J.Elliott kicks 65 yards from PHI 35 to end zo...,1,0,0,PHI,Kickoff,Touchback,44966.0,...,FS,home,2,right,43.77,8.1,73.05,44.513177,0.182988,True
2,2018090600,37,J.Elliott kicks 65 yards from PHI 35 to end zo...,1,0,0,PHI,Kickoff,Touchback,44966.0,...,FS,home,3,right,43.78,8.1,70.39,44.52301,0.182947,True
3,2018090600,37,J.Elliott kicks 65 yards from PHI 35 to end zo...,1,0,0,PHI,Kickoff,Touchback,44966.0,...,FS,home,4,right,43.8,8.11,71.53,44.544496,0.183086,True
4,2018090600,37,J.Elliott kicks 65 yards from PHI 35 to end zo...,1,0,0,PHI,Kickoff,Touchback,44966.0,...,FS,home,5,right,43.84,8.12,75.17,44.585648,0.183143,True


The only features we're interested in right now is location of players and the ball. So we'll selected those columns and standardize (scale) them.

In [13]:
features = ['stdR', 'stdTheta']

sc = MinMaxScaler()

start = time.time()
kickoff_tracking[features] = sc.fit_transform(kickoff_tracking[features])
print("Done in:", time.time() - start)

Done in: 29.97202777862549


# Need to rename playerIds to 0-11 in order to optimize space (and time) since this step is failing!!
vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv


In [14]:
# start = time.time()
# df = kickoff_tracking.set_index(['gameId', 'playId', 'frameId', 'is_off', 'nflId']).sort_index(level=['is_off', 'frameId', 'nflId'])[features].unstack(['is_off', 'nflId']).reset_index(['gameId', 'playId', 'frameId'])
# print("Done in:", time.time() - start)

# df.head()

In [15]:
start = time.time()
df = kickoff_tracking.set_index(['gameId', 'playId', 'frameId', 'is_off', 'nflId']).sort_index(level=['gameId', 'playId','frameId', 'is_off', 'nflId']).droplevel('nflId')
df = df.set_index(df.groupby(['gameId', 'playId', 'frameId']).cumcount().rename('playerNum'), append=True)
df = df[features].unstack(['is_off', 'playerNum']).reset_index(['gameId', 'playId', 'frameId'])
print("Done in:", time.time() - start)

df.tail(50)

Done in: 160.2979609966278


Unnamed: 0_level_0,gameId,playId,frameId,stdR,stdR,stdR,stdR,stdR,stdR,stdR,...,stdTheta,stdTheta,stdTheta,stdTheta,stdTheta,stdTheta,stdTheta,stdTheta,stdTheta,stdTheta
is_off,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True
playerNum,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,0,1,2,3,4,5,6,...,14,15,16,17,18,19,20,21,22,11
618048,2021010315,3886,54,0.599991,0.709529,0.618113,0.639264,0.623358,0.593914,0.73325,...,0.479715,0.470085,0.555592,0.540663,0.45024,0.42944,0.431319,0.510823,0.406655,0.453767
618049,2021010315,3886,55,0.603289,0.710428,0.620317,0.64134,0.625751,0.596186,0.733153,...,0.478161,0.468194,0.552839,0.53801,0.449238,0.428312,0.431425,0.508898,0.406375,0.45351
618050,2021010315,3886,56,0.606636,0.711311,0.622393,0.643319,0.628014,0.598489,0.733002,...,0.476648,0.466229,0.54997,0.535431,0.448221,0.427117,0.431389,0.506983,0.406036,0.453273
618051,2021010315,3886,57,0.609894,0.71218,0.624347,0.645178,0.630123,0.60085,0.73285,...,0.475145,0.464371,0.546929,0.532879,0.447293,0.425961,0.431293,0.505103,0.405718,0.453057
618052,2021010315,3886,58,0.613153,0.713079,0.626272,0.646919,0.632144,0.603178,0.732737,...,0.473525,0.462525,0.543822,0.530397,0.446329,0.424773,0.431296,0.503185,0.405356,0.452885
618053,2021010315,3886,59,0.616302,0.713986,0.628071,0.648562,0.634035,0.605538,0.732651,...,0.471943,0.460586,0.540624,0.527976,0.445451,0.423569,0.431258,0.501276,0.40504,0.452685
618054,2021010315,4182,1,0.347062,0.382382,0.471346,0.379773,0.408878,0.53148,0.628187,...,0.653542,0.620223,0.393956,0.454105,0.501412,0.546763,0.59145,0.482256,0.351263,
618055,2021010315,4182,2,0.347388,0.382463,0.471364,0.379857,0.408762,0.531424,0.62814,...,0.653702,0.620223,0.393956,0.454105,0.501325,0.546321,0.59145,0.482217,0.351263,
618056,2021010315,4182,3,0.347799,0.382487,0.471364,0.37987,0.40887,0.531467,0.628052,...,0.653862,0.620215,0.393956,0.454199,0.50128,0.545832,0.591467,0.482038,0.351353,
618057,2021010315,4182,4,0.34827,0.382498,0.471364,0.379949,0.408978,0.531411,0.627937,...,0.653915,0.620215,0.393956,0.454199,0.501362,0.545374,0.591586,0.481949,0.351353,


Then we'll convert the DataFrame to a 2D array and remove all the `nan` values, since those are for players that are not in the play. The first column is the **gameId**, second is **playId**, and third is **frameId**. This will probably be useful once we implement a recurrent neural net (which can remember previous input), but that's probably pretty far down the road.

In [16]:
start = time.time()
values = np.array([x[~np.isnan(x)] for x in df.values])
print("Done in:", time.time() - start)
values.shape

Done in: 2.057347059249878


(618098, 49)

In [17]:
start = time.time()
output = kickoff_tracking.groupby(['gameId', 'playId', 'frameId'])[output_cols].first().values
print("Done in:", time.time() - start)
output.shape

Done in: 2.4998631477355957


(618098, 7)