# Model Notebook

This notebook is only used to help speed up data cleaning/transformations and model creation process.

In [135]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.animation import FuncAnimation
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

field_height = 53.3
field_length = 120
endzone_length = 10
seconds_in_a_half = 15 * 60
seconds_in_overtime = 10 * 60

In [3]:
games = pd.read_csv('data/games.csv')

In [4]:
players = pd.read_csv('data/players.csv')

In [5]:
plays = pd.read_csv('data/plays.csv')

In [6]:
plays = plays.merge(games, on='gameId')
plays['is_home_offense'] = plays['possessionTeam'] == plays['homeTeamAbbr']

In [7]:
scout = pd.read_csv('data/PFFScoutingData.csv')

In [8]:
tracking_2018 = pd.read_csv('data/tracking2018.csv')
tracking_2019 = pd.read_csv('data/tracking2019.csv')
tracking_2020 = pd.read_csv('data/tracking2020.csv')
tracking = tracking_2018.append([tracking_2019, tracking_2020])

left_mask = tracking['playDirection'] == 'left'
right_mask = tracking['playDirection'] == 'right'
tracking.loc[left_mask, 'stdX'] = 120 - tracking.loc[left_mask, 'x']
tracking.loc[left_mask, 'stdY'] = field_height - tracking.loc[left_mask, 'y']
tracking.loc[left_mask, 'stdDir'] = (360 - tracking.loc[left_mask, 'dir']) % 360
tracking.loc[right_mask, 'stdX'] = tracking.loc[right_mask, 'x']
tracking.loc[right_mask, 'stdY'] = tracking.loc[right_mask, 'y']
tracking.loc[right_mask, 'stdDir'] = tracking.loc[right_mask, 'dir']
tracking['stdR'] = (tracking['stdX'].pow(2) + tracking['stdY'].pow(2)).pow(1/2)
tracking['stdTheta'] = np.arctan(tracking['stdY']/tracking['stdX'])

In [9]:
num_games = plays['gameId'].unique().size

play_results = plays.groupby(['specialTeamsPlayType', 'specialTeamsResult']).size().unstack('specialTeamsPlayType')
play_results = play_results.divide(num_games)
play_results_probs = play_results.div(play_results.sum(axis=0), axis=1)
excitement_scores = play_results_probs.subtract(1).multiply(-1).unstack().dropna().rename('excitement_score_v1')
plays = plays.join(excitement_scores, on=['specialTeamsPlayType', 'specialTeamsResult'])

plays.loc[plays['is_home_offense'], 'score_diff'] = plays['preSnapHomeScore'] - plays['preSnapVisitorScore']
plays.loc[~plays['is_home_offense'], 'score_diff'] = plays['preSnapVisitorScore'] - plays['preSnapHomeScore']
plays['score_diff_ex'] = pd.cut(plays['score_diff'].abs(), 
    bins=[-1, 0, 1, 3, 5, 8, 11, 16, 21, 26, 31], 
    labels=[.9, 1, .8, .7, .6, .5, .4, .3, .2, .1]
).astype(float)
plays['excitement_score_v2'] = 0.5 * plays['excitement_score_v1'] + 0.5 * plays['score_diff_ex']

plays['gameClockMins'] = plays['gameClock'].str.split(':').str[0]
plays['gameClockSecs'] = plays['gameClock'].str.split(':').str[1]
plays['seconds_left_in_quarter'] = plays['gameClockMins'].astype(int) * 60 + plays['gameClockSecs'].astype(int)
plays.loc[plays['quarter'].isin([2, 4, 5]), 'seconds_left_in_half'] = plays['seconds_left_in_quarter']
plays.loc[plays['quarter'].isin([1, 3]), 'seconds_left_in_half'] = plays['seconds_left_in_quarter'] + seconds_in_a_half

plays.loc[plays['quarter'] != 5, 'time_left_ex'] = (1 - plays['seconds_left_in_half'] / (seconds_in_a_half * 2))
plays.loc[plays['quarter'] == 5, 'time_left_ex'] = (1 - plays['seconds_left_in_half'] / seconds_in_overtime)
plays['quarter_ex'] = plays['quarter'] / 5
plays['excitement_score_v3'] = 0.25 * (plays['excitement_score_v1'] + plays['score_diff_ex'] + plays['quarter_ex'] + plays['time_left_ex'])

In [160]:
kickoffs = plays[plays['specialTeamsPlayType']=='Kickoff'].reset_index(drop=True)

encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
transformed = encoder.fit_transform(kickoffs[['specialTeamsResult']])
output_cols = [x.split('_')[1] for x in encoder.get_feature_names()]
ohe_df = pd.DataFrame(transformed, columns=output_cols)
kickoffs = pd.concat([kickoffs, ohe_df], axis=1)

kickoffs['initial_excitement'] = 1/3 * (kickoffs['score_diff_ex'] + kickoffs['quarter_ex'] + kickoffs['time_left_ex'])
kickoff_tracking = kickoffs.merge(tracking, on=['gameId', 'playId'])

kickoff_tracking['is_off'] = (kickoff_tracking['is_home_offense'] & (kickoff_tracking['team'] == 'home')) | (~kickoff_tracking['is_home_offense'] & (kickoff_tracking['team'] != 'home'))

NotFittedError: This OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [145]:
single_play = kickoff_tracking[(kickoff_tracking['gameId'].isin([2018090600, 2018090600])) & (kickoff_tracking['playId'].isin([37, 1387]))]
single_play.head()

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,specialTeamsPlayType,specialTeamsResult,kickerId,...,position,team,frameId,playDirection,stdX,stdY,stdDir,stdR,stdTheta,is_off
0,2018090600,37,J.Elliott kicks 65 yards from PHI 35 to end zo...,1,0,0,PHI,Kickoff,Touchback,44966.0,...,FS,home,1,right,43.76,8.1,53.28,44.503344,0.183029,True
1,2018090600,37,J.Elliott kicks 65 yards from PHI 35 to end zo...,1,0,0,PHI,Kickoff,Touchback,44966.0,...,FS,home,2,right,43.77,8.1,73.05,44.513177,0.182988,True
2,2018090600,37,J.Elliott kicks 65 yards from PHI 35 to end zo...,1,0,0,PHI,Kickoff,Touchback,44966.0,...,FS,home,3,right,43.78,8.1,70.39,44.52301,0.182947,True
3,2018090600,37,J.Elliott kicks 65 yards from PHI 35 to end zo...,1,0,0,PHI,Kickoff,Touchback,44966.0,...,FS,home,4,right,43.8,8.11,71.53,44.544496,0.183086,True
4,2018090600,37,J.Elliott kicks 65 yards from PHI 35 to end zo...,1,0,0,PHI,Kickoff,Touchback,44966.0,...,FS,home,5,right,43.84,8.12,75.17,44.585648,0.183143,True


In [13]:
single_play.columns

Index(['gameId', 'playId', 'playDescription', 'quarter', 'down', 'yardsToGo',
       'possessionTeam', 'specialTeamsPlayType', 'specialTeamsResult',
       'kickerId', 'returnerId', 'kickBlockerId', 'yardlineSide',
       'yardlineNumber', 'gameClock', 'penaltyCodes', 'penaltyJerseyNumbers',
       'penaltyYards', 'preSnapHomeScore', 'preSnapVisitorScore', 'passResult',
       'kickLength', 'kickReturnYardage', 'playResult',
       'absoluteYardlineNumber', 'season', 'week', 'gameDate',
       'gameTimeEastern', 'homeTeamAbbr', 'visitorTeamAbbr', 'is_home_offense',
       'excitement_score_v1', 'score_diff', 'score_diff_ex',
       'excitement_score_v2', 'gameClockMins', 'gameClockSecs',
       'seconds_left_in_quarter', 'seconds_left_in_half', 'time_left_ex',
       'quarter_ex', 'excitement_score_v3', 'Downed', 'Fair Catch',
       'Kickoff Team Recovery', 'Muffed', 'Out of Bounds', 'Return',
       'Touchback', 'initial_excitement', 'time', 'x', 'y', 's', 'a', 'dis',
       'o', 'di

The only features we're interested in right now is location of players and the ball. So we'll selected those columns and standardize (scale) them.

In [158]:
features = ['stdR', 'stdTheta']
output_cols = encoder.get_feature_names()

sc = MinMaxScaler()
single_play[features] = sc.fit_transform(single_play[features])

In [159]:
df = single_play.set_index(['gameId', 'playId', 'frameId', 'is_off', 'nflId']).sort_index(level=['is_off', 'frameId', 'nflId'])[[*features, *output_cols]].unstack(['is_off', 'nflId']).reset_index(['gameId', 'playId', 'frameId'])

df.head()

KeyError: "['x0_Downed', 'x0_Fair Catch', 'x0_Kickoff Team Recovery', 'x0_Muffed', 'x0_Out of Bounds', 'x0_Return', 'x0_Touchback'] not in index"

Then we'll convert the DataFrame to a 2D array and remove all the `nan` values, since those are for players that are not in the play. The first column is the **gameId**, second is **playId**, and third is **frameId**. This will probably be useful once we implement a recurrent neural net (which can remember previous input), but that's probably pretty far down the road.

In [155]:
values = np.array([x[~np.isnan(x)] for x in df.values])
values

array([[2.01809060e+09, 3.70000000e+01, 1.00000000e+00, ...,
        3.57523740e-01, 8.76489637e-01, 9.44980100e-01],
       [2.01809060e+09, 3.70000000e+01, 2.00000000e+00, ...,
        3.57771620e-01, 8.76341106e-01, 9.45246562e-01],
       [2.01809060e+09, 3.70000000e+01, 3.00000000e+00, ...,
        3.58019468e-01, 8.76206390e-01, 9.45647612e-01],
       ...,
       [2.01809060e+09, 1.38700000e+03, 6.50000000e+01, ...,
        1.33661814e-01, 3.20206141e-01, 4.41613665e-01],
       [2.01809060e+09, 1.38700000e+03, 6.60000000e+01, ...,
        1.34215107e-01, 3.15584506e-01, 4.35915806e-01],
       [2.01809060e+09, 1.38700000e+03, 6.70000000e+01, ...,
        1.35191457e-01, 3.10712038e-01, 4.30209315e-01]])