In [1]:
# Importing 
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from scipy.spatial import Voronoi, voronoi_plot_2d
import statsmodels.api as sm
from scipy.interpolate import interp1d
import math
import random
import os
from scipy.stats import trim_mean
import scipy
from copy import deepcopy
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import warnings
import time

In [2]:
pd.set_option('display.max_columns', 100)

In [3]:
raw = pd.read_csv('train.csv.zip', low_memory = False)

In [4]:
raw.head()

Unnamed: 0,GameId,PlayId,Team,X,Y,S,A,Dis,Orientation,Dir,NflId,DisplayName,JerseyNumber,Season,YardLine,Quarter,GameClock,PossessionTeam,Down,Distance,FieldPosition,HomeScoreBeforePlay,VisitorScoreBeforePlay,NflIdRusher,OffenseFormation,OffensePersonnel,DefendersInTheBox,DefensePersonnel,PlayDirection,TimeHandoff,TimeSnap,Yards,PlayerHeight,PlayerWeight,PlayerBirthDate,PlayerCollegeName,Position,HomeTeamAbbr,VisitorTeamAbbr,Week,Stadium,Location,StadiumType,Turf,GameWeather,Temperature,Humidity,WindSpeed,WindDirection
0,2017090700,20170907000118,away,73.91,34.84,1.69,1.13,0.4,81.99,177.18,496723,Eric Berry,29,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-0,212,12/29/1988,Tennessee,SS,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
1,2017090700,20170907000118,away,74.67,32.64,0.42,1.35,0.01,27.61,198.7,2495116,Allen Bailey,97,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-3,288,03/25/1989,Miami,DE,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
2,2017090700,20170907000118,away,74.0,33.2,1.22,0.59,0.31,3.01,202.73,2495493,Justin Houston,50,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-3,270,01/21/1989,Georgia,DE,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
3,2017090700,20170907000118,away,71.46,27.7,0.42,0.54,0.02,359.77,105.64,2506353,Derrick Johnson,56,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-3,245,11/22/1982,Texas,ILB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
4,2017090700,20170907000118,away,69.32,35.42,1.82,2.43,0.16,12.63,164.31,2530794,Ron Parker,38,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-0,206,08/17/1987,Newberry,FS,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW


In [5]:
# Weather columns are recorded at the game level.
weather = raw[['GameId', 'PlayId', 'Temperature', 'Humidity', 'WindSpeed']].groupby(['GameId','PlayId']).agg(max)
weather = weather.reset_index()[['GameId','Temperature','Humidity']].groupby('GameId').agg(['max','min'])
weather = weather.fillna(0)
weather[
    (weather['Temperature','max'] != weather['Temperature','min']) |
    (weather['Humidity','max'] != weather['Humidity','min'])
]

Unnamed: 0_level_0,Temperature,Temperature,Humidity,Humidity
Unnamed: 0_level_1,max,min,max,min
GameId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2


In [6]:
# Orientation may be useful but apparently it is mismeasured in the 2017 data so we drop it.
drop_cols = ['DisplayName', 'JerseyNumber', 'PlayerCollegeName', 'Orientation']
player_cols = [
    'Team', 'X', 'Y', 'S', 'A', 'Dis', 'Dir', 'NflId', 'PlayerHeight', 
    'PlayerWeight', 'PlayerBirthDate', 'Position'
]
play_cols = [
    'PlayId', 'YardLine', 'Quarter', 'GameClock', 'PossessionTeam','Down', 
    'Distance', 'FieldPosition', 'HomeScoreBeforePlay', 'VisitorScoreBeforePlay',
    'NflIdRusher', 'OffenseFormation', 'OffensePersonnel', 'DefendersInTheBox', 
    'DefensePersonnel', 'PlayDirection','TimeHandoff','TimeSnap','Yards'
]
game_cols = [
    'GameId', 'Season', 'HomeTeamAbbr', 'VisitorTeamAbbr','Week','Stadium', 'Location', 
    'StadiumType', 'Turf', 'GameWeather', 'Temperature', 'Humidity', 
    'WindSpeed', 'WindDirection'
]

In [7]:
#Fixing some inconsistencies in the data 
name_maps = {'ARI':'ARZ', 'BAL':'BLT', 'CLE':'CLV', 'HOU':'HST'}
def clean_name(name):
    return name_maps.get(name,name)
def standardize_names(df):
    df['VisitorTeamAbbr'] = df['VisitorTeamAbbr'].apply(clean_name)
    df['HomeTeamAbbr'] = df['HomeTeamAbbr'].apply(clean_name)
    return df

In [8]:
raw = standardize_names(raw)

In [9]:
#standardize locations so that all plays are in the same direction 
def standardize_locations(df):
    leftPlays = df['PlayDirection'] == 'left'
    df.loc[leftPlays,'X'] = 120 - df.loc[leftPlays,'X']
    # we technically don't have to reflect Y but it makes it easier to deal with Dir
    df.loc[leftPlays, 'Y'] = (160/3) - df.loc[leftPlays,'Y']
    df.loc[leftPlays, 'Dir'] = np.mod(df.loc[leftPlays, 'Dir'] + 180,360)
    # this moves the discountinuity in direction to the direction away from the goal line
    # bc we don't expect players to go in that direction.
    bigAngles = df['Dir'] > 270
    df.loc[bigAngles, 'Dir'] = df.loc[bigAngles,'Dir'] - 360
    past50 = df['PossessionTeam'] != df['FieldPosition']
    df.loc[past50, 'YardLine'] = 100 - df.loc[past50, 'YardLine']
    df['maxYards'] = 100 - df['YardLine']
    df['minYards'] = -1*df['YardLine']
    # some Kaggle competitors found this is a better measure of speed
    df['S'] = 10*df['Dis']
    df['V_x'] = df['S']*np.sin(df['Dir']*2*np.pi/360)
    df['V_y'] = df['S']*np.cos(df['Dir']*2*np.pi/360)
    return df

In [10]:
# Reshape data to one row per play 
def pivot_locations_and_velocities(df):
    player_state_cols = ['X', 'Y', 'V_x', 'V_y']
    rusher_state_col_names = {col:'Rusher'+col+'0' for col in player_state_cols}
    isHome = df['Team'] == 'home'
    df['TeamAbbr'] = df['VisitorTeamAbbr']
    df.loc[isHome, 'TeamAbbr'] = df.loc[isHome, 'HomeTeamAbbr']
    isDefender = df['TeamAbbr'] != df['PossessionTeam']
    isRusher = df['NflId'] == df['NflIdRusher']
    rusher = df[isRusher]
    rusher = rusher.rename(columns = rusher_state_col_names)
    df = pd.merge(df[['PlayId'] + player_state_cols], rusher, on = 'PlayId')
    for col in ['X','Y']:
        df[col] = df[col] - df[rusher_state_col_names[col]]
    df = df[['PlayId'] + player_state_cols]
    defenders = df[isDefender].copy(0)
    defenders['idx'] = defenders.groupby('PlayId').cumcount()
    defenders = defenders.pivot(index='PlayId',columns='idx')[player_state_cols]
    defenders.columns = ['def' + col + str(i) for col in player_state_cols for i in range(11)]
    attackers = df[(~isDefender)& (~isRusher)].copy()
    attackers['idx'] = attackers.groupby('PlayId').cumcount()
    attackers = attackers.pivot(index='PlayId',columns='idx')[player_state_cols]
    attackers.columns = ['att' + col + str(i) for col in player_state_cols for i in range(10)]
    rusher = rusher.join(defenders, on = 'PlayId')
    rusher = rusher.join(attackers, on = 'PlayId')
    return rusher

In [47]:
#Use the week variable to create the test train split and the cross validation folds 
def standardize_week(df):
    df['Week'] = 17*(df['Season'] - 2017) + df['Week']
    return df
def train_test_split(df, p):
    n_weeks = max(df['Week'])
    delta = int(n_weeks*p)
    isTest = df['Week'] > (n_weeks - delta)
    return df[isTest].copy(), df[~isTest].copy()
def make_cv_folds(df, k):
    n_weeks = max(df['Week'])
    step = n_weeks//k
    remainder = n_weeks - step*k
    def week_to_fold(week):
        if week <= (step+1)*remainder:
            return (week-1)//(step+1)
        else:
            week -= (step+1)*remainder
            return remainder + (week-1)//step
    df['fold'] = df['Week'].apply(week_to_fold)
    return df
def make_pos_vel_cols():
    base = ['X', 'Y', 'V_x', 'V_y']
    cats = [('Rusher',1),('def',11),('att',10)]
    return [cat[0] + col + str(i) for cat in cats for col in base for i in range(cat[1])]

In [53]:
#Apply functions 
processed = standardize_locations(raw.copy())
processed = standardize_week(processed)
processed = pivot_locations_and_velocities(processed.copy())

In [56]:
cols = ['PlayId'] + make_pos_vel_cols()
processed[processed[cols].isna().any(axis=1)][cols]

Unnamed: 0,PlayId,RusherX0,RusherY0,RusherV_x0,RusherV_y0,defX0,defX1,defX2,defX3,defX4,defX5,defX6,defX7,defX8,defX9,defX10,defY0,defY1,defY2,defY3,defY4,defY5,defY6,defY7,defY8,defY9,defY10,defV_x0,defV_x1,defV_x2,defV_x3,defV_x4,defV_x5,defV_x6,defV_x7,defV_x8,defV_x9,defV_x10,defV_y0,defV_y1,defV_y2,defV_y3,defV_y4,defV_y5,defV_y6,defV_y7,defV_y8,defV_y9,defV_y10,attX0,attX1,attX2,attX3,attX4,attX5,attX6,attX7,attX8,attX9,attY0,attY1,attY2,attY3,attY4,attY5,attY6,attY7,attY8,attY9,attV_x0,attV_x1,attV_x2,attV_x3,attV_x4,attV_x5,attV_x6,attV_x7,attV_x8,attV_x9,attV_y0,attV_y1,attV_y2,attV_y3,attV_y4,attV_y5,attV_y6,attV_y7,attV_y8,attV_y9
294106,20180923010147,98.68,31.833333,3.509392,2.484788,10.86,8.63,9.51,3.91,7.17,6.6,5.31,3.75,5.29,11.14,5.5,3.51,-0.82,-12.59,-8.79,5.8,-1.49,3.22,5.45,1.16,-17.86,-2.8,-1.252347,-2.117229,-1.147547,-3.385747,,1.102987,-1.657688,-4.969857,-0.745649,-0.562658,-1.049547,0.995805,3.155526,-2.663294,-0.886971,,1.15906,0.376922,-1.529877,1.064898,2.946764,4.272991,4.89,5.53,4.12,1.68,4.69,6.47,3.62,6.06,3.53,4.7,-3.19,-18.43,-10.0,-0.79,0.72,-0.79,4.31,-1.57,1.63,2.61,0.18088,2.303575,0.798467,-3.461566,-0.133495,2.392954,-2.383073,1.946264,-0.122502,-0.247168,3.595453,3.748805,-4.021499,2.716166,0.890045,2.124564,1.269237,2.412893,-3.898076,0.865395
294194,20180923010411,34.88,25.953333,3.862798,1.038648,14.03,8.1,11.31,2.48,8.03,4.31,4.13,5.23,4.85,12.92,5.24,8.83,1.1,-3.34,2.66,-5.33,-0.2,-3.85,-8.94,1.28,-10.08,-3.97,2.893181,-1.59622,-1.810042,-2.734841,,-1.12601,-1.611333,0.137102,-0.463296,0.417733,-2.421949,0.198759,1.205023,2.265778,-0.964699,,1.273618,1.497867,1.393271,0.886204,2.768664,3.910775,3.42,9.07,5.84,0.25,4.22,3.83,5.05,6.14,3.59,3.29,-3.81,8.5,-8.79,-0.78,0.59,-0.53,-8.12,-0.77,-2.92,2.29,-1.347614,6.146958,2.454442,-4.252338,-0.224611,-1.104747,1.306574,2.48653,3.35632,-1.554479,1.985935,0.809262,4.240957,2.001905,0.974449,1.667193,0.502856,4.105748,2.687958,0.378941
294215,20180923010432,33.48,26.893333,2.409825,-2.807622,10.73,21.0,4.6,3.2,3.92,4.56,5.64,7.92,8.53,14.21,8.58,-9.2,3.36,3.45,-5.29,0.33,-3.04,-2.76,10.41,1.18,15.3,-3.11,-1.3551,-0.442136,-0.601662,,-0.71665,-0.8687,0.685652,-0.178252,-2.07643,1.038602,-2.252573,0.351715,-0.542693,-2.529427,,-1.202669,0.674804,2.299974,2.79432,-2.165281,-0.938779,-1.298428,4.2,2.19,2.77,-0.77,2.89,3.33,3.9,4.82,4.15,5.61,-2.44,12.77,-4.44,-0.36,-1.21,0.0,-3.49,-2.35,2.03,16.35,0.8029,0.365714,0.722034,-0.154556,-0.009774,-0.938405,0.530776,1.280289,0.885723,1.957064,1.383962,4.585439,3.526849,0.368934,-3.499986,-0.57393,1.928283,0.225519,-0.809627,-1.555604
294237,20180923010499,49.97,28.333333,2.715197,5.684866,10.26,6.58,22.35,4.27,8.1,4.53,5.47,5.07,5.13,8.06,7.11,5.29,1.64,-2.48,-4.81,-5.87,-0.02,3.08,5.39,-2.72,-18.71,-1.3,-1.092886,-2.606125,0.49728,-2.142223,,-0.377128,-0.090803,-0.828249,-0.343559,2.875889,-1.278765,3.534912,3.66853,3.161125,5.497352,,3.982182,4.299041,4.321343,4.988183,1.157266,5.452042,4.03,0.11,5.18,3.72,4.81,4.24,5.94,4.35,4.79,6.59,-3.0,-1.0,1.67,-0.24,4.78,-5.4,-1.2,5.44,3.12,-18.26,0.169981,-3.924343,0.971036,-0.441207,0.074401,-0.911013,1.410364,-0.268501,0.58226,3.585017,5.097167,4.133949,3.983351,4.176762,4.899435,5.119576,5.212569,4.291609,3.957395,1.774162
294258,20180923010544,61.8,29.593333,2.825834,-3.502094,10.35,12.34,7.09,19.91,9.29,4.68,3.09,4.67,12.24,7.95,4.8,6.32,-8.69,-0.14,-1.87,-15.56,4.37,-5.8,0.11,-19.9,-4.34,-3.29,-0.51865,-0.296045,-1.005861,,1.102764,0.136527,-1.299681,0.332039,1.551386,-1.599229,0.756756,-1.93158,-3.085832,-2.826348,,-2.682147,-2.596413,-0.028813,-1.76911,-0.391409,-1.025898,-2.382713,2.47,4.6,0.87,-1.0,4.01,4.28,3.95,4.03,4.37,5.86,-5.14,-16.16,-11.2,-0.08,-0.23,-2.6,3.89,-3.26,0.73,-21.32,0.043279,3.491839,-1.098093,0.097366,1.036336,0.890214,0.894826,1.316954,1.479209,2.762879,-1.599415,-3.717131,-4.775373,-0.022801,-2.275084,-2.336133,-1.561822,-2.006397,-2.258747,-0.45442
294273,20180923010595,68.68,21.753333,-2.251262,7.363547,10.77,9.02,24.88,5.81,7.23,5.54,5.75,10.46,8.69,13.24,5.09,23.82,-1.73,5.26,3.59,-3.33,0.55,-2.91,-4.38,3.44,-5.79,-1.59,4.996795,-0.818871,0.897719,-0.622349,,-1.530413,0.31884,3.059165,-1.595985,1.38249,-1.843904,2.298269,-0.734473,-0.064034,-1.026003,,-0.74016,-1.669833,-0.501506,-0.113282,-0.581997,-2.73679,5.37,9.99,0.53,5.74,4.22,5.92,3.38,7.71,2.94,10.45,-2.59,-4.41,-0.18,1.95,-1.73,-1.88,3.76,4.72,-3.89,25.28,0.11746,3.581012,-2.644899,-0.011239,-1.037321,-0.112278,0.086217,0.973579,2.640516,6.366906,-1.595683,0.930782,-1.415806,-0.69991,-1.471042,-1.39549,6.499428,0.228351,-6.374769,2.659418
294348,20180923010877,25.97,32.153333,4.622398,1.906159,15.43,7.99,16.98,12.73,5.51,5.01,3.83,5.06,6.29,8.94,8.64,3.27,0.89,-10.42,6.94,0.42,2.46,-5.29,4.5,-2.4,-20.63,-2.23,0.879173,-1.861706,-0.290923,,0.260472,-0.767056,-2.937419,-0.016858,0.563112,2.930683,-1.690954,-0.192495,0.971622,-0.073237,,1.477212,0.227212,3.409042,0.699797,1.814636,2.25856,3.064421,5.91,6.73,6.9,0.32,6.1,4.72,4.44,5.51,3.23,4.41,-3.15,-19.67,6.83,-0.87,0.98,0.23,4.08,-2.39,-2.31,2.76,1.71532,4.019284,4.068588,-4.205893,1.761609,0.260816,0.352188,1.155645,1.814952,-0.024607,1.678593,2.974114,-1.67529,1.862917,1.317852,1.882014,1.354977,2.329052,-3.338555,0.999697
294500,20180923011762,56.16,22.98,3.93789,-0.702154,12.88,7.91,22.33,3.29,4.03,8.73,4.66,7.74,4.42,13.08,7.61,13.0,4.49,-1.4,3.95,5.88,-8.75,-3.96,2.87,-0.93,-11.63,0.5,-0.087111,-1.550378,,-1.691443,-1.070408,0.861746,-0.387454,-2.022281,-1.006261,2.395264,-0.998746,-2.59854,-1.832029,,2.477705,-0.253428,-0.973342,1.240919,-1.93659,1.243961,0.150697,-0.05007,3.64,5.63,0.07,5.63,3.34,4.37,5.68,3.74,5.04,4.2,-1.0,-11.51,0.82,3.11,3.32,-3.35,-0.25,5.1,9.92,-7.93,-0.320297,5.299193,-4.399908,2.152957,-1.135514,0.62861,2.271688,-0.062493,1.65889,4.295084,1.669554,-0.092498,-0.028414,-0.452522,2.449614,1.250939,0.774231,0.496079,-5.033695,-0.205556
294522,20180923011783,64.45,26.13,3.841926,0.670523,7.75,20.49,17.58,6.01,5.4,7.28,5.76,7.86,5.78,10.03,7.38,8.67,8.02,-4.19,0.56,1.24,5.14,-7.25,2.8,-2.24,-7.93,-1.61,-0.023596,1.897111,,1.187795,0.782778,1.033423,1.76915,-1.671262,1.505021,-1.604524,-1.2473,-1.299786,0.104736,,0.916047,-0.1651,-0.609948,1.131419,0.311259,2.120121,0.815783,0.366391,4.47,7.07,-0.55,4.95,5.43,5.31,5.0,4.63,5.89,2.79,-2.6,4.05,-0.49,0.46,-0.09,-6.89,-2.15,1.35,5.63,-1.76,1.079076,2.298878,-4.25021,1.121902,1.135707,2.596115,1.906376,0.789597,3.353884,1.156789,2.031156,-0.071844,1.478416,0.837459,1.127018,1.048898,2.050787,-0.128594,-1.562517,-4.967076
294543,20180923012151,20.92,30.15,2.21565,4.258039,5.27,19.31,0.22,2.31,20.06,3.57,7.01,4.83,11.59,4.62,7.5,7.65,-10.12,4.91,1.92,4.27,-3.48,-16.8,1.52,-20.34,-5.56,-0.13,-0.711892,-0.78449,-3.030517,-2.051876,,1.592215,-1.181201,1.033793,0.706882,0.845777,-0.704156,-2.081636,1.159558,-3.592487,-1.428217,,-1.927395,-3.611754,-0.375861,0.374591,0.307671,-0.971681,4.64,2.37,2.67,-1.03,5.15,3.32,2.18,0.9,4.44,5.5,-4.45,-18.89,2.26,-0.31,0.29,-3.24,1.13,5.02,1.25,-19.43,1.694452,1.779613,0.57736,0.42123,2.421348,0.271551,-0.613012,-2.639786,1.171879,1.741661,-0.137225,-4.672577,-2.329518,-1.123639,-1.194602,-1.475215,1.031609,-1.625278,-1.089358,1.793493


In [45]:
identifiers = ['Week']
label = ['Yards']
other_features = ['maxYards', 'minYards']
features = other_features + make_pos_vel_cols()
processed = processed[identifiers + features + label]

In [48]:
test_df, train_df = train_test_split(processed, 0.2)
test_df.drop(columns = 'Week', inplace = True)
train_df = make_cv_folds(train_df, 5)
train_df.drop(columns = 'Week', inplace = True)
train_df = train_df[['fold'] + features + label]

In [74]:
# We could try replacing nan's in Dir with a random direction, 0, 
# or the direction of the opposite team's goalline.

# For now we drop them
train_df = train_df.dropna()
test_df = test_df.dropna()

In [60]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

#Cross Validation steps
def score_modelCV(df, model):
    scores = []
    for i in range(max(df['fold'])+1):
        mask = df['fold'] == i
        X_train = df.loc[~mask, df.columns[1:-1]]
        y_train = df.loc[~mask, df.columns[-1]]
        X_test = df.loc[mask, df.columns[1:-1]]
        y_test = df.loc[mask,df.columns[-1]]
        model.fit(X_train, y_train)
        yhat = model.predict(X_test)
        score = mean_squared_error(y_test, yhat)
        scores.append(score)
        print('added a cv score')
    scores = np.array(scores)
    return np.mean(scores), np.std(scores)

def RF_gridCV(df, trees, features):
    results = []
    for n in trees:
        for p in features:
            model = RandomForestRegressor(n_estimators = n, max_features = p)
            avg_score, std_dev = score_modelCV(df, model)
            results.append({
              'n_estimators': n, 'max_features': p, 'avg_score': avg_score, 'score_std_dev': std_dev
              })
            print('added a result')
    return pd.DataFrame(results)

In [82]:
mask = train_df['fold'] == 0
X_train = train_df.loc[~mask,train_df.columns[1:-1]]
y_train = train_df.loc[~mask, train_df.columns[-1]]

In [61]:
RF_results = RF_gridCV(train_df, trees = [30, 60, 100], features = [0.3, 0.6, 1.0])

added a cv score
added a cv score
added a cv score
added a cv score
added a result
added a cv score
added a cv score
added a cv score
added a cv score
added a result
added a cv score
added a cv score
added a cv score
added a cv score
added a result
added a cv score
added a cv score
added a cv score
added a cv score
added a result
added a cv score
added a cv score
added a cv score
added a cv score
added a result
added a cv score
added a cv score
added a cv score
added a cv score
added a result
added a cv score
added a cv score
added a cv score
added a cv score
added a result
added a cv score
added a cv score
added a cv score
added a cv score
added a result
added a cv score
added a cv score
added a cv score
added a cv score
added a result


In [75]:
def save_CV(df, train_df, test_df, modelName):
    df.to_csv(modelName + '_cv1.scv', index = False)
    print('saved CV')
    best_score = min(df['avg_score'])
    mask = df['avg_score'] == best_score
    X_train = train_df[train_df.columns[1:-1]]
    y_train = train_df[train_df.columns[-1]]
    X_test = test_df[test_df.columns[:-1]]
    y_test = test_df[test_df.columns[-1]]
    param_names = [col for col in df.columns if col not in ['avg_score', 'score_std_dev']]
    params = {col: df.loc[mask,col].values[0] for col in param_names}
    if modelName == 'RandomForest':
        model = RandomForestRegressor(**params)
    if modelName == 'AdaBoost':
        d = params['max_depth']
        n = params['n_estimators']
        model = AdaBoostRegressor(base_estimator = DecisionTreeRegressor(max_depth = d), n_estimators = n)
    model.fit(X_train, y_train)
    print('trained best model')
    yhat = model.predict(X_test)
    score = mean_squared_error(y_test, yhat)
    return score

In [66]:
RF_results = RF_results.rename(columns = {'n_trees':'n_estimators', 'proportion_features':'max_features'})

In [70]:
mask = RF_results['avg_score'] == min(RF_results['avg_score'])
RF_results.loc[mask,'n_estimators'].values[0]

6    100
Name: n_estimators, dtype: int64

In [76]:
save_CV(RF_results, train_df, test_df, 'RandomForest')

saved CV
trained best model


42.03236663185379

In [77]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

def Boost_gridCV(df, trees, depths):
    results = []
    for n in trees:
        for d in depths:
            base = DecisionTreeRegressor(max_depth = d)
            model = AdaBoostRegressor(base_estimator = base, n_estimators = n)
            avg_score, std_dev = score_modelCV(df, model)
            results.append({
              'n_estimators': n, 'max_depth': d, 'avg_score': avg_score, 'score_std_dev': std_dev
              })
            print('added a result')
    return pd.DataFrame(results)

In [79]:
AB_results = Boost_gridCV(train_df, trees = [20,40,60], depths = [2,3,4])

added a cv score
added a cv score
added a cv score
added a cv score
added a result
added a cv score
added a cv score
added a cv score
added a cv score
added a result
added a cv score
added a cv score
added a cv score
added a cv score
added a result
added a cv score
added a cv score
added a cv score
added a cv score
added a result
added a cv score
added a cv score
added a cv score
added a cv score
added a result
added a cv score
added a cv score
added a cv score
added a cv score
added a result
added a cv score
added a cv score
added a cv score
added a cv score
added a result
added a cv score
added a cv score
added a cv score
added a cv score
added a result
added a cv score
added a cv score
added a cv score
added a cv score
added a result


In [80]:
save_CV(AB_results, train_df, test_df, 'AdaBoost')

saved CV
trained best model


59.84160076040958

In [19]:
pivoted.head()

Unnamed: 0,Week,GameId,PlayId,Yards,maxYards,minYards,RusherX0,RusherY0,RusherV_x0,RusherV_y0,defX0,defX1,defX2,defX3,defX4,defX5,defX6,defX7,defX8,defX9,defX10,defY0,defY1,defY2,defY3,defY4,defY5,defY6,defY7,defY8,defY9,defY10,defV_x0,defV_x1,defV_x2,defV_x3,defV_x4,defV_x5,defV_x6,defV_x7,defV_x8,defV_x9,defV_x10,defV_y0,defV_y1,defV_y2,defV_y3,defV_y4,defV_y5,defV_y6,defV_y7,defV_y8,defV_y9,defV_y10,attX0,attX1,attX2,attX3,attX4,attX5,attX6,attX7,attX8,attX9,attY0,attY1,attY2,attY3,attY4,attY5,attY6,attY7,attY8,attY9,attV_x0,attV_x1,attV_x2,attV_x3,attV_x4,attV_x5,attV_x6,attV_x7,attV_x8,attV_x9,attV_y0,attV_y1,attV_y2,attV_y3,attV_y4,attV_y5,attV_y6,attV_y7,attV_y8,attV_y9
18,1,2017090700,20170907000118,8,65,-35,41.25,22.803333,3.464423,1.561336,4.84,4.08,4.75,7.29,9.43,3.69,4.64,5.38,22.12,5.4,4.6,-4.31,-2.11,-2.67,2.83,-4.89,6.53,13.89,11.8,3.63,-8.3,1.63,-0.196794,0.032061,1.197806,-0.192595,-0.432692,-1.793123,0.121503,1.3,-2.302281,0.959142,0.099739,3.995156,0.094721,2.859241,0.053918,1.540382,0.157193,-0.158862,-0.000908,-1.593581,5.008997,-0.007219,2.93,3.97,3.32,2.85,-1.01,2.28,4.05,4.15,4.17,4.17,12.97,-2.68,-1.88,5.41,1.04,-6.38,11.34,-1.35,1.41,2.06,5.333125,1.114107,2.172572,0.604451,-0.0,1.535982,5.043717,1.698141,1.943385,2.325327,-1.344537,1.539079,2.349453,1.801288,0.0,5.696557,-0.755589,0.079488,1.031142,0.594015
40,1,2017090700,20170907000139,3,57,-43,48.93,26.173333,2.518736,-2.28385,5.34,5.03,5.05,7.76,10.56,4.47,5.95,5.89,23.02,6.61,4.27,-6.72,-2.93,-4.34,-0.03,0.65,2.89,17.73,10.8,-0.52,7.8,-0.39,-0.493998,0.096492,0.808381,0.528448,-0.37613,-0.137499,2.86938,2.656059,-0.453585,-1.569065,-0.554745,1.938031,0.026253,1.018096,-2.341099,-2.774622,-1.895018,1.17331,2.122582,-0.658984,2.073171,-1.175695,0.53,4.32,4.88,3.67,-0.72,4.15,3.98,4.61,3.73,3.16,8.88,-4.44,-3.13,1.62,-0.33,10.6,17.66,-2.51,-0.66,-1.32,-0.359329,1.575836,1.776946,2.324307,-1.476675,3.378265,4.803169,2.191595,-0.370963,0.861647,4.485631,0.869908,0.287166,-1.561281,1.348863,1.739921,1.389088,-0.192125,-0.471579,-2.239992
62,1,2017090700,20170907000189,5,35,-65,71.34,34.223333,4.01167,4.461671,3.91,4.28,5.27,6.69,7.71,4.03,6.04,8.79,20.42,6.39,4.01,-7.42,-4.31,-6.25,-0.02,-3.35,2.93,10.03,-11.38,-3.55,-13.78,-1.32,-0.902789,0.825989,0.493982,-0.218407,0.310895,0.451436,1.153687,2.079958,-1.060429,2.409684,-0.42063,4.306387,3.298142,4.070133,3.994033,3.686915,3.369897,-0.330161,3.99046,4.37327,2.53839,2.970365,3.54,4.44,3.85,3.42,-0.85,4.83,5.3,5.5,3.2,5.36,-9.87,-6.12,-4.2,2.55,-1.41,-12.72,9.68,-2.87,-0.92,0.13,2.816826,1.434124,1.851773,0.668634,-3.187386,3.790689,1.552878,3.564301,-0.316423,1.919859,4.009425,3.734071,2.85148,3.231552,2.416728,2.944601,-0.385449,3.063619,3.385244,3.394722
84,1,2017090700,20170907000345,2,2,-98,104.47,27.973333,4.579282,-0.436095,3.62,5.86,4.25,4.36,5.77,4.66,5.21,4.4,5.47,6.33,3.32,2.72,6.88,-2.43,-3.78,-0.13,5.04,-8.29,-1.33,-1.62,-4.48,3.31,-2.643288,0.477409,0.212771,0.604094,-0.366768,0.925664,0.681685,0.179136,-1.254224,-1.368029,-1.343249,0.923595,-3.164187,-1.28247,-1.151117,-1.864264,-1.772892,-1.336153,-1.891536,0.341941,-1.722932,-2.108479,4.35,4.61,3.97,-0.77,2.89,3.77,4.06,4.04,5.24,3.58,4.67,-1.48,-3.42,0.5,-1.65,-7.13,2.72,-1.08,0.14,1.89,0.940562,2.031481,1.546318,-2.883204,3.372677,2.427863,0.574577,0.958646,1.476453,-0.890184,-0.745213,-1.457081,-0.706328,-1.388212,-2.667405,-4.370981,-0.818451,-1.523482,-1.029605,0.132562
98,1,2017090700,20170907000395,7,75,-25,29.99,27.12,1.653964,-4.077304,5.71,19.42,4.48,3.96,7.62,17.73,6.53,9.8,4.34,8.81,4.26,8.88,-8.54,-3.67,-6.0,-14.72,5.45,-0.33,12.06,4.0,-3.49,0.49,0.0,0.35448,-0.461027,-1.196993,-1.15085,1.503196,-1.4403,0.171112,-0.695683,-1.99182,-0.763087,-0.0,-3.180306,-2.964364,-1.725459,1.511801,-1.870936,-3.189911,-0.575083,-3.225837,-3.118117,-3.21056,-0.48,3.2,3.35,1.37,4.26,3.62,5.08,6.2,4.19,3.81,0.15,-5.11,0.57,7.61,8.87,-3.51,-0.72,-16.21,-2.58,10.29,0.407594,-0.113037,0.223546,-1.281933,1.701895,0.087462,1.341551,2.899995,0.382628,3.392736,0.68838,-1.996803,-4.294185,3.894438,2.709899,-2.798634,-3.662,-0.005568,-3.076296,3.535441


In [22]:
mask = (raw['PlayId'] == 20170907000118)
cols = ['Week','GameId','PlayId', 'Yards', 'Team', 'X','Y','Dir']
raw[mask][cols]

Unnamed: 0,Week,GameId,PlayId,Yards,Team,X,Y,Dir
0,1,2017090700,20170907000118,8,away,73.91,34.84,177.18
1,1,2017090700,20170907000118,8,away,74.67,32.64,198.7
2,1,2017090700,20170907000118,8,away,74.0,33.2,202.73
3,1,2017090700,20170907000118,8,away,71.46,27.7,105.64
4,1,2017090700,20170907000118,8,away,69.32,35.42,164.31
5,1,2017090700,20170907000118,8,away,75.06,24.0,95.01
6,1,2017090700,20170907000118,8,away,74.11,16.64,322.59
7,1,2017090700,20170907000118,8,away,73.37,18.73,270.04
8,1,2017090700,20170907000118,8,away,56.63,26.9,55.31
9,1,2017090700,20170907000118,8,away,73.35,38.83,190.84


In [23]:
processed[mask][cols + ['V_x', 'V_y']]

Unnamed: 0,Week,GameId,PlayId,Yards,Team,X,Y,Dir,V_x,V_y
0,1,2017090700,20170907000118,8,away,46.09,18.493333,-2.82,-0.196794,3.995156
1,1,2017090700,20170907000118,8,away,45.33,20.693333,18.7,0.032061,0.094721
2,1,2017090700,20170907000118,8,away,46.0,20.133333,22.73,1.197806,2.859241
3,1,2017090700,20170907000118,8,away,48.54,25.633333,-74.36,-0.192595,0.053918
4,1,2017090700,20170907000118,8,away,50.68,17.913333,-15.69,-0.432692,1.540382
5,1,2017090700,20170907000118,8,away,44.94,29.333333,-84.99,-1.793123,0.157193
6,1,2017090700,20170907000118,8,away,45.89,36.693333,142.59,0.121503,-0.158862
7,1,2017090700,20170907000118,8,away,46.63,34.603333,90.04,1.3,-0.000908
8,1,2017090700,20170907000118,8,away,63.37,26.433333,235.31,-2.302281,-1.593581
9,1,2017090700,20170907000118,8,away,46.65,14.503333,10.84,0.959142,5.008997


In [157]:
Dir = 245.74
Dir = np.mod(Dir + 180,360)

print('Dir_x: ', np.sin(Dir*2*np.pi/360))

Dir_x;  0.9116903457277413


In [118]:
raw[play_cols].drop_duplicates().head(10)

Unnamed: 0,PlayId,YardLine,Quarter,GameClock,PossessionTeam,Down,Distance,FieldPosition,HomeScoreBeforePlay,VisitorScoreBeforePlay,NflIdRusher,OffenseFormation,OffensePersonnel,DefendersInTheBox,DefensePersonnel,PlayDirection,TimeHandoff,TimeSnap,Yards
0,20170907000118,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8
22,20170907000139,43,1,13:52:00,NE,1,10,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:27.000Z,2017-09-08T00:44:26.000Z,3
44,20170907000189,35,1,13:02:00,NE,1,10,KC,0,0,2543773,SINGLEBACK,"1 RB, 1 TE, 3 WR",7.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:45:17.000Z,2017-09-08T00:45:15.000Z,5
66,20170907000345,2,1,12:12:00,NE,2,2,KC,0,0,2539663,JUMBO,"6 OL, 2 RB, 2 TE, 0 WR",9.0,"4 DL, 4 LB, 3 DB",left,2017-09-08T00:48:41.000Z,2017-09-08T00:48:39.000Z,2
88,20170907000395,25,1,12:08:00,KC,1,10,KC,7,0,2557917,SHOTGUN,"1 RB, 3 TE, 1 WR",7.0,"3 DL, 2 LB, 6 DB",right,2017-09-08T00:53:14.000Z,2017-09-08T00:53:13.000Z,7
110,20170907000473,29,1,11:21:00,NE,3,7,KC,7,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",5.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:55:46.000Z,2017-09-08T00:55:45.000Z,10
132,20170907000516,19,1,10:34:00,NE,2,10,KC,7,0,2539265,SINGLEBACK,"1 RB, 2 TE, 2 WR",8.0,"3 DL, 4 LB, 4 DB",left,2017-09-08T00:57:06.000Z,2017-09-08T00:57:05.000Z,-1
154,20170907000653,10,1,09:34:00,NE,4,1,KC,7,0,2539663,SINGLEBACK,"6 OL, 1 RB, 2 TE, 1 WR",9.0,"3 DL, 4 LB, 4 DB",left,2017-09-08T01:03:35.000Z,2017-09-08T01:03:34.000Z,0
176,20170907000680,10,1,09:25:00,KC,1,10,KC,7,0,2557917,SINGLEBACK,"2 RB, 1 TE, 2 WR",7.0,"3 DL, 2 LB, 6 DB",right,2017-09-08T01:06:42.000Z,2017-09-08T01:06:41.000Z,9
198,20170907000801,50,1,06:34:00,KC,1,10,,7,0,2557917,PISTOL,"1 RB, 2 TE, 2 WR",7.0,"3 DL, 2 LB, 6 DB",right,2017-09-08T01:10:03.000Z,2017-09-08T01:10:02.000Z,9


In [94]:
raw.head(40)

Unnamed: 0,GameId,PlayId,Team,X,Y,S,A,Dis,Orientation,Dir,NflId,DisplayName,JerseyNumber,Season,YardLine,Quarter,GameClock,PossessionTeam,Down,Distance,FieldPosition,HomeScoreBeforePlay,VisitorScoreBeforePlay,NflIdRusher,OffenseFormation,OffensePersonnel,DefendersInTheBox,DefensePersonnel,PlayDirection,TimeHandoff,TimeSnap,Yards,PlayerHeight,PlayerWeight,PlayerBirthDate,PlayerCollegeName,Position,HomeTeamAbbr,VisitorTeamAbbr,Week,Stadium,Location,StadiumType,Turf,GameWeather,Temperature,Humidity,WindSpeed,WindDirection,maxYards,minYards,V_x,V_y
0,2017090700,20170907000118,away,73.91,34.84,4.0,1.13,0.4,81.99,2.82,496723,Eric Berry,29,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-0,212,12/29/1988,Tennessee,SS,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,65,-35,0.196794,3.995156
1,2017090700,20170907000118,away,74.67,32.64,0.1,1.35,0.01,27.61,0.0,2495116,Allen Bailey,97,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-3,288,03/25/1989,Miami,DE,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,65,-35,0.0,0.1
2,2017090700,20170907000118,away,74.0,33.2,3.1,0.59,0.31,3.01,0.0,2495493,Justin Houston,50,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-3,270,01/21/1989,Georgia,DE,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,65,-35,0.0,3.1
3,2017090700,20170907000118,away,71.46,27.7,0.2,0.54,0.02,359.77,74.36,2506353,Derrick Johnson,56,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-3,245,11/22/1982,Texas,ILB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,65,-35,0.192595,0.053918
4,2017090700,20170907000118,away,69.32,35.42,1.6,2.43,0.16,12.63,15.69,2530794,Ron Parker,38,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-0,206,08/17/1987,Newberry,FS,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,65,-35,0.432692,1.540382
5,2017090700,20170907000118,away,75.06,24.0,1.8,0.32,0.18,308.34,84.99,2543494,Dee Ford,55,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-2,252,03/19/1991,Auburn,DE,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,65,-35,1.793123,0.157193
6,2017090700,20170907000118,away,74.11,16.64,0.2,0.83,0.02,357.23,0.0,2543637,Terrance Mitchell,39,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,5-11,190,05/17/1992,Oregon,CB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,65,-35,0.0,0.2
7,2017090700,20170907000118,away,73.37,18.73,1.3,0.74,0.13,328.52,0.0,2543851,Phillip Gaines,23,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-0,193,04/04/1991,Rice,CB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,65,-35,0.0,1.3
8,2017090700,20170907000118,away,56.63,26.9,2.8,1.86,0.28,344.7,124.69,2550257,Daniel Sorensen,49,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-2,208,03/05/1990,Brigham Young,SS,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,65,-35,2.302281,-1.593581
9,2017090700,20170907000118,away,73.35,38.83,5.1,0.76,0.51,75.47,0.0,2552488,Marcus Peters,22,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-0,197,01/09/1993,Washington,CB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,65,-35,0.0,5.1


In [56]:
mask = raw.groupby('PlayId').cumcount() == 0
play_data = raw[playcols + game_cols]
play_data = play_data[mask]

In [57]:
avg_data = raw[['PlayId', 'X', 'Y']].groupby('PlayId').agg('mean').reset_index()
play_data = play_data.merge(avg_data)

In [58]:
play_data.head()

Unnamed: 0,PlayId,YardLine,Quarter,GameClock,PossessionTeam,Down,Distance,FieldPosition,HomeScoreBeforePlay,VisitorScoreBeforePlay,NflIdRusher,OffenseFormation,OffensePersonnel,DefendersInTheBox,DefensePersonnel,PlayDirection,TimeHandoff,TimeSnap,Yards,GameId,Season,HomeTeamAbbr,VisitorTeamAbbr,Week,Stadium,Location,StadiumType,Turf,GameWeather,Temperature,Humidity,WindSpeed,WindDirection,X,Y
0,20170907000118,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,2017090700,2017,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,73.881818,28.713182
1,20170907000139,43,1,13:52:00,NE,1,10,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:27.000Z,2017-09-08T00:44:26.000Z,3,2017090700,2017,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,65.785455,24.827727
2,20170907000189,35,1,13:02:00,NE,1,10,KC,0,0,2543773,SINGLEBACK,"1 RB, 1 TE, 3 WR",7.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:45:17.000Z,2017-09-08T00:45:15.000Z,5,2017090700,2017,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,43.381364,22.026818
3,20170907000345,2,1,12:12:00,NE,2,2,KC,0,0,2539663,JUMBO,"6 OL, 2 RB, 2 TE, 0 WR",9.0,"4 DL, 4 LB, 3 DB",left,2017-09-08T00:48:41.000Z,2017-09-08T00:48:39.000Z,2,2017090700,2017,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,11.485,25.766818
4,20170907000395,25,1,12:08:00,KC,1,10,KC,7,0,2557917,SHOTGUN,"1 RB, 3 TE, 1 WR",7.0,"3 DL, 2 LB, 6 DB",right,2017-09-08T00:53:14.000Z,2017-09-08T00:53:13.000Z,7,2017090700,2017,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,35.774545,26.824091


In [120]:
# We will set aside roughly 20% of the data as test data. 
# It makes sense to make this the final 20% of the data (chronologically).
time_info = raw[['Season','Week','GameId', 'PlayId']].groupby(['Season','Week']).agg('nunique')
time_info
# this corresponds to weeks 4-12 of the 2019 season

Unnamed: 0_level_0,Unnamed: 1_level_0,GameId,PlayId
Season,Week,Unnamed: 2_level_1,Unnamed: 3_level_1
2017,1,15,676
2017,2,16,721
2017,3,16,732
2017,4,16,771
2017,5,14,646
2017,6,14,677
2017,7,15,713
2017,8,13,617
2017,9,13,590
2017,10,14,659


In [37]:
istest = (raw['Season'] == 2019)&(raw['Week'] >= 4)

In [74]:
location_cols = [
    'YardLine','PossessionTeam','FieldPosition','PlayDirection', 'HomeTeamAbbr', 
    'VisitorTeamAbbr', 'X', 'Y'
]

play_data[play_data['Season'] == 2017][location_cols].head()

Unnamed: 0,YardLine,PossessionTeam,FieldPosition,PlayDirection,HomeTeamAbbr,VisitorTeamAbbr,X,Y
0,35,NE,NE,left,NE,KC,73.881818,28.713182
1,43,NE,NE,left,NE,KC,65.785455,24.827727
2,35,NE,KC,left,NE,KC,43.381364,22.026818
3,2,NE,KC,left,NE,KC,11.485,25.766818
4,25,KC,KC,right,NE,KC,35.774545,26.824091


In [73]:
play_data[play_data['Season'] == 2018][location_cols].head()

Unnamed: 0,YardLine,PossessionTeam,FieldPosition,PlayDirection,HomeTeamAbbr,VisitorTeamAbbr,X,Y
11900,30,ATL,ATL,left,PHI,ATL,79.495,31.341818
11901,41,ATL,ATL,left,PHI,ATL,68.307727,25.040909
11902,6,ATL,PHI,left,PHI,ATL,15.983636,24.878182
11903,1,ATL,PHI,left,PHI,ATL,11.669545,23.599545
11904,1,ATL,PHI,left,PHI,ATL,11.553182,20.898182
