## Import Data

In [43]:
# import packages and set options

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from datetime import datetime
import math
pd.set_option('display.max_rows', 20)

In [6]:
# read data in

games_df = pd.read_csv('../../data/games.csv')
players_df = pd.read_csv('../../data/players.csv')
plays_df = pd.read_csv('../../data/plays.csv')
tackles_df = pd.read_csv('../../data/tackles.csv')
tracking_week_1_df = pd.read_csv('../../data/tracking_week_1.csv')
tracking_week_2_df = pd.read_csv('../../data/tracking_week_2.csv')
tracking_week_3_df = pd.read_csv('../../data/tracking_week_3.csv')
tracking_week_4_df = pd.read_csv('../../data/tracking_week_4.csv')
tracking_week_5_df = pd.read_csv('../../data/tracking_week_5.csv')
tracking_week_6_df = pd.read_csv('../../data/tracking_week_6.csv')
tracking_week_7_df = pd.read_csv('../../data/tracking_week_7.csv')
#tracking_week_8_df = pd.read_csv('tracking_week_8.csv')
#tracking_week_9_df = pd.read_csv('tracking_week_9.csv')

In [7]:
# append all tracking data into one dataframe
tracking_df = tracking_week_1_df.append(tracking_week_2_df, ignore_index=True)
tracking_df = tracking_df.append(tracking_week_3_df, ignore_index=True)
tracking_df = tracking_df.append(tracking_week_4_df, ignore_index=True)
tracking_df = tracking_df.append(tracking_week_5_df, ignore_index=True)
tracking_df = tracking_df.append(tracking_week_6_df, ignore_index=True)
tracking_df = tracking_df.append(tracking_week_7_df, ignore_index=True)

  tracking_df = tracking_week_1_df.append(tracking_week_2_df, ignore_index=True)
  tracking_df = tracking_df.append(tracking_week_3_df, ignore_index=True)
  tracking_df = tracking_df.append(tracking_week_4_df, ignore_index=True)
  tracking_df = tracking_df.append(tracking_week_5_df, ignore_index=True)
  tracking_df = tracking_df.append(tracking_week_6_df, ignore_index=True)
  tracking_df = tracking_df.append(tracking_week_7_df, ignore_index=True)


## Data Preprocessing and Basic Feature Engineering

In [15]:
## players_df edits

# converting heights

# height dictionary
height_mapping = {'5-10': 70, '5-11': 71, '5-6': 66, '5-7': 67, '5-8': 68,
                  '5-9': 69, '6-0': 72, '6-1': 73, '6-2': 74, '6-3': 75,
                  '6-4': 76, '6-5': 77, '6-6': 78, '6-7': 79, '6-8': 80, '6-9': 81}

def height_to_inches(height_str):
    return height_mapping[height_str]

# Apply the function to create a new 'height_in_inch' column
players_df['height_in_inch'] = players_df['height'].apply(height_to_inches)

# converting ages

# Convert birthdates to datetime objects
birthdates_datetime = pd.to_datetime(players_df['birthDate'], errors='coerce')

# Calculate age based on current date
current_date = datetime.now()
ages = (current_date - birthdates_datetime).dt.days / 365.25

# Create a DataFrame with birthdates and ages
players_df['age'] = ages

# creating position groups

# size based / traditional mapping
position_group_mapping = {'C' : 'OL', 'CB' : 'DB', 'DB': 'DB', 'DE' : 'DL', 'DT': 'DL', 'FB' : 'HB',
                  'FS': 'DB', 'G': 'OL', 'ILB' : 'LB', 'LS': 'OL', 'MLB': 'LB', 'NT': 'DL',
                  'OLB': 'LB', 'QB': 'QB', 'RB': 'HB', 'SS': 'DB', 'T': 'OL', 'TE' : 'R', 'WR' : 'R'}

# role based mapping
position_role_mapping = {'C' : 'OL', 'CB' : 'CB', 'DB': 'DB', 'DE' : 'EDGE', 'DT': 'DL', 'FB' : 'TE',
                  'FS': 'DB', 'G': 'OL', 'ILB' : 'LB', 'LS': 'OL', 'MLB': 'LB', 'NT': 'DL',
                  'OLB': 'EDGE', 'QB': 'QB', 'RB': 'HB', 'SS': 'DB', 'T': 'OL', 'TE' : 'TE', 'WR' : 'WR'}


def position_to_pos_group(position):
    return position_group_mapping[position]

def position_to_pos_role(position):
    return position_role_mapping[position]

players_df['position_group'] = players_df['position'].apply(position_to_pos_group)
players_df['position_role'] = players_df['position'].apply(position_to_pos_role)

# select relevant columns
players_df_set = players_df[['nflId','height_in_inch', 'weight',
                         'position','position_group','position_role','displayName']]

In [16]:
# plays_df edits

def calculate_time_remaining(quarter, gameclock):
    def convert_to_seconds(gameclock_str):
        minutes, seconds = map(int, gameclock_str.split(':'))
        return minutes * 60 + seconds

    # Convert 'gameclock' to total seconds
    total_seconds = gameclock.apply(convert_to_seconds)

    # Adjust time based on 'quarter'
    adjusted_time = total_seconds + (4 - quarter) * 900

    return adjusted_time

plays_df['time_remaining'] = calculate_time_remaining(plays_df['quarter'], plays_df['gameClock'])

# drop plays nullified by penalty
plays_df = plays_df[plays_df["playNullifiedByPenalty"] == 'N']

plays_df_set = plays_df[['gameId', 'playId','ballCarrierId', 'ballCarrierDisplayName', 
                         'quarter', 'down', 'yardsToGo', 'time_remaining',
                    'absoluteYardlineNumber', 'gameClock', 'preSnapHomeScore', 'preSnapVisitorScore',
                    'passLength', 'offenseFormation', 'defendersInTheBox', 'passProbability',
                    'preSnapHomeTeamWinProbability', 'preSnapVisitorTeamWinProbability','defensiveTeam']]

In [17]:
# tackles_df edits

tackles_df_set = tackles_df[['gameId', 'playId', 'nflId','tackle', 'assist', 'forcedFumble', 'pff_missedTackle']]

In [19]:
# take subset of tracking_df

tracking_df_set = tracking_df[['gameId', 'playId', 'nflId', 'displayName', 'frameId', 'time',
                               'club', 'playDirection', 'x', 'y', 's', 'a', 'dis', 'o','dir', 'event']]

In [20]:
# merge all available data to tracking data

one_merge = pd.merge(tracking_df_set, games_df, on='gameId')
two_merge = pd.merge(one_merge, players_df_set, on='nflId')
three_merge = pd.merge(two_merge, plays_df_set, on=['gameId', 'playId'])
final_merge = pd.merge(three_merge, tackles_df_set, on=['gameId', 'playId', 'nflId'], how="left")

final_merge.fillna(0, inplace=True)
final_merge.head()

# dropping about 61193 frames because of missing/unexplained data
# tracking_week_1_df[tracking_week_1_df["displayName"] == "football"]

Unnamed: 0,gameId,playId,nflId,displayName_x,frameId,time,club,playDirection,x,y,s,a,dis,o,dir,event,season,week,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr,homeFinalScore,visitorFinalScore,height_in_inch,weight,position,position_group,position_role,displayName_y,ballCarrierId,ballCarrierDisplayName,quarter,down,yardsToGo,time_remaining,absoluteYardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,passLength,offenseFormation,defendersInTheBox,passProbability,preSnapHomeTeamWinProbability,preSnapVisitorTeamWinProbability,defensiveTeam,tackle,assist,forcedFumble,pff_missedTackle
0,2022090800,56,35472.0,Rodger Saffold,1,2022-09-08 20:24:05.200000,BUF,left,88.37,27.27,1.62,1.15,0.16,231.74,147.9,,2022,1,09/08/2022,20:20:00,LA,BUF,10,31,77,325,G,OL,OL,Rodger Saffold,42489,Stefon Diggs,1,1,10,3600,85,15:00,0,0,5.0,SHOTGUN,6.0,0.68996,0.413347,0.586653,LA,,,,
1,2022090800,56,35472.0,Rodger Saffold,2,2022-09-08 20:24:05.299999,BUF,left,88.47,27.13,1.67,0.61,0.17,230.98,148.53,pass_arrived,2022,1,09/08/2022,20:20:00,LA,BUF,10,31,77,325,G,OL,OL,Rodger Saffold,42489,Stefon Diggs,1,1,10,3600,85,15:00,0,0,5.0,SHOTGUN,6.0,0.68996,0.413347,0.586653,LA,,,,
2,2022090800,56,35472.0,Rodger Saffold,3,2022-09-08 20:24:05.400000,BUF,left,88.56,27.01,1.57,0.49,0.15,230.98,147.05,,2022,1,09/08/2022,20:20:00,LA,BUF,10,31,77,325,G,OL,OL,Rodger Saffold,42489,Stefon Diggs,1,1,10,3600,85,15:00,0,0,5.0,SHOTGUN,6.0,0.68996,0.413347,0.586653,LA,,,,
3,2022090800,56,35472.0,Rodger Saffold,4,2022-09-08 20:24:05.500000,BUF,left,88.64,26.9,1.44,0.89,0.14,232.38,145.42,,2022,1,09/08/2022,20:20:00,LA,BUF,10,31,77,325,G,OL,OL,Rodger Saffold,42489,Stefon Diggs,1,1,10,3600,85,15:00,0,0,5.0,SHOTGUN,6.0,0.68996,0.413347,0.586653,LA,,,,
4,2022090800,56,35472.0,Rodger Saffold,5,2022-09-08 20:24:05.599999,BUF,left,88.72,26.8,1.29,1.24,0.13,233.36,141.95,,2022,1,09/08/2022,20:20:00,LA,BUF,10,31,77,325,G,OL,OL,Rodger Saffold,42489,Stefon Diggs,1,1,10,3600,85,15:00,0,0,5.0,SHOTGUN,6.0,0.68996,0.413347,0.586653,LA,,,,


In [21]:
# final merge checkpoint
# final_merge.to_csv('../../data/final_merge.csv', index=False)
# final_merge = pd.read_csv('../../data/final_merge.csv')

In [31]:
# filter by frames in which the pass is caught
pass_caught = final_merge[final_merge['event'] == 'pass_outcome_caught']

## ADVANCED FEATURE ENGINEERING

In [32]:
# add columns to track cumulative tackles by a player over a game and a season, 
# making sure to shift the feature to not perfectly correlate with when a tackle occurs

pass_caught = pass_caught.sort_values(by=['gameId', 'playId'])
pass_caught['season_rolling_tackle_ct'] = pass_caught.groupby('nflId')['tackle'].cumsum()
pass_caught['season_rolling_tackle_ct'] = pass_caught.groupby('nflId')['season_rolling_tackle_ct'].shift(fill_value=0)
pass_caught['game_rolling_tackle_ct'] = pass_caught.groupby(['gameId', 'nflId'])['tackle'].cumsum()
pass_caught['game_rolling_tackle_ct'] = pass_caught.groupby(['gameId', 'nflId'])['game_rolling_tackle_ct'].shift(fill_value=0)
pass_caught['season_rolling_missed_tackle_ct'] = pass_caught.groupby('nflId')['pff_missedTackle'].cumsum()
pass_caught['season_rolling_missed_tackle_ct'] = pass_caught.groupby('nflId')['season_rolling_missed_tackle_ct'].shift(fill_value=0)
pass_caught['game_rolling_missed_tackle_ct'] = pass_caught.groupby(['gameId', 'nflId'])['pff_missedTackle'].cumsum()
pass_caught['game_rolling_missed_tackle_ct'] = pass_caught.groupby(['gameId', 'nflId'])['game_rolling_missed_tackle_ct'].shift(fill_value=0)

In [40]:
# pass_caught checkpoint
# pass_caught.to_csv('../../data/pass_caught.csv', index=False)
# pass_caught = pd.read_csv('../../data/pass_caught.csv')

In [41]:
# join all players to ball carrier information

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
joined_with_bc = pd.merge(pass_caught,
                          final_merge[['displayName_x','gameId', 'playId', 'frameId','x', 'y', 's', 'a',
            'dis', 'o', 'dir', 'weight', 'position', 'nflId', 'height_in_inch', 'position_group','position_role']],
                     left_on=['gameId', 'playId', 'frameId', 'ballCarrierId'],
                     right_on=['gameId', 'playId', 'frameId', 'nflId'],
                     how='left', suffixes=('_defense', '_offense'))
joined_with_bc.head(5)

Unnamed: 0,gameId,playId,nflId_defense,displayName_x_defense,frameId,time,club,playDirection,x_defense,y_defense,s_defense,a_defense,dis_defense,o_defense,dir_defense,event,season,week,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr,homeFinalScore,visitorFinalScore,height_in_inch_defense,weight_defense,position_defense,position_group_defense,position_role_defense,displayName_y,ballCarrierId,ballCarrierDisplayName,quarter,down,yardsToGo,time_remaining,absoluteYardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,passLength,offenseFormation,defendersInTheBox,passProbability,preSnapHomeTeamWinProbability,preSnapVisitorTeamWinProbability,defensiveTeam,tackle,assist,forcedFumble,pff_missedTackle,season_rolling_tackle_ct,game_rolling_tackle_ct,season_rolling_missed_tackle_ct,game_rolling_missed_tackle_ct,displayName_x_offense,x_offense,y_offense,s_offense,a_offense,dis_offense,o_offense,dir_offense,weight_offense,position_offense,nflId_offense,height_in_inch_offense,position_group_offense,position_role_offense
0,2022090800,56,35472.0,Rodger Saffold,6,2022-09-08 20:24:05.700000,BUF,left,88.8,26.7,1.15,1.42,0.12,234.48,139.41,pass_outcome_caught,2022,1,09/08/2022,20:20:00,LA,BUF,10,31,77,325,G,OL,OL,Rodger Saffold,42489,Stefon Diggs,1,1,10,3600,85,15:00,0,0,5.0,SHOTGUN,6.0,0.68996,0.413347,0.586653,LA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Stefon Diggs,79.85,35.59,4.61,4.82,0.45,114.27,202.2,191,WR,42489.0,72,R,WR
1,2022090800,56,38577.0,Bobby Wagner,6,2022-09-08 20:24:05.700000,LA,left,78.11,28.74,3.35,2.62,0.32,349.47,357.71,pass_outcome_caught,2022,1,09/08/2022,20:20:00,LA,BUF,10,31,72,242,ILB,LB,LB,Bobby Wagner,42489,Stefon Diggs,1,1,10,3600,85,15:00,0,0,5.0,SHOTGUN,6.0,0.68996,0.413347,0.586653,LA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Stefon Diggs,79.85,35.59,4.61,4.82,0.45,114.27,202.2,191,WR,42489.0,72,R,WR
2,2022090800,56,41239.0,Aaron Donald,6,2022-09-08 20:24:05.700000,LA,left,92.15,29.96,3.62,2.86,0.37,186.16,157.65,pass_outcome_caught,2022,1,09/08/2022,20:20:00,LA,BUF,10,31,73,280,DT,DL,DL,Aaron Donald,42489,Stefon Diggs,1,1,10,3600,85,15:00,0,0,5.0,SHOTGUN,6.0,0.68996,0.413347,0.586653,LA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Stefon Diggs,79.85,35.59,4.61,4.82,0.45,114.27,202.2,191,WR,42489.0,72,R,WR
3,2022090800,56,42392.0,Mitch Morse,6,2022-09-08 20:24:05.700000,BUF,left,88.21,29.31,1.42,0.64,0.14,282.32,347.15,pass_outcome_caught,2022,1,09/08/2022,20:20:00,LA,BUF,10,31,78,305,C,OL,OL,Mitch Morse,42489,Stefon Diggs,1,1,10,3600,85,15:00,0,0,5.0,SHOTGUN,6.0,0.68996,0.413347,0.586653,LA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Stefon Diggs,79.85,35.59,4.61,4.82,0.45,114.27,202.2,191,WR,42489.0,72,R,WR
4,2022090800,56,42489.0,Stefon Diggs,6,2022-09-08 20:24:05.700000,BUF,left,79.85,35.59,4.61,4.82,0.45,114.27,202.2,pass_outcome_caught,2022,1,09/08/2022,20:20:00,LA,BUF,10,31,72,191,WR,R,WR,Stefon Diggs,42489,Stefon Diggs,1,1,10,3600,85,15:00,0,0,5.0,SHOTGUN,6.0,0.68996,0.413347,0.586653,LA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Stefon Diggs,79.85,35.59,4.61,4.82,0.45,114.27,202.2,191,WR,42489.0,72,R,WR


In [42]:
#defense_only = joined_with_bc[joined_with_bc['club'] == joined_with_bc['defensiveTeam']]

joined_with_bc["offense_score"] = np.where(joined_with_bc['homeTeamAbbr'] == joined_with_bc['defensiveTeam'],
                                           joined_with_bc["preSnapVisitorScore"], joined_with_bc["preSnapHomeScore"])
joined_with_bc["defense_score"] = np.where(joined_with_bc['homeTeamAbbr'] == joined_with_bc['defensiveTeam'],
                                           joined_with_bc["preSnapHomeScore"], joined_with_bc["preSnapVisitorScore"])
joined_with_bc["differential"] = joined_with_bc["defense_score"] - joined_with_bc["offense_score"]

joined_with_bc["offense_win_prob"] = np.where(joined_with_bc['homeTeamAbbr'] == joined_with_bc['defensiveTeam'],
                                           joined_with_bc["preSnapVisitorTeamWinProbability"], joined_with_bc["preSnapHomeTeamWinProbability"])
joined_with_bc["defense_win_prob"] = np.where(joined_with_bc['homeTeamAbbr'] == joined_with_bc['defensiveTeam'],
                                           joined_with_bc["preSnapHomeTeamWinProbability"], joined_with_bc["preSnapVisitorTeamWinProbability"])
joined_with_bc["prob_differential"] = joined_with_bc["defense_win_prob"] - joined_with_bc["offense_win_prob"]


In [44]:
# distance to ball carrier
def calculate_distance(row):
    return ((row['x_defense'] - row['x_offense'])**2 + (row['y_defense'] - row['y_offense'])**2)**0.5

# Apply the function to create a new column 'distance'
joined_with_bc['dist_to_bc'] = joined_with_bc.apply(calculate_distance, axis=1)

# adds column for defense
joined_with_bc['is_defense'] = (joined_with_bc['club'] == joined_with_bc['defensiveTeam']).astype(int)

In [54]:
# number of offensive obstacles
joined_with_bc['num_blockers'] = joined_with_bc.apply(lambda row: (
    (joined_with_bc['is_defense'] == 0) & 
    (joined_with_bc['gameId'] == row['gameId']) & 
    (joined_with_bc['playId'] == row['playId']) & 
    (joined_with_bc['dist_to_bc'] < row['dist_to_bc'])
).sum() - 1, axis=1)

# weight of offensive obstacles
joined_with_bc['total_weight_blockers'] = joined_with_bc.apply(lambda row: (
    joined_with_bc.loc[
        (joined_with_bc['is_defense'] == 0) &
        (joined_with_bc['gameId'] == row['gameId']) &
        (joined_with_bc['playId'] == row['playId']) &
        (joined_with_bc['dist_to_bc'] < row['dist_to_bc']),
        'weight_defense'
    ].sum() - row['weight_offense']
), axis=1)

In [57]:
# create defense subset
defensive_players = joined_with_bc[joined_with_bc['is_defense'] == 1]

# rank amongst defenders
defensive_players['rank_to_bc'] = defensive_players.groupby(['playId','gameId'])['dist_to_bc'].rank()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  defensive_players['rank_to_bc'] = defensive_players.groupby(['playId','gameId'])['dist_to_bc'].rank()


In [58]:
# Calculate the differences in x and y coordinates
defensive_players['delta_x'] = defensive_players['x_offense'] - defensive_players['x_defense']
defensive_players['delta_y'] = defensive_players['y_offense'] - defensive_players['y_defense']

# Calculate the angle between the defensive player and the offensive player
defensive_players['angle_to_offensive'] = np.degrees(np.arctan2(defensive_players['delta_y'], defensive_players['delta_x']))


# Function to calculate resulting velocity and acceleration
def calculate_resulting_vectors(row):
    velocity = row['s_defense']
    acceleration = row['a_defense']
    original_direction = row['dir_defense']
    new_direction = row['angle_to_offensive']

    # Convert angles to radians
    original_direction_rad = math.radians(original_direction)
    new_direction_rad = math.radians(new_direction)

    # Calculate the component of velocity in the new direction
    velocity_component_in_new_direction = velocity * math.cos(new_direction_rad - original_direction_rad)

    # Calculate the component of acceleration in the new direction
    acceleration_component_in_new_direction = acceleration * math.cos(new_direction_rad - original_direction_rad)

    return pd.Series({
        'resulting_velocity': velocity_component_in_new_direction,
        'resulting_acceleration': acceleration_component_in_new_direction
    })

# Apply the function to create new columns 'resulting_velocity' and 'resulting_acceleration'
defensive_players[['resulting_velocity', 'resulting_acceleration']] = defensive_players.apply(calculate_resulting_vectors, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  defensive_players['delta_x'] = defensive_players['x_offense'] - defensive_players['x_defense']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  defensive_players['delta_y'] = defensive_players['y_offense'] - defensive_players['y_defense']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  defensive_playe

In [66]:
# 18 seconds into https://www.youtube.com/watch?v=eWPijiN3vGU
defensive_players[(defensive_players["gameId"] == 2022090800) & (defensive_players["ballCarrierDisplayName"] == "Devin Singletary")][["displayName_x_defense", "s_defense","a_defense","tackle","dist_to_bc","is_defense","num_blockers","total_weight_blockers","rank_to_bc","delta_x","delta_y","angle_to_offensive","resulting_velocity","resulting_acceleration"]]

Unnamed: 0,displayName_x_defense,s_defense,a_defense,tackle,dist_to_bc,is_defense,num_blockers,total_weight_blockers,rank_to_bc,delta_x,delta_y,angle_to_offensive,resulting_velocity,resulting_acceleration
23,Bobby Wagner,6.15,0.9,1.0,7.059922,1,2,625,3.0,6.61,2.48,20.565544,6.143889,0.899106
24,Aaron Donald,4.44,3.47,0.0,4.278329,1,0,0,1.0,-1.04,4.15,104.068724,-1.159537,-0.906215
27,Troy Hill,2.81,5.72,0.0,19.615823,1,7,2062,7.0,18.58,-6.29,-18.702814,2.014081,4.099837
28,Jalen Ramsey,4.31,3.89,0.0,23.766626,1,9,2426,9.0,16.67,16.94,45.460266,0.765437,0.690846
29,Leonard Floyd,0.74,1.87,0.0,7.449248,1,2,625,4.0,-2.88,6.87,112.744112,-0.605015,-1.52889
30,A'Shawn Robinson,3.68,2.58,0.0,4.809127,1,0,0,2.0,-3.06,3.71,129.515739,-2.351937,-1.648913
34,Taylor Rapp,3.74,3.87,0.0,21.979775,1,9,2426,8.0,19.92,9.29,25.00272,3.062403,3.16885
37,Greg Gaines,4.6,2.66,0.0,9.385542,1,5,1505,6.0,-7.8,5.22,146.208344,-0.905945,-0.523873
38,Nick Scott,5.02,3.68,0.0,30.639067,1,9,2426,10.0,30.4,3.82,7.162128,2.755206,2.019753
42,David Long,3.89,2.12,0.0,35.097602,1,9,2426,11.0,22.71,26.76,49.680242,-3.828163,-2.086299


## TARGET VARIABLE ENGINEERING

In [35]:
defensive_players

Unnamed: 0,gameId,playId,nflId_defense,displayName_x_defense,frameId,time,club,playDirection,x_defense,y_defense,s_defense,a_defense,dis_defense,o_defense,dir_defense,event,season,week,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr,homeFinalScore,visitorFinalScore,height_in_inch_defense,weight_defense,position_defense,position_group_defense,position_role_defense,displayName_y,ballCarrierId,ballCarrierDisplayName,quarter,down,yardsToGo,time_remaining,absoluteYardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,passLength,offenseFormation,defendersInTheBox,passProbability,preSnapHomeTeamWinProbability,preSnapVisitorTeamWinProbability,defensiveTeam,tackle,assist,forcedFumble,pff_missedTackle,season_rolling_tackle_ct,game_rolling_tackle_ct,season_rolling_missed_tackle_ct,game_rolling_missed_tackle_ct,displayName_x_offense,x_offense,y_offense,s_offense,a_offense,dis_offense,o_offense,dir_offense,weight_offense,position_offense,nflId_offense,height_in_inch_offense,position_group_offense,position_role_offense,offense_score,defense_score,differential,offense_win_prob,defense_win_prob,prob_differential,dist_to_bc,is_defense,rank_to_bc
1,2022090800,56,38577.0,Bobby Wagner,6,2022-09-08 20:24:05.700000,LA,left,78.11,28.74,3.35,2.62,0.32,349.47,357.71,pass_outcome_caught,2022,1,09/08/2022,20:20:00,LA,BUF,10,31,72,242,ILB,LB,LB,Bobby Wagner,42489,Stefon Diggs,1,1,10,3600,85,15:00,0,0,5.0,SHOTGUN,6.0,0.689960,0.413347,0.586653,LA,,,,,,,,,Stefon Diggs,79.85,35.59,4.61,4.82,0.45,114.27,202.20,191,WR,42489.0,72,R,WR,0,0,0,0.586653,0.413347,-0.173306,7.067538,1,3.0
2,2022090800,56,41239.0,Aaron Donald,6,2022-09-08 20:24:05.700000,LA,left,92.15,29.96,3.62,2.86,0.37,186.16,157.65,pass_outcome_caught,2022,1,09/08/2022,20:20:00,LA,BUF,10,31,73,280,DT,DL,DL,Aaron Donald,42489,Stefon Diggs,1,1,10,3600,85,15:00,0,0,5.0,SHOTGUN,6.0,0.689960,0.413347,0.586653,LA,,,,,,,,,Stefon Diggs,79.85,35.59,4.61,4.82,0.45,114.27,202.20,191,WR,42489.0,72,R,WR,0,0,0,0.586653,0.413347,-0.173306,13.527265,1,9.0
5,2022090800,56,42816.0,Troy Hill,6,2022-09-08 20:24:05.700000,LA,left,70.62,7.66,2.60,4.14,0.27,331.57,278.33,pass_outcome_caught,2022,1,09/08/2022,20:20:00,LA,BUF,10,31,71,184,CB,DB,CB,Troy Hill,42489,Stefon Diggs,1,1,10,3600,85,15:00,0,0,5.0,SHOTGUN,6.0,0.689960,0.413347,0.586653,LA,,,,,,,,,Stefon Diggs,79.85,35.59,4.61,4.82,0.45,114.27,202.20,191,WR,42489.0,72,R,WR,0,0,0,0.586653,0.413347,-0.173306,29.415605,1,11.0
6,2022090800,56,43294.0,Jalen Ramsey,6,2022-09-08 20:24:05.700000,LA,left,78.15,37.85,5.88,1.23,0.59,140.96,178.50,pass_outcome_caught,2022,1,09/08/2022,20:20:00,LA,BUF,10,31,73,208,CB,DB,CB,Jalen Ramsey,42489,Stefon Diggs,1,1,10,3600,85,15:00,0,0,5.0,SHOTGUN,6.0,0.689960,0.413347,0.586653,LA,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,Stefon Diggs,79.85,35.59,4.61,4.82,0.45,114.27,202.20,191,WR,42489.0,72,R,WR,0,0,0,0.586653,0.413347,-0.173306,2.828003,1,1.0
7,2022090800,56,43298.0,Leonard Floyd,6,2022-09-08 20:24:05.700000,LA,left,92.11,33.14,1.34,2.21,0.13,159.12,203.53,pass_outcome_caught,2022,1,09/08/2022,20:20:00,LA,BUF,10,31,77,240,DE,DL,EDGE,Leonard Floyd,42489,Stefon Diggs,1,1,10,3600,85,15:00,0,0,5.0,SHOTGUN,6.0,0.689960,0.413347,0.586653,LA,,,,,,,,,Stefon Diggs,79.85,35.59,4.61,4.82,0.45,114.27,202.20,191,WR,42489.0,72,R,WR,0,0,0,0.586653,0.413347,-0.173306,12.502404,1,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97078,2022091200,3826,43436.0,Quinton Jefferson,6,2022-09-12 23:05:53.500000,SEA,left,69.03,28.24,3.50,0.92,0.36,112.81,184.46,pass_outcome_caught,2022,1,09/12/2022,20:15:00,SEA,DEN,17,16,76,291,DE,DL,EDGE,Quinton Jefferson,53464,Javonte Williams,4,3,14,71,65,1:11,17,16,-6.0,SHOTGUN,3.0,0.916875,0.508994,0.491006,SEA,,,,,,,,,Javonte Williams,70.57,11.96,3.91,2.22,0.39,313.03,192.49,220,RB,53464.0,70,HB,HB,16,17,1,0.491006,0.508994,0.017987,16.352676,1,4.0
97079,2022091200,3826,42827.0,Justin Coleman,6,2022-09-12 23:05:53.500000,SEA,left,60.59,12.24,6.30,2.68,0.63,103.59,136.76,pass_outcome_caught,2022,1,09/12/2022,20:15:00,SEA,DEN,17,16,71,190,CB,DB,CB,Justin Coleman,53464,Javonte Williams,4,3,14,71,65,1:11,17,16,-6.0,SHOTGUN,3.0,0.916875,0.508994,0.491006,SEA,0.0,0.0,0.0,1.0,3.0,3.0,2.0,2.0,Javonte Williams,70.57,11.96,3.91,2.22,0.39,313.03,192.49,220,RB,53464.0,70,HB,HB,16,17,1,0.491006,0.508994,0.017987,9.983927,1,1.0
97080,2022091200,3826,44873.0,Josh Jones,6,2022-09-12 23:05:53.500000,SEA,left,49.62,32.42,4.63,4.05,0.44,143.62,159.31,pass_outcome_caught,2022,1,09/12/2022,20:15:00,SEA,DEN,17,16,74,220,FS,DB,DB,Josh Jones,53464,Javonte Williams,4,3,14,71,65,1:11,17,16,-6.0,SHOTGUN,3.0,0.916875,0.508994,0.491006,SEA,,,,,,,,,Javonte Williams,70.57,11.96,3.91,2.22,0.39,313.03,192.49,220,RB,53464.0,70,HB,HB,16,17,1,0.491006,0.508994,0.017987,29.283342,1,8.0
97083,2022091200,3826,54505.0,Boye Mafe,6,2022-09-12 23:05:53.500000,SEA,left,71.56,25.09,4.80,3.11,0.47,183.52,187.27,pass_outcome_caught,2022,1,09/12/2022,20:15:00,SEA,DEN,17,16,76,265,OLB,LB,EDGE,Boye Mafe,53464,Javonte Williams,4,3,14,71,65,1:11,17,16,-6.0,SHOTGUN,3.0,0.916875,0.508994,0.491006,SEA,,,,,,,,,Javonte Williams,70.57,11.96,3.91,2.22,0.39,313.03,192.49,220,RB,53464.0,70,HB,HB,16,17,1,0.491006,0.508994,0.017987,13.167270,1,2.0


In [70]:
# how to handle assists, forced fumbles
#tackle_prob_df['tackle_prob'] = np.where(tackle_prob_df['tackle'] == 1, 1, np.where(tackle_prob_df['assist'] == 1, 0.5, 0))


defensive_players['tackle_prob'] = np.where(defensive_players['tackle'] == 1, 1, 0)
defensive_players.fillna(0, inplace=True)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  defensive_players['tackle_prob'] = np.where(defensive_players['tackle'] == 1, 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  defensive_players.fillna(0, inplace=True)


In [71]:
# how many plays are there?
len(defensive_players.groupby(['gameId', 'playId']).count())

4412

In [75]:
# drop all plays that do not end in a tackle
defensive_players = defensive_players.groupby(['gameId', 'playId']).filter(lambda x: x['tackle_prob'].sum() > 0)
len(defensive_players.groupby(['gameId', 'playId']).count())

3757

## EXPORT DATA

In [76]:
defensive_players.to_csv('../model/defensive_players.csv', index=False)