In [23]:
''' Imports '''

import pandas as pd
import polars as pl

import nflreadpy as nfl
import nfl_data_py as nfldy

pl.Config.set_tbl_width_chars(-1)
pl.Config.set_tbl_cols(-1)
pl.Config.set_tbl_rows(-1)

polars.config.Config

# Features

## Questions
- Percentage of plays vs. total plays

## General

### Pace
- Total drives per game (both teams)
- Plays per game (offense)

## Offense
- QB Position / Formation splits
- Personnel usage
- Play types: under center run, under center play action, under center pass, shotgun pass, RPO, shotgun play action, pistol...

- % Pass
- % Pass on downs
- % Pass on "neutral" (open playbook) downs

### Passing
- ADOT
- Depths: Behind LOS, Short, Medium, Long
- Locations: Side of field (left / middle / right), or just middle / boundary
- Target share of top receiver (OR, number of receivers that make up top 80% targets)
- % Screen pass
- % pass from under center vs shotgun vs pistol
- % pass inbreakers vs outbreakers ??

### Rushing
- Run gaps: middle, guard/tackle, edge
- % rushes from under center vs shotgun vs pistol
- Number of rushers to carry 20% rushes (dial in this number)

In [None]:
''' Parameters / Constants '''

START_YEAR = 2016       # first year of participation data
END_YEAR = 2024
SEASONS = [i for i in range(START_YEAR, END_YEAR + 1)]


In [7]:
''' Load Data '''

# Load
pbp = nfl.load_pbp(seasons=SEASONS)

# Add columns
pbp = pbp.with_columns(
    DriveID=pl.concat_str([pl.col('game_id'), pl.col('drive').cast(pl.Int8).cast(pl.String)], separator='_'),
)
pbp = pbp.with_columns(
    NeutralDown=pl.when((pl.col('down') == 1) & (pl.col('ydstogo') <= 10)).then(1).when((pl.col('down') == 2) & (pl.col('ydstogo') <= 6)).then(1).when((pl.col('down') == 3) & (pl.col('ydstogo') <= 3)).then(1).otherwise(0),
)
pbp = pbp.with_columns(
    PassDepth=pl.when(pl.col('air_yards') <= 0).then(pl.lit('Behind LOS')).when(pl.col('air_yards') < 10).then(pl.lit('Short')).when(pl.col('air_yards') < 20).then(pl.lit('Medium')).when(pl.col('air_yards') >= 20).then(pl.lit('Long')),
)

# Trouble with run_gap and the middle rush https://thespade.substack.com/p/run-gap-charts-version-15
pbp = pbp.with_columns(
    RunLocation=pl.when(pl.col('run_gap') == 'end').then(pl.lit('Outside')).when((pl.col('run_gap') == 'guard') | (pl.col('run_gap') == 'tackle')).then(pl.lit('Inside'))
)

# Add distance

# Filter to relevant plays (see nflfastr beginner's guide)
pbp = pbp.filter(
    (pl.col('pass') == 1) | (pl.col('rush') == 1),
    (pl.col('season_type') == 'REG'),
    (pl.col('epa').is_not_nan()),
    (pl.col('posteam').is_not_null()),
    (pl.col('posteam') != ''),
)

# Filter to normal game state
pbp = pbp.filter(
    # (pl.col('qtr') <= 3),
    # (pl.col('half_seconds_remaining') > 120),
    # (pl.col('score_differential') <= 14),
    (pl.col('special_teams_play') == 0),
    (pl.col('play_type_nfl') != 'PAT2'),
    (pl.col('play_type_nfl') != 'UNSPECIFIED'),     # Unspecified seems to be mostly punt / FG formation plays where something weird happened (fake, fumble, botched snap, etc)
)

# Convert to pandas DF
pbp = pd.DataFrame(data=pbp, columns=pbp.columns)

print(pbp.shape)
print(pbp.head().to_string())
# print(pbp.head().to_string())

(342624, 376)
  play_id          game_id old_game_id home_team away_team season_type week posteam posteam_type defteam side_of_field yardline_100   game_date quarter_seconds_remaining half_seconds_remaining game_seconds_remaining game_half quarter_end drive   sp  qtr down goal_to_go   time   yrdln ydstogo ydsnet                                                                                                              desc play_type yards_gained shotgun no_huddle qb_dropback qb_kneel qb_spike qb_scramble pass_length pass_location air_yards yards_after_catch run_location run_gap field_goal_result kick_distance extra_point_result two_point_conv_result home_timeouts_remaining away_timeouts_remaining timeout timeout_team td_team td_player_name td_player_id posteam_timeouts_remaining defteam_timeouts_remaining total_home_score total_away_score posteam_score defteam_score score_differential posteam_score_post defteam_score_post score_differential_post no_score_prob opp_fg_prob opp_safety_pr

In [32]:
participation = nfl.load_participation(seasons=SEASONS)

# Formation
participation = participation.with_columns(
    Formation=pl.when(pl.col('offense_formation').is_in(['SINGLEBACK', 'I_FORM', 'UNDER CENTER', 'JUMBO'])).then(pl.lit('Under Center')).when(pl.col('offense_formation').is_in(['SHOTGUN', 'EMPTY', 'WILDCAT', 'PISTOL'])).then(pl.lit('Shotgun'))
)

# Personnel
participation = participation.with_columns(
    Personnel=
        pl.when(pl.col('offense_personnel') == '0 RB, 1 TE, 4 WR').then(pl.lit('01'))
        .when(pl.col('offense_personnel') == '0 RB, 2 TE, 3 WR').then(pl.lit('02'))
        .when(pl.col('offense_personnel') == '1 RB, 0 TE, 4 WR').then(pl.lit('10'))
        .when(pl.col('offense_personnel') == '1 RB, 1 TE, 3 WR').then(pl.lit('11'))
        .when(pl.col('offense_personnel') == '1 RB, 2 TE, 2 WR').then(pl.lit('12'))
        .when(pl.col('offense_personnel') == '1 RB, 3 TE, 1 WR').then(pl.lit('13'))
        .when(pl.col('offense_personnel') == '2 RB, 0 TE, 3 WR').then(pl.lit('20'))
        .when(pl.col('offense_personnel') == '2 RB, 1 TE, 2 WR').then(pl.lit('21'))
        .when(pl.col('offense_personnel') == '2 RB, 2 TE, 1 WR').then(pl.lit('22'))
        .otherwise(pl.lit('Other'))
)

# print(participation['offense_personnel'].value_counts().sort(by='count', descending=True)) 
unique_values = participation['offense_personnel'].unique().to_list()
# print(len(unique_values))
# for val in unique_values:
#     print(val)

print(participation.shape)
print(participation.head())

(433805, 28)
shape: (5, 28)
┌──────────────────┬─────────────┬─────────┬─────────────────┬───────────────────┬────────────────────────┬──────────────────┬───────────────────┬────────────────────────┬─────────────────────────────────┬─────────────────────────────────┬─────────────────────────────────┬───────────┬───────────┬───────────────┬───────────────┬──────────────┬────────┬───────────────────────┬───────────────────────┬───────────────┬───────────────┬───────────────────┬───────────────────┬─────────────────┬─────────────────┬──────────────┬───────────┐
│ nflverse_game_id ┆ old_game_id ┆ play_id ┆ possession_team ┆ offense_formation ┆ offense_personnel      ┆ defenders_in_box ┆ defense_personnel ┆ number_of_pass_rushers ┆ players_on_play                 ┆ offense_players                 ┆ defense_players                 ┆ n_offense ┆ n_defense ┆ ngs_air_yards ┆ time_to_throw ┆ was_pressure ┆ route  ┆ defense_man_zone_type ┆ defense_coverage_type ┆ offense_names ┆ defense_names ┆ o

In [8]:
print(pbp['run_gap'].value_counts(dropna=False))

None      244506
end        34163
guard      33028
tackle     30927
Name: run_gap, dtype: int64


In [None]:
''' Prep data inputs '''

gpby = pbp.groupby(['posteam', 'season']).aggregate(
    Games=('game_id', 'nunique'),
    Drives=('DriveID', 'nunique'),
    Plays=('posteam', 'size'),
    Neutral_Down_Plays=('posteam', lambda x: x[pbp['NeutralDown'] == 1].shape[0]),

    Pass_Plays=('pass', 'sum'),
    Pass_Attempts=('pass_attempt', 'sum'),
    Sacks=('sack', 'sum'),
    IAY=('air_yards', 'sum'),
    Pass_BehindLOS=('pass', lambda x: x[pbp['PassDepth'] == 'Behind LOS'].sum()),
    Pass_Deep=('pass', lambda x: x[pbp['PassDepth'] == 'Long'].sum()),
    Neutral_Down_Pass=('pass', lambda x: x[pbp['NeutralDown'] == 1].sum()),

    Rush_Plays=('rush', 'sum'),
    Rush_Attempts=('rush_attempt', 'sum'),
    Rush_Inside=('rush', lambda x: x[pbp['RunLocation'] == 'Inside'].sum()),
    Rush_Outside=('rush', lambda x: x[pbp['RunLocation'] == 'Outside'].sum()),
)

# Overall numbers
gpby['% Pass'] = gpby['Pass_Plays'] / gpby['Plays']
gpby['% Pass Neutral Down'] = gpby['Neutral_Down_Pass'] / gpby['Neutral_Down_Plays']

# Passing numbers
gpby['ADOT'] = gpby['IAY'] / (gpby['Pass_Attempts'] - gpby['Sacks'])

# Rushing numbers
gpby['% Rush Inside'] = gpby['Rush_Inside'] / gpby['Rush_Plays']
gpby['% Rush Outside'] = gpby['Rush_Outside'] / gpby['Rush_Plays']


print(gpby.shape)
print(gpby.head(12).to_string())


(320, 14)
                Games  Drives  Plays Pass_Plays Pass_Attempts Sacks     IAY  PassBehindLOS  PassDeep  NeutralDownPass Rush_Plays Rush_Attempts    % Pass       ADOT
posteam season                                                                                                                                                     
ARI     2015       16     175   1065      641.0         583.0  27.0  6084.0           80.0      91.0            331.0      424.0         422.0  0.601878  10.942446
        2016       16     192   1137      738.0         684.0  41.0  6204.0           82.0      79.0            381.0      399.0         391.0  0.649077   9.648523
        2017       16     197   1103      712.0         649.0  52.0  5807.0           93.0      73.0            329.0      391.0         395.0  0.645512   9.726968
        2018       16     182    930      591.0         546.0  52.0  3979.0          111.0      60.0            272.0      339.0         346.0  0.635484   8.054656
      

In [16]:
gpby = pbp.groupby(['special_teams_play', 'play_type_nfl']).size()
print(gpby.to_string())

special_teams_play  play_type_nfl               
0.0                 FUMBLE_RECOVERED_BY_OPPONENT        92
                    INTERCEPTION                      2307
                    PASS                            105753
                    PENALTY                           9328
                    RUSH                             90915
                    SACK                              7396


In [None]:
## QB Position Base
qb_pos = league_pbp_normal_gs.groupby(['posteam', 'QB Position']).aggregate(
    Plays=('QB Position', 'size'),
    Rush=('rush', 'sum'),
    Pass=('pass', 'sum'),
    Pure_Rush = ('rush', lambda x: x[(league_pbp_normal_gs['is_rpo'] == False)].sum()),
    RPO_Rush = ('rush', lambda x: x[(league_pbp_normal_gs['is_rpo'] == True)].sum()),
    RPO_Pass = ('pass', lambda x: x[(league_pbp_normal_gs['is_rpo'] == True)].sum()),
    PA_Pass = ('pass', lambda x: x[(league_pbp_normal_gs['is_rpo'] == False) & (league_pbp_normal_gs['is_play_action'] == True)].sum()),
    Pure_Pass = ('pass', lambda x: x[(league_pbp_normal_gs['is_rpo'] == False) & (league_pbp_normal_gs['is_play_action'] == False)].sum()),
    Yards=('yards_gained', 'sum'),
    PassYards=('passing_yards', 'sum'),
    RushYards=('rushing_yards', 'sum'),
)
qb_pos['% Plays'] = qb_pos['Plays'] / qb_pos.groupby(level=0)['Plays'].transform('sum')
qb_pos['% Plays LRank'] = qb_pos.groupby(level='QB Position')['% Plays'].rank(ascending=False, method='max')

# Reindex
qb_pos = qb_pos.reindex(labels=qb_pos_order, level='QB Position')

# Names
play_type_cols = ['Pure_Rush', 'RPO_Rush', 'RPO_Pass', 'PA_Pass', 'Pure_Pass']
qb_pos = qb_pos.rename(columns={col: col.replace('_', ' ') for col in play_type_cols})
play_type_cols = [col.replace('_', ' ') for col in play_type_cols]


Start with

- Formations (under center, shotgun, pistol)
- Personnel (% 11, % mult TEs, % no TEs, % mult RBs, % no RBs, % extra OL)
- % Pass
- % Pass neutral downs
- ADOT
- % Screens
- % Long
- % passes from play-action
- % passes from under center vs shotgun vs pistol
- number receivers in top 80% targets
- % runs middle, guard/tackle, edge
- % rushes from under center vs. shotgun vs. pistol
- number rushers to account for 20% rushes