In [None]:
''' Imports '''

import pandas as pd
import polars as pl

import nflreadpy as nfl
import nfl_data_py as nfldy

pl.Config.set_tbl_width_chars(-1)
pl.Config.set_tbl_cols(-1)
pl.Config.set_tbl_rows(-1)

# Features

## Questions
- Percentage of plays vs. total plays

## General

### Pace
- Total drives per game (both teams)
- Plays per game (offense)

## Offense
- QB Position / Formation splits
- Personnel usage
- Play types: under center run, under center play action, under center pass, shotgun pass, RPO, shotgun play action, pistol...

- % Pass
- % Pass on downs
- % Pass on "neutral" (open playbook) downs

### Passing
- Time to throw
- ADOT
- Depths: Behind LOS, Short, Medium, Long
- Locations: Side of field (left / middle / right), or just middle / boundary
- Target share of top receiver (OR, number of receivers that make up top 80% targets)
- % Screen pass
- % pass from under center vs shotgun vs pistol
- % pass inbreakers vs outbreakers ??

### Rushing
- Run gaps: middle, guard/tackle, edge
- % rushes from under center vs shotgun vs pistol
- Number of rushers to carry 20% rushes (dial in this number)

## Defense
https://www.matchquarters.com/p/chargers-run-defense-small-ball-jesse-minter   
https://www.matchquarters.com/p/light-box-run-fits-fitting-the-run-with-a-head-up-nose-rams-chargers-jesse-minter-chris-shula

### Personnel
- Base, Nickel, Dime usage
- % 3 DL vs 4 DL

### Play types
- heavy box rate            (8+ in box)
- light box rate            (6 or less in box)
- blitz rate                (% 5 or more rushers, % 6 or more rushers)
- mult. blitzers rate
- man rate
- cover 1, cover 2, cover 3, cover 4, cover 6 usage

In [None]:
''' Parameters / Constants '''

START_YEAR = 2018       # first year of participation data
END_YEAR = 2024
SEASONS = [i for i in range(START_YEAR, END_YEAR + 1)]


In [None]:
''' PBP Data '''

## Load ##
pbp = nfl.load_pbp(seasons=SEASONS)

## Add columns ##
pbp = pbp.with_columns(
    MasterPlayID=pl.concat_str([pl.col('game_id'), pl.col('play_id').cast(pl.Int32).cast(pl.String)], separator='_'),
    DriveID=pl.concat_str([pl.col('game_id'), pl.col('drive').cast(pl.Int8).cast(pl.String)], separator='_'),
)
pbp = pbp.with_columns(
    NeutralDown=pl.when((pl.col('down') == 1) & (pl.col('ydstogo') <= 10)).then(1).when((pl.col('down') == 2) & (pl.col('ydstogo') <= 6)).then(1).when((pl.col('down') == 3) & (pl.col('ydstogo') <= 3)).then(1).otherwise(0),
    PassDepth=pl.when(pl.col('air_yards') <= 0).then(pl.lit('Behind LOS')).when(pl.col('air_yards') < 10).then(pl.lit('Short')).when(pl.col('air_yards') < 20).then(pl.lit('Medium')).when(pl.col('air_yards') >= 20).then(pl.lit('Long')),
    AirYardsToSticks=pl.col('air_yards') - pl.col('ydstogo'),

    # Trouble with run_gap and the middle rush https://thespade.substack.com/p/run-gap-charts-version-15
    RunLocation=pl.when(pl.col('run_gap') == 'end').then(pl.lit('Outside')).when((pl.col('run_gap') == 'guard') | (pl.col('run_gap') == 'tackle')).then(pl.lit('Inside'))
)

## Filters ##

# Filter to relevant plays (see nflfastr beginner's guide)
pbp = pbp.filter(
    (pl.col('pass') == 1) | (pl.col('rush') == 1),
    (pl.col('season_type') == 'REG'),
    (pl.col('epa').is_not_nan()),
    (pl.col('posteam').is_not_null()),
    (pl.col('posteam') != ''),
)

# Filter to normal game state
pbp = pbp.filter(
    # (pl.col('qtr') <= 3),
    # (pl.col('half_seconds_remaining') > 120),
    # (pl.col('score_differential') <= 14),
    (pl.col('special_teams_play') == 0),
    (pl.col('play_type_nfl') != 'PAT2'),
    (pl.col('play_type_nfl') != 'UNSPECIFIED'),     # Unspecified seems to be mostly punt / FG formation plays where something weird happened (fake, fumble, botched snap, etc)
)

print(pbp.shape)
print(pbp['MasterPlayID'].n_unique())
print(pbp.head())

In [None]:
''' Participation Data '''

## Personnel Helper Functions ## 

def clean_personnel(personnel_str: str) -> str:
    if not personnel_str:
        return ''
    
    personnel_list = personnel_str.split(', ')
    personnel_str_positions = ''
    for i in personnel_list:

        num = int(i.split(' ')[0])
        pos = i.split(' ')[1]
        pos_str = f'{pos};' * num

        personnel_str_positions += pos_str

    return personnel_str_positions

def offensive_personnel(personnel_str: str) -> str:
    if type(personnel_str) != str:
        return ''
    
    spts = personnel_str.count('K;') + personnel_str.count('P;') + personnel_str.count('LS;') + personnel_str.count('FS;') + personnel_str.count('CB;')
    if spts > 0:
        return 'ST'
    
    wrs = personnel_str.count('WR')
    rbs = personnel_str.count('RB')
    tes = personnel_str.count('TE')

    personnel = f'{rbs}{tes}'

    centers = personnel_str.count('C;')
    guards = personnel_str.count('G;')
    tackles = personnel_str.count('T;')
    ol = personnel_str.count('OL;')

    if (centers + guards + tackles) > 5 or ol > 5:
        asts = ''
        if (centers + guards + tackles) > 5:
            asts = '*' * ((centers + guards + tackles) - 5)
        else:
            asts = '*' * (ol - 5)    
                 
        personnel += asts

    return personnel

def defensive_personnel(personnel_str: str) -> str:
    if type(personnel_str) != str:
        return ''
    
    spts = personnel_str.count('K;') + personnel_str.count('P;') + personnel_str.count('LS;') + personnel_str.count('WR;') + personnel_str.count('RB;') + personnel_str.count('TE;')
    if spts > 0:
        return 'ST'
    
    # DL
    dls = personnel_str.count('DL;')
    des = personnel_str.count('DE;')
    dts = personnel_str.count('DT;')
    nts = personnel_str.count('NT;')
    total_dls = dls + des + dts + nts

    # LBs
    lbs = personnel_str.count('LB;')
    # mlbs = personnel_str.count('MLB;')
    # ilbs = personnel_str.count('ILB;')
    # olbs = personnel_str.count('OLB;')
    total_lbs = lbs #+ mlbs + ilbs + olbs

    # DBs
    dbs = personnel_str.count('DB;')
    cbs = personnel_str.count('CB;')
    sss = personnel_str.count('SS;')
    fss = personnel_str.count('FS;')
    total_dbs = dbs + cbs + sss + fss

    d_type = ''
    if total_dbs == 4: d_type = 'Base'
    elif total_dbs == 5: d_type = 'Nickel'
    elif total_dbs == 6: d_type = 'Dime'
    elif total_dbs == 7: d_type = 'Quarters'
    else: d_type = 'Other'

    return f'{d_type} {total_dls}-{total_lbs}'
    
## Get data ##
participation = nfl.load_participation(seasons=SEASONS)

## Add columns
participation = participation.with_columns(
    MasterPlayID=pl.concat_str([pl.col('nflverse_game_id'), pl.col('play_id').cast(pl.Int32).cast(pl.String)], separator='_'),
    season=pl.col('nflverse_game_id').str.split('_').list.get(0).cast(int),

    # Defense stuff
    LightBox=pl.when(pl.col('defenders_in_box') <= 6).then(1).otherwise(0),
    HeavyBox=pl.when(pl.col('defenders_in_box') >= 8).then(1).otherwise(0),
    ZoneCoverage=pl.when(pl.col('defense_man_zone_type') == 'ZONE_COVERAGE').then(1).otherwise(0),
    ManCoverage=pl.when(pl.col('defense_man_zone_type') == 'MAN_COVERAGE').then(1).otherwise(0),
    OffenseFormation=pl.when(pl.col('offense_formation').is_in(['SINGLEBACK', 'I_FORM', 'UNDER CENTER', 'JUMBO'])).then(pl.lit('Under Center')).when(pl.col('offense_formation').is_in(['SHOTGUN', 'EMPTY', 'WILDCAT', 'PISTOL'])).then(pl.lit('Shotgun'))
)

# Personnel
participation = participation.with_columns(
    OffensePositionsStr=pl.col('offense_personnel').map_elements(clean_personnel, return_dtype=str),
    DefensePositionsStr=pl.col('defense_personnel').map_elements(clean_personnel, return_dtype=str),
)
participation = participation.with_columns(
    OffensePersonnelGroup=pl.col('OffensePositionsStr').map_elements(offensive_personnel, return_dtype=str),
    DefensePersonnelGroup=pl.col('DefensePositionsStr').map_elements(defensive_personnel, return_dtype=str),
)
participation = participation.with_columns(
    OffenseMultRBs=pl.when(pl.col('OffensePersonnelGroup').str.slice(0, 1).is_in(['2', '3', '4'])).then(1).otherwise(0),
    OffenseZeroRBs=pl.when(pl.col('OffensePersonnelGroup').str.slice(0, 1) == '0').then(1).otherwise(0),
    OffenseMultTEs=pl.when(pl.col('OffensePersonnelGroup').str.slice(1, 1).is_in(['2', '3', '4'])).then(1).otherwise(0),
    OffenseZeroTEs=pl.when(pl.col('OffensePersonnelGroup').str.slice(1, 1) == '0').then(1).otherwise(0),
    OffenseExtraOL=pl.when(pl.col('OffensePersonnelGroup').str.tail(1) == '*').then(1).otherwise(0),
    DefensePersonnelType=pl.col('DefensePersonnelGroup').str.split(' ').list.get(0)
)

print(participation.shape)
print(participation['MasterPlayID'].n_unique())
print(participation.filter(pl.col('season') == 2024, pl.col('route') != '').head(100))

In [None]:
''' Combine '''

pbp = pbp.join(participation[['MasterPlayID', 'OffenseFormation', 'OffensePersonnelGroup', 'OffenseMultRBs', 'OffenseZeroRBs', 'OffenseMultTEs', 'OffenseZeroTEs', 'OffenseExtraOL', 'time_to_throw', 'DefensePersonnelGroup', 'DefensePersonnelType', 'LightBox', 'HeavyBox', 'number_of_pass_rushers', 'ZoneCoverage', 'ManCoverage', 'defense_coverage_type']], on='MasterPlayID', how='left')

# Create dataframe
pbp_df = pd.DataFrame(columns=pbp.columns, data=pbp)

print(pbp_df.shape)
print(pbp_df.head().to_string())

In [None]:
''' Prep Offensive Inputs '''
# TODO - top targets and rushers, or # of receivers / rushers

## Base ##
gpby = pbp_df.groupby(['posteam', 'season']).aggregate(
    # General
    Games=('game_id', 'nunique'),
    Drives=('DriveID', 'nunique'),
    Plays=('posteam', 'size'),
    Neutral_Down_Plays=('posteam', lambda x: x[pbp_df['NeutralDown'] == 1].shape[0]),

    # Play Types
    Pass_Plays=('pass', 'sum'),
    Neutral_Down_Pass=('pass', lambda x: x[pbp_df['NeutralDown'] == 1].sum()),
    Pass_Attempts=('pass_attempt', 'sum'),
    
    # Passing
    IAY=('air_yards', 'sum'),
    IAY_ToSticks=('AirYardsToSticks', 'sum'),
    TotalTimeToThrow=('time_to_throw', 'sum'),
    Pass_BehindLOS=('pass_attempt', lambda x: x[pbp_df['PassDepth'] == 'Behind LOS'].sum()),
    Pass_Deep=('pass_attempt', lambda x: x[pbp_df['PassDepth'] == 'Long'].sum()),
    Sacks=('sack', 'sum'),

    # Rushing
    Rush_Plays=('rush', 'sum'),
    Rush_Attempts=('rush_attempt', 'sum'),
    Rush_Inside=('rush', lambda x: x[pbp_df['RunLocation'] == 'Inside'].sum()),
    Rush_Outside=('rush', lambda x: x[pbp_df['RunLocation'] == 'Outside'].sum()),

    # Personnel
    Plays_11_Personnel=('posteam', lambda x: x[pbp_df['OffensePersonnelGroup'] == '11'].shape[0]),
    Plays_Mult_RBs=('OffenseMultRBs', 'sum'),
    Plays_Zero_RBs=('OffenseZeroRBs', 'sum'),
    Plays_Mult_TEs=('OffenseMultTEs', 'sum'),
    Plays_Zero_TEs=('OffenseZeroTEs', 'sum'),
    Plays_Extra_OL=('OffenseExtraOL', 'sum')
)

## Add Formation info ##
formation_gpby = pbp_df.groupby(['posteam', 'season', 'OffenseFormation']).aggregate(
    Plays=('posteam', 'size'),
    Neutral_Down_Plays=('posteam', lambda x: x[pbp_df['NeutralDown'] == 1].shape[0]),

    Pass_Plays=('pass', 'sum'),
    Rush_Plays=('rush', 'sum')
)
formation_gpby['% Pass'] = formation_gpby['Pass_Plays'] / formation_gpby['Plays']

formation_data = formation_gpby.reset_index().pivot(
    index=['posteam', 'season'],
    columns='OffenseFormation',
    values=['Plays', 'Neutral_Down_Plays', '% Pass']
).swaplevel(axis=1)
formation_data.columns = [" ".join(col) for col in formation_data.columns.values]
formation_data['% Under Center'] = formation_data['Under Center Plays'] / (formation_data['Under Center Plays'] + formation_data['Shotgun Plays'])
formation_data['% Shotgun'] = formation_data['Shotgun Plays'] / (formation_data['Under Center Plays'] + formation_data['Shotgun Plays'])
formation_data['% Under Center Neutral Downs'] = formation_data['Under Center Neutral_Down_Plays'] / (formation_data['Under Center Neutral_Down_Plays'] + formation_data['Shotgun Neutral_Down_Plays'])
formation_data['% Shotgun Neutral Downs'] = formation_data['Shotgun Neutral_Down_Plays'] / (formation_data['Under Center Neutral_Down_Plays'] + formation_data['Shotgun Neutral_Down_Plays'])

gpby = gpby.merge(formation_data, left_index=True, right_index=True, how='left')

# Overall numbers
gpby['Plays / Game'] = gpby['Plays'] / gpby['Games']
gpby['Drives / Game'] = gpby['Drives'] / gpby['Games']

# Play Types
gpby['% Pass'] = gpby['Pass_Plays'] / gpby['Plays']
gpby['% Pass Neutral Downs'] = gpby['Neutral_Down_Pass'] / gpby['Neutral_Down_Plays']

# Passing numbers
gpby['ADOT'] = gpby['IAY'] / (gpby['Pass_Attempts'] - gpby['Sacks'])
gpby['ADOT to Sticks'] = gpby['IAY_ToSticks'] / (gpby['Pass_Attempts'] - gpby['Sacks'])
gpby['Avg Time to Throw'] = gpby['TotalTimeToThrow'] / (gpby['Pass_Attempts'] - gpby['Sacks'])

gpby['% Passes Behind LOS'] = gpby['Pass_BehindLOS'] / (gpby['Pass_Attempts'] - gpby['Sacks'])
gpby['% Passes Deep'] = gpby['Pass_Deep'] / (gpby['Pass_Attempts'] - gpby['Sacks'])

# Rushing numbers
gpby['% Rush Inside'] = gpby['Rush_Inside'] / gpby['Rush_Plays']
gpby['% Rush Outside'] = gpby['Rush_Outside'] / gpby['Rush_Plays']

# Personnel
for col in ['Plays_11_Personnel', 'Plays_Mult_RBs', 'Plays_Zero_RBs', 'Plays_Mult_TEs', 'Plays_Zero_TEs', 'Plays_Extra_OL']:
    cat = col.replace('Plays_', '').replace('_', ' ')
    col_name = f'% Plays {col}'
    gpby[col_name] = gpby[col] / gpby['Plays']

# print(gpby.columns)
print(gpby.shape)
print(gpby.head(12).to_string())


Start with

- Formations (under center, shotgun, pistol)
- Personnel (% 11, % mult TEs, % no TEs, % mult RBs, % no RBs, % extra OL)
- % Pass
- % Pass neutral downs
- ADOT
- % Screens
- % Long
- % passes from play-action
- % passes from under center vs shotgun vs pistol
- number receivers in top 80% targets
- % runs middle, guard/tackle, edge
- % rushes from under center vs. shotgun vs. pistol
- number rushers to account for 20% rushes