In [None]:
''' Imports '''

import pandas as pd
import polars as pl

import nflreadpy as nfl
import nfl_data_py as nfldy

pl.Config.set_tbl_width_chars(-1)
pl.Config.set_tbl_cols(-1)
pl.Config.set_tbl_rows(-1)

# Features

## Questions
- Percentage of plays vs. total plays

## General

### Pace
- Total drives per game (both teams)
- Plays per game (offense)

## Offense
- QB Position / Formation splits
- Personnel usage
- Play types: under center run, under center play action, under center pass, shotgun pass, RPO, shotgun play action, pistol...

- % Pass
- % Pass on downs
- % Pass on "neutral" (open playbook) downs

### Passing
- ADOT
- Depths: Behind LOS, Short, Medium, Long
- Locations: Side of field (left / middle / right), or just middle / boundary
- Target share of top receiver (OR, number of receivers that make up top 80% targets)
- % Screen pass
- % pass from under center vs shotgun vs pistol
- % pass inbreakers vs outbreakers ??

### Rushing
- Run gaps: middle, guard/tackle, edge
- % rushes from under center vs shotgun vs pistol
- Number of rushers to carry 20% rushes (dial in this number)

## Defense

### Personnel
- Base, Nickel, Dime usage
- % 3 DL vs 4 DL

### Play types
- heavy box rate
- light box rate
- blitz rate                (might have to be total blitzes if PFR is all I can get)
- mult. blitzers rate
- man rate
- cover 2, cover 3, cover 4 usage (when in zone??)

In [None]:
''' Parameters / Constants '''

START_YEAR = 2018       # first year of participation data
END_YEAR = 2024
SEASONS = [i for i in range(START_YEAR, END_YEAR + 1)]


In [None]:
''' PBP Data '''

## Load ##
pbp = nfl.load_pbp(seasons=SEASONS)

## Add columns ##
pbp = pbp.with_columns(
    DriveID=pl.concat_str([pl.col('game_id'), pl.col('drive').cast(pl.Int8).cast(pl.String)], separator='_'),
)
pbp = pbp.with_columns(
    NeutralDown=pl.when((pl.col('down') == 1) & (pl.col('ydstogo') <= 10)).then(1).when((pl.col('down') == 2) & (pl.col('ydstogo') <= 6)).then(1).when((pl.col('down') == 3) & (pl.col('ydstogo') <= 3)).then(1).otherwise(0),
)
pbp = pbp.with_columns(
    PassDepth=pl.when(pl.col('air_yards') <= 0).then(pl.lit('Behind LOS')).when(pl.col('air_yards') < 10).then(pl.lit('Short')).when(pl.col('air_yards') < 20).then(pl.lit('Medium')).when(pl.col('air_yards') >= 20).then(pl.lit('Long')),
)
# Trouble with run_gap and the middle rush https://thespade.substack.com/p/run-gap-charts-version-15
pbp = pbp.with_columns(
    RunLocation=pl.when(pl.col('run_gap') == 'end').then(pl.lit('Outside')).when((pl.col('run_gap') == 'guard') | (pl.col('run_gap') == 'tackle')).then(pl.lit('Inside'))
)

# Add distance

## Filters ##

# Filter to relevant plays (see nflfastr beginner's guide)
pbp = pbp.filter(
    (pl.col('pass') == 1) | (pl.col('rush') == 1),
    (pl.col('season_type') == 'REG'),
    (pl.col('epa').is_not_nan()),
    (pl.col('posteam').is_not_null()),
    (pl.col('posteam') != ''),
)

# Filter to normal game state
pbp = pbp.filter(
    # (pl.col('qtr') <= 3),
    # (pl.col('half_seconds_remaining') > 120),
    # (pl.col('score_differential') <= 14),
    (pl.col('special_teams_play') == 0),
    (pl.col('play_type_nfl') != 'PAT2'),
    (pl.col('play_type_nfl') != 'UNSPECIFIED'),     # Unspecified seems to be mostly punt / FG formation plays where something weird happened (fake, fumble, botched snap, etc)
)

# Convert to pandas DF
pbp = pd.DataFrame(data=pbp, columns=pbp.columns)

print(pbp.shape)
print(pbp.head().to_string())

In [None]:
''' Participation Data '''

participation = nfl.load_participation(seasons=SEASONS)

participation = participation.with_columns(
    year=pl.col('nflverse_game_id').str.split('_').list.get(0).cast(int)
)

# Formation
participation = participation.with_columns(
    Formation=pl.when(pl.col('offense_formation').is_in(['SINGLEBACK', 'I_FORM', 'UNDER CENTER', 'JUMBO'])).then(pl.lit('Under Center')).when(pl.col('offense_formation').is_in(['SHOTGUN', 'EMPTY', 'WILDCAT', 'PISTOL'])).then(pl.lit('Shotgun'))
)

def clean_personnel(personnel_str: str) -> str:
    if not personnel_str:
        return ''
    
    personnel_list = personnel_str.split(', ')
    personnel_str_positions = ''
    for i in personnel_list:

        num = int(i.split(' ')[0])
        pos = i.split(' ')[1]
        pos_str = f'{pos};' * num

        personnel_str_positions += pos_str

    return personnel_str_positions

def offensive_personnel(personnel_str: str) -> str:
    if type(personnel_str) != str:
        return ''
    
    spts = personnel_str.count('K;') + personnel_str.count('P;') + personnel_str.count('LS;') + personnel_str.count('FS;') + personnel_str.count('CB;')
    if spts > 0:
        return 'ST'
    
    wrs = personnel_str.count('WR')
    rbs = personnel_str.count('RB')
    tes = personnel_str.count('TE')

    personnel = f'{rbs}{tes}'

    centers = personnel_str.count('C;')
    guards = personnel_str.count('G;')
    tackles = personnel_str.count('T;')
    ol = personnel_str.count('OL;')

    if (centers + guards + tackles) > 5 or ol > 5:
        asts = ''
        if (centers + guards + tackles) > 5:
            asts = '*' * ((centers + guards + tackles) - 5)
        else:
            asts = '*' * (ol - 5)    
                 
        personnel += asts

    return personnel

def defensive_personnel(personnel_str: str) -> str:
    if type(personnel_str) != str:
        return ''
    
    spts = personnel_str.count('K;') + personnel_str.count('P;') + personnel_str.count('LS;') + personnel_str.count('WR;') + personnel_str.count('RB;') + personnel_str.count('TE;')
    if spts > 0:
        return 'ST'
    
    # DL
    dls = personnel_str.count('DL;')
    des = personnel_str.count('DE;')
    dts = personnel_str.count('DT;')
    nts = personnel_str.count('NT;')
    total_dls = dls + des + dts + nts

    # LBs
    lbs = personnel_str.count('LB;')
    # mlbs = personnel_str.count('MLB;')
    # ilbs = personnel_str.count('ILB;')
    # olbs = personnel_str.count('OLB;')
    total_lbs = lbs #+ mlbs + ilbs + olbs

    # DBs
    dbs = personnel_str.count('DB;')
    cbs = personnel_str.count('CB;')
    sss = personnel_str.count('SS;')
    fss = personnel_str.count('FS;')
    total_dbs = dbs + cbs + sss + fss

    d_type = ''
    if total_dbs == 4: d_type = 'Base'
    elif total_dbs == 5: d_type = 'Nickel'
    elif total_dbs == 6: d_type = 'Dime'
    elif total_dbs == 7: d_type = 'Quarters'
    else: d_type = 'Other'

    return f'{d_type} {total_dls}-{total_lbs}'
    

# Personnel
participation = participation.with_columns(
    offense_personnel_str=pl.col('offense_personnel').map_elements(clean_personnel, return_dtype=str),
    defense_personnel_str=pl.col('defense_personnel').map_elements(clean_personnel, return_dtype=str),
)
participation = participation.with_columns(
        offense_personnel_group=pl.col('offense_personnel_str').map_elements(offensive_personnel, return_dtype=str),
        defense_personnel_group=pl.col('defense_personnel_str').map_elements(defensive_personnel, return_dtype=str),
)
participation = participation.with_columns(
    defense_personnel_type=pl.col('defense_personnel_group').str.split(' ').list.get(0)
)


print(participation.shape)
print(participation.filter(pl.col('year') == 2024).head(10))

In [None]:
personnel_pcts = participation.filter(pl.col('year') == 2024, pl.col('defense_personnel_type') != 'ST')['defense_personnel_group'].value_counts().sort(by='count', descending=True).to_pandas()
personnel_pcts['% Plays'] = personnel_pcts['count'] / personnel_pcts['count'].sum() 

print(personnel_pcts.head(10).to_string())

In [None]:
''' Pro Football Ref '''
# Get number of total number of player blitzes

pfr_def = nfl.load_pfr_advstats(seasons=SEASONS, stat_type='def', summary_level='season')
print(pfr_def.head())

pfr_def_team_seasons = pfr_def.group_by(['season', 'tm']).agg(
    Blitzes=pl.col('bltz').sum()
)
print(pfr_def_team_seasons.head())


In [None]:
''' Prep data inputs '''

gpby = pbp.groupby(['posteam', 'season']).aggregate(
    Games=('game_id', 'nunique'),
    Drives=('DriveID', 'nunique'),
    Plays=('posteam', 'size'),
    Neutral_Down_Plays=('posteam', lambda x: x[pbp['NeutralDown'] == 1].shape[0]),

    Pass_Plays=('pass', 'sum'),
    Pass_Attempts=('pass_attempt', 'sum'),
    Sacks=('sack', 'sum'),
    IAY=('air_yards', 'sum'),
    Pass_BehindLOS=('pass', lambda x: x[pbp['PassDepth'] == 'Behind LOS'].sum()),
    Pass_Deep=('pass', lambda x: x[pbp['PassDepth'] == 'Long'].sum()),
    Neutral_Down_Pass=('pass', lambda x: x[pbp['NeutralDown'] == 1].sum()),

    Rush_Plays=('rush', 'sum'),
    Rush_Attempts=('rush_attempt', 'sum'),
    Rush_Inside=('rush', lambda x: x[pbp['RunLocation'] == 'Inside'].sum()),
    Rush_Outside=('rush', lambda x: x[pbp['RunLocation'] == 'Outside'].sum()),
)

# Overall numbers
gpby['% Pass'] = gpby['Pass_Plays'] / gpby['Plays']
gpby['% Pass Neutral Down'] = gpby['Neutral_Down_Pass'] / gpby['Neutral_Down_Plays']

# Passing numbers
gpby['ADOT'] = gpby['IAY'] / (gpby['Pass_Attempts'] - gpby['Sacks'])

# Rushing numbers
gpby['% Rush Inside'] = gpby['Rush_Inside'] / gpby['Rush_Plays']
gpby['% Rush Outside'] = gpby['Rush_Outside'] / gpby['Rush_Plays']


print(gpby.shape)
print(gpby.head(12).to_string())


In [None]:
gpby = pbp.groupby(['special_teams_play', 'play_type_nfl']).size()
print(gpby.to_string())

In [None]:
## QB Position Base
qb_pos = league_pbp_normal_gs.groupby(['posteam', 'QB Position']).aggregate(
    Plays=('QB Position', 'size'),
    Rush=('rush', 'sum'),
    Pass=('pass', 'sum'),
    Pure_Rush = ('rush', lambda x: x[(league_pbp_normal_gs['is_rpo'] == False)].sum()),
    RPO_Rush = ('rush', lambda x: x[(league_pbp_normal_gs['is_rpo'] == True)].sum()),
    RPO_Pass = ('pass', lambda x: x[(league_pbp_normal_gs['is_rpo'] == True)].sum()),
    PA_Pass = ('pass', lambda x: x[(league_pbp_normal_gs['is_rpo'] == False) & (league_pbp_normal_gs['is_play_action'] == True)].sum()),
    Pure_Pass = ('pass', lambda x: x[(league_pbp_normal_gs['is_rpo'] == False) & (league_pbp_normal_gs['is_play_action'] == False)].sum()),
    Yards=('yards_gained', 'sum'),
    PassYards=('passing_yards', 'sum'),
    RushYards=('rushing_yards', 'sum'),
)
qb_pos['% Plays'] = qb_pos['Plays'] / qb_pos.groupby(level=0)['Plays'].transform('sum')
qb_pos['% Plays LRank'] = qb_pos.groupby(level='QB Position')['% Plays'].rank(ascending=False, method='max')

# Reindex
qb_pos = qb_pos.reindex(labels=qb_pos_order, level='QB Position')

# Names
play_type_cols = ['Pure_Rush', 'RPO_Rush', 'RPO_Pass', 'PA_Pass', 'Pure_Pass']
qb_pos = qb_pos.rename(columns={col: col.replace('_', ' ') for col in play_type_cols})
play_type_cols = [col.replace('_', ' ') for col in play_type_cols]


Start with

- Formations (under center, shotgun, pistol)
- Personnel (% 11, % mult TEs, % no TEs, % mult RBs, % no RBs, % extra OL)
- % Pass
- % Pass neutral downs
- ADOT
- % Screens
- % Long
- % passes from play-action
- % passes from under center vs shotgun vs pistol
- number receivers in top 80% targets
- % runs middle, guard/tackle, edge
- % rushes from under center vs. shotgun vs. pistol
- number rushers to account for 20% rushes