In [54]:
import pandas as pd
import numpy as np

# Load supplementary data (same across all weeks)
supplemental_df = pd.read_csv('../data/supplementary_data.csv')

# Load all weeks
all_input_dfs = []
all_output_dfs = []

weeks = range(1, 19)  # Weeks 1-18

print("Loading data...")
for week in weeks:
    try:
        input_path = f'../data/train/input_2023_w{week:02d}.csv'
        output_path = f'../data/train/output_2023_w{week:02d}.csv'
        
        input_df = pd.read_csv(input_path)
        output_df = pd.read_csv(output_path)
        
        all_input_dfs.append(input_df)
        all_output_dfs.append(output_df)
        
        print(f"  ✓ Week {week:02d} loaded")
    except FileNotFoundError:
        print(f"  ✗ Week {week:02d} not found")

# Combine all weeks
input_df = pd.concat(all_input_dfs, ignore_index=True)
output_df = pd.concat(all_output_dfs, ignore_index=True)

print(f"\n✓ Total input rows: {len(input_df):,}")
print(f"✓ Total output rows: {len(output_df):,}")

# Merge and filter to man coverage
df = pd.merge(input_df, supplemental_df, on=['game_id','play_id'], how='left')
df_man = df[~df['team_coverage_type'].str.contains("ZONE", na=False)]

print(f"✓ Man coverage rows: {len(df_man):,}")

  supplemental_df = pd.read_csv('../data/supplementary_data.csv')


Loading data...
  ✓ Week 01 loaded
  ✓ Week 02 loaded
  ✓ Week 03 loaded
  ✓ Week 04 loaded
  ✓ Week 05 loaded
  ✓ Week 06 loaded
  ✓ Week 07 loaded
  ✓ Week 08 loaded
  ✓ Week 09 loaded
  ✓ Week 10 loaded
  ✓ Week 11 loaded
  ✓ Week 12 loaded
  ✓ Week 13 loaded
  ✓ Week 14 loaded
  ✓ Week 15 loaded
  ✓ Week 16 loaded
  ✓ Week 17 loaded
  ✓ Week 18 loaded

✓ Total input rows: 4,880,579
✓ Total output rows: 562,936
✓ Man coverage rows: 1,293,453


In [59]:
test_play = df_man[df_man['pass_result'] == 'IN'].iloc[0]
test_game_id = 2023090700
test_play_id = 3032

# Filter output data to just this one play
output_test = output_df[
    (output_df['game_id'] == test_game_id) & 
    (output_df['play_id'] == test_play_id)
]

# From input_df, who are the players on this play?
input_test = input_df[
    (input_df['game_id'] == test_game_id) & 
    (input_df['play_id'] == test_play_id)
]

# Look at player roles
print(input_test[['nfl_id', 'player_name', 'player_position', 'player_role']].drop_duplicates())

       nfl_id        player_name player_position         player_role
15964   54527         Bryan Cook              FS  Defensive Coverage
15984   54600    Joshua Williams              CB  Defensive Coverage
16004   46137        Justin Reid              SS  Defensive Coverage
16024   53487        Nick Bolton             MLB  Defensive Coverage
16044   54486     Trent McDuffie              CB  Defensive Coverage
16064   52471     Willie Gay Jr.             ILB  Defensive Coverage
16084   47856   David Montgomery              RB  Other Route Runner
16104   43584      Kalif Raymond              WR  Other Route Runner
16124   38696       Marvin Jones              WR  Other Route Runner
16144   55899        Sam LaPorta              TE  Other Route Runner
16164   43290         Jared Goff              QB              Passer
16184   53541  Amon-Ra St. Brown              WR   Targeted Receiver


In [60]:
# Constants
BOUNDARY_THRESHOLD = 12

# Get targeted receiver
targeted_wr = input_test[input_test['player_role'] == 'Targeted Receiver']
if len(targeted_wr) == 0:
    print("No targeted receiver - skipping")
else:
    wr_id = targeted_wr['nfl_id'].iloc[0]
    wr_name = targeted_wr['player_name'].iloc[0]
    
    print(f"Targeted: {wr_name} (ID: {wr_id})")
    
    # Get all CBs/DBs in coverage
    cb_ids = input_test[
        (input_test['player_role'] == 'Defensive Coverage') & 
        (input_test['player_position'].isin(['CB', 'DB', 'SS', 'FS']))
    ]['nfl_id'].unique()
    
    # Filter to CBs that have output data
    cbs_in_output = []
    for cb_id in cb_ids:
        if len(output_test[output_test['nfl_id'] == cb_id]) > 0:
            cbs_in_output.append(cb_id)
    
    if len(cbs_in_output) == 0:
        print("No CBs in output - skipping")
    else:
        print(f"\nCBs in output: {len(cbs_in_output)}")
        
        # Calculate AVERAGE distance for each CB across all frames
        cb_avg_distances = {}
        
        for cb_id in cbs_in_output:
            # Get CB and WR frames
            cb_frames = output_test[output_test['nfl_id'] == cb_id][['frame_id', 'x', 'y']]
            wr_frames = output_test[output_test['nfl_id'] == wr_id][['frame_id', 'x', 'y']]
            
            # Merge on frame_id to align them
            merged = pd.merge(
                cb_frames,
                wr_frames,
                on='frame_id',
                suffixes=('_cb', '_wr')
            )
            
            if len(merged) == 0:
                continue
            
            # Calculate distance at each frame
            merged['distance'] = np.sqrt(
                (merged['x_cb'] - merged['x_wr'])**2 + 
                (merged['y_cb'] - merged['y_wr'])**2
            )
            
            # OPTION A: Simple average
            avg_distance = merged['distance'].mean()
            
            # OPTION B: Weighted average (later frames matter more)
            # Uncomment to use weighted:
            # weights = np.linspace(1, 2, len(merged))
            # avg_distance = np.average(merged['distance'], weights=weights)
            
            cb_avg_distances[cb_id] = avg_distance
            
            # Print info
            cb_name = input_test[input_test['nfl_id'] == cb_id]['player_name'].iloc[0]
            cb_pos = input_test[input_test['nfl_id'] == cb_id]['player_position'].iloc[0]
            print(f"  {cb_name} ({cb_pos}):")
            print(f"    First frame: {merged['distance'].iloc[0]:.2f} yards")
            print(f"    Average: {avg_distance:.2f} yards")
            print(f"    Final frame: {merged['distance'].iloc[-1]:.2f} yards")
        
        if len(cb_avg_distances) == 0:
            print("No valid CB-WR matchups - skipping")
        else:
            # Pick CB with LOWEST average distance
            primary_cb_id = min(cb_avg_distances, key=cb_avg_distances.get)
            primary_cb_name = input_test[input_test['nfl_id'] == primary_cb_id]['player_name'].iloc[0]
            
            print(f"\n✓ Primary defender: {primary_cb_name} (ID: {primary_cb_id})")
            print(f"✓ Average distance: {cb_avg_distances[primary_cb_id]:.2f} yards")
            
            # NOW calculate your metrics using this CB
            cb_frames = output_test[output_test['nfl_id'] == primary_cb_id].sort_values('frame_id')
            wr_frames = output_test[output_test['nfl_id'] == wr_id].sort_values('frame_id')
            
            cb_start = cb_frames.iloc[0]
            cb_end = cb_frames.iloc[-1]
            wr_start = wr_frames.iloc[0]
            wr_end = wr_frames.iloc[-1]
            
            # Depth change
            cb_depth_change = abs(cb_end['x'] - cb_start['x'])
            wr_depth_change = abs(wr_end['x'] - wr_start['x'])
            depth_ratio = cb_depth_change / wr_depth_change if wr_depth_change > 0 else 0
            
            # Separation
            separation_start = np.sqrt(
                (cb_start['x'] - wr_start['x'])**2 + 
                (cb_start['y'] - wr_start['y'])**2
            )
            separation_end = np.sqrt(
                (cb_end['x'] - wr_end['x'])**2 + 
                (cb_end['y'] - wr_end['y'])**2
            )
            separation_change = separation_end - separation_start
            
            print(f"\n--- METRICS ---")
            print(f"Frames: {len(cb_frames)}")
            print(f"Depth ratio: {depth_ratio:.2f}")
            print(f"Separation start: {separation_start:.2f} yards")
            print(f"Separation end: {separation_end:.2f} yards")
            print(f"Separation change: {separation_change:+.2f} yards")
            
            # Classification
            if separation_end < 3:
                if depth_ratio < 0.9:
                    classification = "AGGRESSIVE (successful undercut)"
                else:
                    classification = "CONSERVATIVE (tight coverage)"
            elif separation_change < -2:
                if depth_ratio < 0.9 and separation_end < 6:
                    classification = "AGGRESSIVE (closing on route)"
                else:
                    classification = "BEATEN (trailing but closing)"
            else:
                if separation_end > 5:
                    classification = "BEATEN (lost coverage)"
                else:
                    classification = "CONSERVATIVE (maintaining distance)"
            
            print(f"\n✓ Classification: {classification}")

Targeted: Amon-Ra St. Brown (ID: 53541)

CBs in output: 1
  Justin Reid (SS):
    First frame: 2.51 yards
    Average: 1.30 yards
    Final frame: 0.51 yards

✓ Primary defender: Justin Reid (ID: 46137)
✓ Average distance: 1.30 yards

--- METRICS ---
Frames: 9
Depth ratio: 9.09
Separation start: 2.51 yards
Separation end: 0.51 yards
Separation change: -2.00 yards

✓ Classification: CONSERVATIVE (tight coverage)


In [61]:
# Simpler approach - any CB on any receiver
for idx in range(min(20, len(df_man))):
    test_play = df_man.iloc[idx]
    test_game_id = test_play['game_id']
    test_play_id = test_play['play_id']
    
    input_test = input_df[
        (input_df['game_id'] == test_game_id) & 
        (input_df['play_id'] == test_play_id)
    ]
    
    output_test = output_df[
        (output_df['game_id'] == test_game_id) & 
        (output_df['play_id'] == test_play_id)
    ]
    
    # Get targeted receiver
    targeted_wr = input_test[input_test['player_role'] == 'Targeted Receiver']
    if len(targeted_wr) == 0:
        continue
    
    wr_id = targeted_wr['nfl_id'].iloc[0]
    
    # Check if WR in output
    if len(output_test[output_test['nfl_id'] == wr_id]) == 0:
        continue
    
    # Get CBs
    cb_ids = input_test[
        (input_test['player_role'] == 'Defensive Coverage') & 
        (input_test['player_position'].isin(['CB', 'DB', 'SS', 'FS']))  # Include safeties too
    ]['nfl_id'].unique()
    
    # Check if any CB in output
    cbs_in_output = []
    for cb_id in cb_ids:
        if len(output_test[output_test['nfl_id'] == cb_id]) > 0:
            cbs_in_output.append(cb_id)
    
    if len(cbs_in_output) > 0:
        print(f"✓ Valid play: game={test_game_id}, play={test_play_id}")
        print(f"  Result: {test_play['pass_result']}")
        break

✓ Valid play: game=2023090700, play=194
  Result: C


In [62]:
results = []
skipped = 0

print(f"\nAnalyzing {len(unique_plays)} unique plays across all 18 weeks...")

for idx in range(len(unique_plays)):
    test_play = unique_plays.iloc[idx]
    test_game_id = test_play['game_id']
    test_play_id = test_play['play_id']
    los_x = test_play['absolute_yardline_number']
    
    if idx % 500 == 0:  # Progress every 500 plays instead of 50
        print(f"Processing play {idx}/{len(unique_plays)}...")
    
    # [Rest of your classification loop - exact same code]
    # Load play data
    input_test = input_df[
        (input_df['game_id'] == test_game_id) & 
        (input_df['play_id'] == test_play_id)
    ]
    
    output_test = output_df[
        (output_df['game_id'] == test_game_id) & 
        (output_df['play_id'] == test_play_id)
    ]
    
    # Get targeted receiver
    targeted_wr = input_test[input_test['player_role'] == 'Targeted Receiver']
    if len(targeted_wr) == 0:
        skipped += 1
        continue
    
    wr_id = targeted_wr['nfl_id'].iloc[0]
    wr_name = targeted_wr['player_name'].iloc[0]
    
    if len(output_test[output_test['nfl_id'] == wr_id]) == 0:
        skipped += 1
        continue
    
    # Get CBs in coverage
    cb_ids = input_test[
        (input_test['player_role'] == 'Defensive Coverage') & 
        (input_test['player_position'].isin(['CB', 'DB', 'SS', 'FS']))
    ]['nfl_id'].unique()
    
    cbs_in_output = []
    for cb_id in cb_ids:
        if len(output_test[output_test['nfl_id'] == cb_id]) > 0:
            cbs_in_output.append(cb_id)
    
    if len(cbs_in_output) == 0:
        skipped += 1
        continue
    
    # Find primary coverage
    cb_avg_distances = {}
    
    for cb_id in cbs_in_output:
        cb_frames = output_test[output_test['nfl_id'] == cb_id][['frame_id', 'x', 'y']]
        wr_frames = output_test[output_test['nfl_id'] == wr_id][['frame_id', 'x', 'y']]
        
        merged = pd.merge(cb_frames, wr_frames, on='frame_id', suffixes=('_cb', '_wr'))
        
        if len(merged) == 0:
            continue
        
        merged['distance'] = np.sqrt(
            (merged['x_cb'] - merged['x_wr'])**2 + 
            (merged['y_cb'] - merged['y_wr'])**2
        )
        
        avg_distance = merged['distance'].mean()
        cb_avg_distances[cb_id] = avg_distance
    
    if len(cb_avg_distances) == 0:
        skipped += 1
        continue
    
    primary_cb_id = min(cb_avg_distances, key=cb_avg_distances.get)
    primary_cb_info = input_test[input_test['nfl_id'] == primary_cb_id].iloc[0]
    
    # Get frame-by-frame data
    cb_frames = output_test[output_test['nfl_id'] == primary_cb_id].sort_values('frame_id').reset_index(drop=True)
    wr_frames = output_test[output_test['nfl_id'] == wr_id].sort_values('frame_id').reset_index(drop=True)
    
    if len(cb_frames) < 5 or len(wr_frames) < 5:
        skipped += 1
        continue
    
    # Calculate metrics
    num_frames = len(cb_frames)
    early_end = num_frames // 3
    late_start = 2 * num_frames // 3
    
    cb_early = cb_frames.iloc[:early_end]
    cb_late = cb_frames.iloc[late_start:]
    
    early_dx = cb_early.iloc[-1]['x'] - cb_early.iloc[0]['x']
    early_dy = cb_early.iloc[-1]['y'] - cb_early.iloc[0]['y']
    early_direction = np.arctan2(early_dy, early_dx)
    
    late_dx = cb_late.iloc[-1]['x'] - cb_late.iloc[0]['x']
    late_dy = cb_late.iloc[-1]['y'] - cb_late.iloc[0]['y']
    late_direction = np.arctan2(late_dy, late_dx)
    
    direction_change = abs(late_direction - early_direction)
    if direction_change > np.pi:
        direction_change = 2 * np.pi - direction_change
    direction_change_degrees = np.degrees(direction_change)
    
    cb_start = cb_frames.iloc[0]
    cb_end = cb_frames.iloc[-1]
    wr_start = wr_frames.iloc[0]
    wr_end = wr_frames.iloc[-1]
    
    separation_start = np.sqrt(
        (cb_start['x'] - wr_start['x'])**2 + 
        (cb_start['y'] - wr_start['y'])**2
    )
    separation_end = np.sqrt(
        (cb_end['x'] - wr_end['x'])**2 + 
        (cb_end['y'] - wr_end['y'])**2
    )
    separation_change = separation_end - separation_start
    
    cb_los_start = abs(cb_start['x'] - los_x)
    cb_los_end = abs(cb_end['x'] - los_x)
    wr_los_start = abs(wr_start['x'] - los_x)
    wr_los_end = abs(wr_end['x'] - los_x)
    
    cb_los_change = cb_los_end - cb_los_start
    wr_los_change = wr_los_end - wr_los_start
    los_change_diff = cb_los_change - wr_los_change
    
    pass_result = test_play['pass_result']
    
    # Classification
    made_big_break = direction_change_degrees > 30
    separated_from_wr = separation_change > 0.5
    undercut_route = los_change_diff < -2.5 and separation_end > 4
    
    is_aggressive = (made_big_break and separated_from_wr) or undercut_route
    
    if is_aggressive:
        if pass_result in ['IN', 'I']:
            if pass_result == 'IN':
                classification = "AGG: Successful Jump (INT)"
            else:
                classification = "AGG: Successful Jump (INC)"
        else:
            if separation_end > 8:
                classification = "AGG: Overcommit (Beaten)"
            elif direction_change_degrees > 60:
                classification = "AGG: Hard Break (Complete)"
            else:
                classification = "AGG: Undercut Fail (Complete)"
    else:
        if separation_end > 8:
            classification = "BEATEN"
        elif separation_end < 3 and direction_change_degrees < 25:
            if pass_result == 'C':
                classification = "CONS: Press (Complete)"
            elif pass_result == 'IN':
                classification = "CONS: Press (INT)"
            else:
                classification = "CONS: Press (INC)"
        elif 25 <= direction_change_degrees <= 50 and separation_end < 5:
            if separation_change < -0.5:
                classification = "CONS: Reactive Close"
            else:
                classification = "CONS: Reactive Mirror"
        elif 3 <= separation_end <= 7:
            classification = "CONS: Trail Coverage"
        elif direction_change_degrees > 50 and separation_change < 0:
            classification = "CONS: Broke With WR"
        else:
            classification = "CONS: Standard"
    
    results.append({
        'game_id': test_game_id,
        'play_id': test_play_id,
        'pass_result': pass_result,
        'wr_id': wr_id,
        'wr_name': wr_name,
        'cb_id': primary_cb_id,
        'cb_name': primary_cb_info['player_name'],
        'cb_position': primary_cb_info['player_position'],
        'avg_distance': cb_avg_distances[primary_cb_id],
        'direction_change_degrees': direction_change_degrees,
        'separation_start': separation_start,
        'separation_end': separation_end,
        'separation_change': separation_change,
        'los_change_diff': los_change_diff,
        'classification': classification,
        'decision_type': 'AGGRESSIVE' if is_aggressive else 'CONSERVATIVE',
        'num_frames': num_frames
    })

print(f"\n✓ Analysis complete!")
print(f"  Analyzed: {len(results)} plays")
print(f"  Skipped: {skipped} plays")

results_df = pd.DataFrame(results)

# === SAME ANALYSIS AS BEFORE ===
print("\n" + "="*60)
print("ANALYSIS ACROSS ALL 18 WEEKS")
print("="*60)

print("\n" + "="*60)
print("HIGH-LEVEL DECISION DISTRIBUTION")
print("="*60)
print(results_df['decision_type'].value_counts())
print(f"\nPercentages:")
print((results_df['decision_type'].value_counts(normalize=True) * 100).round(1))

print("\n" + "="*60)
print("DETAILED CLASSIFICATION DISTRIBUTION")
print("="*60)
print(results_df['classification'].value_counts())

print("\n" + "="*60)
print("AGGRESSIVE: Success Rate")
print("="*60)
aggressive = results_df[results_df['decision_type'] == 'AGGRESSIVE']
if len(aggressive) > 0:
    print(f"Total aggressive decisions: {len(aggressive)}")
    print("\nBreakdown:")
    print(aggressive['classification'].value_counts())
    print(f"\nOutcomes:")
    print(aggressive['pass_result'].value_counts())
    
    success_rate = len(aggressive[aggressive['pass_result'].isin(['I', 'IN'])]) / len(aggressive) * 100
    int_rate = len(aggressive[aggressive['pass_result'] == 'IN']) / len(aggressive) * 100
    print(f"\nSuccess rate (INT or INC): {success_rate:.1f}%")
    print(f"Interception rate: {int_rate:.1f}%")

print("\n" + "="*60)
print("CONSERVATIVE: Breakdown by Style")
print("="*60)
conservative = results_df[results_df['decision_type'] == 'CONSERVATIVE']
if len(conservative) > 0:
    print(f"Total conservative decisions: {len(conservative)}")
    print("\nTop classifications:")
    print(conservative['classification'].value_counts().head(10))
    
    print("\nPress Coverage outcomes:")
    press = conservative[conservative['classification'].str.contains('Press')]
    if len(press) > 0:
        print(press['pass_result'].value_counts())
        comp_rate = len(press[press['pass_result'] == 'C']) / len(press) * 100
        int_rate = len(press[press['pass_result'] == 'IN']) / len(press) * 100
        print(f"Completion rate: {comp_rate:.1f}%")
        print(f"Interception rate: {int_rate:.1f}%")

print("\n" + "="*60)
print("PASS RESULT BY DECISION TYPE")
print("="*60)
crosstab = pd.crosstab(results_df['decision_type'], results_df['pass_result'], normalize='index')
print((crosstab * 100).round(1))

print("\n" + "="*60)
print("COMPARISON: Week 1 vs All Weeks")
print("="*60)
print(f"Week 1 sample: 211 plays")
print(f"All weeks sample: {len(results_df)} plays")
print(f"\nWeek 1 aggressive rate: 13.3%")
print(f"All weeks aggressive rate: {(len(aggressive) / len(results_df) * 100):.1f}%")


Analyzing 4055 unique plays across all 18 weeks...
Processing play 0/4055...
Processing play 500/4055...
Processing play 1000/4055...
Processing play 1500/4055...
Processing play 2000/4055...
Processing play 2500/4055...
Processing play 3000/4055...
Processing play 3500/4055...
Processing play 4000/4055...

✓ Analysis complete!
  Analyzed: 3742 plays
  Skipped: 313 plays

ANALYSIS ACROSS ALL 18 WEEKS

HIGH-LEVEL DECISION DISTRIBUTION
decision_type
CONSERVATIVE    3295
AGGRESSIVE       447
Name: count, dtype: int64

Percentages:
decision_type
CONSERVATIVE    88.1
AGGRESSIVE      11.9
Name: proportion, dtype: float64

DETAILED CLASSIFICATION DISTRIBUTION
classification
CONS: Press (Complete)           959
CONS: Press (INC)                848
CONS: Trail Coverage             471
CONS: Reactive Close             269
CONS: Reactive Mirror            242
CONS: Broke With WR              205
BEATEN                           159
AGG: Successful Jump (INC)       151
AGG: Undercut Fail (Complet

In [63]:
# Add contextual variables
results_with_context = pd.merge(
    results_df,
    supplemental_df[[
        'game_id', 'play_id',
        'route_of_targeted_receiver',
        'down', 'yards_to_go',
        'yardline_number', 'quarter',
        'pre_snap_home_score', 'pre_snap_visitor_score'
    ]],
    on=['game_id', 'play_id'],
    how='left'
)

# Calculate score differential
results_with_context['score_diff'] = (
    results_with_context['pre_snap_home_score'] - 
    results_with_context['pre_snap_visitor_score']
)

# Analyze: When does aggressive work?
aggressive_context = results_with_context[
    results_with_context['decision_type'] == 'AGGRESSIVE'
]

print("WHEN DOES AGGRESSIVE SUCCEED?")
print("\nBy route type:")
route_analysis = pd.crosstab(
    aggressive_context['route_of_targeted_receiver'],
    aggressive_context['pass_result'],
    normalize='index'
)
print((route_analysis * 100).round(1))

print("\nBy down:")
down_analysis = pd.crosstab(
    aggressive_context['down'],
    aggressive_context['pass_result'],
    normalize='index'
)
print((down_analysis * 100).round(1))

print("\nBy distance (3rd downs only):")
third_down_agg = aggressive_context[aggressive_context['down'] == 3]
print(third_down_agg.groupby('yards_to_go')['pass_result'].value_counts(normalize=True))

WHEN DOES AGGRESSIVE SUCCEED?

By route type:
pass_result                    C     I   IN
route_of_targeted_receiver                 
ANGLE                       50.0  50.0  0.0
CORNER                      51.2  43.9  4.9
CROSS                       68.6  31.4  0.0
FLAT                        80.9  19.1  0.0
GO                          52.8  47.2  0.0
HITCH                       66.0  30.2  3.8
IN                          53.3  43.3  3.3
OUT                         72.9  26.0  1.0
POST                        78.1  21.9  0.0
SCREEN                      66.7  33.3  0.0
SLANT                       52.2  39.1  8.7
WHEEL                       45.5  54.5  0.0

By down:
pass_result     C     I   IN
down                        
1            65.2  33.9  0.9
2            65.4  32.3  2.3
3            63.6  34.8  1.6
4            61.1  33.3  5.6

By distance (3rd downs only):
yards_to_go  pass_result
1            C              0.636364
             I              0.363636
2            C          

In [64]:
# Look at ALL interceptions
all_ints = results_df[results_df['pass_result'] == 'IN']

print(f"Total INTs: {len(all_ints)}")
print("\nINT classification breakdown:")
print(all_ints['classification'].value_counts())
print(f"\nWhat % of INTs are classified as aggressive?")
print(f"Aggressive INTs: {len(all_ints[all_ints['decision_type'] == 'AGGRESSIVE'])} ({len(all_ints[all_ints['decision_type'] == 'AGGRESSIVE'])/len(all_ints)*100:.1f}%)")

# Check metrics for INT plays
print("\nAverage metrics for INT plays:")
print(all_ints.groupby('decision_type')[[
    'direction_change_degrees', 
    'separation_change', 
    'los_change_diff'
]].mean())

Total INTs: 78

INT classification breakdown:
classification
CONS: Press (INT)             42
CONS: Reactive Close          12
AGG: Successful Jump (INT)     8
CONS: Trail Coverage           6
CONS: Broke With WR            5
CONS: Reactive Mirror          2
BEATEN                         2
CONS: Standard                 1
Name: count, dtype: int64

What % of INTs are classified as aggressive?
Aggressive INTs: 8 (10.3%)

Average metrics for INT plays:
               direction_change_degrees  separation_change  los_change_diff
decision_type                                                              
AGGRESSIVE                    50.000851           1.985388        -0.851250
CONSERVATIVE                  24.210624          -0.555119        -0.820857


In [65]:
# Calculate aggression rate per CB
cb_aggression = results_df.groupby('cb_name').agg({
    'decision_type': lambda x: (x == 'AGGRESSIVE').sum() / len(x),
    'game_id': 'count'
}).rename(columns={'decision_type': 'aggression_rate', 'game_id': 'total_plays'})

# Filter to CBs with 20+ plays
cb_aggression = cb_aggression[cb_aggression['total_plays'] >= 20]
cb_aggression = cb_aggression.sort_values('aggression_rate', ascending=False)

print("Most aggressive CBs (players with 20+ plays):")
print(cb_aggression.head(10))

print("\nMost conservative CBs:")
print(cb_aggression.tail(10))

Most aggressive CBs (players with 20+ plays):
                     aggression_rate  total_plays
cb_name                                          
Darious Williams            0.320000           25
Patrick Peterson            0.280000           25
Sauce Gardner               0.250000           28
Sean Murphy-Bunting         0.225806           31
A.J. Terrell                0.222222           36
Carlton Davis III           0.206897           29
L'Jarius Sneed              0.200000           30
Jerry Jacobs                0.190476           21
Martin Emerson              0.189189           37
Darius Slay                 0.166667           36

Most conservative CBs:
                   aggression_rate  total_plays
cb_name                                        
Ja'Quan McMillian         0.045455           22
Greg Newsome II           0.041667           24
Jonathan Jones            0.035714           28
Michael Davis             0.030303           33
Asante Samuel             0.000000        

In [67]:
# For each aggressive play, when does separation start increasing?
def analyze_break_timing(game_id, play_id, cb_id, wr_id):
    """Calculate frame-by-frame separation to find break point"""
    
    cb_data = output_df[
        (output_df['game_id'] == game_id) & 
        (output_df['play_id'] == play_id) & 
        (output_df['nfl_id'] == cb_id)
    ].sort_values('frame_id')
    
    wr_data = output_df[
        (output_df['game_id'] == game_id) & 
        (output_df['play_id'] == play_id) & 
        (output_df['nfl_id'] == wr_id)
    ].sort_values('frame_id')
    
    merged = pd.merge(cb_data, wr_data, on='frame_id', suffixes=('_cb', '_wr'))
    merged['separation'] = np.sqrt(
        (merged['x_cb'] - merged['x_wr'])**2 + 
        (merged['y_cb'] - merged['y_wr'])**2
    )
    
    # When does separation start increasing?
    merged['sep_change'] = merged['separation'].diff()
    
    return merged

# Sample a few aggressive plays
aggressive_sample = results_df[
    results_df['decision_type'] == 'AGGRESSIVE'
].sample(5)

for idx, row in aggressive_sample.iterrows():
    timing = analyze_break_timing(
        row['game_id'], row['play_id'], 
        row['cb_id'], row['wr_id']
    )
    print(f"\n{row['cb_name']} vs {row['wr_name']} ({row['classification']})")
    print(timing[['frame_id', 'separation', 'sep_change']])


Kyle Dugger vs Isaiah McKenzie (AGG: Overcommit (Beaten))
    frame_id  separation  sep_change
0          1    8.498435         NaN
1          2    8.576788    0.078353
2          3    8.673506    0.096717
3          4    8.788822    0.115317
4          5    8.917937    0.129115
5          6    9.039386    0.121449
6          7    9.168042    0.128656
7          8    9.286980    0.118938
8          9    9.404169    0.117189
9         10    9.513622    0.109453
10        11    9.614141    0.100519
11        12    9.708785    0.094644
12        13    9.814443    0.105659
13        14    9.908628    0.094184
14        15   10.003929    0.095302
15        16   10.110020    0.106091

Jordan Battle vs Devin Singletary (AGG: Undercut Fail (Complete))
   frame_id  separation  sep_change
0         1    8.918705         NaN
1         2    8.497635   -0.421070
2         3    8.060484   -0.437151
3         4    7.619324   -0.441160
4         5    7.174204   -0.445120
5         6    6.748874   -0.

In [70]:
# Make sure you're using the dataframe with context data
if 'route_of_targeted_receiver' not in results_df.columns:
    print("Need to merge with supplemental data first...")
    results_with_context = pd.merge(
        results_df,
        supplemental_df[[
            'game_id', 'play_id',
            'route_of_targeted_receiver',
            'down', 'yards_to_go'
        ]],
        on=['game_id', 'play_id'],
        how='left'
    )
else:
    results_with_context = results_df

# Now run threshold test on results_with_context
thresholds_to_test = [25, 30, 35, 40, 45, 50]

for threshold in thresholds_to_test:
    # Reclassify with new threshold
    results_with_context['is_agg_test'] = (
        (results_with_context['direction_change_degrees'] > threshold) & 
        (results_with_context['separation_change'] > 0.5)
    ) | (
        (results_with_context['los_change_diff'] < -2.5) & 
        (results_with_context['separation_end'] > 4)
    )
    
    agg_test = results_with_context[results_with_context['is_agg_test']]
    
    print(f"\nThreshold = {threshold}°:")
    print(f"  Aggressive plays: {len(agg_test)} ({len(agg_test)/len(results_with_context)*100:.1f}%)")
    
    if len(agg_test) > 0:
        success_rate = len(agg_test[agg_test['pass_result'].isin(['I','IN'])]) / len(agg_test) * 100
        print(f"  Success rate: {success_rate:.1f}%")
        
        # Check route patterns (only if we have route data)
        if agg_test['route_of_targeted_receiver'].notna().any():
            route_success = agg_test.groupby('route_of_targeted_receiver').apply(
                lambda x: len(x[x['pass_result'].isin(['I','IN'])]) / len(x) * 100
            ).sort_values(ascending=False)
            print(f"  Top routes: {route_success.head(3).to_dict()}")
    else:
        print("  No aggressive plays at this threshold")

Need to merge with supplemental data first...

Threshold = 25°:
  Aggressive plays: 497 (13.3%)
  Success rate: 35.6%
  Top routes: {'WHEEL': 54.54545454545454, 'GO': 50.847457627118644, 'ANGLE': 50.0}

Threshold = 30°:
  Aggressive plays: 447 (11.9%)
  Success rate: 35.6%
  Top routes: {'WHEEL': 54.54545454545454, 'ANGLE': 50.0, 'CORNER': 48.78048780487805}

Threshold = 35°:
  Aggressive plays: 393 (10.5%)
  Success rate: 37.4%
  Top routes: {'WHEEL': 54.54545454545454, 'ANGLE': 50.0, 'CORNER': 48.57142857142857}

Threshold = 40°:
  Aggressive plays: 342 (9.1%)
  Success rate: 38.0%
  Top routes: {'SLANT': 60.0, 'WHEEL': 54.54545454545454, 'ANGLE': 50.0}

Threshold = 45°:
  Aggressive plays: 306 (8.2%)
  Success rate: 37.6%
  Top routes: {'SLANT': 54.54545454545454, 'WHEEL': 54.54545454545454, 'ANGLE': 50.0}

Threshold = 50°:
  Aggressive plays: 281 (7.5%)
  Success rate: 38.1%
  Top routes: {'SLANT': 54.54545454545454, 'WHEEL': 54.54545454545454, 'ANGLE': 50.0}


In [73]:
# Calculate velocity change (acceleration proxy)
def calculate_velocity_metrics(game_id, play_id, cb_id):
    cb_data = output_df[
        (output_df['game_id'] == game_id) & 
        (output_df['play_id'] == play_id) & 
        (output_df['nfl_id'] == cb_id)
    ].sort_values('frame_id')
    
    # Frame-to-frame movement
    cb_data['dx'] = cb_data['x'].diff()
    cb_data['dy'] = cb_data['y'].diff()
    cb_data['speed'] = np.sqrt(cb_data['dx']**2 + cb_data['dy']**2)
    
    # Acceleration (change in speed)
    cb_data['acceleration'] = cb_data['speed'].diff()
    
    # When does CB accelerate hard?
    max_accel_frame = cb_data['acceleration'].idxmax()
    
    return {
        'max_accel': cb_data.loc[max_accel_frame, 'acceleration'],
        'max_accel_frame': cb_data.loc[max_accel_frame, 'frame_id'],
        'avg_speed': cb_data['speed'].mean()
    }

# Compare aggressive vs conservative
sample_agg = results_df[results_df['decision_type'] == 'AGGRESSIVE'].sample(min(50, len(results_df[results_df['decision_type'] == 'AGGRESSIVE'])))
sample_cons = results_df[results_df['decision_type'] == 'CONSERVATIVE'].sample(min(50, len(results_df[results_df['decision_type'] == 'CONSERVATIVE'])))

# This would show if aggressive plays have different velocity patterns
sample_agg
sample_cons

Unnamed: 0,game_id,play_id,pass_result,wr_id,wr_name,cb_id,cb_name,cb_position,avg_distance,direction_change_degrees,separation_start,separation_end,separation_change,los_change_diff,classification,decision_type,num_frames,is_agg_test
3192,2023122405,2604,C,53541,Amon-Ra St. Brown,54583,Akayleb Evans,CB,3.620372,29.629538,4.65244,2.597306,-2.055134,-2.38,CONS: Reactive Close,CONSERVATIVE,10,False
1421,2023102300,1376,C,52433,Brandon Aiyuk,54583,Akayleb Evans,CB,3.329014,28.978414,2.467894,3.322845,0.854951,0.14,CONS: Reactive Mirror,CONSERVATIVE,13,True
62,2023091004,1746,I,55887,Jordan Addison,47877,Jamel Dean,CB,1.144485,39.616014,1.578005,1.008811,-0.569194,0.3,CONS: Reactive Close,CONSERVATIVE,17,False
2784,2023121005,119,I,52659,Stephen Sullivan,40017,Tyrann Mathieu,FS,1.477298,22.979943,0.809938,2.38533,1.575392,-1.97,CONS: Press (INC),CONSERVATIVE,25,False
2853,2023121010,3311,I,48456,Donald Parham,43387,Justin Simmons,FS,2.716909,21.492317,5.828182,0.682495,-5.145686,-4.78,CONS: Press (INC),CONSERVATIVE,10,False
3712,2024010711,3327,C,52608,Quez Watkins,44830,Adoree' Jackson,CB,3.29469,0.877713,3.637857,3.023194,-0.614663,-1.34,CONS: Trail Coverage,CONSERVATIVE,9,False
1588,2023102908,1602,C,54481,Jahan Dotson,54808,Reed Blankenship,FS,2.631102,5.082455,2.312488,2.650019,0.337531,-0.25,CONS: Press (Complete),CONSERVATIVE,11,False
619,2023092411,522,I,40011,Travis Kelce,56030,Terell Smith,CB,1.994348,3.010364,2.584956,1.394597,-1.19036,0.69,CONS: Press (INC),CONSERVATIVE,9,False
1819,2023110510,2460,I,52425,CeeDee Lamb,39984,Darius Slay,CB,0.838587,0.332135,0.977446,0.565685,-0.41176,0.31,CONS: Press (INC),CONSERVATIVE,22,False
2647,2023120309,637,C,53439,DeVonta Smith,53531,Ambry Thomas,CB,3.380094,157.958889,4.031241,2.583118,-1.448122,-1.49,CONS: Broke With WR,CONSERVATIVE,11,False
