In [42]:
import pandas as pd 
import numpy as np
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

input_df = pd.read_csv('../data/train/input_2023_w01.csv')
print(input_df.head())

output_df = pd.read_csv('../data/train/output_2023_w01.csv')
print(output_df.head())

supplemental_df = pd.read_csv('../data/supplementary_data.csv')

      game_id  play_id  player_to_predict  nfl_id  frame_id play_direction  \
0  2023090700      101              False   54527         1          right   
1  2023090700      101              False   54527         2          right   
2  2023090700      101              False   54527         3          right   
3  2023090700      101              False   54527         4          right   
4  2023090700      101              False   54527         5          right   

   absolute_yardline_number player_name player_height  player_weight  ...  \
0                        42  Bryan Cook           6-1            210  ...   
1                        42  Bryan Cook           6-1            210  ...   
2                        42  Bryan Cook           6-1            210  ...   
3                        42  Bryan Cook           6-1            210  ...   
4                        42  Bryan Cook           6-1            210  ...   

          player_role      x      y     s     a     dir       o  \
0

  supplemental_df = pd.read_csv('../data/supplementary_data.csv')


In [43]:
# Merge input with supplemental data on game_id and play_id
final_df = pd.merge(supplemental_df, input_df, on=['game_id', 'play_id'], how='left')

print(f"Final shape: {final_df.shape}")
print(final_df.head())

Final shape: (302904, 62)
      game_id  season  week   game_date game_time_eastern home_team_abbr  \
0  2023090700    2023     1  09/07/2023          20:20:00             KC   
1  2023090700    2023     1  09/07/2023          20:20:00             KC   
2  2023090700    2023     1  09/07/2023          20:20:00             KC   
3  2023090700    2023     1  09/07/2023          20:20:00             KC   
4  2023090700    2023     1  09/07/2023          20:20:00             KC   

  visitor_team_abbr  play_id  \
0               DET     3461   
1               DET     3461   
2               DET     3461   
3               DET     3461   
4               DET     3461   

                                    play_description  quarter  ...  \
0  (10:46) (Shotgun) J.Goff pass deep left to J.R...        4  ...   
1  (10:46) (Shotgun) J.Goff pass deep left to J.R...        4  ...   
2  (10:46) (Shotgun) J.Goff pass deep left to J.R...        4  ...   
3  (10:46) (Shotgun) J.Goff pass deep left t

In [44]:
# Sort by game_id and play_id to ensure proper order
supplemental_df = supplemental_df.sort_values(['game_id', 'play_id'])

# Get the previous play's expected_points_added within each game
supplemental_df['prev_expected_points_added'] = supplemental_df.groupby('game_id')['expected_points_added'].shift(1)

# Calculate the change in EPA from the last play
supplemental_df['epa_change_from_last_play'] = supplemental_df['expected_points_added'] - supplemental_df['prev_expected_points_added']

# Check the result
#print(supplemental_df[['game_id', 'play_id', 'expected_points_added', 'prev_expected_points_added', 'epa_change_from_last_play']].head(20))
print(supplemental_df.columns)
print(supplemental_df.shape)

Index(['game_id', 'season', 'week', 'game_date', 'game_time_eastern',
       'home_team_abbr', 'visitor_team_abbr', 'play_id', 'play_description',
       'quarter', 'game_clock', 'down', 'yards_to_go', 'possession_team',
       'defensive_team', 'yardline_side', 'yardline_number',
       'pre_snap_home_score', 'pre_snap_visitor_score',
       'play_nullified_by_penalty', 'pass_result', 'pass_length',
       'offense_formation', 'receiver_alignment', 'route_of_targeted_receiver',
       'play_action', 'dropback_type', 'dropback_distance',
       'pass_location_type', 'defenders_in_the_box', 'team_coverage_man_zone',
       'team_coverage_type', 'penalty_yards', 'pre_penalty_yards_gained',
       'yards_gained', 'expected_points', 'expected_points_added',
       'pre_snap_home_team_win_probability',
       'pre_snap_visitor_team_win_probability',
       'home_team_win_probability_added', 'visitor_team_win_probility_added',
       'prev_expected_points_added', 'epa_change_from_last_play']

In [45]:
categorical_vars = ['offense_formation', 'receiver_alignment', 'route_of_targeted_receiver',
                    'play_action', 'dropback_type', 'pass_location_type', 
                    'team_coverage_man_zone', 'coverage_shell']

# Use epa_change_from_last_play to see actual impact


# For continuous variables
print("\nDropback Distance correlation with EPA Change:")
print(supplemental_df[['dropback_distance', 'epa_change_from_last_play']].corr())

# Create a new simplified coverage column
supplemental_df['coverage_shell'] = supplemental_df['team_coverage_type'].str.replace('_MAN', '').str.replace('_ZONE', '')

# Check the new values
print("Simplified Coverage Types:")
print(supplemental_df.groupby('coverage_shell')['epa_change_from_last_play'].agg(['mean', 'count', 'std']).sort_values('mean', ascending=False))

for var in categorical_vars:
    print(f"\n{var.upper()}:")
    print(supplemental_df.groupby(var)['epa_change_from_last_play'].agg(['mean', 'count', 'std']).sort_values('mean', ascending=False))


Dropback Distance correlation with EPA Change:
                           dropback_distance  epa_change_from_last_play
dropback_distance                    1.00000                   -0.00317
epa_change_from_last_play           -0.00317                    1.00000
Simplified Coverage Types:
                    mean  count       std
coverage_shell                           
COVER_1         0.069325   4025  2.400311
COVER_2         0.026411   2803  2.252582
COVER_3        -0.032846   5531  2.286238
COVER_4        -0.068406   2815  2.201964
COVER_6        -0.070321   1660  2.266725
COVER_0        -0.081306    773  2.503411
PREVENT        -0.467970     48  1.980563

OFFENSE_FORMATION:
                       mean  count       std
offense_formation                           
WILDCAT            2.466206      3  1.182529
JUMBO              0.334705     36  2.247562
EMPTY              0.026039   2090  2.374883
SHOTGUN            0.020335  12536  2.277655
I_FORM            -0.057051    383  2.260

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

def train_model_subset(df, var_list):
    """Train a model on a subset of variables"""
    df_ml = df.dropna(subset=var_list + ['epa_change_from_last_play'])
    
    encoders = {}
    encoded_cols = []
    
    for var in var_list:
        if var in ['defenders_in_the_box', 'dropback_distance']:
            # Numeric variables - no encoding needed
            encoded_cols.append(var)
        else:
            # Categorical - needs encoding
            le = LabelEncoder()
            df_ml[f'{var}_encoded'] = le.fit_transform(df_ml[var].astype(str))
            encoders[var] = le
            encoded_cols.append(f'{var}_encoded')
    
    X = df_ml[encoded_cols]
    y = df_ml['epa_change_from_last_play']
    
    # Train model
    rf = RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42)
    rf.fit(X, y)
    
    # Get score
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    rf.fit(X_train, y_train)
    score = rf.score(X_test, y_test)
    
    print(f"Model trained on {len(var_list)} variables - R¬≤ Score: {score:.3f}")
    
    return rf, encoders

def train_all_models(supplemental_df):
    """Train multiple models for different scenarios"""
    
    print("Training multiple models...\n")
    
    models = {}
    
    # Model 1: Pre-snap offensive playcalling only
    print("1. OFFENSE ONLY MODEL (pre-snap decisions)")
    offense_vars = ['offense_formation', 'receiver_alignment', 'route_of_targeted_receiver', 'play_action']
    models['offense_only'] = {
        'model': train_model_subset(supplemental_df, offense_vars)[0],
        'encoders': train_model_subset(supplemental_df, offense_vars)[1],
        'vars': offense_vars,
        'description': 'Use when planning plays (no defensive info)'
    }
    
    # Model 2: With defensive alignment (pre-snap reads)
    print("\n2. PRE-SNAP READ MODEL (offense + defensive alignment)")
    presnap_vars = ['offense_formation', 'receiver_alignment', 'route_of_targeted_receiver', 
                    'play_action', 'defenders_in_the_box', 'coverage_shell']
    models['presnap'] = {
        'model': train_model_subset(supplemental_df, presnap_vars)[0],
        'encoders': train_model_subset(supplemental_df, presnap_vars)[1],
        'vars': presnap_vars,
        'description': 'Use when you can read defensive alignment pre-snap'
    }
    
    # Model 3: With coverage ID (post-snap reads)
    print("\n3. COVERAGE READ MODEL (offense + identified coverage)")
    coverage_vars = ['offense_formation', 'receiver_alignment', 'route_of_targeted_receiver',
                     'play_action', 'defenders_in_the_box', 'coverage_shell','team_coverage_man_zone']
    models['coverage'] = {
        'model': train_model_subset(supplemental_df, coverage_vars)[0],
        'encoders': train_model_subset(supplemental_df, coverage_vars)[1],
        'vars': coverage_vars,
        'description': 'Use when you know the coverage type'
    }
    
    # Model 4: Full model (everything known)
    print("\n4. FULL MODEL (all variables)")
    full_vars = ['offense_formation', 'receiver_alignment', 'route_of_targeted_receiver',
                 'play_action', 'dropback_type', 'pass_location_type', 
                 'team_coverage_man_zone', 'team_coverage_type', 
                 'defenders_in_the_box', 'dropback_distance']
    models['full'] = {
        'model': train_model_subset(supplemental_df, full_vars)[0],
        'encoders': train_model_subset(supplemental_df, full_vars)[1],
        'vars': full_vars,
        'description': 'Use for post-play analysis with all info'
    }
    
    print("\n" + "="*60)
    print("All models trained successfully!")
    print("="*60)
    
    return models

def predict_epa_smart(models, **play_params):
    """
    Automatically selects the best model based on provided parameters
    """
    # Determine which model to use based on available params
    param_keys = set(play_params.keys())
    
    if 'dropback_distance' in param_keys or 'pass_location_type' in param_keys:
        model_choice = 'full'
    elif 'team_coverage_type' in param_keys:
        model_choice = 'coverage'
    elif 'defenders_in_the_box' in param_keys or 'team_coverage_man_zone' in param_keys:
        model_choice = 'presnap'
    else:
        model_choice = 'offense_only'
    
    print(f"Using {model_choice.upper()} model")
    print(f"({models[model_choice]['description']})\n")
    
    model_info = models[model_choice]
    model = model_info['model']
    encoders = model_info['encoders']
    required_vars = model_info['vars']
    
    # Check if all required variables are provided
    missing = [var for var in required_vars if var not in play_params]
    if missing:
        print(f"ERROR: Missing required variables for {model_choice} model: {missing}")
        print(f"Required: {required_vars}")
        return None
    
    # Encode inputs
    encoded_values = []
    for var in required_vars:
        value = play_params[var]
        
        if var in ['defenders_in_the_box', 'dropback_distance']:
            # Numeric - use directly
            encoded_values.append(value)
        else:
            # Categorical - encode
            value_str = str(value)
            if value_str not in encoders[var].classes_:
                print(f"ERROR: '{value_str}' not a valid value for {var}")
                print(f"Valid values: {list(encoders[var].classes_[:10])}")
                return None
            encoded_values.append(encoders[var].transform([value_str])[0])
    
    # Predict
    prediction = model.predict([encoded_values])[0]
    
    print(f"{'='*60}")
    print(f"PREDICTED EPA CHANGE: {prediction:+.3f}")
    print(f"{'='*60}")
    print("\nPlay Details:")
    for key, value in play_params.items():
        print(f"  {key}: {value}")
    
    return prediction

# Train all models
all_models = train_all_models(supplemental_df)

In [49]:
# Test with different scenarios
print("\n\n" + "="*60)
print("EXAMPLE 1: Planning a play (offense only)")
print("="*60)
predict_epa_smart(all_models,
                  offense_formation='SHOTGUN',
                  receiver_alignment='3x1',
                  route_of_targeted_receiver='WHEEL',
                  play_action='True')

print("\n\n" + "="*60)
print("EXAMPLE 2: Pre-snap read (identified 6 defenders in box)")
print("="*60)
predict_epa_smart(all_models,
                  offense_formation='SHOTGUN',
                  receiver_alignment='3x1',
                  route_of_targeted_receiver='SLANT',
                  play_action='False',
                  defenders_in_the_box=6,
                  coverage_shell='COVER_1', )

print("\n\n" + "="*60)
print("EXAMPLE 3: Post-snap (identified Cover 3)")
print("="*60)
predict_epa_smart(all_models,
                  offense_formation='SHOTGUN',
                  receiver_alignment='2x2',
                  route_of_targeted_receiver='GO',
                  play_action='False',
                  defenders_in_the_box=6,
                  coverage_shell='COVER_3',
                  team_coverage_man_zone='MAN')



EXAMPLE 1: Planning a play (offense only)
Using OFFENSE_ONLY model
(Use when planning plays (no defensive info))

PREDICTED EPA CHANGE: +0.111

Play Details:
  offense_formation: SHOTGUN
  receiver_alignment: 3x1
  route_of_targeted_receiver: WHEEL
  play_action: True


EXAMPLE 2: Pre-snap read (identified 6 defenders in box)
Using PRESNAP model
(Use when you can read defensive alignment pre-snap)

PREDICTED EPA CHANGE: +0.264

Play Details:
  offense_formation: SHOTGUN
  receiver_alignment: 3x1
  route_of_targeted_receiver: SLANT
  play_action: False
  defenders_in_the_box: 6
  coverage_shell: COVER_1


EXAMPLE 3: Post-snap (identified Cover 3)
Using PRESNAP model
(Use when you can read defensive alignment pre-snap)

PREDICTED EPA CHANGE: +0.093

Play Details:
  offense_formation: SHOTGUN
  receiver_alignment: 2x2
  route_of_targeted_receiver: GO
  play_action: False
  defenders_in_the_box: 6
  coverage_shell: COVER_3
  team_coverage_man_zone: MAN




0.0934004790145611

In [54]:

from dash import Dash, dcc, html, Input, Output, State, ALL
import plotly.express as px
import pandas as pd
from dash.exceptions import PreventUpdate

# Initialize the Dash app
app = Dash(__name__, suppress_callback_exceptions=True)

# Define model options
model_options = [
    {'label': 'Offense Only (Pre-snap planning)', 'value': 'offense_only'},
    {'label': 'Pre-snap Read (Defensive alignment)', 'value': 'presnap'},
    {'label': 'Coverage Read (Post-snap coverage ID)', 'value': 'coverage'},
    {'label': 'Full Model (All variables)', 'value': 'full'}
]

# Function to get dropdown options from data
def get_dropdown_options(column_name):
    unique_values = supplemental_df[column_name].dropna().unique()
    return [{'label': str(val), 'value': str(val)} for val in sorted(unique_values)]

# App layout
app.layout = html.Div([
    html.H1("NFL EPA Prediction Dashboard", style={'textAlign': 'center', 'color': '#013369'}),
    
    html.Div([
        html.Label("Select Model:", style={'fontSize': 18, 'fontWeight': 'bold'}),
        dcc.Dropdown(
            id='model-selector',
            options=model_options,
            value='offense_only',
            style={'width': '100%'}
        ),
    ], style={'padding': '20px', 'backgroundColor': '#f0f0f0', 'borderRadius': '5px', 'margin': '20px'}),
    
    html.Div(id='input-fields', style={'padding': '20px'}),
    
    html.Div([
        html.Button('PREDICT EPA', id='predict-button', n_clicks=0, 
                   style={'fontSize': 20, 'padding': '15px 30px', 'backgroundColor': '#013369', 
                          'color': 'white', 'border': 'none', 'borderRadius': '5px', 'cursor': 'pointer'})
    ], style={'textAlign': 'center', 'margin': '20px'}),
    
    html.Div(id='prediction-output', style={'padding': '20px'})
])

# Callback to update input fields based on selected model
@app.callback(
    Output('input-fields', 'children'),
    Input('model-selector', 'value')
)
def update_input_fields(selected_model):
    if not selected_model:
        raise PreventUpdate
    
    required_vars = all_models[selected_model]['vars']
    
    inputs = []
    inputs.append(html.H3(f"Enter variables for {selected_model.replace('_', ' ').title()} model:", 
                         style={'color': '#013369'}))
    
    for var in required_vars:
        if var in ['defenders_in_the_box', 'dropback_distance']:
            # Numeric input
            if var == 'defenders_in_the_box':
                inputs.append(html.Div([
                    html.Label(f"{var.replace('_', ' ').title()}:", style={'fontWeight': 'bold'}),
                    dcc.Input(
                        id={'type': 'dynamic-input', 'index': var},
                        type='number',
                        placeholder='Enter number (e.g., 6)',
                        min=0,
                        max=11,
                        step=1,
                        style={'width': '100%', 'padding': '8px', 'margin': '5px 0'}
                    )
                ], style={'marginBottom': '15px'}))
            else:  # dropback_distance
                inputs.append(html.Div([
                    html.Label(f"{var.replace('_', ' ').title()}:", style={'fontWeight': 'bold'}),
                    dcc.Input(
                        id={'type': 'dynamic-input', 'index': var},
                        type='number',
                        placeholder='Enter distance in yards (e.g., 5.5)',
                        min=0,
                        max=20,
                        step=0.1,
                        style={'width': '100%', 'padding': '8px', 'margin': '5px 0'}
                    )
                ], style={'marginBottom': '15px'}))
        else:
            # Categorical dropdown
            inputs.append(html.Div([
                html.Label(f"{var.replace('_', ' ').title()}:", style={'fontWeight': 'bold'}),
                dcc.Dropdown(
                    id={'type': 'dynamic-input', 'index': var},
                    options=get_dropdown_options(var),
                    placeholder=f'Select {var.replace("_", " ")}...',
                    style={'width': '100%'}
                )
            ], style={'marginBottom': '15px'}))
    
    return inputs

# Callback to make prediction
@app.callback(
    Output('prediction-output', 'children'),
    Input('predict-button', 'n_clicks'),
    State('model-selector', 'value'),
    State({'type': 'dynamic-input', 'index': ALL}, 'value'),
    State({'type': 'dynamic-input', 'index': ALL}, 'id')
)
def predict_epa_callback(n_clicks, selected_model, input_values, input_ids):
    if n_clicks == 0:
        return html.Div("Select your inputs and click PREDICT EPA", 
                       style={'textAlign': 'center', 'fontSize': 18, 'color': '#666'})
    
    if not selected_model:
        raise PreventUpdate
    
    # Map inputs to variable names
    play_params = {}
    for input_id, value in zip(input_ids, input_values):
        var_name = input_id['index']
        if value is None or value == '':
            return html.Div([
                html.H3("‚ùå Error", style={'color': 'red'}),
                html.P(f"Please fill in all required fields. Missing: {var_name.replace('_', ' ').title()}")
            ], style={'padding': '20px', 'backgroundColor': '#ffe6e6', 'borderRadius': '5px'})
        play_params[var_name] = value
    
    # Get model info
    model_info = all_models[selected_model]
    model = model_info['model']
    encoders = model_info['encoders']
    required_vars = model_info['vars']
    
    # Encode inputs
    try:
        encoded_values = []
        for var in required_vars:
            value = play_params[var]
            
            if var in ['defenders_in_the_box', 'dropback_distance']:
                encoded_values.append(float(value))
            else:
                value_str = str(value)
                if value_str not in encoders[var].classes_:
                    return html.Div([
                        html.H3("‚ùå Error", style={'color': 'red'}),
                        html.P(f"Invalid value '{value_str}' for {var}")
                    ], style={'padding': '20px', 'backgroundColor': '#ffe6e6', 'borderRadius': '5px'})
                encoded_values.append(encoders[var].transform([value_str])[0])
        
        # Make prediction
        prediction = model.predict([encoded_values])[0]
        
        # Create output display
        output = html.Div([
            html.H2("üèà Prediction Results", style={'color': '#013369', 'textAlign': 'center'}),
            html.Div([
                html.H1(f"{prediction:+.3f}", 
                       style={'fontSize': 60, 'color': '#28a745' if prediction > 0 else '#dc3545', 
                              'textAlign': 'center', 'margin': '20px'})
            ], style={'backgroundColor': '#f8f9fa', 'padding': '20px', 'borderRadius': '10px'}),
            html.H4("Expected Points Added", style={'textAlign': 'center', 'color': '#666'}),
            
            html.Hr(),
            
            html.H4("Play Configuration:", style={'color': '#013369'}),
            html.Ul([
                html.Li(f"{var.replace('_', ' ').title()}: {play_params[var]}", 
                       style={'fontSize': 16, 'padding': '5px'})
                for var in required_vars
            ])
        ], style={'padding': '30px', 'backgroundColor': '#ffffff', 'borderRadius': '10px', 
                 'boxShadow': '0 4px 6px rgba(0,0,0,0.1)'})
        
        return output
        
    except Exception as e:
        return html.Div([
            html.H3("‚ùå Error", style={'color': 'red'}),
            html.P(f"Prediction failed: {str(e)}")
        ], style={'padding': '20px', 'backgroundColor': '#ffe6e6', 'borderRadius': '5px'})

# Run the app
if __name__ == '__main__':
    app.run(debug=True, port=8050)


X does not have valid feature names, but RandomForestRegressor was fitted with feature names


X does not have valid feature names, but RandomForestRegressor was fitted with feature names


X does not have valid feature names, but RandomForestRegressor was fitted with feature names


X does not have valid feature names, but RandomForestRegressor was fitted with feature names


X does not have valid feature names, but RandomForestRegressor was fitted with feature names


X does not have valid feature names, but RandomForestRegressor was fitted with feature names


X does not have valid feature names, but RandomForestRegressor was fitted with feature names


X does not have valid feature names, but RandomForestRegressor was fitted with feature names


X does not have valid feature names, but RandomForestRegressor was fitted with feature names


X does not have valid feature names, but RandomForestRegressor was fitted with feature names


X does not have valid feature names, but RandomFo