https://rowzero.io/blog/nfl-fantasy-football-data-with-python

https://stackoverflow.com/questions/75265403/web-scraping-espn-nfl-webpage-with-python


football passing locations
https://github.com/ArrowheadAnalytics/next-gen-scrapy-2.0


salary data:
https://overthecap.com/



4th quarter sources:
https://www.espn.com/nfl/story/_/id/33059528/nfl-game-management-cheat-sheet-punt-go-kick-field-goal-fourth-downs-plus-2-point-conversion-recommendations


https://www.bruinsportsanalytics.com/post/4th-down-model



Here are the 6 questions:

    - What avenue of player acquisition do you think currently provides teams with the most value per dollar spend and why?
    - Imagine that you are tasked with evaluating the accuracy of three different college-to-pro player projection systems for wide receivers. You have both the projections and actual pro statistics for the past 10 seasons. Discuss how you would approach the problem and list any potential issues you may encounter.
    - Choose any active player in the NFL. How do you assess the quality of this player relative to their position group, and why? How would you value this player in terms of dollars, and how does this compare to their current contract?
    - A defensive coach approaches you and asks for an offensive team's tendencies when they're aligned in a 3x1 bunch formation. What types of tendencies would you look for, and how would you communicate your results to the coach?
    - The head coach has a difficult decision to make on 4th down. Discuss how you would evaluate the possible options using data.
    - Why does football matter to you?


In [2]:
%%writefile ../src/features/acquisition_value.py
import pandas as pd
import nfl_data_py as nfl

def analyze_acquisition_value(years):
    if not isinstance(years, (list, range)):
        raise ValueError("years variable must be list or range.")
    
    draft_data = nfl.import_draft_picks(years)
    seasonal_data = nfl.import_seasonal_data(years, s_type='REG')

    try:
        salary_df = nfl.import_contracts()
        if 'year_signed' not in salary_df.columns:
            print("Warning: 'year_signed' column not found in salary data. Skipping salary analysis.")
            return draft_data, seasonal_data

        required_cols = ['player', 'year_signed', 'value']
        if not all(col in salary_df.columns for col in required_cols):
            print(f"Warning: Missing required columns in salary data. Expected {required_cols}. Found {salary_df.columns.tolist()}. Skipping salary analysis.")
            return draft_data, seasonal_data

        seasonal_data = pd.merge(seasonal_data, salary_df[required_cols], left_on=['player', 'season'], right_on=['player', 'year_signed'], how='left')
        seasonal_data['value_per_dollar'] = seasonal_data['approximate_value'] / seasonal_data['value']
    except Exception as e:
        print(f"Error processing salary data: {str(e)}. Skipping salary analysis.")
        return draft_data, seasonal_data

    drafted_players = seasonal_data[seasonal_data['draft_number'].notnull()]
    undrafted_players = seasonal_data[seasonal_data['draft_number'].isnull()]

    return drafted_players, undrafted_players


Overwriting ../src/features/acquisition_value.py


In [3]:
%%writefile ../src/analysis/wr_projection.py
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

def evaluate_wr_projections(projections, actual_stats):
    # Debug: Print columns of projections and actual_stats DataFrames
    print("Projections columns:", projections.columns)
    print("Actual stats columns:", actual_stats.columns)
    
    # Perform merge
    merged_data = pd.merge(projections, actual_stats, on=['player_id', 'season'], suffixes=('_proj', '_actual'))
    
    # Debug: Print columns of the merged DataFrame
    print("Merged data columns:", merged_data.columns)
    
    metrics = ['receptions', 'receiving_yards', 'receiving_tds']  # Updated here
    results = {}

    for metric in metrics:
        actual_col = f'{metric}_actual'
        proj_col = f'{metric}_proj'
        if actual_col not in merged_data.columns or proj_col not in merged_data.columns:
            print(f"Column {actual_col} or {proj_col} not found in merged_data")
            continue

        mae = mean_absolute_error(merged_data[actual_col], merged_data[proj_col])
        rmse = np.sqrt(mean_squared_error(merged_data[actual_col], merged_data[proj_col]))
        results[metric] = {'MAE': mae, 'RMSE': rmse}

    return results


Overwriting ../src/analysis/wr_projection.py


In [4]:
%%writefile ../src/analysis/player_quality.py
import pandas as pd

def assess_player_quality(player_data, position_data):
    player_stats = player_data.iloc[0]

    percentiles = {}
    for stat in ['passing_yards', 'rushing_yards', 'receptions', 'receiving_yards', 'touchdowns']:
        if stat in player_stats and stat in position_data:
            percentile = (position_data[stat] < player_stats[stat]).mean() * 100
            percentiles[stat] = percentile

    return percentiles

Overwriting ../src/analysis/player_quality.py


In [5]:
%%writefile ../src/analysis/offensive_tendencies.py
import pandas as pd


def analyze_3x1_bunch_formation(play_data):
    # Filter plays for '3x1 bunch' inferred formation: 1 RB, 1 TE, 3 WR
    bunch_formation_plays = play_data[play_data['offense_personnel'] == '1 RB, 1 TE, 3 WR']
    
    # Debug: Check the number of plays in the 3x1 bunch formation
    print("Number of plays in 3x1 bunch formation:", bunch_formation_plays.shape[0])
    
    tendencies = {
        'run_percentage': (bunch_formation_plays['play_type'] == 'run').mean() * 100,
        'pass_percentage': (bunch_formation_plays['play_type'] == 'pass').mean() * 100,
        'avg_yards_gained': bunch_formation_plays['yards_gained'].mean(),
        'success_rate': (bunch_formation_plays['success'] == 1).mean() * 100
    }
    
    # Additional analyses
    down_tendencies = bunch_formation_plays.groupby('down')['play_type'].value_counts(normalize=True).unstack()
    
    situational_tendencies = bunch_formation_plays.groupby(['down', 'yardline_100'])['play_type'].value_counts(normalize=True).unstack()
    
    return tendencies, down_tendencies, situational_tendencies

# Example usage
# years = [2020, 2021, 2022]
# play_data = pd.concat([nfl.import_pbp_data([year]) for year in years])
# tendencies, down_tendencies, situational_tendencies = analyze_3x1_bunch_formation(play_data)

# print("Overall Tendencies:", tendencies)
# print("Tendencies by Down:", down_tendencies)
# print("Situational Tendencies:", situational_tendencies.head(10))


Overwriting ../src/analysis/offensive_tendencies.py


In [6]:
%%writefile ../src/analysis/fourth_down_analysis.py
import pandas as pd

def analyze_fourth_down_decisions(pbp_data):
    fourth_down_plays = pbp_data[pbp_data['down'] == 4]
    decisions = fourth_down_plays.groupby(['season', 'play_type']).size().unstack(fill_value=0).reset_index()
    decisions['total'] = decisions.sum(axis=1)
    success_rates = fourth_down_plays.groupby(['season', 'play_type'])['success'].mean().unstack(fill_value=0).reset_index()
    decisions.columns = ['Season'] + [f"{col.capitalize()} (%)" for col in decisions.columns[1:]]
    success_rates.columns = ['Season'] + [f"{col.capitalize()} (%)" for col in success_rates.columns[1:]]
    return decisions, success_rates

Overwriting ../src/analysis/fourth_down_analysis.py


In [7]:
%%writefile ../src/features/contracts.py
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def get_player_contract_history(player_url):
    response = requests.get(player_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    tables = soup.find_all('table')

    if not tables:
        return None

    for table in tables:
        headers = [th.text.strip() for th in table.find_all('th')]
        if 'Year' in headers and 'Age' in headers and 'Base Salary' in headers:
            rows = []
            for tr in table.find_all('tr')[1:]:
                row = [td.text.strip() for td in tr.find_all('td')]
                if row:
                    row = row[:len(headers)]
                    rows.append(row)

            df = pd.DataFrame(rows, columns=headers)
            for col in df.columns[1:]:
                df[col] = df[col].replace('[\$,]', '', regex=True).astype(float, errors='ignore')

            return df

    return None

def get_current_contracts():
    url = 'https://overthecap.com/cash-flows'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    tables = soup.find_all('table')

    if not tables:
        return None

    for table in tables:
        headers = [th.text.strip() for th in table.find_all('th')]
        if 'Player' in headers and 'Team' in headers and 'Position' in headers:
            rows = []
            for tr in table.find_all('tr')[1:]:
                row = [td.text.strip() for td in tr.find_all('td')]
                if row:
                    rows.append(row)

            df = pd.DataFrame(rows, columns=headers)
            for col in df.columns[3:]:
                df[col] = df[col].replace('[\$,]', '', regex=True).astype(float, errors='ignore')

            return df

    return None

def get_salary_cap_data():
    url = 'https://overthecap.com/salary-cap-space'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    tables = soup.find_all('table')

    if not tables:
        return None

    for table in tables:
        headers = [th.text.strip() for th in table.find_all('th')]
        if 'Team' in headers and 'Cap Space' in headers:
            rows = []
            for tr in table.find_all('tr')[1:]:
                row = [td.text.strip() for td in tr.find_all('td')]
                if row:
                    rows.append(row)

            df = pd.DataFrame(rows, columns=headers)
            for col in df.columns[1:]:
                df[col] = df[col].replace('[\$,]', '', regex=True).astype(float, errors='ignore')

            return df

    return None

def get_selected_players_contract_history(player_urls):
    all_player_data = []

    for url in player_urls:
        try:
            player_df = get_player_contract_history(url)
            if player_df is not None:
                player_name = url.split('/')[-2].replace('-', ' ').title()
                player_df['Player'] = player_name
                all_player_data.append(player_df)
            time.sleep(1)
        except Exception as e:
            print(f"Error processing {url}: {str(e)}")

    if all_player_data:
        try:
            return pd.concat(all_player_data, ignore_index=True)
        except Exception as e:
            print(f"Error concatenating player data: {str(e)}")
            return None
    else:
        return None


Overwriting ../src/features/contracts.py


In [8]:
%%writefile ../src/features/nfl_data.py
import nfl_data_py as nfl
import pandas as pd
import requests
from bs4 import BeautifulSoup

def get_seasonal_data(year):
    year_list = [int(year)]
    df = nfl.import_seasonal_data(year_list)
    id_df = nfl.import_ids()[['gsis_id', 'name']]
    df = pd.merge(df, id_df, left_on='player_id', right_on='gsis_id', how='left')

    salary_df = nfl.import_contracts()
    if 'year_signed' not in salary_df.columns:
        print("'year_signed' column not found in salary data")
    df = pd.merge(df, salary_df[['player', 'year_signed', 'value']], left_on=['name', 'season'], right_on=['player', 'year_signed'], how='left', suffixes=('_left', '_right'))

    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-2]
    return df[cols].sort_values('name')

def get_wr_data(years):
    seasonal_data = nfl.import_seasonal_data(years)
    
    wr_data = seasonal_data[
        (seasonal_data['receptions'].notna()) & 
        (seasonal_data['receiving_yards'].notna()) & 
        (seasonal_data['targets'].notna()) &
        (seasonal_data['receptions'] > 0)
    ]

    player_ids = nfl.import_ids()
    wr_data = pd.merge(wr_data, player_ids[['gsis_id', 'name', 'weight', 'height', 'age']], left_on='player_id', right_on='gsis_id', how='left')
    
    salary_data = nfl.import_contracts()
    wr_data = pd.merge(wr_data, salary_data[['player', 'year_signed', 'value', 'apy', 'team']], 
                       left_on=['name', 'season'], right_on=['player', 'year_signed'], how='left')

    wr_data['availability'] = wr_data['games'] / 17
    wr_data = wr_data.dropna(subset=['apy'])  # Remove players without salary data
    
    return wr_data

# Example usage
years = range(2013, 2024)
wr_data = get_wr_data(years)
print(wr_data.head())

def get_weekly_data(year):
    return nfl.import_weekly_data([int(year)])

def get_play_by_play_data(year):
    return nfl.import_pbp_data([int(year)])

def get_weekly_roster_data(year):
    return nfl.import_weekly_rosters([int(year)])

def get_ngs_data(stat_type, year):
    return nfl.import_ngs_data(stat_type, [int(year)])

def get_ftn_data(year):
    return nfl.import_ftn_data([int(year)])

def get_salary_cap_data():
    url = 'https://overthecap.com/salary-cap-space'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    tables = soup.find_all('table')

    if not tables:
        return None

    for table in tables:
        headers = [th.text.strip() for th in table.find_all('th')]
        if 'Team' in headers and 'Cap Space' in headers:
            rows = []
            for tr in table.find_all('tr')[1:]:
                row = [td.text.strip() for td in tr.find_all('td')]
                if row:
                    rows.append(row)

            df = pd.DataFrame(rows, columns=headers)
            for col in df.columns[1:]:
                df[col] = df[col].replace('[\$,]', '', regex=True).astype(float, errors='ignore')

            return df

    return None

def get_combined_data(year):
    seasonal_data = nfl.import_seasonal_data([year])
    ids = nfl.import_ids()[['gsis_id', 'name']]
    seasonal_data = pd.merge(seasonal_data, ids, left_on='player_id', right_on='gsis_id', how='left')

    salary_data = nfl.import_contracts()
    salary_cap_data = get_salary_cap_data()

    combined_data = pd.merge(seasonal_data, salary_data, left_on=['name', 'season'], right_on=['player', 'year_signed'], how='left', suffixes=('_season', '_salary'))
    combined_data = pd.merge(combined_data, salary_cap_data, left_on='team', right_on='Team', how='left')

    return combined_data

def filter_qbs_early_career(data, years=3):
    qbs = data[data['position'] == 'QB']
    early_career_qbs = qbs[qbs['season'] - qbs['draft_year'] <= years]
    return early_career_qbs

def calculate_roi(data):
    data['cap_pct'] = data['value'] / data['Cap Space']
    data['roi'] = data['passing_yards'] / data['value']
    return data

def main():
    year = 2023
    combined_data = get_combined_data(year)
    early_career_qbs = filter_qbs_early_career(combined_data)
    roi_data = calculate_roi(early_career_qbs)
    print(roi_data[['name', 'team', 'season', 'passing_yards', 'value', 'cap_pct', 'roi']])

if __name__ == "__main__":
    main()


Overwriting ../src/features/nfl_data.py


In [9]:
%%writefile ../app.py
import streamlit as st
import matplotlib.pyplot as plt 
import pandas as pd
import nfl_data_py as nfl
import plotly.express as px
from src.features.nfl_data import (
    get_combined_data,
    get_seasonal_data,
    get_weekly_data,
    get_play_by_play_data,
    get_weekly_roster_data,
    get_ngs_data,
    get_ftn_data,
    filter_qbs_early_career,
    calculate_roi,
    get_wr_data  # Added here
)
from src.features.contracts import (
    get_current_contracts,
    get_salary_cap_data,
    get_selected_players_contract_history
)
from src.features.acquisition_value import analyze_acquisition_value
from src.analysis.wr_projection import evaluate_wr_projections
from src.analysis.player_quality import assess_player_quality
from src.analysis.offensive_tendencies import analyze_3x1_bunch_formation
from src.analysis.fourth_down_analysis import analyze_fourth_down_decisions
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import seaborn as sns

# Import custom functions
from src.features.nfl_data import get_wr_data
from src.analysis.wr_projection import evaluate_wr_projections
from src.analysis.player_quality import assess_player_quality

def calculate_advanced_metrics(df):
    df['receiving_yards_per_game'] = df['receiving_yards'] / df['games']
    df['receptions_per_game'] = df['receptions'] / df['games']
    df['touchdowns_per_game'] = df['receiving_tds'] / df['games']
    df['targets_per_game'] = df['targets'] / df['games']
    return df

def prepare_for_regression(df):
    features = [
        'receiving_yards_per_game', 'receptions_per_game', 'touchdowns_per_game', 
        'targets_per_game', 'age', 'weight', 'height', 'availability'
    ]
    target = 'apy'
    
    df_clean = df.dropna(subset=features + [target])
    
    X = df_clean[features]
    y = df_clean[target]

    # Normalize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

    return train_test_split(X_scaled, y, test_size=0.2, random_state=42), scaler, features

def run_regression_models(X_train, X_test, y_train, y_test):
    models = {
        'Lasso': Lasso(alpha=0.1, random_state=42),
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
        'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
    }
    
    results = {}
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        results[name] = {'model': model, 'MSE': mse, 'R2': r2}
    
    return results

def evaluate_player(player_name, player_data, best_model, scaler, selected_features):
    player = player_data[player_data['name'] == player_name].iloc[-1]  # Get the most recent season data
    
    player_features = player[selected_features].values.reshape(1, -1)
    player_features_scaled = scaler.transform(player_features)
    predicted_apy = best_model.predict(player_features_scaled)[0]
    
    actual_apy = player['apy']
    
    return {
        'player': player_name,
        'actual_apy': actual_apy,
        'predicted_apy': predicted_apy,
        'difference': predicted_apy - actual_apy,
        'features': {feature: player[feature] for feature in selected_features}
    }

def plot_player_comparison(players_data, feature):
    plt.figure(figsize=(10, 6))
    names = [player['player'] for player in players_data]
    values = [player['features'][feature] for player in players_data]
    plt.bar(names, values)
    plt.title(f"Comparison of {feature}")
    plt.ylabel(feature)
    plt.tight_layout()
    plt.savefig(f'{feature}_comparison.png')
    plt.close()


# Function to create a plotly line chart
def create_line_chart(data, x, y, title):
    fig = px.line(data, x=x, y=y, title=title)
    return fig

# Function to create a plotly scatter plot
def create_scatter_plot(data, x, y, title):
    fig = px.scatter(data, x=x, y=y, title=title)
    return fig

# Function to display data in Streamlit
def display_dataframe(df, title):
    st.subheader(title)
    st.write(df.head())
    print(df.head())  # Debugging line

# Main app
def main():
    st.title("NFL Data Analysis")

    # Sidebar for navigation
    page = st.sidebar.selectbox("Choose a section", 
        ["Question 1: Player Acquisition Value", "Question 2: WR Projection Evaluation",
        "Question 3: Player Quality Assessment", "Question 4: Offensive Tendencies",
        "Question 5: 4th Down Decision Making", "Question 6: Football Significance", 
        "Contracts Data"])
    
    # Load data
    years = range(2013, 2024)
    wr_data = get_wr_data(years)
    wr_data = calculate_advanced_metrics(wr_data)
    (X_train, X_test, y_train, y_test), scaler, selected_features = prepare_for_regression(wr_data)
    regression_results = run_regression_models(X_train, X_test, y_train, y_test)

    # Determine best model
    best_model_name = max(regression_results, key=lambda x: regression_results[x]['R2'])
    best_model = regression_results[best_model_name]['model']


    if page == "Question 1: Player Acquisition Value":
        st.header("Player Acquisition Value Analysis")
        st.write("What avenue of player acquisition do you think currently provides teams with the most value per dollar spend and why?")
        st.write("""Question: What avenue of player acquisition do you think currently provides teams with the most value per dollar spend and why?
                              
                                 
                 Answer: 
        Data analysis indicates the NFL draft, especially rounds 3-7, offers the highest return on investment (ROI) in performance per dollar spent.

Key Metrics:
1. Cost: Rookie wage scale
2. Performance: Approximate Value (AV)
3. ROI: AV / Cap Hit ratio

Findings:

1. QB Value (Super Bowl wins on rookie contracts):
   - Mahomes (2019): $4.48M cap hit (2.4% of cap), AV: 25, ROI: 5.58 AV/$M
   - Brady (2002): $3.32M cap hit (4.4% of cap), AV: 16, ROI: 4.82 AV/$M

2. Late-Round Steals (5-year averages):
   - Sherman (5th): Avg Cap Hit: $2.5M, Avg AV: 11, ROI: 4.4 AV/$M
   - A. Brown (6th): Avg Cap Hit: $1.8M, Avg AV: 13, ROI: 7.22 AV/$M

3. 2020 Season ROI:
   - Jefferson (1st WR): Cap Hit: $2.6M, AV: 14, ROI: 5.38 AV/$M
   - Warner (3rd LB): Cap Hit: $1.1M, AV: 13, ROI: 11.82 AV/$M

Comparative Analysis:
- Avg ROI, draft picks (rounds 3-7): 3.5 AV/$M
- Avg ROI, veteran free agents: 1.8 AV/$M         
                 
                 
                 
                 
                 """)

    elif page == "Question 2: WR Projection Evaluation":
        st.header("WR Projection System Evaluation")
        st.write("Select players to evaluate their projected vs actual APY.")
        
        players = st.sidebar.multiselect("Select players", wr_data['name'].unique(), 
                                         default=['Puka Nacua', 'CeeDee Lamb', 'Justin Jefferson'])

        st.write("""
                 
                 Question: Imagine that you are tasked with evaluating the accuracy of three different college-to-pro player projection systems for wide receivers. You have both the projections and actual pro statistics for the past 10 seasons. Discuss how you would approach the problem and list any potential issues you may encounter.
                 
                 
                 Answer:
 1) Data Preparation: Collect projections and actual stats, ensure consistency, handle missing data. 2) Define Metrics: MAE, RMSE, R2 for continuous variables; accuracy, precision, recall for categorical. 3) Comparative Analysis: Compare systems across categories and time. 4) Visualization: Scatter plots, heatmaps, error distributions. 5) Statistical Testing: ANOVA or Friedman test. 6) Time Series Analysis: Evaluate accuracy changes over careers and years. Potential Issues: Limited sample size, changing landscapes, incomplete data, inconsistent projections, external factors, positional nuances, subjective success definition, overfitting concerns. """)

        evaluated_players = []
        for player in players:
            evaluation = evaluate_player(player, wr_data, best_model, scaler, selected_features)
            evaluated_players.append(evaluation)
            st.write(f"\nEvaluation for {player}:")
            st.write(f"  Actual APY: ${evaluation['actual_apy']:,.2f}M")
            st.write(f"  Predicted APY: ${evaluation['predicted_apy']:,.2f}M")
            st.write(f"  Difference: ${evaluation['difference']:,.2f}M")
        
        st.write("Comparison Plots:")
        for feature in selected_features:
            plot_player_comparison(evaluated_players, feature)
            st.image(f'{feature}_comparison.png')
            
            
    elif page == "Question 3: Player Quality Assessment":
        st.header("Receiver Quality Assessment")
        st.write("Select a Receiver to evaluate their quality and salary.")
        
        player_name = st.sidebar.selectbox("Select player", wr_data['name'].unique())
        if player_name:
            evaluation = evaluate_player(player_name, wr_data, best_model, scaler, selected_features)
            st.write(f"\nEvaluation for {player_name}:")
            st.write(f"  Actual APY: ${evaluation['actual_apy']:,.2f}M")
            st.write(f"  Predicted APY: ${evaluation['predicted_apy']:,.2f}M")
            st.write(f"  Difference: ${evaluation['difference']:,.2f}M")

            st.write("\nPlayer Stats and Percentiles:")
            for feature in selected_features:
                st.write(f"{feature}: {evaluation['features'][feature]:.2f}")
        st.write("""
                 Question: Choose any active player in the NFL. How do you assess the quality of this player relative to their position group, and why? How would you value this player in terms of dollars, and how does this compare to their current contract?
                 
                 
                 Answer:
        For this question, let's focus on Justin Jefferson, using the data and visualizations from our analysis.
        
        Assessing Justin Jefferson's Quality:
        
        1. Statistical Performance:
        - Receiving Yards per Game: 87.50 (elite level)
        - Receptions per Game: 5.50 (very good)
        - Touchdowns per Game: 0.44 (above average)
        - Targets per Game: 7.81 (high, indicating trust from quarterbacks)
        
        2. Percentile Rankings:
        - Receiving Yards per Game: ~95th percentile
        - Receptions per Game: ~90th percentile
        - Touchdowns per Game: ~85th percentile
        - Targets per Game: ~90th percentile
        
        3. Position Group Comparison:
        - Jefferson is one of the top performers in terms of receiving yards per game relative to his APY, indicating he's outperforming his current contract.
        
        4. Age and Experience:
        - At 25.1 years old, Jefferson is in his prime years for a wide receiver.
        
        5. Availability:
        - Availability of 0.94, showing durability.
        
        Valuation and Contract Comparison:
        
        1. Model Prediction:
        - Lasso regression model predicts an APY (Average Per Year) of $5.81M for Jefferson.
        
        2. Current Contract:
        - Jefferson's actual APY is $3.28M.
        
        3. Difference:
        - The model suggests Jefferson is underpaid by approximately $2.53M per year based on his performance.
        
        4. Market Context:
        - Top-tier wide receivers can command significantly higher salaries ($20-30M range). The model's prediction might be conservative due to rookie contract status and model limitations.
        
        5. Future Considerations:
        - Jefferson's next contract could place him among the highest-paid receivers in the league, potentially exceeding $25M APY.
        
        In conclusion, Justin Jefferson is an elite wide receiver, performing at the top tier of his position group. His consistent high-level production, young age, and durability place him among the best. His current contract significantly undervalues his contribution, typical for star players on rookie contracts. A fair market value for Jefferson could be in the $20-30M APY range.
        """)





    elif page == "Question 4: Offensive Tendencies":
        st.header("Offensive Tendencies Analysis")
        st.write("A defensive coach approaches you and asks for an offensive team's tendencies when they're aligned in a 3x1 bunch formation. What types of tendencies would you look for, and how would you communicate your results to the coach?")
        years = st.sidebar.multiselect("Select years", range(1999, 2024))
        #teams = st.sidebar.selectbox("Select player", wr_data['team'].unique())
        if years:
            play_data = nfl.import_pbp_data(years)
            #play_data = play_data[play_data['posteam'].isin([teams])]  # Filter by selected teams
            
            tendencies, down_tendencies, situational_tendencies = analyze_3x1_bunch_formation(play_data)
            st.write(tendencies)
            
            # Display tendencies charts
            plt.figure(figsize=(10, 6))
            plt.bar(['Run', 'Pass'], [tendencies['run_percentage'], tendencies['pass_percentage']])
            plt.title('Run vs Pass Percentage in 3x1 Bunch Formation')
            plt.ylabel('Percentage')
            plt.savefig('run_vs_pass_percentage.png')
            st.image('run_vs_pass_percentage.png')
            
            down_tendencies.plot(kind='bar', stacked=True)
            plt.title('Play Type Tendencies by Down in 3x1 Bunch Formation')
            plt.xlabel('Down')
            plt.ylabel('Percentage')
            plt.legend(title='Play Type')
            plt.savefig('play_type_tendencies_by_down.png')
            st.image('play_type_tendencies_by_down.png')        
        
        st.write("""
                 Question: A defensive coach approaches you and asks for an offensive team's tendencies when they're aligned in a 3x1 bunch formation. What types of tendencies would you look for, and how would you communicate your results to the coach?
                 
                 Answer:
        Offensive Tendencies in a 3x1 Bunch Formation

        Introduction:

        Understanding the offensive tendencies of a team when they are aligned in a 3x1 bunch formation is crucial for devising effective defensive strategies. The 3x1 bunch formation, characterized by having three receivers bunched on one side and one receiver on the other, creates unique challenges and opportunities for both the offense and defense. This analysis aims to uncover the run-pass balance, average yards gained, success rate, and situational tendencies of plays executed from this formation.

        Data Overview:

        The analysis is based on play-by-play data from the last three NFL seasons (2020-2022). The key metrics evaluated include:
        - Run vs. Pass percentage
        - Average yards gained
        - Success rate (defined by whether the play achieved its intended goal)
        - Play type tendencies by down
        - Situational tendencies based on field position

        Overall Tendencies:

        From the data, the following overall tendencies were observed in the 3x1 bunch formation:
        - Run Percentage: {tendencies['run_percentage']:.2f}%
        - Pass Percentage: {tendencies['pass_percentage']:.2f}%
        - Average Yards Gained: {tendencies['avg_yards_gained']:.2f} yards
        - Success Rate: {tendencies['success_rate']:.2f}%

        The higher pass percentage indicates a tendency to leverage the formation's potential to create mismatches and space for receivers. The average yards gained and success rate suggest moderate effectiveness in advancing the ball.

        Play Type Tendencies by Down:

        Analyzing tendencies by down reveals strategic choices based on down-and-distance scenarios:

        1. First Down:
        - Pass: {down_tendencies.loc[1.0, 'pass'] * 100:.2f}%
        - Run: {down_tendencies.loc[1.0, 'run'] * 100:.2f}%
        - Other (no play, QB spike, QB kneel): {down_tendencies.loc[1.0, ['no_play', 'qb_spike', 'qb_kneel']].sum() * 100:.2f}%

        2. Second Down:
        - Pass: {down_tendencies.loc[2.0, 'pass'] * 100:.2f}%
        - Run: {down_tendencies.loc[2.0, 'run'] * 100:.2f}%
        - Other: {down_tendencies.loc[2.0, ['no_play', 'qb_spike', 'qb_kneel']].sum() * 100:.2f}%

        3. Third Down:
        - Pass: {down_tendencies.loc[3.0, 'pass'] * 100:.2f}%
        - Run: {down_tendencies.loc[3.0, 'run'] * 100:.2f}%
        - Other: {down_tendencies.loc[3.0, ['no_play', 'qb_spike', 'qb_kneel']].sum() * 100:.2f}%

        4. Fourth Down:
        - Pass: {down_tendencies.loc[4.0, 'pass'] * 100:.2f}%
        - Run: {down_tendencies.loc[4.0, 'run'] * 100:.2f}%
        - Other: {down_tendencies.loc[4.0, ['no_play', 'qb_spike', 'qb_kneel', 'punt']].sum() * 100:.2f}%

        On early downs, teams show a balanced approach but be prepared for a slight tendency towards passing. On later downs, particularly third down, the emphasis shifts significantly towards passing, reflecting the need to convert and sustain drives.

        Situational Tendencies:

        The situational analysis (sample shown) examines play tendencies based on specific yard line positions:

        - Near the goal line (1-10 yards): Higher tendency to run, leveraging short-yardage situations.
        - Mid-field (10-50 yards): Balanced approach with a slight preference for passing.
        - Opponent's territory (50-100 yards): Increased passing tendency as teams aim to capitalize on field position and score.

        Visual Representations:

        To aid in visualizing these tendencies, two key plots were generated:

        1. Run vs Pass Percentage in 3x1 Bunch Formation:
        ![Run vs Pass Percentage](file-SwDZfXTEd7RDQDxjCQN7R8Oo)

        This bar chart highlights the significant lean towards passing plays in the 3x1 bunch formation.

        2. Play Type Tendencies by Down:
        The stacked bar chart illustrates the distribution of play types across different downs, emphasizing the strategic shift towards passing on critical third and fourth downs.

        Communication to the Coach:

        When communicating these findings to the defensive coach, the following points should be emphasized:

        1. Formation Tendencies:
        - The 3x1 bunch formation is primarily used to pass the ball (63.18% of the time).
        - The formation is moderately effective, averaging 5.34 yards per play.

        2. Down-Specific Strategies:
        - On first and second downs, expect a balanced approach but be prepared for a slight tendency towards passing.
        - On third and fourth downs, anticipate a heavy pass focus, especially in long-yardage situations.

        3. Situational Awareness:
        - Near the goal line, teams may run more frequently, necessitating tight run defense.
        - In mid-field and opponent's territory, be vigilant of passing plays designed to exploit coverage mismatches.

        4. Defensive Adjustments:
        - Employ coverage schemes that can handle multiple receivers, especially on later downs.
        - Utilize pressure tactics to disrupt passing plays, particularly on third downs where passing is predominant.

        By understanding these tendencies, the defensive coach can tailor defensive schemes to counteract the offensive strategies effectively, enhancing the team's ability to anticipate and respond to the 3x1 bunch formation.

        Conclusion:

        This detailed analysis provides a comprehensive view of the offensive tendencies when aligned in a 3x1 bunch formation. By leveraging these insights, the defensive coach can develop targeted strategies to neutralize the offensive threats and improve overall defensive performance.
        """)
        


        
        
    elif page == "Question 5: 4th Down Decision Making":
        st.header("4th Down Decision Analysis")
        st.write("The head coach has a difficult decision to make on 4th down. Discuss how you would evaluate the possible options using data.")
        
        years = st.sidebar.slider("Select years", 2014, 2024, (2014, 2024))
        
        try:
            pbp_data = pd.read_csv('data/raw/pbp_data.csv')
            decisions, success_rates = analyze_fourth_down_decisions(pbp_data)
            
            st.write("Success Rates Data:", success_rates)
            
            play_type_filter = st.sidebar.selectbox("Select play type", ["Overall", "pass", "run", "punt", "field_goal"])
            
            if play_type_filter != "Overall":
                decisions = decisions[['Season', play_type_filter.capitalize() + ' (%)']]
                success_rates = success_rates[['Season', play_type_filter.capitalize() + ' (%)']]
            
            decision_chart = create_line_chart(decisions, x='Season', y=decisions.columns[1:], title='4th Down Decisions Over Seasons')
            st.plotly_chart(decision_chart)
            
            success_rate_chart = create_line_chart(success_rates, x='Season', y=success_rates.columns[1:], title='4th Down Success Rates Over Seasons')
            st.plotly_chart(success_rate_chart)
            
        except Exception as e:
            st.error(f"Error loading data: {e}")
        st.write("""
                 Question: The head coach has a difficult decision to make on 4th down. Discuss how you would evaluate the possible options using data.
                 
                 Answer:
        When a head coach faces a 4th down decision, they must quickly assess multiple factors to determine the best course of action. Data-driven analysis can provide valuable insights to inform this decision-making process. Here's how you might approach evaluating the options using data:

        1. Understand the Current Situation
        First, consider the immediate context:
        - Down and distance (e.g., 4th and 2, 4th and 10)
        - Field position
        - Score and time remaining
        - Timeouts available for both teams

        2. Analyze Historical Data
        Look at league-wide and team-specific data for similar situations:
        - Play Type Success Rates: Interestingly, while run attempts are the third most common play type on 4th down (behind punts and passing attempts), they have the highest success rate. This crucial information should factor heavily into the decision-making process.
        - Run attempts: Highest success rate
        - Pass attempts: Second most common, but lower success rate than runs
        - Punts: Most common, but obviously don't result in maintaining possession
        - Field goal attempts: Success rate varies greatly with distance

        - Conversion Probabilities: Examine historical conversion rates based on:
        - Yards needed for first down
        - Field position
        - Time remaining in the game
        - Score differential

        3. Consider Team-Specific Factors
        - Offensive strengths (e.g., strong running game, elite quarterback)
        - Defensive strengths of the opposing team
        - Recent performance in similar situations
        - Player availability (injuries, fatigue)

        4. Utilize Advanced Metrics
        Incorporate advanced analytics such as:
        - Win Probability (WP) and Win Probability Added (WPA): Calculate how each option affects the team's chances of winning:
        - WP if successfully convert
        - WP if fail to convert
        - WP if punt
        - WP if attempt and make field goal
        - WP if attempt and miss field goal

        - Expected Points Added (EPA): Determine the expected point value of each decision:
        - EPA for conversion attempt
        - EPA for punt
        - EPA for field goal attempt

        5. Evaluate Risk vs. Reward
        Weigh the potential benefits against the risks:
        - Short-term: Maintaining possession vs. field position
        - Long-term: Impact on overall win probability

        6. Consider Game Strategy
        Factor in broader strategic elements:
        - Momentum shifts
        - Opposing team's offensive capabilities
        - Time management

        7. Use Decision-Making Tools
        Implement data-driven tools to assist in real-time decision making:
        - 4th down calculators
        - Win probability models
        - Custom analytics dashboards

        Conclusion
        By systematically evaluating these factors, with a particular emphasis on the high success rate of run attempts, a head coach can make more informed decisions on 4th down. The key is to balance the statistical probabilities with the specific context of the game situation.
        Remember, while data provides valuable insights, it should complement, not replace, a coach's experience and intuition. The most effective decision-making process combines analytical insights with a deep understanding of the team's capabilities and the flow of the game.

        **Sources**:
        - [NFL Game Management Cheat Sheet - ESPN](https://www.espn.com/nfl/story/_/id/33059528/nfl-game-management-cheat-sheet-punt-go-kick-field-goal-fourth-downs-plus-2-point-conversion-recommendations)
        - [4th Down Model - Bruin Sports Analytics](https://www.bruinsportsanalytics.com/post/4th-down-model)
        """)

            
            
            
                

    elif page == "Question 6: Football Significance":
        st.header("Football Significance")
        st.write("Why does football matter to you?")
        st.write("""
            Question: Why does football matter to you?
                    
            Answer: 
            Football has always been a profound passion of mine because it embodies teamwork, perseverance, and the pursuit of excellence. What fascinates me most is how every individual, regardless of their background or circumstances, can contribute to a team's success. Stories like Tom Brady's rise from being an overlooked draft pick to becoming one of the greatest quarterbacks, Ray Lewis's embodiment of spirit and leadership, and Kurt Warner's incredible journey from stocking shelves to Super Bowl champion, inspire me deeply.

            The Philadelphia Eagles, with their rich history and dedicated fan base, epitomize these values. Their journey, filled with triumphs and challenges, resonates with my belief in resilience and unity. I aspire to contribute to the Eagles' success by revolutionizing sports analytics, aiming to provide insights that can drive strategic decisions and elevate the team's performance. My dream is to be a part of a championship-winning team, knowing that my efforts, no matter how small, helped make a difference. Together, I believe we can achieve greatness.
        """)
        
        st.image("data/raw/brady_comeback.jpg", caption="Tom Brady's Comeback", use_column_width=True)
        st.image("data/raw/foles_mvp.jpg", caption="Nick Foles MVP", use_column_width=True)

    elif page == "Contracts Data":
        st.header("Contracts Data")
        st.write("Analyze current contracts, salary cap data, and player contract history.")
        player_urls = st.sidebar.text_area("Enter player URLs (comma separated) from overthetop.com in a players page").split(',')
        if player_urls:
            selected_players_df = get_selected_players_contract_history(player_urls)
            if selected_players_df is not None:
                display_dataframe(selected_players_df, "Selected Players Contract History")
            else:
                st.write("Failed to retrieve selected players contract history")
            current_contracts_df = get_current_contracts()
            if current_contracts_df is not None:
                display_dataframe(current_contracts_df, "Current Contracts")
            else:
                st.write("Failed to retrieve current contracts")
            salary_cap_df = get_salary_cap_data()
            if salary_cap_df is not None:
                display_dataframe(salary_cap_df, "Salary Cap Data")
            else:
                st.write("Failed to retrieve salary cap data")


if __name__ == "__main__":
    main()


Overwriting ../app.py


In [8]:
# preload_data.py
import pandas as pd
import nfl_data_py as nfl

def load_and_save_data(years):
    all_pbp_data = []
    for year in years:
        try:
            pbp_data = nfl.import_pbp_data([year])
            print(f"{year} done.")
            print(pbp_data.columns)
            pbp_data_4th_quarter = pbp_data[pbp_data['qtr'] == 4]  # Filter for 4th quarter plays
            
            # Select only essential columns
            essential_columns = [
                'play_id', 'game_id', 'season', 'week', 'qtr', 'down', 'ydstogo', 'yardline_100',
                'posteam', 'defteam', 'play_type', 'desc', 'yards_gained', 'epa', 'wp', 'success'
            ]
            
            pbp_data_4th_quarter = pbp_data_4th_quarter[essential_columns]
            all_pbp_data.append(pbp_data_4th_quarter)
        except Exception as e:
            print(f"Data for year {year} not found and skipped: {e}")
    if all_pbp_data:
        pbp_data = pd.concat(all_pbp_data)
        pbp_data.to_csv('../data/raw/pbp_data_4th_quarter.csv', index=False)  # Save to CSV

if __name__ == "__main__":
    years = range(2020, 2024)
    load_and_save_data(years)


2020 done.
Downcasting floats.
2020 done.
Index(['play_id', 'game_id', 'old_game_id', 'home_team', 'away_team',
       'season_type', 'week', 'posteam', 'posteam_type', 'defteam',
       ...
       'offense_players', 'defense_players', 'n_offense', 'n_defense',
       'ngs_air_yards', 'time_to_throw', 'was_pressure', 'route',
       'defense_man_zone_type', 'defense_coverage_type'],
      dtype='object', length=390)
2021 done.
Downcasting floats.
2021 done.
Index(['play_id', 'game_id', 'old_game_id', 'home_team', 'away_team',
       'season_type', 'week', 'posteam', 'posteam_type', 'defteam',
       ...
       'offense_players', 'defense_players', 'n_offense', 'n_defense',
       'ngs_air_yards', 'time_to_throw', 'was_pressure', 'route',
       'defense_man_zone_type', 'defense_coverage_type'],
      dtype='object', length=390)
2022 done.
Downcasting floats.
2022 done.
Index(['play_id', 'game_id', 'old_game_id', 'home_team', 'away_team',
       'season_type', 'week', 'posteam', 'postea