Importing Libraries

In [None]:
# Imports and basic configuration

# Core utilities
import os
import random
import warnings

# Numerics and dataframes
import numpy as np
import pandas as pd

# Modeling utilities
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Set global random seed for reproducibility
RANDOM_STATE = 42
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

# Configure pandas display for better readability in notebooks
pd.set_option('display.width', 140)
pd.set_option('display.max_columns', 120)

# Suppress unnecessary warnings during execution
warnings.filterwarnings('ignore')

# Define base directory for data files
DATA_DIR = "."

# Define full paths for key data files
PBP_FILE = os.path.join(DATA_DIR, "nfl_2024_new.csv")
SCORES_FILE = os.path.join(DATA_DIR, "nfl_2024_scores.csv")
UPCOMING_FILE = os.path.join(DATA_DIR, "upcoming_games_2025_week1.csv")

Scraping/Merging Data

In [None]:
# Data ingestion and merging

# Load 2024 play-by-play data (Weeks 1–17 only)
pbp_df = pd.read_csv(PBP_FILE)

# Load 2024 final scores for all games
scores_df = pd.read_csv(SCORES_FILE)

# Merge 1: Attempt to join by matching Offense vs. Defense to Visitor vs. Home
merged = pbp_df.merge(
    scores_df,
    left_on=['Date', 'OffenseTeam', 'DefenseTeam'],
    right_on=['Date', 'Visitor', 'Home'],
    how='left'
)

# Merge 2: Reverse the roles to catch games where Home/Visitor are flipped
merged = merged.merge(
    scores_df,
    left_on=['Date', 'OffenseTeam', 'DefenseTeam'],
    right_on=['Date', 'Home', 'Visitor'],
    how='left',
    suffixes=('', '_reverse')
)

# Fill missing columns using reverse match when needed
for col in ['Week', 'Visitor', 'VisitorScore', 'Home', 'HomeScore', 'OT']:
    merged[col] = merged[col].combine_first(merged[f"{col}_reverse"])

# Drop the redundant reverse columns
merged.drop(columns=[f"{col}_reverse" for col in ['Week', 'Visitor', 'VisitorScore', 'Home', 'HomeScore', 'OT']], inplace=True)

# Add binary target column: 1 if Home team won, 0 otherwise
merged['HomeWon'] = merged['HomeScore'] > merged['VisitorScore']

# Preview merged dataset
merged[['Date', 'Home', 'Visitor', 'HomeScore', 'VisitorScore', 'HomeWon']].head(50)

Team Feature Extraction

In [None]:
# Feature engineering: basic scoring and win rate metrics

# Load schedule for upcoming games (Week 1 of 2025 season)
upcoming_games = pd.read_csv(UPCOMING_FILE)

# Preview the upcoming schedule to verify the structure
print(upcoming_games.head(16))

# --- Average Points Scored ---

# Calculate average points scored at home and away
avg_scored_home = merged['HomeScore'].groupby(merged['Home']).mean()
avg_scored_away = merged['VisitorScore'].groupby(merged['Visitor']).mean()

# Combine to get total average points scored per team
avg_points_scored = (avg_scored_home + avg_scored_away) / 2

# --- Average Points Allowed ---

# Calculate average points allowed at home and away
avg_allowed_home = merged['VisitorScore'].groupby(merged['Home']).mean()
avg_allowed_away = merged['HomeScore'].groupby(merged['Visitor']).mean()

# Combine to get total average points allowed per team
avg_points_allowed = (avg_allowed_home + avg_allowed_away) / 2

# --- Win Rate ---

# Count wins as home and away
home_wins = merged.groupby('Home')['HomeWon'].sum()
away_wins = merged.groupby('Visitor')['HomeWon'].apply(lambda x: len(x) - x.sum())

# Count total games played as home and away
home_games = merged['Home'].value_counts()
away_games = merged['Visitor'].value_counts()

# Total wins and games played
total_wins = home_wins + away_wins
total_games = home_games + away_games

# Calculate overall win rate
win_rate = total_wins / total_games

# --- Assemble team-level features ---

# Combine all features into one DataFrame
team_features = pd.DataFrame({
    'AvgPointsScored': avg_points_scored,
    'AvgPointsAllowed': avg_points_allowed,
    'WinRate': win_rate
})

# Reset index and name the index column explicitly
team_features.reset_index(names='Team', inplace=True)

# Preview the engineered features
team_features.head(32)


In [None]:
# Feature engineering: defensive metrics (conceded plays and turnovers)

# --- Conceded Plays ---

# Define a successful offensive play as either a touchdown or a non-turnover play
merged['SuccessfulPlay'] = merged['IsTouchdown'] | (~merged['IsInterception'] & ~merged['IsFumble'])

# Calculate average rate of successful plays conceded at home and as a visitor
avg_conceded_home = merged.groupby('Home')['SuccessfulPlay'].mean()
avg_conceded_away = merged.groupby('Visitor')['SuccessfulPlay'].mean()

# Combine to get overall conceded play rate per team
avg_conceded_plays = (avg_conceded_home + avg_conceded_away) / 2

# --- Forced Turnovers ---

# Define a turnover as either an interception or a fumble
merged['Turnover'] = merged['IsInterception'] | merged['IsFumble']

# Calculate average forced turnovers at home and as a visitor
avg_turnovers_home = merged.groupby('Home')['Turnover'].mean()
avg_turnovers_away = merged.groupby('Visitor')['Turnover'].mean()

# Combine to get overall forced turnover rate per team
avg_forced_turnovers = (avg_turnovers_home + avg_turnovers_away) / 2

# --- Merge Defensive Features ---

# Create defensive metrics dataframe
team_features_def = pd.DataFrame({
    'Team': team_features['Team'],
    'AvgPointsDefended': team_features['AvgPointsAllowed'],  # Already calculated earlier
    'AvgConcededPlays': avg_conceded_plays.values,
    'AvgForcedTurnovers': avg_forced_turnovers.values
})

# Join defensive features into existing feature set
team_features_combined = team_features.merge(team_features_def, on='Team', how='left')

# Preview combined features
team_features_combined.head(32)

In [None]:
# Feature engineering: additional offensive metrics

# --- Average Yards Per Play ---

# Calculate mean yards gained per play at home and away
avg_yards_home = merged.groupby('Home')['Yards'].mean()
avg_yards_away = merged.groupby('Visitor')['Yards'].mean()

# Combine for team-wide average
avg_yards_per_play = (avg_yards_home + avg_yards_away) / 2

# --- Average Yards Per Game ---

# Sum total yards per team per season and divide by number of games played
total_yards_home = merged.groupby(['SeasonYear', 'Home'])['Yards'].sum() / merged.groupby(['SeasonYear', 'Home']).size()
total_yards_away = merged.groupby(['SeasonYear', 'Visitor'])['Yards'].sum() / merged.groupby(['SeasonYear', 'Visitor']).size()

# Combine to get team-wide season average
avg_yards_per_game = (total_yards_home + total_yards_away).groupby(level=1).mean()

# --- Pass Completion Rate ---

# Completion rate = 1 - incompletion rate
pass_comp_home = merged.groupby('Home')['IsIncomplete'].mean().apply(lambda x: 1 - x)
pass_comp_away = merged.groupby('Visitor')['IsIncomplete'].mean().apply(lambda x: 1 - x)

# Combine to get overall pass completion rate
avg_pass_completion = (pass_comp_home + pass_comp_away) / 2

# --- Touchdowns Per Game ---

# Touchdowns per game = total TDs divided by games played (per season)
tds_home = merged.groupby(['SeasonYear', 'Home'])['IsTouchdown'].sum() / merged.groupby(['SeasonYear', 'Home']).size()
tds_away = merged.groupby(['SeasonYear', 'Visitor'])['IsTouchdown'].sum() / merged.groupby(['SeasonYear', 'Visitor']).size()

# Combine for average TDs per game per team
avg_td_per_game = (tds_home + tds_away).groupby(level=1).mean()

# --- Rush Success Rate ---

# Rush success = average yards per rush
rush_success_home = merged[merged['IsRush'] == 1].groupby('Home')['Yards'].mean()
rush_success_away = merged[merged['IsRush'] == 1].groupby('Visitor')['Yards'].mean()

# Combine for team-wide average
avg_rush_success = (rush_success_home + rush_success_away) / 2

# --- Combine and Merge Offensive Features ---

# Create DataFrame of offensive metrics
offensive_features = pd.DataFrame({
    'Team': team_features_combined['Team'],
    'AvgYardsPerPlay': avg_yards_per_play.values,
    'AvgYardsPerGame': avg_yards_per_game.values,
    'AvgPassCompletionRate': avg_pass_completion.values,
    'AvgTouchdownsPerGame': avg_td_per_game.values,
    'AvgRushSuccessRate': avg_rush_success.values
})

# Merge into existing team feature set
team_features_expanded = team_features_combined.merge(offensive_features, on='Team')

# Preview final offensive features
team_features_expanded.head(32)


In [None]:
# Feature engineering: additional defensive metrics

# --- Yards Allowed Per Play ---

# Calculate average yards allowed per play at home and away
yards_allowed_home = merged.groupby('Home')['Yards'].mean()
yards_allowed_away = merged.groupby('Visitor')['Yards'].mean()

# Combine to get total average yards allowed per team
avg_yards_allowed = (yards_allowed_home + yards_allowed_away) / 2

# --- Total Yards Allowed Per Game ---

# Sum total yards allowed per season and divide by number of games
total_yards_home = merged.groupby(['SeasonYear', 'Home'])['Yards'].sum() / merged.groupby(['SeasonYear', 'Home']).size()
total_yards_away = merged.groupby(['SeasonYear', 'Visitor'])['Yards'].sum() / merged.groupby(['SeasonYear', 'Visitor']).size()

# Combine and average over both sides
avg_yards_allowed_per_game = (total_yards_home + total_yards_away).groupby(level=1).mean()

# --- Pass Completion Allowed Rate ---

# Completion rate allowed = 1 - incompletion rate
comp_allowed_home = merged.groupby('Home')['IsIncomplete'].mean().apply(lambda x: 1 - x)
comp_allowed_away = merged.groupby('Visitor')['IsIncomplete'].mean().apply(lambda x: 1 - x)

# Combine for overall allowed completion rate
avg_pass_completion_allowed = (comp_allowed_home + comp_allowed_away) / 2

# --- Touchdowns Allowed Per Game ---

# Calculate touchdowns allowed per game from both sides
tds_allowed_home = merged.groupby(['SeasonYear', 'Home'])['IsTouchdown'].sum() / merged.groupby(['SeasonYear', 'Home']).size()
tds_allowed_away = merged.groupby(['SeasonYear', 'Visitor'])['IsTouchdown'].sum() / merged.groupby(['SeasonYear', 'Visitor']).size()

# Combine and average
avg_tds_allowed_per_game = (tds_allowed_home + tds_allowed_away).groupby(level=1).mean()

# --- Rush Success Allowed Rate ---

# Calculate average yards per rush allowed at home and away
rush_allowed_home = merged[merged['IsRush'] == 1].groupby('Home')['Yards'].mean()
rush_allowed_away = merged[merged['IsRush'] == 1].groupby('Visitor')['Yards'].mean()

# Combine for overall rush success rate allowed
avg_rush_success_allowed = (rush_allowed_home + rush_allowed_away) / 2

# --- Combine and Merge Defensive Features ---

# Create DataFrame of defensive metrics
defensive_features = pd.DataFrame({
    'Team': team_features_expanded['Team'],
    'AvgYardsAllowedPerPlay': avg_yards_allowed.values,
    'AvgYardsAllowedPerGame': avg_yards_allowed_per_game.values,
    'AvgPassCompletionAllowedRate': avg_pass_completion_allowed.values,
    'AvgTouchdownsAllowedPerGame': avg_tds_allowed_per_game.values,
    'AvgRushSuccessAllowedRate': avg_rush_success_allowed.values
})

# Merge with the previous feature set
team_features_complete = team_features_expanded.merge(defensive_features, on='Team')

# Preview the completed team feature dataset
team_features_complete.head(32)


In [None]:
# Game-level feature encoding for upcoming Week 1 matchups

# Reload upcoming games (redundant in Colab, but ensures fresh read if needed)
upcoming_games = pd.read_csv(UPCOMING_FILE)

# Merge home team features
upcoming_merged = upcoming_games.merge(
    team_features_complete,
    left_on='Home',
    right_on='Team',
    how='left'
)

# Merge visitor team features and suffix columns appropriately
upcoming_merged = upcoming_merged.merge(
    team_features_complete,
    left_on='Visitor',
    right_on='Team',
    how='left',
    suffixes=('_Home', '_Visitor')
)

# Preview encoded game matrix
upcoming_merged.head(16)

Data Training Preparation

In [None]:
# Training set construction: join game data with team features and build home-vs-visitor deltas

# Merge team features for the home side
training_encoded_home = merged.merge(
    team_features_complete,
    left_on='Home',
    right_on='Team',
    how='left'
)

# Merge team features for the visitor side (suffixes disambiguate feature columns)
training_encoded_both = training_encoded_home.merge(
    team_features_complete,
    left_on='Visitor',
    right_on='Team',
    how='left',
    suffixes=('_Home', '_Visitor')
)

# List the team-level feature names to difference (explicit for reproducibility)
diff_source_cols = [
    'AvgPointsScored', 'AvgPointsAllowed', 'WinRate',
    'AvgPointsDefended', 'AvgConcededPlays', 'AvgForcedTurnovers',
    'AvgYardsPerPlay', 'AvgYardsPerGame', 'AvgPassCompletionRate',
    'AvgTouchdownsPerGame', 'AvgRushSuccessRate',
    'AvgYardsAllowedPerPlay', 'AvgYardsAllowedPerGame',
    'AvgPassCompletionAllowedRate', 'AvgTouchdownsAllowedPerGame',
    'AvgRushSuccessAllowedRate'
]

# Create home-minus-visitor differences for each feature
for col in diff_source_cols:
    training_encoded_both[f'Diff_{col}'] = (
        training_encoded_both[f'{col}_Home'] - training_encoded_both[f'{col}_Visitor']
    )

# Select model inputs (all Diff_ columns) and the target (home team win flag)
training_data = training_encoded_both[[c for c in training_encoded_both.columns if c.startswith('Diff_')]]
training_labels = training_encoded_both['HomeWon']

# Preview a sample of the finalized training matrix
training_data.head(50)

In [None]:
# Sanity checks: verify training matrix and labels before modeling

# Basic shapes
print("training_data shape:", training_data.shape)
print("training_labels shape:", training_labels.shape)

# Index alignment (ensures each row of X maps to the same row of y)
aligned = training_data.index.equals(training_labels.index)
print("Index aligned:", aligned)

# Missing values
total_feature_nas = int(training_data.isna().sum().sum())
label_nas = int(training_labels.isna().sum())
print("Total missing values in training_data:", total_feature_nas)
print("Missing labels:", label_nas)

# Type check: ensure all features are numeric
non_numeric_cols = training_data.select_dtypes(exclude=[np.number]).columns.tolist()
print("Non-numeric feature columns:", non_numeric_cols if non_numeric_cols else "None")

# Label distribution (helps catch class imbalance issues)
print("\nLabel distribution (HomeWon):")
print(training_labels.value_counts(dropna=False))

# Preview: show a small sample of the feature matrix
print("\nFeature columns (first 10):", list(training_data.columns[:10]))
training_data.head(10)

AI Model Training

In [None]:
# AI model training: calibrated logistic regression with optimal threshold selection

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.base import clone
import numpy as np

# Debug flag for reduced search space and faster runtime during development
DEBUG_FAST = True  # set to False for full weekly run

# Sanitize training matrix to remove NaNs and infinite values
X_train = training_data.replace([np.inf, -np.inf], np.nan).fillna(0.0).astype(float)
y_train = training_labels.astype(int)

# Define pipeline: scaling + logistic regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(
        max_iter=2000,
        solver='lbfgs',
        class_weight='balanced',
        random_state=RANDOM_STATE
    ))
])

# Define parameter grid (compact in debug, wider for full run)
if DEBUG_FAST:
    param_dist = {
        'model__C': np.logspace(-2, 1, 6),   # 0.01 → 10
        'model__penalty': ['l2'],
    }
    n_iter = 8
    n_splits = 5
else:
    param_dist = {
        'model__C': np.logspace(-3, 2, 10),  # 0.001 → 100
        'model__penalty': ['l2'],
    }
    n_iter = 20
    n_splits = 10

# Define cross-validation strategy
cv_strategy = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)

# Randomized search for accuracy
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=n_iter,
    scoring=make_scorer(accuracy_score),
    cv=cv_strategy,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    verbose=0
)

# Fit randomized search on the training matrix
random_search.fit(X_train, y_train)

# Retrieve best pipeline
best_pipeline = random_search.best_estimator_

# Calibrate probabilities for improved reliability
calibrated_model = CalibratedClassifierCV(
    estimator=best_pipeline,
    method='sigmoid',
    cv=cv_strategy
)
calibrated_model.fit(X_train, y_train)

print(f"Best CV Accuracy (search): {random_search.best_score_:.4f}")
print(f"Chosen Parameters: {random_search.best_params_}")

# Function to compute optimal probability threshold using out-of-fold predictions
def find_optimal_threshold_oof(estimator, X, y, cv):
    oof_proba = np.zeros(len(y), dtype=float)
    for train_idx, valid_idx in cv.split(X, y):
        fold_est = clone(estimator)
        fold_est.fit(X.iloc[train_idx], y.iloc[train_idx])
        oof_proba[valid_idx] = fold_est.predict_proba(X.iloc[valid_idx])[:, 1]

    thresholds = np.linspace(0.30, 0.70, 401)
    best_thr, best_acc = 0.5, -1.0
    y_true = y.values
    for thr in thresholds:
        preds = (oof_proba >= thr).astype(int)
        acc = (preds == y_true).mean()
        if acc > best_acc:
            best_acc = acc
            best_thr = thr
    return best_thr, best_acc

# Compute optimal threshold
OPTIMAL_THRESHOLD, OOF_ACC = find_optimal_threshold_oof(
    estimator=calibrated_model,
    X=X_train,
    y=y_train,
    cv=cv_strategy
)

print(f"Optimal OOF threshold: {OPTIMAL_THRESHOLD:.3f} (OOF accuracy at threshold: {OOF_ACC:.4f})")

# Persist references for prediction
final_model = calibrated_model
best_threshold = OPTIMAL_THRESHOLD
decision_threshold = max(0.50, best_threshold)   # clamp display threshold to avoid sub-0.50 winners
feature_order_ = X_train.columns                  # capture feature order for alignment at inference

Upcoming Game Predictions

In [None]:
# Prediction preparation: construct diffs for upcoming games and generate probabilities

# Create a copy to avoid mutating the base data
prediction_results = upcoming_merged.copy()

# Define feature columns used during training
feature_cols = [f"Diff_{c}" for c in diff_source_cols]

# Create home-minus-visitor differences for each feature
for col in diff_source_cols:
    prediction_results[f'Diff_{col}'] = prediction_results[f'{col}_Home'] - prediction_results[f'{col}_Visitor']

# Prepare model input matrix
upcoming_X = prediction_results[feature_cols].copy()

# Sanitize prediction matrix to remove NaNs and infinite values
upcoming_X = upcoming_X.replace([np.inf, -np.inf], np.nan).fillna(0.0).astype(float)

# Align feature order to match training data
upcoming_X = upcoming_X.reindex(columns=feature_order_, fill_value=0.0)

# Generate calibrated probabilities
predicted_proba = final_model.predict_proba(upcoming_X)[:, 1]

# Store probabilities and predicted winners (uses clamped decision_threshold for display)
prediction_results['Home Win Probability'] = predicted_proba
prediction_results['Predicted Winner'] = np.where(
    prediction_results['Home Win Probability'] >= decision_threshold,
    prediction_results['Home'],
    prediction_results['Visitor']
)

In [None]:
# Display predictions

# Select display columns
display_cols = ['Home', 'Visitor', 'Home Win Probability', 'Predicted Winner']
if 'Week' in prediction_results.columns:
    display_cols = ['Week'] + display_cols

# Sort by win probability in descending order
upcoming_predictions = prediction_results[display_cols].sort_values(
    by='Home Win Probability', ascending=False
)

# Show final prediction table
upcoming_predictions