# Pre-Snap Game Prediction!

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from xgboost import XGBClassifier
from sklearn.datasets import make_classification 
import xgboost as xgb

# Function to drop unwanted features from the dataset
def drop_unwanted_features(df, features_to_drop):
    """
    Drops specified features (columns) from the DataFrame.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame.
    features_to_drop (list): List of column names to drop.
    
    Returns:
    pd.DataFrame: A new DataFrame with the specified columns dropped.
    """
    # Check if all columns to drop exist in the DataFrame
    features_to_drop = [feature for feature in features_to_drop if feature in df.columns]
    
    # Drop the unwanted features
    df_cleaned = df.drop(columns=features_to_drop)
    
    return df_cleaned
    # Returns a DataFrame without the unwanted columns

# Function to convert game clock time into seconds
def convert_to_seconds(game_clock):
    """
    Converts the game clock time (in 'MM:SS' format) to seconds.
    
    Parameters:
    game_clock (str): The game clock time as a string in 'MM:SS' format.
    
    Returns:
    int: The game clock time converted into seconds.
    """
    # Split the clock string into minutes and seconds and convert to integers
    minutes, seconds = map(int, game_clock.split(':'))
    
    # Convert the time into total seconds
    return minutes * 60 + seconds

# Function to calculate remaining time till the end of the half
def calculate_remaining_time_end_of_half(row):
    """
    Calculates the remaining time in seconds until the end of the half based on the game clock and half number.
    
    Parameters:
    row (pd.Series): A row of the DataFrame containing 'gameClock' and 'half' columns.
    
    Returns:
    int: The remaining time in seconds until the end of the half.
    """
    half_game_time = 30 * 60  # 1800 seconds in a half (30 minutes * 60 seconds)
    
    # Convert the game clock to seconds for the current row
    quarter_time_remaining = convert_to_seconds(row['gameClock'])
    
    # Calculate the total elapsed time in the game up to the current quarter
    # Half 1 includes the first two quarters (each 15 minutes or 900 seconds), so we account for that
    elapsed_time = (row['half'] - 1) * 900 + (900 - quarter_time_remaining)
    
    # Remaining time until the end of the half
    return half_game_time - elapsed_time


### Initial Data Understanding and Cleaning

In [None]:
plays_df = pd.read_csv(r'C:\Users\Karahan C\Desktop\Portfolio Projects\Kaggle\nfl-big-data-bowl-2025\plays.csv')
games_df = pd.read_csv(r'C:\Users\Karahan C\Desktop\Portfolio Projects\Kaggle\nfl-big-data-bowl-2025\games.csv')


In [None]:
plays_df.sample(5)

As we see this dataframe contains various data about plays as descriptive, post-snap or pre-snap. Since we investigate a pre-snap incident we could drop irrelevant features like playDescription.   

In [None]:
plays_df.info()

In [None]:
games_df.sample(5)

In [None]:
plays_df.columns

In [None]:
games_df.columns

In [None]:
df = plays_df.merge(games_df[['gameId','homeTeamAbbr', 'visitorTeamAbbr']], on='gameId', how='left')
# Merged two data set on gameIds

I've merged the two data-set for is possesion team whether leading or not

In [None]:
df.columns

In [None]:
features_to_drop = ['gameId','playId','isDropback', 'expectedPointsAdded', 'yardsGained', 'visitorTeamWinProbilityAdded',
                    'homeTeamWinProbabilityAdded','prePenaltyYardsGained','penaltyYards','rushLocationType','qbSneak',
                    'qbKneel','qbSpike','unblockedPressure','passTippedAtLine','timeToSack','timeInTackleBox','timeToThrow',
                    'passLocationType','dropbackDistance','dropbackType','playAction','targetX', 'targetY','passLength',
                    'playClockAtSnap', 'expectedPoints','preSnapVisitorTeamWinProbability','preSnapHomeTeamWinProbability',
                    'playNullifiedByPenalty','yardlineSide','yardlineNumber','playDescription','pff_manZone','pff_passCoverage',
                    'pff_runPassOption','pff_runConceptSecondary','pff_runConceptPrimary']
 

In here I've droped unnecessary columns for prediction

In [None]:
df_cleaned = drop_unwanted_features(df,features_to_drop)


In [None]:
df_cleaned.sample(5)

In [None]:
df_cleaned.info()

## Feature Creation

I have tried to create my features in scope of situational football:
- Is my team behind? I would tend to play because I would tie the fame asap.
- Is my team needs to hurry for a TD because we are behind and run out of time?
- Are we leading and inside last two mins? I would tend to run the ball because I gotta melt time.  

In [None]:
# Half feature created 
df_cleaned['half'] = df_cleaned['quarter'].apply(lambda row: 1 if row < 3 else 2)

In [None]:
# Remaing time feature created
df_cleaned.loc[:, 'remainingTime'] = df_cleaned.apply(calculate_remaining_time_end_of_half, axis=1)


In [None]:
# df_cleaned dataset score data store as preSnapHomeScore and preSnapVisitor but team stored as possesionTeam and defensiveTeam.    

df_cleaned['possessionTeamScore'] = df_cleaned.apply(lambda row: row['preSnapHomeScore'] if row['possessionTeam'] == row['homeTeamAbbr'] else row['preSnapVisitorScore'], axis=1)
  
df_cleaned['defensiveTeamScore'] = df_cleaned.apply(lambda row: row['preSnapVisitorScore'] if row['possessionTeam'] == row['homeTeamAbbr'] else row['preSnapHomeScore'], axis=1)

# So I converted preSnapHomeScore and preSnapVisitor data to possessionTeamScore and defensiveTeamScore.

In [None]:
# This feature shows how many yards left for TD
df_cleaned['yardsToTd'] = df_cleaned['absoluteYardlineNumber'] - 10

In [None]:
# This feature creates a score difference and shows that possesion team leading or not.  
df_cleaned['possessionTeamLeadsBy'] = df_cleaned['possessionTeamScore'] - df_cleaned['defensiveTeamScore']

In [None]:
df_cleaned['isPossessionTeamLead'] = df_cleaned.apply(lambda row: 0 if row['possessionTeamLeadsBy'] > 0 else 1, axis=1 )
df_cleaned['isPossessionTeamLead'] = df_cleaned['isPossessionTeamLead'].astype('byte')

In [None]:
# Get the list of column names
new_order = ['half','quarter', 'gameClock','remainingTime', 'down', 'yardsToGo', 'yardsToTd','possessionTeamLeadsBy','isPossessionTeamLead',
             'possessionTeam', 'defensiveTeam','homeTeamAbbr', 'visitorTeamAbbr','possessionTeamScore', 'defensiveTeamScore',
             'preSnapHomeScore','preSnapVisitorScore', 'absoluteYardlineNumber', 'offenseFormation', 'receiverAlignment', 'passResult']

# Reorder the columns in the DataFrame
df_cleaned = df_cleaned[new_order]

# Check the reordered DataFrame
print(df_cleaned.head())


In [None]:
df_cleaned['offenseFormation'].value_counts()

As we know some formations have tendencies for pass or run games

In [None]:
df_cleaned = df_cleaned.copy()  # Create a deep copy of the DataFrame
heavyPass = ['SHOTGUN', 'EMPTY']
df_cleaned['isHeavyRun/Pass'] = df_cleaned['offenseFormation'].apply(lambda formation: 1 if formation in heavyPass else 0)
df_cleaned['isHeavyRun/Pass'] = df_cleaned['isHeavyRun/Pass'].astype('byte')


In [None]:
df_cleaned['receiverAlignment'].value_counts()

In [None]:
heavyRun = ['1x0', '2x0', '3x0', '1x1', '2x1']
df_cleaned.loc[:, 'isWRsHeavyRun/Pass'] = df_cleaned['receiverAlignment'].apply(lambda alignment: 0 if alignment in heavyRun else 1)
df_cleaned['isWRsHeavyRun/Pass'] = df_cleaned['isWRsHeavyRun/Pass'].astype('byte')

In [None]:
# This feature is bytes variable that show the game wheter game is pass (1) or not (0) 
df_cleaned['WasItPass'] = df_cleaned['passResult'].apply(lambda row: 0 if pd.isnull(row) == True else 1)


In [None]:
df_cleaned.info()

## Train - Test Split

In [None]:
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, n_informative=2, n_redundant=10, random_state=42)

## Train XGboost Model - Make Predictions

In [None]:
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

In [None]:
y_true = []
y_preds = []

for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    
    # Store true and predicted labels 
    y_true.extend(y_test)
    y_preds.extend(y_pred)
    

## Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
y_pred_prob = y_pred
auc = roc_auc_score(y_test, y_pred_prob)
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)

plt.plot(fpr, tpr, label=f'XGBoost (AUC = {auc:.2f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


In [None]:
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.3],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]}

xgb_clf = XGBClassifier(objective='binary:logistic')
grid = GridSearchCV(xgb_clf, param_grid, cv=5, scoring='accuracy',verbose=1)
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)

# Get the best parameters
best_params = grid.best_params_


# Fit model with best parameters
best_xgb_model = grid.best_estimator_

## Feature Importance

In [None]:
# Plot feature importance
xgb.plot_importance(best_xgb_model)
plt.show()


In [None]:
y_pred_final = best_xgb_model.predict(X_test)
accuracy_final = accuracy_score(y_test, y_pred_final)
print(f'Final Model Accuracy: {accuracy_final}') 