In [None]:
import pandas as pd
import numpy as np

# Read the tracking data
df = pd.read_csv('tracking_week_1.csv')

# Look at the structure and some basic stats
print("Dataset Shape:", df.shape)
print("\
Columns:", df.columns.tolist())
print("\
Sample of pre-snap data:")
print(df[df['frameType'] == 'BEFORE_SNAP'].head())

# Get unique events
print("\
Unique events in the data:")
print(df['event'].unique())

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

# Read data
df = pd.read_csv('tracking_week_1.csv')

# Group by play and get pre-snap sequences
play_groups = df.groupby(['gameId', 'playId'])

# Function to analyze pre-snap motion
def analyze_presnap(group):
    presnap = group[group['frameType'] == 'BEFORE_SNAP']
    
    # Check for motion
    has_motion = 'man_in_motion' in group['event'].values
    has_shift = 'shift' in group['event'].values
    
    # Get play outcome
    play_events = group['event'].dropna().unique()
    touchdown = 'touchdown' in play_events
    
    # Calculate motion metrics
    max_speed = presnap['s'].max()
    total_displacement = presnap.groupby('nflId')['dis'].sum().mean()
    
    return pd.Series({
        'has_motion': has_motion,
        'has_shift': has_shift,
        'touchdown': touchdown,
        'max_presnap_speed': max_speed,
        'avg_displacement': total_displacement
    })

# Apply analysis to each play
play_metrics = play_groups.apply(analyze_presnap)

# Calculate success rates
motion_success = play_metrics[play_metrics['has_motion']]['touchdown'].mean()
no_motion_success = play_metrics[~play_metrics['has_motion']]['touchdown'].mean()

print("Plays with motion touchdown rate: {:.1%}".format(motion_success))
print("Plays without motion touchdown rate: {:.1%}".format(no_motion_success))

# Visualize the relationship between motion and outcomes
plt.figure(figsize=(10, 6))
sns.boxplot(data=play_metrics, x='has_motion', y='max_presnap_speed')
plt.title('Pre-snap Speed Distribution by Motion Usage')
plt.show()

# Basic stats about motion usage
print("\
Motion usage statistics:")
print("Total plays:", len(play_metrics))
print("Plays with motion:", play_metrics['has_motion'].sum())
print("Plays with shifts:", play_metrics['has_shift'].sum())

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

# Function to extract pre-snap features
def extract_presnap_features(group):
    presnap = group[group['frameType'] == 'BEFORE_SNAP']
    
    # Motion detection
    has_motion = 'man_in_motion' in group['event'].values
    has_shift = 'shift' in group['event'].values
    
    # Speed and movement features
    max_speed = presnap['s'].max()
    avg_speed = presnap['s'].mean()
    total_displacement = presnap.groupby('nflId')['dis'].sum().mean()
    
    # Formation spread
    last_frame = presnap[presnap['frameId'] == presnap['frameId'].max()]
    if len(last_frame) > 1:
        x_spread = last_frame['x'].max() - last_frame['x'].min()
        y_spread = last_frame['y'].max() - last_frame['y'].min()
    else:
        x_spread = 0
        y_spread = 0
    
    # Motion timing
    snap_frame = group[group['event'] == 'ball_snap']['frameId'].min() if 'ball_snap' in group['event'].values else group['frameId'].max()
    if has_motion:
        motion_frames = group[group['event'] == 'man_in_motion']['frameId'].min()
        frames_before_snap = snap_frame - motion_frames
    else:
        frames_before_snap = 0
        
    return pd.Series({
        'has_motion': has_motion,
        'has_shift': has_shift,
        'max_presnap_speed': max_speed,
        'avg_presnap_speed': avg_speed,
        'avg_displacement': total_displacement,
        'formation_x_spread': x_spread,
        'formation_y_spread': y_spread,
        'motion_timing': frames_before_snap
    })

# Group by play and extract features
play_groups = merged_df.groupby(['gameId', 'playId'])
play_features = play_groups.apply(extract_presnap_features)

# Add play outcomes
play_outcomes = merged_df.groupby(['gameId', 'playId']).agg({
    'expectedPoints': 'first',
    'yardsGained': 'first',
    'offenseFormation': 'first'
}).reset_index()

# Combine features and outcomes
analysis_df = play_features.reset_index().merge(play_outcomes, on=['gameId', 'playId'])

# Calculate average EPA by motion usage
motion_analysis = analysis_df.groupby('has_motion').agg({
    'expectedPoints': ['mean', 'std', 'count'],
    'yardsGained': ['mean', 'std']
}).round(2)

print("Motion Analysis:")
print(motion_analysis)

# Visualize EPA distribution by motion
plt.figure(figsize=(10, 6))
sns.boxplot(data=analysis_df, x='has_motion', y='expectedPoints')
plt.title('Expected Points by Motion Usage')
plt.show()

# Analyze motion timing impact
plt.figure(figsize=(10, 6))
sns.scatterplot(data=analysis_df[analysis_df['has_motion']], 
                x='motion_timing', y='expectedPoints')
plt.title('Motion Timing vs Expected Points')
plt.show()

# Formation analysis
formation_motion = pd.crosstab(analysis_df['offenseFormation'], 
                             analysis_df['has_motion'], 
                             values=analysis_df['expectedPoints'], 
                             aggfunc='mean').round(2)
print("\
Expected Points by Formation and Motion:")
print(formation_motion)

# Calculate correlations between features and outcomes
feature_cols = ['has_motion', 'has_shift', 'max_presnap_speed', 
                'avg_presnap_speed', 'avg_displacement', 
                'formation_x_spread', 'formation_y_spread']
correlation_matrix = analysis_df[feature_cols + ['expectedPoints', 'yardsGained']].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='RdBu', center=0)
plt.title('Feature Correlations with Outcomes')
plt.show()

In [None]:
# Filter data for I_FORM and EMPTY formations
formation_focus = analysis_df[analysis_df['offenseFormation'].isin(['I_FORM', 'EMPTY'])]

# Group by formation and motion usage
formation_analysis = formation_focus.groupby(['offenseFormation', 'has_motion']).agg({
    'expectedPoints': ['mean', 'std', 'count'],
    'yardsGained': ['mean', 'std']
}).round(2)

print("Detailed Analysis for I_FORM and EMPTY formations:")
print(formation_analysis)

# Visualize expected points for these formations
plt.figure(figsize=(10, 6))
sns.boxplot(data=formation_focus, x='offenseFormation', y='expectedPoints', hue='has_motion')
plt.title('Expected Points by Formation and Motion Usage')
plt.show()

# Analyze correlation of features for these formations
formation_corr = formation_focus[['has_motion', 'max_presnap_speed', 'avg_presnap_speed', 
                                   'avg_displacement', 'formation_x_spread', 'formation_y_spread', 
                                   'expectedPoints', 'yardsGained']].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(formation_corr, annot=True, cmap='RdBu', center=0)
plt.title('Feature Correlations for I_FORM and EMPTY Formations')
plt.show()

In [None]:
# Additional analysis of play success rates
def calculate_success_rate(group):
    # Define success criteria: 40% of yards needed on first down, 60% on second down, 100% on third/fourth down
    success = ((group['down'] == 1) & (group['yardsGained'] >= 0.4 * group['yardsToGo'])) | \
             ((group['down'] == 2) & (group['yardsGained'] >= 0.6 * group['yardsToGo'])) | \
             ((group['down'].isin([3, 4])) & (group['yardsGained'] >= group['yardsToGo']))
    return success.mean()

# Merge with plays data to get down and distance information
formation_focus = formation_focus.merge(
    plays_df[['gameId', 'playId', 'down', 'yardsToGo']], 
    on=['gameId', 'playId']
)

# Calculate success rates by formation and motion
success_rates = formation_focus.groupby(['offenseFormation', 'has_motion']).apply(calculate_success_rate)
print("\
Success Rates by Formation and Motion:")
print(success_rates)

# Analyze by down
down_analysis = formation_focus.groupby(['offenseFormation', 'has_motion', 'down']).agg({
    'expectedPoints': ['mean', 'count'],
    'yardsGained': 'mean'
}).round(2)

print("\
Analysis by Down:")
print(down_analysis)

# Visualize EPA by down for each formation
plt.figure(figsize=(12, 6))
sns.boxplot(data=formation_focus, x='down', y='expectedPoints', 
            hue='offenseFormation', palette='Set2')
plt.title('Expected Points by Down and Formation')
plt.show()

In [None]:
# Let's first load and prepare our data properly
import pandas as pd
import numpy as np

# Read the tracking data
df = pd.read_csv('tracking_week_1.csv')

# Create basic features for modeling
df['has_motion'] = df['event'].notna() & df['event'].str.contains('motion', na=False)
df['presnap'] = df['frameType'] == 'BEFORE_SNAP'

# Group by play to get play-level features
play_features = df[df['presnap']].groupby(['gameId', 'playId']).agg({
    'x': ['std', 'mean'],
    'y': ['std', 'mean'],
    's': ['max', 'mean'],  # speed
    'a': ['max', 'mean'],  # acceleration
    'has_motion': 'max',
    'o': ['std', 'mean']  # orientation
}).reset_index()

# Flatten column names
play_features.columns = ['gameId', 'playId', 
                        'x_spread', 'x_mean',
                        'y_spread', 'y_mean',
                        'max_speed', 'avg_speed',
                        'max_accel', 'avg_accel',
                        'has_motion',
                        'o_spread', 'o_mean']

print("Features created per play:")
print(play_features.head())

# Look at distributions
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.histplot(data=play_features, x='x_spread', hue='has_motion')
plt.title('Formation Spread (X) Distribution')

plt.subplot(1, 3, 2)
sns.histplot(data=play_features, x='max_speed', hue='has_motion')
plt.title('Max Speed Distribution')

plt.subplot(1, 3, 3)
sns.histplot(data=play_features, x='y_spread', hue='has_motion')
plt.title('Formation Spread (Y) Distribution')

plt.tight_layout()
plt.show()

# Print summary statistics
print("\
Summary statistics for key features:")
print(play_features.describe())

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error

# Prepare data for modeling
# Target variable: Play success (binary classification)
play_features['successful_play'] = (
    (play_features['max_speed'] > 4) & (play_features['x_spread'] > 3)
).astype(int)

# Features for classification and regression
feature_cols = ['x_spread', 'y_spread', 'max_speed', 'avg_speed', 'max_accel', 'avg_accel', 'o_spread']
X = play_features[feature_cols]

# Classification target
y_class = play_features['successful_play']

# Regression target
y_reg = play_features['x_spread']  # Example regression target

# Split data into training and testing sets
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X, y_class, test_size=0.3, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.3, random_state=42)

# Classification model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_class, y_train_class)
y_pred_class = clf.predict(X_test_class)

# Regression model
reg = RandomForestRegressor(random_state=42)
reg.fit(X_train_reg, y_train_reg)
y_pred_reg = reg.predict(X_test_reg)

# Evaluate models
classification_accuracy = accuracy_score(y_test_class, y_pred_class)
classification_auc = roc_auc_score(y_test_class, clf.predict_proba(X_test_class)[:, 1])
regression_mse = mean_squared_error(y_test_reg, y_pred_reg)

print("Classification Model Performance:")
print("Accuracy:", classification_accuracy)
print("AUC:", classification_auc)

print("\
Regression Model Performance:")
print("Mean Squared Error:", regression_mse)

# Feature importance
importances_class = clf.feature_importances_
importances_reg = reg.feature_importances_

# Plot feature importance
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.barh(feature_cols, importances_class)
plt.title('Feature Importance (Classification)')

plt.subplot(1, 2, 2)
plt.barh(feature_cols, importances_reg)
plt.title('Feature Importance (Regression)')

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns

# Read the data again and create more sophisticated features
df = pd.read_csv('tracking_week_1.csv')

# Create time-based features
df['time_to_snap'] = df.groupby(['gameId', 'playId'])['frameId'].transform('max') - df['frameId']
df['presnap'] = df['frameType'] == 'BEFORE_SNAP'

# Calculate more sophisticated pre-snap features
play_features = df[df['presnap']].groupby(['gameId', 'playId']).agg({
    'x': ['std', 'mean', 'min', 'max'],  # Formation spread and position
    'y': ['std', 'mean', 'min', 'max'],
    's': ['max', 'mean', 'std'],  # Speed features
    'a': ['max', 'mean', 'std'],  # Acceleration features
    'dir': ['std', 'mean'],  # Directional movement
    'o': ['std', 'mean'],    # Orientation
    'time_to_snap': ['min']  # Timing features
}).reset_index()

# Flatten column names
play_features.columns = ['gameId', 'playId', 
                        'x_spread', 'x_mean', 'x_min', 'x_max',
                        'y_spread', 'y_mean', 'y_min', 'y_max',
                        'speed_max', 'speed_mean', 'speed_std',
                        'accel_max', 'accel_mean', 'accel_std',
                        'dir_std', 'dir_mean',
                        'orient_std', 'orient_mean',
                        'time_to_snap']

# Add formation complexity metrics
play_features['formation_area'] = (play_features['x_max'] - play_features['x_min']) * (play_features['y_max'] - play_features['y_min'])
play_features['speed_complexity'] = play_features['speed_std'] * play_features['speed_max']
play_features['movement_complexity'] = play_features['dir_std'] * play_features['speed_mean']

# Create a more sophisticated success metric
# Let's say a play is "deceptive" if it shows complex pre-snap movement
play_features['deceptive_play'] = (
    (play_features['speed_complexity'] > play_features['speed_complexity'].median()) &
    (play_features['movement_complexity'] > play_features['movement_complexity'].median()) &
    (play_features['formation_area'] > play_features['formation_area'].median())
).astype(int)

# Prepare features for modeling
feature_cols = ['x_spread', 'y_spread', 'speed_max', 'speed_mean', 'speed_std',
                'accel_max', 'accel_mean', 'accel_std', 'dir_std', 'orient_std',
                'formation_area', 'speed_complexity', 'movement_complexity']

X = play_features[feature_cols]
y = play_features['deceptive_play']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=feature_cols)

# Train a gradient boosting model with cross-validation
gb_model = GradientBoostingClassifier(random_state=42)
cv_scores = cross_val_score(gb_model, X_scaled, y, cv=5)

print("Cross-validation scores:", cv_scores)
print("Average CV score:", cv_scores.mean())

# Fit the model on full dataset for feature importance
gb_model.fit(X_scaled, y)

# Plot feature importances
plt.figure(figsize=(12, 6))
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': gb_model.feature_importances_
}).sort_values('importance', ascending=True)

plt.barh(y=range(len(importance_df)), width=importance_df['importance'])
plt.yticks(range(len(importance_df)), importance_df['feature'])
plt.xlabel('Feature Importance')
plt.title('Feature Importance in Predicting Deceptive Plays')
plt.tight_layout()
plt.show()

# Analyze relationships between key features
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.scatterplot(data=play_features, x='speed_complexity', y='movement_complexity', 
                hue='deceptive_play', alpha=0.6)
plt.title('Speed vs Movement Complexity')

plt.subplot(1, 3, 2)
sns.scatterplot(data=play_features, x='formation_area', y='speed_max',
                hue='deceptive_play', alpha=0.6)
plt.title('Formation Area vs Max Speed')

plt.subplot(1, 3, 3)
sns.boxplot(data=play_features, x='deceptive_play', y='speed_complexity')
plt.title('Speed Complexity by Play Type')

plt.tight_layout()
plt.show()

# Print summary statistics for deceptive vs non-deceptive plays
print("\
Summary Statistics for Deceptive vs Non-Deceptive Plays:")
print(play_features.groupby('deceptive_play')[
    ['speed_complexity', 'movement_complexity', 'formation_area']
].describe().round(2))

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

# Prepare data for transformer model
class PreSnapDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.float32)

# Convert play-level features into sequences
sequence_features = ['x_spread', 'y_spread', 'speed_max', 'speed_mean', 'speed_std',
                     'accel_max', 'accel_mean', 'accel_std', 'dir_std', 'orient_std']

X_sequences = play_features[sequence_features].values
y_sequences = play_features['deceptive_play'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_sequences, y_sequences, test_size=0.3, random_state=42)

# Create datasets and dataloaders
train_dataset = PreSnapDataset(X_train, y_train)
test_dataset = PreSnapDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define transformer model
class TransformerModel(nn.Module):
    def __init__(self, input_dim, num_heads, num_layers, hidden_dim, output_dim):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Linear(input_dim, hidden_dim)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads),
            num_layers=num_layers
        )
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x)
        x = x.mean(dim=1)  # Global average pooling
        x = self.fc(x)
        return x

# Model parameters
input_dim = len(sequence_features)
hidden_dim = 64
num_heads = 4
num_layers = 2
output_dim = 1

# Initialize model, loss, and optimizer
model = TransformerModel(input_dim, num_heads, num_layers, hidden_dim, output_dim)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs.squeeze(), y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print("Epoch", epoch + 1, "Loss:", total_loss / len(train_loader))

# Train the model
train_model(model, train_loader, criterion, optimizer)

# Evaluate the model
def evaluate_model(model, test_loader):
    model.eval()
    predictions, actuals = [], []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch).squeeze()
            predictions.extend(torch.sigmoid(outputs).round().numpy())
            actuals.extend(y_batch.numpy())
    return predictions, actuals

predictions, actuals = evaluate_model(model, test_loader)

# Calculate accuracy
accuracy = (np.array(predictions) == np.array(actuals)).mean()
print("Test Accuracy:", accuracy)