In [1]:
import pandas as pd
import glob

# Paths to datasets
games_path = 'data/games.csv'
plays_path = 'data/plays.csv'
tracking_files = glob.glob('data/tracking_week_*.csv')  # Collect all tracking files

# Load the data
games = pd.read_csv(games_path)
plays = pd.read_csv(plays_path)

In [2]:
# Merge all tracking files into a single DataFrame
tracking_data = pd.concat([pd.read_csv(file) for file in tracking_files], ignore_index=True)

# Merge plays with the combined tracking data on gameId and playId
merged_data = pd.merge(tracking_data, plays, on=['gameId', 'playId'])

# Filter for pre-snap frames (frameType == 'BEFORE_SNAP')
pre_snap_data = merged_data[merged_data['frameType'] == 'BEFORE_SNAP']

# Identify plays with motion based on the 'event' column
pre_snap_data['motion_detected'] = pre_snap_data['event'].str.contains('man_in_motion', na=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_snap_data['motion_detected'] = pre_snap_data['event'].str.contains('man_in_motion', na=False)


In [3]:
# Aggregate motion detection per play
motion_summary = pre_snap_data.groupby(['gameId', 'playId']).agg(
    motion_detected=('motion_detected', 'any')  # Whether 'man_in_motion' occurred
).reset_index()

# Merge motion summary back with plays to analyze play outcomes
motion_analysis = pd.merge(motion_summary, plays, on=['gameId', 'playId'])

# Compare yards gained for plays with vs. without motion
motion_outcome = motion_analysis.groupby('motion_detected').agg(
    avg_yards_gained=('yardsGained', 'mean'),
    play_count=('playId', 'count')
).reset_index()

In [4]:
motion_outcome

Unnamed: 0,motion_detected,avg_yards_gained,play_count
0,False,5.445203,10986
1,True,5.495617,5133
