In [5]:
#Import necessary modules
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col, when, lag, round, sqrt, pow, sum as sum_func, max as max_func,row_number, lit, min as min_func, abs
from pyspark.sql.types import StructType, StructField, LongType, StringType, BooleanType, DoubleType, IntegerType
from pyspark.sql.functions import monotonically_increasing_id

In [6]:
#Create spark session
spark = SparkSession.builder.appName("Formation insights").getOrCreate()

In [7]:
#Select only desired columns and apply schema
schema = StructType([
    StructField("gameId", LongType(), True),
    StructField("playId", LongType(), True),
    StructField("nflId", DoubleType(), True),
    StructField("frameId", LongType(), True),
    StructField("x", DoubleType(), True),
    StructField("y", DoubleType(), True),
    StructField("frameType", StringType(), True)
])

#Merge all the tracking data into one dataframe.
trackingDf = spark.read.schema(schema).parquet("data/tracking/").cache()

#Filter and select tracking data just before the snap - to determine if there is any pre-snap motion
snapEventsDf = trackingDf.filter(col("frameType") == "SNAP").groupby("gameId", "playId").agg(min_func("frameId").alias("snapEvent"))

#Join with full data to get last second before snap, using the last 10 frames (update rate of 0.1 seconds)
presnapDf = trackingDf.join(snapEventsDf, ["gameId", "playId"]).filter(col("frameId") < col("snapEvent")).filter(col("frameId") >= (col("snapEvent") - 10))

25/06/10 07:49:27 WARN CacheManager: Asked to cache already cached data.


In [8]:
#Read in columns of interest from players data source 
schema = StructType([
    StructField("nflId", LongType(), True), 
    StructField("position", StringType(), True)
])

#Read in players datasource and apply schema
playersDf = spark.read.schema(schema).parquet("data/players.parquet").cache()

#Add positions column, through a join
presnapDf = presnapDf.join(playersDf.select("nflId", "position"), on="nflId", how="left")

playersDf.unpersist()

#Position classifications
offensivePositions = ["QB", "RB", "FB", "HB", "WR", "TE", "LT", "LG", "C", "RG", "RT"]
defensivePositions = ["CB", "S", "FS", "SS", "MLB", "OLB", "ILB", "LB", "DT", "DE", "NT", "DB"]

#Create a classifier column for if the player is on offence or defence
presnapDf = presnapDf.withColumn("isOffence",when(col("position").isin(*offensivePositions), True).when(col("position").isin(*defensivePositions), False).otherwise(lit(None)).cast(BooleanType()))

safetyDf = presnapDf.filter(col("position").isin(["SS", "FS"]))

schema = StructType([
    StructField("gameId", LongType(), True),
    StructField("playId", LongType(), True),
    StructField("yardlineNumber", LongType(), True)
])

playsDf = spark.read.schema(schema).parquet("data/plays.parquet").cache()

safetyDf = safetyDf.join(playsDf.select("gameId", "playId", "yardlineNumber"), on = ["gameId", "playId"], how = "left")

safetyDf = safetyDf.withColumn("play_direction", when(col("x") < col("yardlineNumber"), "right").otherwise("left"))

safetyDf = safetyDf.withColumn("line_of_scrimage",when(col("play_direction") == "left", 100 - col("yardlineNumber")).otherwise(col("yardlineNumber")))

safetyDf = safetyDf.withColumn("distance", abs(col("x") - col("line_of_scrimage")))

frameGroupByDf = safetyDf.groupBy("nflId", "gameId", "playId").agg(max_func("frameId").alias("frameId"))

safetyDf = safetyDf.join(frameGroupByDf, on = ["nflId","gameId", "playId", "frameId"], how = "inner")

safetyDf.show(5)

#Drop redundant columns
presnapDf = presnapDf.drop("frameId", "frameType", "snapEvent", "position")

presnapDf = presnapDf.withColumn("sequenceId", monotonically_increasing_id())
sparkWindow = Window.partitionBy("gameId", "playId", "nflId").orderBy("sequenceId")

#Add orderId using row number (for later sorting)
presnapDf = presnapDf.withColumn("orderId", row_number().over(sparkWindow))

presnapDf = presnapDf.withColumn("prevX", round(lag("x").over(sparkWindow), 2))
presnapDf = presnapDf.withColumn("prevY", round(lag("y").over(sparkWindow), 2))

#Calculate cumulative distance
presnapDf = presnapDf.withColumn("cumulativeDistance", round(when(col("prevX").isNull() | col("prevY").isNull(), 0.0).otherwise(sqrt(pow(col("x") - col("prevX"), 2) + pow(col("y") - col("prevY"), 2))),5))

#Sum all the movement values - this gives total distance moved in a second
playerMotion = presnapDf.groupBy("gameId", "playId", "nflId", "isOffence").agg(sum_func("cumulativeDistance").alias("distanceMoved"))

presnapDf.unpersist()

#Using a threshold of 2 yards in the 1 second time frame, determine if the player was in motion
playerMotion = playerMotion.withColumn("motion", col("distanceMoved") > 2)

#Determine if any player on each side was in motion
playMotion = playerMotion.groupBy("gameId", "playId", "isOffence").agg(max_func("motion").alias("isMotion"))

#Create final result
playMotion = playMotion.select("gameId", "playId", "isOffence", "isMotion")

25/06/10 07:49:27 WARN CacheManager: Asked to cache already cached data.
25/06/10 07:49:44 WARN MemoryStore: Not enough space to cache rdd_11_0 in memory! (computed 152.0 MiB so far)
25/06/10 07:49:44 WARN BlockManager: Persisting block rdd_11_0 to disk instead.
25/06/10 07:49:49 WARN MemoryStore: Not enough space to cache rdd_11_2 in memory! (computed 42.7 MiB so far)
25/06/10 07:49:49 WARN BlockManager: Persisting block rdd_11_2 to disk instead.
25/06/10 07:49:52 WARN MemoryStore: Not enough space to cache rdd_11_3 in memory! (computed 65.8 MiB so far)
25/06/10 07:49:52 WARN BlockManager: Persisting block rdd_11_3 to disk instead.
25/06/10 07:49:59 WARN MemoryStore: Not enough space to cache rdd_11_2 in memory! (computed 65.8 MiB so far)
25/06/10 07:49:59 WARN MemoryStore: Not enough space to cache rdd_11_3 in memory! (computed 65.8 MiB so far)
25/06/10 07:50:02 WARN MemoryStore: Not enough space to cache rdd_11_4 in memory! (computed 42.7 MiB so far)
25/06/10 07:50:02 WARN BlockMana

+-------+----------+------+-------+------+-----+-----------+---------+--------+---------+--------------+--------------+----------------+------------------+
|  nflId|    gameId|playId|frameId|     x|    y|  frameType|snapEvent|position|isOffence|yardlineNumber|play_direction|line_of_scrimage|          distance|
+-------+----------+------+-------+------+-----+-----------+---------+--------+---------+--------------+--------------+----------------+------------------+
|44848.0|2022092510|  2891|    107| 98.23|34.14|BEFORE_SNAP|      108|      SS|    false|            20|          left|              80|18.230000000000004|
|47882.0|2022092512|   713|     91| 77.51|38.45|BEFORE_SNAP|       92|      FS|    false|            29|          left|              71| 6.510000000000005|
|53592.0|2022092508|  3710|    158| 64.26|35.48|BEFORE_SNAP|      159|      FS|    false|            45|          left|              55| 9.260000000000005|
|38843.0|2022092502|  2337|     70|105.99| 35.2|BEFORE_SNAP|    