In [121]:
#Import necessary modules
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col, when, lag, round, sqrt, pow, sum as sum_func, max as max_func,row_number, lit, min as min_func
from pyspark.sql.types import StructType, StructField, LongType, StringType, BooleanType, DoubleType


In [122]:
#Create spark session
spark = SparkSession.builder.appName("Formation insights").getOrCreate()

In [123]:
#Select only desired columns and apply schema
schema = StructType([
    StructField("gameId", LongType(), True),
    StructField("playId", LongType(), True),
    StructField("nflId", DoubleType(), True),
    StructField("frameId", LongType(), True),
    StructField("x", DoubleType(), True),
    StructField("y", DoubleType(), True),
    StructField("frameType", StringType(), True)
])

#Merge all the tracking data into one dataframe.
trackingDf = spark.read.parquet("data/tracking/").cache()

#Filter and select tracking data just before the snap - to determine if there is any pre-snap motion
snapEventsDf = trackingDf.filter(col("frameType") == "SNAP").groupby("gameId", "playId").agg(min_func("frameId").alias("snapEvent"))

#Join with full data to get last second before snap, using the last 10 frames (update rate of 0.1 seconds)
presnapDf = trackingDf.join(snapEventsDf, ["gameId", "playId"]).filter(col("frameId") < col("snapEvent")).filter(col("frameId") >= (col("snapEvent") - 10))

25/06/04 16:15:47 WARN CacheManager: Asked to cache already cached data.


In [127]:
#Read in columns of interest from players data source 
schema = StructType([
    StructField("nflId", LongType(), True), 
    StructField("position", StringType(), True)
])

#Read in players datasource and apply schema
playersDf = spark.read.schema(schema).parquet("data/players.parquet").cache()

#Add positions column, through a join
presnapDf = presnapDf.join(playersDf.select("nflId", "position"), on="nflId", how="left")

#Position classifications
offensivePositions = ["QB", "RB", "FB", "HB", "WR", "TE", "LT", "LG", "C", "RG", "RT"]
defensivePositions = ["CB", "S", "FS", "SS", "MLB", "OLB", "ILB", "LB", "DT", "DE", "NT", "DB"]

#Create a classifier column for if the player is on offence or defence
presnapDf = presnapDf.withColumn("isOffence",when(col("position").isin(*offensivePositions), True).when(col("position").isin(*defensivePositions), False).otherwise(lit(None)).cast(BooleanType()))

#Drop redundant columns
presnapDf = presnapDf.drop("frameId", "frameType", "snapEvent", "position")

#Define window for ordering
sparkWindow = Window.partitionBy("gameId", "playId", "nflId").orderBy("time")  # assuming 'time' exists

#Add orderId using row number (for later sorting)
presnapDf = presnapDf.withColumn("orderId", row_number().over(sparkWindow))

presnapDf = presnapDf.withColumn("prevX", round(lag("x").over(sparkWindow), 2))
presnapDf = presnapDf.withColumn("prevY", round(lag("y").over(sparkWindow), 2))

#Calculate cumulative distance
presnapDf = presnapDf.withColumn(
    "cumulativeDistance", round(when(col("prevX").isNull() | col("prevY").isNull(), 0.0).otherwise(sqrt(pow(col("x") - col("prevX"), 2) + pow(col("y") - col("prevY"), 2))),5))

#Sum all the movement values - this gives total distance moved in a second
playerMotion = presnapDf.groupBy("gameId", "playId", "nflId", "isOffence").agg(sum_func("cumulativeDistance").alias("distanceMoved"))

#Using a threshold of 2 yards in the 1 second time frame, determine if the player was in motion
playerMotion = playerMotion.withColumn("motion", col("distanceMoved") > 2)

#Determine if any player on each side was in motion
playMotion = playerMotion.groupBy("gameId", "playId", "isOffence").agg(max_func("motion").alias("isMotion"))

#Create final result
playMotion = playMotion.select("gameId", "playId", "isOffence", "isMotion")

playMotion.show()


25/06/04 16:20:42 WARN CacheManager: Asked to cache already cached data.
25/06/04 16:20:44 WARN MemoryStore: Not enough space to cache rdd_345_0 in memory! (computed 209.7 MiB so far)
25/06/04 16:20:44 WARN MemoryStore: Not enough space to cache rdd_345_1 in memory! (computed 209.6 MiB so far)
25/06/04 16:20:51 WARN MemoryStore: Not enough space to cache rdd_345_3 in memory! (computed 209.6 MiB so far)
25/06/04 16:20:51 WARN MemoryStore: Not enough space to cache rdd_345_2 in memory! (computed 209.7 MiB so far)
25/06/04 16:20:59 WARN MemoryStore: Not enough space to cache rdd_345_4 in memory! (computed 209.6 MiB so far)
25/06/04 16:20:59 WARN MemoryStore: Not enough space to cache rdd_345_5 in memory! (computed 209.7 MiB so far)
25/06/04 16:21:06 WARN MemoryStore: Not enough space to cache rdd_345_6 in memory! (computed 209.6 MiB so far)
25/06/04 16:21:06 WARN MemoryStore: Not enough space to cache rdd_345_7 in memory! (computed 209.6 MiB so far)
25/06/04 16:21:12 WARN MemoryStore: Not

+----------+------+---------+--------+
|    gameId|playId|isOffence|isMotion|
+----------+------+---------+--------+
|2022090800|  1504|    false|   false|
|2022090800|  1504|     true|   false|
|2022090800|  3674|     true|    true|
|2022090800|  3674|    false|   false|
|2022091100|   501|    false|    true|
|2022091100|   501|     true|    true|
|2022091100|   828|    false|    true|
|2022091100|   828|     true|   false|
|2022091100|  2984|    false|   false|
|2022091100|  2984|     true|   false|
|2022091100|  3504|    false|   false|
|2022091100|  3504|     true|   false|
|2022091100|  4187|    false|   false|
|2022091100|  4187|     true|   false|
|2022091101|  1192|     true|    true|
|2022091101|  1192|    false|   false|
|2022091101|  1213|     true|   false|
|2022091101|  1213|    false|   false|
|2022091101|  1537|     true|    true|
|2022091101|  1537|    false|    true|
+----------+------+---------+--------+
only showing top 20 rows


                                                                                