In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.types import DoubleType
import datetime

df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/Volumes/workspace/default/vdl/vdl_dummy_with_windows.csv")
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Define motion and special rule sensors
motion_sensors = ["SkidIndexOut", "LocatorRtn", "LifttoLower", "SkidIndexIn",
                  "RaiseoPloc", "LocatorAdv", "LifttoRaise", "SkidInOut", "LiftoRaise"]
special_rule_sensors = ["LoweroPloc", "LocatorAdv"]

# Convert tagtimestamp to timestamp column
df = df.withColumn("timestamp", F.to_timestamp("tagtimestamp", "MM/dd/yyyy HH:mm:ss:SSSSSS"))
df = df.withColumn("timestamp_secs", F.col("timestamp").cast("double"))

# Add index for ordering
df = df.withColumn("row_id", F.monotonically_increasing_id())

# Handle standard motion sensors (not in special rule)
motion_normal = [s for s in motion_sensors if s not in special_rule_sensors]
df = df.withColumn("start", 
    F.when(F.col("tagid").isin(motion_normal), F.col("timestamp_secs"))
)
df = df.withColumn("end",
    F.when(F.col("tagid").isin(motion_normal), F.col("timestamp_secs") + F.col("tagvalue") / 1000)
)

# Handle "LoweroPloc" using next LocatorRtn
window_forward = Window.orderBy("timestamp_secs").rowsBetween(1, Window.unboundedFollowing)
locator_rtns = df.filter(F.col("tagid") == "LocatorRtn") \
    .select("row_id", F.col("timestamp_secs").alias("locatorrtn_ts"), (F.col("timestamp_secs") + F.col("tagvalue") / 1000).alias("locatorrtn_end"))

df = df.join(locator_rtns, on="row_id", how="left") \
    .withColumn("start", F.when(F.col("tagid") == "LoweroPloc", F.col("timestamp_secs")).otherwise(F.col("start"))) \
    .withColumn("end", F.when(F.col("tagid") == "LoweroPloc", F.col("locatorrtn_end")).otherwise(F.col("end")))

# Handle "LocatorAdv" using next LifttoRaise
liftraise = df.filter(F.col("tagid") == "LifttoRaise") \
    .select("row_id", F.col("timestamp_secs").alias("liftraise_ts"), (F.col("timestamp_secs") + F.col("tagvalue") / 1000).alias("liftraise_end"))

df = df.join(liftraise, on="row_id", how="left") \
    .withColumn("start", F.when(F.col("tagid") == "LocatorAdv", F.col("timestamp_secs")).otherwise(F.col("start"))) \
    .withColumn("end", F.when(F.col("tagid") == "LocatorAdv", F.col("liftraise_end")).otherwise(F.col("end")))

# Create a marker for motion sensor rows (used for forward filling)
df = df.withColumn("motion_flag", F.when(F.col("tagid").isin(motion_sensors), F.col("timestamp_secs")))

# Forward fill motion_flag, start, and end using last() over window
fill_window = Window.orderBy("timestamp_secs").rowsBetween(Window.unboundedPreceding, 0)
df = df.withColumn("ff_start", F.last("start", ignorenulls=True).over(fill_window)) \
       .withColumn("ff_end", F.last("end", ignorenulls=True).over(fill_window))

# Replace start and end for non-motion rows
df = df.withColumn("start", F.when(F.col("start").isNull(), F.col("ff_start")).otherwise(F.col("start"))) \
       .withColumn("end", F.when(F.col("end").isNull(), F.col("ff_end")).otherwise(F.col("end")))

# Clean up helper columns
df = df.drop("motion_flag", "ff_start", "ff_end", "locatorrtn_ts", "locatorrtn_end", "liftraise_ts", "liftraise_end")

# Optional: order by timestamp
#df = df.orderBy("timestamp_secs")

df.display()




row_id,tagtimestamp,tagid,tagvalue,_c3,_c4,timestamp,timestamp_secs,start,end
0,04/28/2025 10:00:40:207077,SkidIndexOut,1817.0,,,2025-04-28T10:00:40.207Z,1745834440.207077,1745834440.207077,1745834442.024077
1,04/28/2025 10:00:42:074077,LocatorRtn,760.0,,,2025-04-28T10:00:42.074Z,1745834442.074077,1745834442.074077,1745834442.834077
2,04/28/2025 10:00:42:884077,LoweroPloc,1432.0,,,2025-04-28T10:00:42.884Z,1745834442.884077,1745834442.884077,1745834442.834077
3,04/28/2025 10:00:44:366077,LocatorAdv,1013.0,,,2025-04-28T10:00:44.366Z,1745834444.366077,1745834444.366077,1745834442.834077
4,04/28/2025 10:00:45:429077,LifttoLower,1704.0,,,2025-04-28T10:00:45.429Z,1745834445.429077,1745834445.429077,1745834447.133077
5,04/28/2025 10:00:47:183077,SkidIndexIn,1485.0,,,2025-04-28T10:00:47.183Z,1745834447.183077,1745834447.183077,1745834448.668077
6,04/28/2025 10:00:48:718077,LocatorRtn,1254.0,,,2025-04-28T10:00:48.718Z,1745834448.718077,1745834448.718077,1745834449.972077
7,04/28/2025 10:00:50:022077,RaiseoPloc,1337.0,,,2025-04-28T10:00:50.022Z,1745834450.022077,1745834450.022077,1745834451.359077
8,04/28/2025 10:00:51:409077,LocatorAdv,1021.0,,,2025-04-28T10:00:51.409Z,1745834451.409077,1745834451.409077,1745834451.359077
9,04/28/2025 10:00:52:480077,LifttoRaise,1062.0,,,2025-04-28T10:00:52.480Z,1745834452.480077,1745834452.480077,1745834453.542077
