In [1]:
import pyspark.sql.functions as func
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame

In [2]:
spark = SparkSession.builder.appName('arrival_ml').getOrCreate()
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

23/12/05 06:19:15 WARN Utils: Your hostname, codespaces-05ff3a resolves to a loopback address: 127.0.0.1; using 172.16.5.4 instead (on interface eth0)
23/12/05 06:19:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/05 06:19:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


DataFrame[key: string, value: string]

In [3]:
def load_data(file_path: str):
    'read csv for development, may change to BQ in the future'
    _schema = StructType([
        StructField("rn", IntegerType(), nullable=True),
        StructField("destSt", IntegerType(), nullable=True),
        StructField("destNm", StringType(), nullable=True),
        StructField("trDr", IntegerType(), nullable=True),
        StructField("nextStaId", IntegerType(), nullable=True),
        StructField("nextStpId", IntegerType(), nullable=True),
        StructField("nextStaNm", StringType(), nullable=True),
        StructField("prdt", TimestampType(), nullable=True),
        StructField("arrT", TimestampType(), nullable=True),
        StructField("flags", StringType(), nullable=True),
        StructField("lat", StringType(), nullable=True),
        StructField("lon", StringType(), nullable=True),
        StructField("heading", IntegerType(), nullable=True),
        StructField("resp_time", TimestampType(), nullable=True),
        StructField("isApp", IntegerType(), nullable=True),
        StructField("isDly", IntegerType(), nullable=True)
    ])
    
    return (spark.read
            .option('timestampFormat', 'MM/dd/yyyy HH:mm:ss')
            .schema(_schema)
            .csv(file_path, header=True)
            )

df = load_data(file_path='blue.csv')

In [4]:
def arrival_morning(raw_data: DataFrame,
                    stop: str = 'Division')-> DataFrame:
    '''split raw data into morning and afternoon section
        param:
            raw_data: initial data from data load
            stop: name of the stop
        return:
            a splitted dataframe of interest
    '''
    df_approach = raw_data.filter(
                    # spark is verbose
                    # morning
                    (func.hour(func.col('arrT'))<10) &
                    (func.col('isApp')==1) &
                    # train is approaching Division
                    (func.col('nextStaNm')==stop) &
                    # direction is towards to forest park or UIC
                    ((func.col('destNm')=='Forest Park') | (func.col('destNM')=='UIC-Halsted'))
                )
    df_approach = df_approach.sort('arrT')

    return df_approach

df_morning = arrival_morning(raw_data=df)
df_morning.select('arrT').show()

                                                                                

+-------------------+
|               arrT|
+-------------------+
|2023-02-15 08:00:52|
|2023-02-15 08:08:52|
|2023-02-15 08:20:55|
|2023-02-15 08:26:56|
|2023-02-15 08:30:52|
|2023-02-15 08:53:16|
|2023-02-15 09:10:55|
|2023-02-15 09:12:53|
|2023-02-15 09:20:54|
|2023-02-16 08:02:59|
|2023-02-16 08:08:54|
|2023-02-16 08:30:51|
|2023-02-16 09:00:56|
|2023-02-16 09:14:50|
|2023-02-16 09:40:53|
|2023-02-16 09:54:56|
+-------------------+



In [5]:
def time_between_arrival(arrival: DataFrame
                         )-> DataFrame:
    windowSpec = Window().orderBy("arrT").partitionBy(func.date_format("arrT", "yyyy-MM-dd"))
    df_arrival = arrival.withColumn('next_arrT', func.lead('arrT').over(windowSpec))
    
    df_arrival = df_arrival.withColumn('arrival_diff', 
                                       func.when(func.col('next_arrT').isNotNull(),
                                                (func.col("next_arrT").cast("long") - func.col("arrT").cast("long"))
                                                ).otherwise(None))
    
    df_arrival = df_arrival.dropna(subset='arrival_diff')
    
    return df_arrival

time_between_arrival(arrival=df_morning).select('arrT', 'next_arrT', 'arrival_diff').show()

+-------------------+-------------------+------------+
|               arrT|          next_arrT|arrival_diff|
+-------------------+-------------------+------------+
|2023-02-15 08:00:52|2023-02-15 08:08:52|         480|
|2023-02-15 08:08:52|2023-02-15 08:20:55|         723|
|2023-02-15 08:20:55|2023-02-15 08:26:56|         361|
|2023-02-15 08:26:56|2023-02-15 08:30:52|         236|
|2023-02-15 08:30:52|2023-02-15 08:53:16|        1344|
|2023-02-15 08:53:16|2023-02-15 09:10:55|        1059|
|2023-02-15 09:10:55|2023-02-15 09:12:53|         118|
|2023-02-15 09:12:53|2023-02-15 09:20:54|         481|
|2023-02-16 08:02:59|2023-02-16 08:08:54|         355|
|2023-02-16 08:08:54|2023-02-16 08:30:51|        1317|
|2023-02-16 08:30:51|2023-02-16 09:00:56|        1805|
|2023-02-16 09:00:56|2023-02-16 09:14:50|         834|
|2023-02-16 09:14:50|2023-02-16 09:40:53|        1563|
|2023-02-16 09:40:53|2023-02-16 09:54:56|         843|
+-------------------+-------------------+------------+

