In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, unix_timestamp, lag, sum, avg
from pyspark.sql.window import Window

In [2]:
spark = (
    SparkSession.builder
        .appName('NYC Taxi query 2')
        .getOrCreate()
)

In [3]:
df = spark.read.load('../output/output.parquet')

In [4]:
# Can be moved to 0_data_enrichment

DATE_FORMAT = 'dd-MM-yy HH:mm'
df_ts = (df
         .withColumn('pickup_ts', unix_timestamp(to_timestamp(col("pickup_datetime"), DATE_FORMAT)))
         .withColumn('dropoff_ts', unix_timestamp(to_timestamp(col("dropoff_datetime"), DATE_FORMAT)))
        )

In [5]:
window = Window.partitionBy('medallion').orderBy('pickup_ts')

df_ts = (df_ts
         .withColumn('dropoff_ts_prev', lag('dropoff_ts').over(window)) # lag() returns previous window row value*
         .withColumn('idle_time', col('pickup_ts') - col('dropoff_ts_prev'))
        )

df_time_to_next = df_ts.groupBy('dropoff_borough').agg(
    avg(col('idle_time')).alias('avg_time_for_next_ride')
)

In [6]:
df_time_to_next.show()

+---------------+----------------------+
|dropoff_borough|avg_time_for_next_ride|
+---------------+----------------------+
|         Queens|     5043.346613545817|
|       Brooklyn|    5372.8348909657325|
|  Staten Island|                6390.0|
|      Manhattan|    2204.6238589486093|
|          Bronx|     6753.784615384616|
+---------------+----------------------+

