In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, unix_timestamp, lag, sum, avg
from pyspark.sql.window import Window

In [2]:
spark = (
    SparkSession.builder
        .appName('NYC Taxi query 1')
        .getOrCreate()
)

In [3]:
# Load trip data

# Can use both output file or original
df = spark.read.csv('../data/Sample NYC Data.csv', header=True, inferSchema=True)

In [4]:
DATE_FORMAT = 'dd-MM-yy HH:mm'
df_ts = (df
         .withColumn('pickup_ts', unix_timestamp(to_timestamp(col("pickup_datetime"), DATE_FORMAT)))
         .withColumn('dropoff_ts', unix_timestamp(to_timestamp(col("dropoff_datetime"), DATE_FORMAT)))
        )

window = Window.partitionBy('medallion').orderBy('pickup_ts')

df_ts = (df_ts
         .withColumn('dropoff_ts_prev', lag('dropoff_ts').over(window)) # lag() returns previous window row value
         .withColumn('idle_time', col('pickup_ts') - col('dropoff_ts_prev'))
         .withColumn('trip_time', col('dropoff_ts') - col('pickup_ts'))
        )

# Remove neg idle_time and idle_time > 4h
HOURS_4 = 4 * 60 * 60 # 14400 seconds
df_ts_wo_exceptions = df_ts.filter((col('idle_time') > 0) & (col('idle_time') <= 14400))


# Group by taxi (using medallion col)
df_util = df_ts_wo_exceptions.groupBy('medallion').agg(
    avg(col('trip_time')).alias('total_trip_time'),
    avg(col('idle_time')).alias('total_idle_time')
)

# Calculate utilization col
df_util = df_util.withColumn(
    'utilization', col('total_trip_time') / (col('total_trip_time') + col('total_idle_time'))
)


In [5]:
df_util.show()

+--------------------+-----------------+------------------+-------------------+
|           medallion|  total_trip_time|   total_idle_time|        utilization|
+--------------------+-----------------+------------------+-------------------+
|000318C2E3E638158...|589.0909090909091| 790.9090909090909|0.42687747035573126|
|002B4CFC5B8920A87...|711.4285714285714|1264.2857142857142| 0.3600867678958785|
|002E3B405B6ABEA23...|886.6666666666666|1793.3333333333333| 0.3308457711442786|
|0030AD2648D81EE87...|           1440.0|             720.0| 0.6666666666666666|
|0035520A854E4F276...|            655.0|            1240.0|0.34564643799472294|
|0036961468659D0BF...|603.5294117647059|1161.1764705882354|0.34199999999999997|
|003889E315BFDD985...|            457.5|            1185.0| 0.2785388127853881|
|0038EF45118925A51...|517.8947368421053| 795.7894736842105| 0.3942307692307693|
|003D87DB553C6F00F...|            500.0|             507.5|0.49627791563275436|
|003EEA559FA618008...|643.6363636363636|