In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, unix_timestamp, lag, sum, avg
from pyspark.sql.window import Window

In [2]:
spark = (
    SparkSession.builder
        .appName('NYC Taxy')    
        .getOrCreate()
)

In [5]:
trip_data = spark.read.csv('../data/trip_data/trip_data_1.csv', header=True, inferSchema=True)
trip_fare = spark.read.csv('../data/trip_fare/trip_fare_1.csv', header=True, inferSchema=True)

# Remove whitespace from column names
trip_data = trip_data.toDF(*[col_name.strip() for col_name in trip_data.columns])
trip_fare = trip_fare.toDF(*[col_name.strip() for col_name in trip_fare.columns])

In [6]:
from itertools import zip_longest

print('trip_data', '||', 'trip_fare')
print('===', '||', '===')
for cols in zip_longest(trip_data.columns, trip_fare.columns, fillvalue=''):
    print(cols[0], '||', cols[1])

trip_data || trip_fare
=== || ===
medallion || medallion
hack_license || hack_license
vendor_id || vendor_id
rate_code || pickup_datetime
store_and_fwd_flag || payment_type
pickup_datetime || fare_amount
dropoff_datetime || surcharge
passenger_count || mta_tax
trip_time_in_secs || tip_amount
trip_distance || tolls_amount
pickup_longitude || total_amount
pickup_latitude || 
dropoff_longitude || 
dropoff_latitude || 


In [7]:
# Join trip_data and trip_fare on medallion, hack_license, pickup_datetime
df_join = trip_data.join(trip_fare, on=["medallion", "hack_license", "pickup_datetime"], how="inner")

# Add pickup and dropoff as timestamps
df_ts = (df_join.withColumn("ts_pickup", unix_timestamp(col("pickup_datetime")))
      .withColumn("ts_dropoff", unix_timestamp(col("dropoff_datetime"))))

window_spec = Window.partitionBy("medallion").orderBy("ts_pickup")

# Add previous delivery dropoff timestamp using lag function
# lag() returns previous window row value
df_ts = df_ts.withColumn("prev_ts_dropoff", lag("ts_dropoff").over(window_spec))

df_ts = df_ts.withColumn("idle_time", col("ts_pickup") - col("prev_ts_dropoff"))

HOURS_4 = 4 * 60 * 60 # 14400 seconds
# Remove neg idle_time and idle_time > 4h
df_ts_wo_exceptions = df_ts.filter((col("idle_time") > 0) & (col("idle_time") <= 14400))

# Group by taxi (using medallion col)
df_util = df_ts_wo_exceptions.groupBy("medallion").agg(
    avg(col("trip_time_in_secs")).alias("total_trip_time"),
    avg(col("idle_time")).alias("total_idle_time")
)

# Calculate utilization col
df_util = df_util.withColumn(
    "utilization", col("total_trip_time") / (col("total_trip_time") + col("total_idle_time"))
)

df_util.show()

+--------------------+-----------------+------------------+-------------------+
|           medallion|  total_trip_time|   total_idle_time|        utilization|
+--------------------+-----------------+------------------+-------------------+
|0038EF45118925A51...|609.1419406575782| 1210.392943063352| 0.3347789295536281|
|00BD5D1AD3A96C997...|695.9139784946236|  968.010752688172| 0.4182364535203076|
|01A2F4366180AEB43...|670.1421736158578| 828.4401913875598|0.44718407827675827|
|01D13A056D9A26F84...|676.9428238039674| 963.9206534422403|  0.412552801126424|
|01F24976B8E3FF46A...|779.4729411764706|            535.32| 0.5928484377768273|
|024E99A049B748C44...|957.7852348993289|1440.4026845637584|0.39937872554781295|
|025B4E80E8A06FDB0...|875.8581235697941|1296.7242562929061| 0.4031415018772017|
|026B27179DE85CFDC...|690.6886120996442| 845.1316725978647| 0.4497196833389138|
|02B196981B24858BC...|729.7250361794501|1392.2431259044863| 0.3438906620836402|
|02C49A409C2DC66B1...|            634.3|

In [8]:
#df_util.select('medallion', 'utilization', 'total_trip_time', 'total_idle_time').write.mode('overwrite').option('header', True).csv('./utilization.csv')