In [2]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
                    .appName('project1')
                    .getOrCreate()
        )

In [16]:
from pyspark.sql.functions import *
from pyspark.sql import Window

In [25]:
rides_df = (spark.read
             .option("sep", ",") # separator
             .option("header", True) # file has header row
             .option("inferSchema", True) # spark tries to infer data types
             .csv("input/Sample NYC Data.csv") #path
            )

In [26]:
# rides_df.printSchema()

In [27]:
# rides_df.show()

In [28]:
DATETIME_FORMAT = "dd-MM-yy HH:mm"

rides_df = rides_df.withColumn("pickup_unix", unix_timestamp("pickup_datetime", DATETIME_FORMAT)) \
                   .withColumn("dropoff_unix", unix_timestamp("dropoff_datetime", DATETIME_FORMAT))

rides_df = rides_df.withColumn("duration_sec", rides_df["dropoff_unix"] - rides_df["pickup_unix"])

rides_df.select("pickup_datetime", "dropoff_datetime", "duration_sec").show()

+---------------+----------------+------------+
|pickup_datetime|dropoff_datetime|duration_sec|
+---------------+----------------+------------+
| 01-01-13 15:11|  01-01-13 15:18|         420|
| 06-01-13 00:18|  06-01-13 00:22|         240|
| 05-01-13 18:49|  05-01-13 18:54|         300|
| 07-01-13 23:54|  07-01-13 23:58|         240|
| 07-01-13 23:25|  07-01-13 23:34|         540|
| 07-01-13 15:27|  07-01-13 15:38|         660|
| 08-01-13 11:01|  08-01-13 11:08|         420|
| 07-01-13 12:39|  07-01-13 13:10|        1860|
| 07-01-13 18:15|  07-01-13 18:20|         300|
| 07-01-13 15:33|  07-01-13 15:49|         960|
| 08-01-13 13:11|  08-01-13 13:19|         480|
| 08-01-13 09:50|  08-01-13 10:02|         720|
| 10-01-13 12:07|  10-01-13 12:17|         600|
| 07-01-13 07:35|  07-01-13 07:46|         660|
| 10-01-13 15:42|  10-01-13 16:04|        1320|
| 10-01-13 14:27|  10-01-13 14:45|        1080|
| 07-01-13 22:09|  07-01-13 22:19|         600|
| 07-01-13 17:18|  07-01-13 17:20|      

In [29]:
DURATION_THRESHOLD = 14400 # 4 hours in seconds

rides_df = rides_df.filter((col("duration_sec") >= 0) & (col("duration_sec") <= DURATION_THRESHOLD))

In [31]:
DRIVER_SESSION_LENGTH = 14400 # 4 hours in seconds

rides_df = rides_df.orderBy("hack_license", "pickup_datetime")

window_spec = Window.partitionBy("hack_license").orderBy("pickup_unix")

rides_df = rides_df.withColumn("prev_dropoff_unix", lag("dropoff_unix").over(window_spec)) \
                   .withColumn("idle_time_sec", 
                               when((col("pickup_unix") - col("prev_dropoff_unix")) < DRIVER_SESSION_LENGTH,
                                    col("pickup_unix") - col("prev_dropoff_unix"))
                               .otherwise(0))

idle_time_df = rides_df.filter(col("idle_time_sec") > 0) \
                       .groupBy("hack_license") \
                       .sum("idle_time_sec") \
                       .withColumnRenamed("sum(idle_time_sec)", "total_idle_time_sec")

idle_time_df.show()

+--------------------+-------------------+
|        hack_license|total_idle_time_sec|
+--------------------+-------------------+
|001C8AAB90AEE49F3...|              12960|
|0025133AD810DBE80...|               2400|
|002C093A2CB9FD40C...|              15300|
|00447A6197DBB329F...|              13440|
|0046F1E91AA13DEDE...|               9960|
|00567B1CBFD51DDFA...|              10080|
|006114F940CB87B3A...|              24000|
|006313464EC98A24B...|              31500|
|006B6BD90C7B5C985...|               6180|
|00711D0CC3FB5BC90...|               6000|
|007357E7FFE212879...|              18660|
|007439EEDB510EF82...|               3240|
|007E686365B4421FB...|               3840|
|00927C48BA4C1B2B1...|              14460|
|00A2DC1380E44036A...|              11100|
|00AE05F56D451E89E...|              22200|
|00B442110FA2D04A1...|              10680|
|00B7691D86D96AEBD...|              12120|
|00BB5ECED533BF463...|              10380|
|00BF52E4A8E6DBB01...|               9720|
+----------