# Project 1 Report

#### Joosep Orasmäe, Kaupo Humal, Tanel Tiisler, Kalju Jake Nekvasil

## Initialization

This section is for importing libraries and visualizing the dataset. 

In [2]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
                    .appName('project1')
                    .getOrCreate()
        )

In [3]:
from pyspark.sql.functions import *
from pyspark.sql import Window
from shapely.geometry import Point, Polygon
from shapely import from_wkt
from pyspark.sql.types import *

In [4]:
rides_df = (spark.read
             .option("sep", ",") # separator
             .option("header", True) # file has header row
             .option("inferSchema", True) # spark tries to infer data types
             .csv("input/Sample NYC Data.csv") #path
            )

In [5]:
rides_df.printSchema()

root
 |-- medallion: string (nullable = true)
 |-- hack_license: string (nullable = true)
 |-- vendor_id: string (nullable = true)
 |-- rate_code: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- dropoff_datetime: string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)



In [6]:
rides_df.show()

+--------------------+--------------------+---------+---------+------------------+---------------+----------------+---------------+----------------+---------------+-----------------+----------------+
|           medallion|        hack_license|vendor_id|rate_code|store_and_fwd_flag|pickup_datetime|dropoff_datetime|passenger_count|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|
+--------------------+--------------------+---------+---------+------------------+---------------+----------------+---------------+----------------+---------------+-----------------+----------------+
|89D227B655E5C82AE...|BA96DE419E711691B...|      CMT|        1|                 N| 01-01-13 15:11|  01-01-13 15:18|              4|      -73.978165|      40.757977|       -73.989838|       40.751171|
|0BD7C8F5BA12B88E0...|9FD8F69F0804BDB55...|      CMT|        1|                 N| 06-01-13 00:18|  06-01-13 00:22|              1|      -74.006683|      40.731781|       -73.994499|        40.75066|


## Query 1

In this query we calculate utilization per taxi/driver.

In [7]:
# Use the unix_timestamp function to convert the date time of the respective times into milliseconds
DATETIME_FORMAT = "dd-MM-yy HH:mm"

rides_df = rides_df.withColumn("pickup_unix", unix_timestamp("pickup_datetime", DATETIME_FORMAT)) \
                   .withColumn("dropoff_unix", unix_timestamp("dropoff_datetime", DATETIME_FORMAT))

# Subtract the times to find the difference (in seconds)
rides_df = rides_df.withColumn("duration_sec", rides_df["dropoff_unix"] - rides_df["pickup_unix"])

# Output table with pickup time, dropoff time, and duration (seconds) for verification
rides_df.select("pickup_datetime", "dropoff_datetime", "duration_sec").show()

+---------------+----------------+------------+
|pickup_datetime|dropoff_datetime|duration_sec|
+---------------+----------------+------------+
| 01-01-13 15:11|  01-01-13 15:18|         420|
| 06-01-13 00:18|  06-01-13 00:22|         240|
| 05-01-13 18:49|  05-01-13 18:54|         300|
| 07-01-13 23:54|  07-01-13 23:58|         240|
| 07-01-13 23:25|  07-01-13 23:34|         540|
| 07-01-13 15:27|  07-01-13 15:38|         660|
| 08-01-13 11:01|  08-01-13 11:08|         420|
| 07-01-13 12:39|  07-01-13 13:10|        1860|
| 07-01-13 18:15|  07-01-13 18:20|         300|
| 07-01-13 15:33|  07-01-13 15:49|         960|
| 08-01-13 13:11|  08-01-13 13:19|         480|
| 08-01-13 09:50|  08-01-13 10:02|         720|
| 10-01-13 12:07|  10-01-13 12:17|         600|
| 07-01-13 07:35|  07-01-13 07:46|         660|
| 10-01-13 15:42|  10-01-13 16:04|        1320|
| 10-01-13 14:27|  10-01-13 14:45|        1080|
| 07-01-13 22:09|  07-01-13 22:19|         600|
| 07-01-13 17:18|  07-01-13 17:20|      

In [8]:
# If the difference between a ride and its subsequent ride is more than 4 hours, we ignore that as it represents a new session. 
# Differences below 4 hours is considered as an idle time
DURATION_THRESHOLD = 14400 # 4 hours in seconds
# Filter OUT rides longer than 4 hours from the dataset
rides_df = rides_df.filter((col("duration_sec") >= 0) & (col("duration_sec") <= DURATION_THRESHOLD))

In [9]:
# Re-declare 4 hour threshold
DRIVER_SESSION_LENGTH = 14400 # 4 hours in seconds
# Sort by driver ID and pickup time so trips for each driver are in chronological order 
rides_df = rides_df.orderBy("hack_license", "pickup_datetime")

# Separate data by driver and order by pickup time
window_spec = Window.partitionBy("hack_license").orderBy("pickup_unix")

# Include previous dropoff time and idle time (if idle time is < 4 hours, else 0)
rides_df = rides_df.withColumn("prev_dropoff_unix", lag("dropoff_unix").over(window_spec)) \
                   .withColumn("idle_time_sec", 
                               when((col("pickup_unix") - col("prev_dropoff_unix")) < DRIVER_SESSION_LENGTH,
                                    col("pickup_unix") - col("prev_dropoff_unix"))
                               .otherwise(0))

# Filter out rows with 0 idle_time, group by driver id, sum total idle time for each driver
idle_time_df = rides_df.filter(col("idle_time_sec") > 0) \
                       .groupBy("hack_license") \
                       .sum("idle_time_sec") \
                       .withColumnRenamed("sum(idle_time_sec)", "total_idle_time_sec")
# Output table of total IDLE time per driver
idle_time_df.show()

+--------------------+-------------------+
|        hack_license|total_idle_time_sec|
+--------------------+-------------------+
|001C8AAB90AEE49F3...|              12960|
|0025133AD810DBE80...|               2400|
|002C093A2CB9FD40C...|              15300|
|00447A6197DBB329F...|              13440|
|0046F1E91AA13DEDE...|               9960|
|00567B1CBFD51DDFA...|              10080|
|006114F940CB87B3A...|              24000|
|006313464EC98A24B...|              31500|
|006B6BD90C7B5C985...|               6180|
|00711D0CC3FB5BC90...|               6000|
|007357E7FFE212879...|              18660|
|007439EEDB510EF82...|               3240|
|007E686365B4421FB...|               3840|
|00927C48BA4C1B2B1...|              14460|
|00A2DC1380E44036A...|              11100|
|00AE05F56D451E89E...|              22200|
|00B442110FA2D04A1...|              10680|
|00B7691D86D96AEBD...|              12120|
|00BB5ECED533BF463...|              10380|
|00BF52E4A8E6DBB01...|               9720|
+----------

In [130]:
# Calc total ACTIVE times by summing all ride durations (for each driver)
active_time_df = rides_df.groupBy("hack_license") \
                        .sum("duration_sec") \
                        .withColumnRenamed("sum(duration_sec)", "total_active_time_sec")

# Join with IDLE time data
driver_times_df = active_time_df.join(idle_time_df, "hack_license")

# Calc total SESSION times by summing idle and ACTIVE times
# Calc utilization rate difference: ACTIVE/SESSION
utilization_df = driver_times_df.withColumn(
    "total_session_time", 
    col("total_active_time_sec") + col("total_idle_time_sec")
).withColumn(
    "utilization_rate", 
    col("total_active_time_sec") / col("total_session_time")
)

# Output sorted table for verification
utilization_df.select("hack_license", "utilization_rate").orderBy("utilization_rate", ascending=False).show()

+--------------------+------------------+
|        hack_license|  utilization_rate|
+--------------------+------------------+
|751EE304AD669A5C9...|0.9885057471264368|
|0D61304FB9E9E7CF6...|            0.9875|
|3836463623075CCF1...|0.9850746268656716|
|9BAB1E0D579D293F4...|0.9833333333333333|
|B24D87EA86C349F5B...|0.9830508474576272|
|06150B4FF9CD737D0...|0.9824561403508771|
|31A1F192A01B30B2C...|0.9814814814814815|
|7E73B38D829AA77F2...|0.9811320754716981|
|1735076F27B86A649...|0.9807692307692307|
|56F5F59C3EC8ACA1F...|0.9777777777777777|
|958460D2C89BCF8C7...|0.9777777777777777|
|BA510F229E9E7E292...|0.9772727272727273|
|5DE0F6ED31F876A28...|             0.975|
|B7F88F199CE4624DF...|0.9743589743589743|
|D57EC7DC90ABE6176...|0.9743589743589743|
|1603132156F27D303...|0.9736842105263158|
|25D94112137704AE1...|0.9736842105263158|
|B708C3D9C585DA845...|0.9736842105263158|
|49365436007E31EE7...|0.9722222222222222|
|AB2569908ED389C92...| 0.967741935483871|
+--------------------+------------

## Query 2

In this query we calculate the average time it takes for a taxi to find its next fare(trip) per destination borough.

*The difference (in seconds) between the drop off of a trip and the pick up of the next trip*

In [131]:
geo_df = (spark.read.option("multiline", "true").json("input/nyc-boroughs.geojson")).select(explode("features").alias("borough")) \
        .select(col("borough.geometry").alias("geometry"),
                col("borough.properties.borough").alias("borough_name"),
                col("borough.properties.boroughCode").alias("borough_code"))

In [132]:
# wkt serializes the geometry so we can store it as a string
geo_to_poly_udf = udf(lambda geo: Polygon(geo["coordinates"][0]).wkt, returnType=StringType())
geo_to_area_udf = udf(lambda geo: Polygon(geo["coordinates"][0]).area, returnType=DoubleType())
coords_to_point_udf = udf(lambda coords: Point(coords[0], coords[1]).wkt, returnType=StringType()) # For the main dataframe
is_point_in_polygon_udf = udf(lambda point_wkt, polygon_wkt: from_wkt(polygon_wkt).contains(from_wkt(point_wkt)), BooleanType())

In [133]:
geo_df = geo_df.withColumn("poly", geo_to_poly_udf(col("geometry"))). \
        withColumn("area", geo_to_area_udf(col("geometry"))). \
        drop("geometry"). \
        orderBy("borough_code", desc("area"))

In [134]:
geo_df.printSchema()

root
 |-- borough_name: string (nullable = true)
 |-- borough_code: long (nullable = true)
 |-- poly: string (nullable = true)
 |-- area: double (nullable = true)



In [135]:
geo_df.show()

+------------+------------+--------------------+--------------------+
|borough_name|borough_code|                poly|                area|
+------------+------------+--------------------+--------------------+
|   Manhattan|           1|POLYGON ((-73.926...|0.005859077996035753|
|   Manhattan|           1|POLYGON ((-73.921...|2.327165585676201...|
|   Manhattan|           1|POLYGON ((-74.016...|  7.6037752599342E-5|
|   Manhattan|           1|POLYGON ((-73.941...| 6.23157479510608E-5|
|   Manhattan|           1|POLYGON ((-73.906...|3.265859127204495...|
|   Manhattan|           1|POLYGON ((-74.039...|1.182888313767709...|
|   Manhattan|           1|POLYGON ((-74.043...|6.143638903459381E-6|
|   Manhattan|           1|POLYGON ((-73.995...|3.383127367444441...|
|   Manhattan|           1|POLYGON ((-74.001...|2.858823502476497E-6|
|   Manhattan|           1|POLYGON ((-74.000...|2.393654308790746E-6|
|   Manhattan|           1|POLYGON ((-74.001...|2.334554077223592...|
|   Manhattan|      

In [136]:
# Convert pickup and dropoff coordinates to points
rides_df = rides_df.withColumn("pickup_point", coords_to_point_udf(array("pickup_longitude", "pickup_latitude"))) \
        .withColumn("dropoff_point", coords_to_point_udf(array("dropoff_longitude", "dropoff_latitude"))) \
        .drop("pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude")

In [120]:
# OPTIMIZATION: Broadcast geo data for quicker borough lookup
borough_lookup = spark.sparkContext.broadcast(geo_df.collect())

# Similar to the previously declared udf: is_point_in_polygon_udf
# Some data does not fall into any borough (Unsure if data is relevant or error)
# Grouped these outliers in "N/A"
find_borough_udf = udf(lambda point_wkt: \
    next((b['borough_name'] for b in borough_lookup.value \
          if point_wkt and from_wkt(b['poly']).contains(from_wkt(point_wkt))), \
         "N/A"), \
    StringType())

# Add pickup + dropoff borough columns
rides_df = rides_df.withColumn("pickup_borough", find_borough_udf(col("pickup_point"))) \
                   .withColumn("dropoff_borough", find_borough_udf(col("dropoff_point")))

# Separate data by driver and order by pickup time
window_spec = Window.partitionBy("hack_license").orderBy("pickup_unix")
# Calc next fare time by borough - Filter out nulls and 4 hours
next_fare_df = rides_df.withColumn("next_pickup_unix", 
                                  lead("pickup_unix").over(window_spec)) \
                       .withColumn("next_fare_time", 
                                  when(col("next_pickup_unix").isNotNull(), 
                                       col("next_pickup_unix") - col("dropoff_unix"))
                                  .otherwise(None)) \
                       .filter(col("next_fare_time").isNotNull() & 
                              (col("next_fare_time") <= DRIVER_SESSION_LENGTH))

# Calc avg. next fare time by dropoff borough and sort
avg_next_fare_by_borough = next_fare_df.groupBy("dropoff_borough") \
                                       .agg(avg("next_fare_time").alias("avg_next_fare_time_sec")) \
                                       .orderBy("avg_next_fare_time_sec")

# Show results
avg_next_fare_by_borough.show()

+---------------+----------------------+
|dropoff_borough|avg_next_fare_time_sec|
+---------------+----------------------+
|      Manhattan|     899.7157508788288|
|            N/A|    1484.9464558898521|
|       Brooklyn|     2075.489067894131|
|          Bronx|    2223.5643564356437|
|         Queens|    2665.1269765213224|
|  Staten Island|                4710.0|
+---------------+----------------------+



## Query 3

The number of trips that started and ended within the same borough

In [1]:
# Query 3. The number of trips that started and ended within the same borough
geo_broadcast = broadcast(geo_df)

# Find pickup borough for each ride
pickup_borough = rides_df \
    .crossJoin(geo_broadcast) \
    .filter(is_point_in_polygon_udf(col("pickup_point"), col("poly"))) \
    .select(
        rides_df["medallion"], 
        rides_df["hack_license"], 
        rides_df["pickup_datetime"],
        rides_df["dropoff_point"],
        col("borough_name").alias("pickup_borough"),
        col("borough_code").alias("pickup_borough_code")
    )

# Find dropoff borough for each ride
dropoff_borough = rides_df \
    .crossJoin(geo_broadcast) \
    .filter(is_point_in_polygon_udf(col("dropoff_point"), col("poly"))) \
    .select(
        rides_df["medallion"], 
        rides_df["hack_license"], 
        rides_df["pickup_datetime"],
        col("borough_name").alias("dropoff_borough"),
        col("borough_code").alias("dropoff_borough_code")
    )

# Join the two DFs to find same-borough trips
same_borough_trips = pickup_borough \
    .join(
        dropoff_borough, 
        ["medallion", "hack_license", "pickup_datetime"],
        "inner"
    ) \
    .filter(col("pickup_borough_code") == col("dropoff_borough_code"))

# Count trips by borough
borough_counts = same_borough_trips \
    .groupBy("pickup_borough", "pickup_borough_code") \
    .count() \
    .orderBy("pickup_borough_code")

# Get total
total_same_borough = same_borough_trips.count()

NameError: name 'broadcast' is not defined

In [None]:
total_same_borough

## Query 4

The number of trips that started and ended within the same borough

In [None]:
# Query 4. The number of trips that started in one borough and ended in another one

rides_df.count() - total_same_borough