# Data enrichment

* Join prickup and dropoff coordinates with location name
* Convert datetimes to timestamps
* Save the result in `../output/output.parquet`

In [1]:
from pyspark.sql import SparkSession
from sedona.utils import SedonaKryoRegistrator
from pyspark.sql.functions import col, udf, unix_timestamp, to_timestamp
from pyspark.sql.functions import expr
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType

import geopandas as gpd
from shapely import Point

In [2]:
# Setup SparkSession and add Sedona
# Sedona enables us to use PostGIS like functions to find intersections

sedona_jar = '/home/jovyan/jars/sedona-spark-shaded-3.0_2.12-1.6.1.jar'
geotools_jar = '/home/jovyan/jars/geotools-wrapper-1.7.0-28.5.jar'
spark = (
    SparkSession.builder
        .config("spark.jars", f"{sedona_jar},{geotools_jar}") 
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        .config("spark.kryo.registrator", SedonaKryoRegistrator.getName)
        .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions")
        .appName('NYC Taxi')    
        .getOrCreate()
)

In [3]:
# Load trip data

# StructField third argumen True -> can have NULL values (as some long/lat's are NULL)
# Instead of infer we define data types to ensure data integrity
schema = StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("vendor_id", StringType(), True),
    StructField("rate_code", IntegerType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("pickup_datetime", StringType(), True),
    StructField("dropoff_datetime", StringType(), True),
    StructField("passenger_count", IntegerType(), True),
    StructField("pickup_longitude", StringType(), True),
    StructField("pickup_latitude", StringType(), True),
    StructField("dropoff_longitude", StringType(), True),
    StructField("dropoff_latitude", StringType(), True)
])

df_trip = spark.read.csv('../data/Sample NYC Data.csv', header=True, schema=schema)

# Select only the data we need to optimize memory usage
df_trip = df_trip.selectExpr("medallion", "pickup_datetime", "dropoff_datetime", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude")

In [4]:
df_trip.show()

+--------------------+---------------+----------------+----------------+---------------+-----------------+----------------+
|           medallion|pickup_datetime|dropoff_datetime|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|
+--------------------+---------------+----------------+----------------+---------------+-----------------+----------------+
|89D227B655E5C82AE...| 01-01-13 15:11|  01-01-13 15:18|      -73.978165|      40.757977|       -73.989838|       40.751171|
|0BD7C8F5BA12B88E0...| 06-01-13 00:18|  06-01-13 00:22|      -74.006683|      40.731781|       -73.994499|        40.75066|
|0BD7C8F5BA12B88E0...| 05-01-13 18:49|  05-01-13 18:54|      -74.004707|       40.73777|       -74.009834|       40.726002|
|DFD2202EE08F7A8DC...| 07-01-13 23:54|  07-01-13 23:58|      -73.974602|      40.759945|       -73.984734|       40.759388|
|DFD2202EE08F7A8DC...| 07-01-13 23:25|  07-01-13 23:34|       -73.97625|      40.748528|       -74.002586|       40.747868|
|20D9ECB

In [5]:
# Load spatial data of NY

# First we use geopandas to read the geojson
gdf = gpd.read_file('../data/nyc-boroughs.geojson')

# Convert geom to WKT
gdf['geom'] = gdf['geometry'].apply(lambda geom: geom.wkt if geom else None)

# geopandas df to pandas df
pdf = gdf.astype(str)

# pandas df to spark df
df_geom = spark.createDataFrame(pdf)

# Convert WKT to geom (format) which is used for intersections
df_geom = df_geom.withColumn("geom", expr("ST_GeomFromWKT(geom)"))

In [6]:
# Helper function to convert long and lat to Point

udf_to_point = udf(lambda lon, lat: Point(lon, lat).wkt if lon is not None and lat is not None else '')

In [7]:
# Trip df to WKT -> geom 

df_trip_w_points = (df_trip
     .withColumn('pickup_point', udf_to_point(col('pickup_longitude'), col('pickup_latitude')))
     .withColumn('dropoff_point', udf_to_point(col('dropoff_longitude'), col('dropoff_latitude')))
     .withColumn('geom_pickup', expr('ST_GeomFromWKT(pickup_point)'))
     .withColumn('geom_dropoff', expr('ST_GeomFromWKT(dropoff_point)'))
    )

In [8]:
df_trip_w_points.first()

Row(medallion='89D227B655E5C82AECF13C3F540D4CF4', pickup_datetime='01-01-13 15:11', dropoff_datetime='01-01-13 15:18', pickup_longitude='-73.978165', pickup_latitude='40.757977', dropoff_longitude='-73.989838', dropoff_latitude='40.751171', pickup_point='POINT (-73.978165 40.757977)', dropoff_point='POINT (-73.989838 40.751171)', geom_pickup=<POINT (-73.978 40.758)>, geom_dropoff=<POINT (-73.99 40.751)>)

In [9]:
# Join geospatial data to trip pickup locations

df_trip_w_pickup = df_geom.alias('geo').join(
    df_trip_w_points.alias('travel'),
    expr('ST_Intersects(geo.geom, travel.geom_pickup)'),
    'inner'
)

In [10]:
# Distinguish pickup borough
df_trip_w_pickup = df_trip_w_pickup.withColumn('pickup_borough', col('borough'))

# Remove excess columns
df_trip_w_pickup = df_trip_w_pickup.drop("geometry", "@id", "geom", "borough", "boroughCode")

In [11]:
# Join geospatial data to trip dropoff locations

df_trip_w_pickup_n_dropoff = df_geom.alias('geo').join(
    df_trip_w_pickup.alias('travel'),
    expr('ST_Intersects(geo.geom, travel.geom_dropoff)'),
    'inner'
)

In [12]:
# Distinguish dropoff borough
df_trip_w_pickup_n_dropoff = df_trip_w_pickup_n_dropoff.withColumn('dropoff_borough', col('borough'))

# Remove excess columns
df_trip_w_pickup_n_dropoff = df_trip_w_pickup_n_dropoff.drop("geometry", "@id", "geom", "borough", "boroughCode")

In [13]:
# Test that the solution works

df_trip_w_pickup_n_dropoff.first()

Row(medallion='F32C996F49594F21C3C14A9E1821A81A', pickup_datetime='13-01-13 03:10', dropoff_datetime='13-01-13 03:37', pickup_longitude='-74.074799', pickup_latitude='40.645111', dropoff_longitude='-74.177147', dropoff_latitude='40.540531', pickup_point='POINT (-74.074799 40.645111)', dropoff_point='POINT (-74.177147 40.540531)', geom_pickup=<POINT (-74.075 40.645)>, geom_dropoff=<POINT (-74.177 40.541)>, pickup_borough='Staten Island', dropoff_borough='Staten Island')

In [14]:
# Convert pcikup and dropoff time to timestamps

DATE_FORMAT = 'dd-MM-yy HH:mm'
df_trip_w_pickup_n_dropoff_n_ts = (df_trip_w_pickup_n_dropoff
         .withColumn('pickup_ts', unix_timestamp(to_timestamp(col("pickup_datetime"), DATE_FORMAT)))
         .withColumn('dropoff_ts', unix_timestamp(to_timestamp(col("dropoff_datetime"), DATE_FORMAT)))
        )

In [15]:
# Pruning the final df to reduce data load

df_final = df_trip_w_pickup_n_dropoff_n_ts.select("medallion", "pickup_borough", "dropoff_borough", "pickup_ts", "dropoff_ts")

In [16]:
df_final.show()

+--------------------+--------------+---------------+----------+----------+
|           medallion|pickup_borough|dropoff_borough| pickup_ts|dropoff_ts|
+--------------------+--------------+---------------+----------+----------+
|F32C996F49594F21C...| Staten Island|  Staten Island|1358046600|1358048220|
|AB89F24DEC0C91478...|     Manhattan|  Staten Island|1358047980|1358049540|
|125A1E8783A1DE113...|     Manhattan|  Staten Island|1358046900|1358048400|
|5ADE47CF8F60E045E...|     Manhattan|  Staten Island|1358070300|1358071980|
|772B7A41B65FF59FA...|     Manhattan|  Staten Island|1358070300|1358072040|
|889230A3561FAED35...|     Manhattan|  Staten Island|1358061600|1358063940|
|CE9AD89BCD75060B0...|     Manhattan|  Staten Island|1358041260|1358043420|
|71AEBB5F4EC620B12...|     Manhattan|  Staten Island|1358041260|1358044680|
|7BA812A3FE9F443B6...|     Manhattan|  Staten Island|1358072160|1358074320|
|46C0D5EB24E9138C2...|     Manhattan|  Staten Island|1358081340|1358083980|
|9B5FB0AA7D3

In [17]:
# Save the result to parquet to use later for queries

df_final.write.mode('overwrite').parquet("../output/output.parquet")