# Data enrichment

* Join prickup and dropoff coordinates with location name
* Convert datetimes to timestamps
* Save the result in `../output/output.parquet`

In [1]:
from pyspark.sql import SparkSession
from sedona.utils import SedonaKryoRegistrator
from pyspark.sql.functions import col, udf, unix_timestamp, to_timestamp
from pyspark.sql.functions import expr
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType

import geopandas as gpd
from shapely import Point

from pathlib import Path

In [2]:
sedona_jar = '/home/jovyan/jars/sedona-spark-shaded-3.0_2.12-1.6.1.jar'
geotools_jar = '/home/jovyan/jars/geotools-wrapper-1.7.0-28.5.jar'
spark = (
    SparkSession.builder
        .config("spark.jars", f"{sedona_jar},{geotools_jar}") 
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        .config("spark.kryo.registrator", SedonaKryoRegistrator.getName)
        .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions")
        .config("spark.driver.memory", "8g")
        .config("spark.executor.memory", "8g")
        .appName('NYC Taxi')    
        .getOrCreate()
)

In [None]:
data_ready_path = '../output/data_ready.parquet'
data_ready_file = Path(data_ready_path)

In [3]:
if not data_ready_file.exists():
    schema = StructType([
        StructField("medallion", StringType(), True),
        StructField("hack_license", StringType(), True),
        StructField("vendor_id", StringType(), True),
        StructField("rate_code", IntegerType(), True),
        StructField("store_and_fwd_flag", StringType(), True),
        StructField("pickup_datetime", StringType(), True),
        StructField("dropoff_datetime", StringType(), True),
        StructField("passenger_count", IntegerType(), True),
    
        StructField("trip_time_in_secs", StringType(), True),
        StructField("trip_distance", StringType(), True),
        
        StructField("pickup_longitude", StringType(), True),
        StructField("pickup_latitude", StringType(), True),
        StructField("dropoff_longitude", StringType(), True),
        StructField("dropoff_latitude", StringType(), True)
    ])
    
    data_path = '../output/data.parquet'
    data_file = Path(data_path)
    if not data_file.exists():
        csv_data = spark.read.csv('../data/trip_data/*.csv', header=True, schema=schema)
        cols = ["medallion", "pickup_datetime", "dropoff_datetime", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"]
        csv_data_cols = csv_data.selectExpr(cols)
        csv_data_cleaned = csv_data_cols.dropna(subset=cols)
        csv_data_cleaned.write.mode('overwrite').parquet(data_path)
    
    df_trip = spark.read.load(data_path)


In [4]:
# Convert pcikup and dropoff time to timestamps

if not data_ready_file.exists():
    DATE_FORMAT = 'yyyy-MM-d HH:mm:ss'
    df_trip_w_ts = (df_trip
             .withColumn('pickup_ts', unix_timestamp(to_timestamp(col("pickup_datetime"), DATE_FORMAT)))
             .withColumn('dropoff_ts', unix_timestamp(to_timestamp(col("dropoff_datetime"), DATE_FORMAT)))
            )

In [5]:
# Load spatial data of NY

if not data_ready_file.exists():
    # First we use geopandas to read the geojson
    gdf = gpd.read_file('../data/nyc-boroughs.geojson')
    
    # Convert geom to WKT
    gdf['geom'] = gdf['geometry'].apply(lambda geom: geom.wkt if geom else None)
    
    # geopandas df to pandas df
    pdf = gdf.astype(str)
    
    # pandas df to spark df
    df_geom = spark.createDataFrame(pdf)
    
    # Convert WKT to geom (format) which is used for intersections
    df_geom = df_geom.withColumn("geom", expr("ST_GeomFromWKT(geom)"))

In [13]:
if not data_ready_file.exists():
    # Helper function to convert long and lat to Point
    udf_to_point = udf(lambda lon, lat: Point(lon, lat).wkt if lon is not None and lat is not None else '')
    
    # Trip df to WKT -> geom 
    df_trip_w_points = (df_trip_w_ts
         .withColumn('pickup_point', udf_to_point(col('pickup_longitude'), col('pickup_latitude')))
         .withColumn('dropoff_point', udf_to_point(col('dropoff_longitude'), col('dropoff_latitude')))
         .withColumn('geom_pickup', expr('ST_GeomFromWKT(pickup_point)'))
         .withColumn('geom_dropoff', expr('ST_GeomFromWKT(dropoff_point)'))
        )
    
    df_geom = df_geom.repartition(40)
    df_trip_w_points = df_trip_w_points.repartition(40)
    
    # Join geospatial data to trip pickup locations
    df_trip_w_pickup = df_geom.alias('geo').join(
        df_trip_w_points.alias('travel'),
        expr('ST_Intersects(geo.geom, travel.geom_pickup)'),
        'inner'
    )
    
    # Distinguish pickup borough
    df_trip_w_pickup = df_trip_w_pickup.withColumn('pickup_borough', col('borough'))
    
    # Remove excess columns from pickup
    df_trip_w_pickup = df_trip_w_pickup.drop("geometry", "@id", "geom", "borough", "boroughCode")
    
    # Join geospatial data to trip dropoff locations
    df_trip_w_pickup_n_dropoff = df_geom.alias('geo').join(
        df_trip_w_pickup.alias('travel'),
        expr('ST_Intersects(geo.geom, travel.geom_dropoff)'),
        'inner'
    )
    
    # Distinguish dropoff borough
    df_trip_w_pickup_n_dropoff = df_trip_w_pickup_n_dropoff.withColumn('dropoff_borough', col('borough'))
    
    df_final = df_trip_w_pickup_n_dropoff.select("medallion", "pickup_borough", "dropoff_borough", "pickup_ts", "dropoff_ts")
    df_final.write.mode('overwrite').parquet(data_ready_path)