# Data enrichment

* Join prickup and dropoff coordinates with location name
* Convert datetimes to timestamps
* Save the result in `../output/output.parquet`

In [1]:
from pyspark.sql import SparkSession
from sedona.utils import SedonaKryoRegistrator
from pyspark.sql.functions import col, udf, unix_timestamp, to_timestamp
from pyspark.sql.functions import expr
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType

import geopandas as gpd
from shapely import Point

In [2]:
sedona_jar = '/home/jovyan/jars/sedona-spark-shaded-3.0_2.12-1.6.1.jar'
geotools_jar = '/home/jovyan/jars/geotools-wrapper-1.7.0-28.5.jar'
spark = (
    SparkSession.builder
        .config("spark.jars", f"{sedona_jar},{geotools_jar}") 
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        .config("spark.kryo.registrator", SedonaKryoRegistrator.getName)
        .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions")
        .appName('NYC Taxi')    
        .getOrCreate()
)

In [3]:
#medallion,hack_license,vendor_id,rate_code,store_and_fwd_flag,pickup_datetime,dropoff	_datetime,passenger_	count,trip_time_in_secs,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude

schema = StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("vendor_id", StringType(), True),
    StructField("rate_code", IntegerType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("pickup_datetime", StringType(), True),
    StructField("dropoff_datetime", StringType(), True),
    StructField("passenger_count", IntegerType(), True),

    StructField("trip_time_in_secs", StringType(), True),
    StructField("trip_distance", StringType(), True),
    
    StructField("pickup_longitude", StringType(), True),
    StructField("pickup_latitude", StringType(), True),
    StructField("dropoff_longitude", StringType(), True),
    StructField("dropoff_latitude", StringType(), True)
])

df_trip = spark.read.csv('../data/trip_data/*.csv', header=True, schema=schema)
#


In [4]:
cols = ["medallion", "pickup_datetime", "dropoff_datetime", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"]
df_trip = df_trip.selectExpr(cols)

In [5]:
df_trip.take(20)

[Row(medallion='89D227B655E5C82AECF13C3F540D4CF4', pickup_datetime='2013-01-01 15:11:48', dropoff_datetime='2013-01-01 15:18:10', pickup_longitude='-73.978165', pickup_latitude='40.757977', dropoff_longitude='-73.989838', dropoff_latitude='40.751171'),
 Row(medallion='0BD7C8F5BA12B88E0B67BED28BEA73D8', pickup_datetime='2013-01-06 00:18:35', dropoff_datetime='2013-01-06 00:22:54', pickup_longitude='-74.006683', pickup_latitude='40.731781', dropoff_longitude='-73.994499', dropoff_latitude='40.75066'),
 Row(medallion='0BD7C8F5BA12B88E0B67BED28BEA73D8', pickup_datetime='2013-01-05 18:49:41', dropoff_datetime='2013-01-05 18:54:23', pickup_longitude='-74.004707', pickup_latitude='40.73777', dropoff_longitude='-74.009834', dropoff_latitude='40.726002'),
 Row(medallion='DFD2202EE08F7A8DC9A57B02ACB81FE2', pickup_datetime='2013-01-07 23:54:15', dropoff_datetime='2013-01-07 23:58:20', pickup_longitude='-73.974602', pickup_latitude='40.759945', dropoff_longitude='-73.984734', dropoff_latitude='40.

In [6]:
#df_trip.count()

In [7]:
df_trip_cleaned = df_trip.dropna(subset=cols)

In [8]:
df_trip_cleaned.count()

173176321

In [9]:
# Convert pcikup and dropoff time to timestamps

DATE_FORMAT = 'yyyy-MM-d HH:mm:ss'
df_trip_w_ts = (df_trip_cleaned
         .withColumn('pickup_ts', unix_timestamp(to_timestamp(col("pickup_datetime"), DATE_FORMAT)))
         .withColumn('dropoff_ts', unix_timestamp(to_timestamp(col("dropoff_datetime"), DATE_FORMAT)))
        )

In [10]:
#df_pd = df.toPandas()

# Create GeoDataFrame for pickup locations
#df_pd["pickup_geometry"] = df_pd.apply(lambda row: Point(row["pickup_longitude"], row["pickup_latitude"]), axis=1)
#df_pd["dropoff_geometry"] = df_pd.apply(lambda row: Point(row["dropoff_longitude"], row["dropoff_latitude"]), axis=1)


In [11]:
#df_trip.write.mode('overwrite').parquet("../output/test.parquet")
#df_trip = spark.read.load('../output/output.parquet')

In [12]:
df_trip.show()

+--------------------+-------------------+-------------------+----------------+---------------+-----------------+----------------+
|           medallion|    pickup_datetime|   dropoff_datetime|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|
+--------------------+-------------------+-------------------+----------------+---------------+-----------------+----------------+
|89D227B655E5C82AE...|2013-01-01 15:11:48|2013-01-01 15:18:10|      -73.978165|      40.757977|       -73.989838|       40.751171|
|0BD7C8F5BA12B88E0...|2013-01-06 00:18:35|2013-01-06 00:22:54|      -74.006683|      40.731781|       -73.994499|        40.75066|
|0BD7C8F5BA12B88E0...|2013-01-05 18:49:41|2013-01-05 18:54:23|      -74.004707|       40.73777|       -74.009834|       40.726002|
|DFD2202EE08F7A8DC...|2013-01-07 23:54:15|2013-01-07 23:58:20|      -73.974602|      40.759945|       -73.984734|       40.759388|
|DFD2202EE08F7A8DC...|2013-01-07 23:25:03|2013-01-07 23:34:24|       -73.97625|    

In [13]:
# Load spatial data of NY

# First we use geopandas to read the geojson
gdf = gpd.read_file('../data/nyc-boroughs.geojson')

# Convert geom to WKT
gdf['geom'] = gdf['geometry'].apply(lambda geom: geom.wkt if geom else None)

# geopandas df to pandas df
pdf = gdf.astype(str)

# pandas df to spark df
df_geom = spark.createDataFrame(pdf)

# Convert WKT to geom (format) which is used for intersections
df_geom = df_geom.withColumn("geom", expr("ST_GeomFromWKT(geom)"))

In [14]:
# Helper function to convert long and lat to Point

udf_to_point = udf(lambda lon, lat: Point(lon, lat).wkt if lon is not None and lat is not None else '')

In [15]:
# Trip df to WKT -> geom 

df_trip_w_points = (df_trip_w_ts
     .withColumn('pickup_point', udf_to_point(col('pickup_longitude'), col('pickup_latitude')))
     .withColumn('dropoff_point', udf_to_point(col('dropoff_longitude'), col('dropoff_latitude')))
     .withColumn('geom_pickup', expr('ST_GeomFromWKT(pickup_point)'))
     .withColumn('geom_dropoff', expr('ST_GeomFromWKT(dropoff_point)'))
    )

In [16]:
df_trip_w_points.first()

Row(medallion='89D227B655E5C82AECF13C3F540D4CF4', pickup_datetime='2013-01-01 15:11:48', dropoff_datetime='2013-01-01 15:18:10', pickup_longitude='-73.978165', pickup_latitude='40.757977', dropoff_longitude='-73.989838', dropoff_latitude='40.751171', pickup_ts=1357053108, dropoff_ts=1357053490, pickup_point='POINT (-73.978165 40.757977)', dropoff_point='POINT (-73.989838 40.751171)', geom_pickup=<POINT (-73.978 40.758)>, geom_dropoff=<POINT (-73.99 40.751)>)

In [17]:
# Join geospatial data to trip pickup locations

df_trip_w_pickup = df_geom.alias('geo').join(
    df_trip_w_points.alias('travel'),
    expr('ST_Intersects(geo.geom, travel.geom_pickup)'),
    'inner'
)

In [18]:
# Distinguish pickup borough
df_trip_w_pickup = df_trip_w_pickup.withColumn('pickup_borough', col('borough'))

# Remove excess columns
df_trip_w_pickup = df_trip_w_pickup.drop("geometry", "@id", "geom", "borough", "boroughCode")

In [19]:
# Join geospatial data to trip dropoff locations

df_trip_w_pickup_n_dropoff = df_geom.alias('geo').join(
    df_trip_w_pickup.alias('travel'),
    expr('ST_Intersects(geo.geom, travel.geom_dropoff)'),
    'inner'
)

In [20]:
# Distinguish dropoff borough
df_trip_w_pickup_n_dropoff = df_trip_w_pickup_n_dropoff.withColumn('dropoff_borough', col('borough'))

# Remove excess columns
df_trip_w_pickup_n_dropoff = df_trip_w_pickup_n_dropoff.drop("geometry", "@id", "geom", "borough", "boroughCode")

In [21]:
df_trip.first()

Row(medallion='89D227B655E5C82AECF13C3F540D4CF4', pickup_datetime='2013-01-01 15:11:48', dropoff_datetime='2013-01-01 15:18:10', pickup_longitude='-73.978165', pickup_latitude='40.757977', dropoff_longitude='-73.989838', dropoff_latitude='40.751171')

In [None]:
# Test that the solution works

df_trip_w_pickup_n_dropoff.first()

In [None]:
# Pruning the final df to reduce data load

df_final = df_trip_w_pickup_n_dropoff_n_ts.select("medallion", "pickup_borough", "dropoff_borough", "pickup_ts", "dropoff_ts")

In [None]:
df_final.show()

In [None]:
# Save the result to parquet to use later for queries

df_final.write.mode('overwrite').parquet("../output/output.parquet")