In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import *
from pyspark.conf import SparkConf
from pyspark import SparkFiles

from pathlib import Path

### Setup Spark Session

In [2]:
spark = SparkSession.builder\
            .master("local[*]")\
            .config("spark.driver.memory", "2g")\
            .config("spark.executor.memory", "8g")\
            .config("spark.cores.max", 8) \
            .appName("pyspark-playground")\
            .getOrCreate()

23/05/22 01:47:25 WARN Utils: Your hostname, magi.local resolves to a loopback address: 127.0.0.1; using 192.168.15.29 instead (on interface en0)
23/05/22 01:47:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/22 01:47:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/22 01:47:25 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/05/22 01:47:25 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/05/22 01:47:25 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


### FHV Dataset

In [3]:
fhv_dataset_url = "https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-01.csv.gz"
fhv_filename = Path(fhv_dataset_url).name

In [4]:
spark.sparkContext.addFile(fhv_dataset_url)

In [5]:
fhv_df = spark.read\
            .option("header", True)\
            .option("inferSchema", True)\
            .csv(f"file://{SparkFiles.get(fhv_filename)}")

                                                                                

In [6]:
fhv_df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropOff_datetime: timestamp (nullable = true)
 |-- PUlocationID: integer (nullable = true)
 |-- DOlocationID: integer (nullable = true)
 |-- SR_Flag: integer (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [7]:
fhv_df.createOrReplaceTempView("fhv")

### Taxi Lookup Zones Dataset

In [8]:
zones_dataset_url = "https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv"
zones_filename = Path(zones_dataset_url).name

In [9]:
spark.sparkContext.addFile(zones_dataset_url)

In [10]:
zone_lookup_df = spark.read\
                    .option("header", True)\
                    .option("inferSchema", True)\
                    .csv(f"file://{SparkFiles.get(zones_filename)}")

In [11]:
zone_lookup_df.printSchema()

root
 |-- LocationID: integer (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)



In [12]:
zone_lookup_df.createOrReplaceTempView("zones")

### SparkSQL - Joining DataFrames

In [13]:
df = spark.sql("""
    WITH t_fhv AS (
        SELECT
            dispatching_base_num, 
            Affiliated_base_number as affiliated_base_num,
            pickup_datetime,
            dropOff_datetime as dropoff_datetime,
            PUlocationID as pickup_location_id,
            DOlocationID as dropoff_location_id
        FROM fhv
    ),
    
    t_zones AS (
        SELECT
            LocationID as location_id,
            Borough as borough,
            Zone as zone,
            service_zone as service_zone                
        FROM zones        
    )    
    
    SELECT 
        f.dispatching_base_num,
        f.affiliated_base_num,

        -- Pickup Location
        f.pickup_datetime,
        pu.zone as pickup_zone,
        pu.service_zone as pickup_service_zone,
        
        -- Dropoff Location
        f.dropoff_location_id,
        do.zone as dropoff_zone,
        do.service_zone as dropoff_service_zone
                
    FROM t_fhv f    
    INNER JOIN t_zones pu ON f.pickup_location_id  = pu.location_id
    INNER JOIN t_zones do ON f.dropoff_location_id = do.location_id
""")

In [14]:
df.show(5, 100, False)

+--------------------+-------------------+-------------------+---------------------+-------------------+-------------------+-------------------------+--------------------+
|dispatching_base_num|affiliated_base_num|    pickup_datetime|          pickup_zone|pickup_service_zone|dropoff_location_id|             dropoff_zone|dropoff_service_zone|
+--------------------+-------------------+-------------------+---------------------+-------------------+-------------------+-------------------------+--------------------+
|              B00254|             B02356|2019-01-01 00:33:03|      Lenox Hill East|        Yellow Zone|                 52|              Cobble Hill|           Boro Zone|
|              B00254|             B00254|2019-01-01 00:03:00|      Lenox Hill West|        Yellow Zone|                237|    Upper East Side South|         Yellow Zone|
|              B00254|             B00254|2019-01-01 00:45:48|Upper East Side South|        Yellow Zone|                236|    Upper East S

In [15]:
df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- affiliated_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- pickup_zone: string (nullable = true)
 |-- pickup_service_zone: string (nullable = true)
 |-- dropoff_location_id: integer (nullable = true)
 |-- dropoff_zone: string (nullable = true)
 |-- dropoff_service_zone: string (nullable = true)



### Spark DataFrame API - Joining DataFrames

In [16]:
fhv = fhv_df.select(
    col("dispatching_base_num"),
    col("Affiliated_base_number").alias("affiliated_base_num"),
    col("pickup_datetime"),
    col("dropOff_datetime").alias("dropoff_datetime"),
    col("PUlocationID").alias("pickup_location_id"),
    col("DOlocationID").alias("dropoff_location_id")
)

In [17]:
zone_lookup = zone_lookup_df.select(
    col("LocationID").alias("location_id"),
    col("Borough").alias("borough"),
    col("Zone").alias("zone"),
    col("service_zone")
)

In [18]:
df = fhv.alias("f")\
        .join(zone_lookup.alias("pu"), col("f.pickup_location_id") == col("pu.location_id"), how="inner")\
        .join(zone_lookup.alias("do"), col("f.pickup_location_id") == col("do.location_id"), how="inner")\
        .select(
            col("f.dispatching_base_num"),
            col("f.affiliated_base_num"),
            # Pickup Data
            col("f.pickup_datetime"),
            col("pu.zone").alias("pickup_zone"),
            col("pu.service_zone").alias("pickup_service_zone"),
            # Dropoff Data
            col("f.dropoff_datetime"),
            col("do.zone").alias("dropoff_zone"),
            col("do.service_zone").alias("dropoff_service_zone")                    
        )

In [19]:
df.show(10, 25, False)

+--------------------+-------------------+-------------------+-------------------------+-------------------+-------------------+-------------------------+--------------------+
|dispatching_base_num|affiliated_base_num|    pickup_datetime|              pickup_zone|pickup_service_zone|   dropoff_datetime|             dropoff_zone|dropoff_service_zone|
+--------------------+-------------------+-------------------+-------------------------+-------------------+-------------------+-------------------------+--------------------+
|              B00254|             B02356|2019-01-01 00:33:03|          Lenox Hill East|        Yellow Zone|2019-01-01 01:37:24|          Lenox Hill East|         Yellow Zone|
|              B00254|             B00254|2019-01-01 00:03:00|          Lenox Hill West|        Yellow Zone|2019-01-01 00:34:25|          Lenox Hill West|         Yellow Zone|
|              B00254|             B00254|2019-01-01 00:45:48|    Upper East Side South|        Yellow Zone|2019-01-01 0