In [1]:
from os import getenv, environ as env

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import *
from pyspark.conf import SparkConf

In [2]:
aws_access_key_id = getenv("AWS_ACCESS_KEY_ID")
aws_secret_access_key = getenv("AWS_SECRET_ACCESS_KEY")

### Setup Spark Session

In [3]:
spark = (
    SparkSession.builder
        .master("spark://localhost:7077")
        .config("spark.sql.execution.arrow.pyspark.enabled", "true")
        .config("spark.driver.memory", "8g")
        .config("spark.executor.memory", "16g")
        .config("spark.cores.max", 8)
        .config("spark.hadoop.fs.s3a.access.key", aws_access_key_id)
        .config("spark.hadoop.fs.s3a.secret.key", aws_secret_access_key)
        .appName("pyspark_aws_docker_master")
        .getOrCreate()
)

23/05/22 01:45:03 WARN Utils: Your hostname, magi.local resolves to a loopback address: 127.0.0.1; using 192.168.15.29 instead (on interface en0)
23/05/22 01:45:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/22 01:45:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/22 01:45:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/05/22 01:45:04 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [4]:
spark.sparkContext.setLogLevel("ERROR")

## Load Datasets from AWS

#### FHV Dataset

In [5]:
fhv_df = spark.read.parquet("s3a://iobruno-datalake-raw/dtc_ny_taxi_tripdata/fhv/fhv_tripdata_2019-01.snappy.parquet")

                                                                                

In [6]:
fhv_df.createOrReplaceTempView("fhv")

In [7]:
fhv_df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropOff_datetime: timestamp (nullable = true)
 |-- PUlocationID: long (nullable = true)
 |-- DOlocationID: long (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



#### Taxi Zone Lookup Dataset

In [8]:
zone_lookup_schema = StructType([
    StructField("LocationID", IntegerType(), True),
    StructField("Borough", StringType(), True),
    StructField("Zone", StringType(), True),
    StructField("service_zone", StringType(), True)
])

In [9]:
zone_lookup_df = spark.read\
                    .option("header", True)\
                    .schema(zone_lookup_schema)\
                    .csv("s3a://iobruno-datalake-raw/dtc_ny_taxi_tripdata/zone_lookup/taxi_zone_lookup.csv.gz")

In [10]:
zone_lookup_df.createOrReplaceTempView("zones")

In [11]:
zone_lookup_df.printSchema()

root
 |-- LocationID: integer (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)



### SparkSQL - Joining DataFrames

In [12]:
sdf = spark.sql("""
    WITH t_fhv AS (
        SELECT
            dispatching_base_num, 
            Affiliated_base_number as affiliated_base_num,
            pickup_datetime,
            dropOff_datetime as dropoff_datetime,
            PUlocationID as pickup_location_id,
            DOlocationID as dropoff_location_id
        FROM 
            fhv
    ),
    
    t_zones AS (
        SELECT
            LocationID as location_id,
            Borough as borough,
            Zone as zone,
            service_zone
        FROM 
            zones        
    )    
    
    SELECT 
        f.dispatching_base_num,
        f.affiliated_base_num,

        -- Pickup Data
        f.pickup_datetime,
        pu.zone as pickup_zone,
        pu.service_zone as pickup_service_zone,
        
        -- Dropoff Data
        f.dropoff_datetime,
        do.zone as dropoff_zone,
        do.service_zone as dropoff_service_zone
    FROM 
        t_fhv f    
    INNER JOIN 
        t_zones pu ON f.pickup_location_id  = pu.location_id
    INNER JOIN 
        t_zones do ON f.dropoff_location_id = do.location_id
""")

In [13]:
sdf.show(10, 25, False)



+--------------------+-------------------+-------------------+-------------------------+-------------------+-------------------+-------------------------+--------------------+
|dispatching_base_num|affiliated_base_num|    pickup_datetime|              pickup_zone|pickup_service_zone|   dropoff_datetime|             dropoff_zone|dropoff_service_zone|
+--------------------+-------------------+-------------------+-------------------------+-------------------+-------------------+-------------------------+--------------------+
|              B00254|             B02356|2018-12-31 22:33:03|          Lenox Hill East|        Yellow Zone|2018-12-31 23:37:24|              Cobble Hill|           Boro Zone|
|              B00254|             B00254|2018-12-31 22:03:00|          Lenox Hill West|        Yellow Zone|2018-12-31 22:34:25|    Upper East Side South|         Yellow Zone|
|              B00254|             B00254|2018-12-31 22:45:48|    Upper East Side South|        Yellow Zone|2018-12-31 2

                                                                                

### Spark DataFrame API - Joining DataFrames

In [14]:
fhv = fhv_df.select(
    col("dispatching_base_num"),
    col("Affiliated_base_number").alias("affiliated_base_num"),
    col("pickup_datetime"),
    col("dropOff_datetime").alias("dropoff_datetime"),
    col("PUlocationID").alias("pickup_location_id"),
    col("DOlocationID").alias("dropoff_location_id")
)

In [15]:
zone_lookup = zone_lookup_df.select(
    col("LocationID").alias("location_id"),
    col("Borough").alias("borough"),
    col("Zone").alias("zone"),
    col("service_zone")
)

In [16]:
df = fhv.alias("f")\
        .join(zone_lookup.alias("pu"), col("f.pickup_location_id") == col("pu.location_id"), how="inner")\
        .join(zone_lookup.alias("do"), col("f.pickup_location_id") == col("do.location_id"), how="inner")\
        .select(
            col("f.dispatching_base_num"),
            col("f.affiliated_base_num"),

            # Pickup Data
            col("f.pickup_datetime"),
            col("pu.zone").alias("pickup_zone"),
            col("pu.service_zone").alias("pickup_service_zone"),
            
            # Dropoff Data
            col("f.dropoff_datetime"),
            col("do.zone").alias("dropoff_zone"),
            col("do.service_zone").alias("dropoff_service_zone")                    
        )

In [17]:
df.show(10, 25, False)



+--------------------+-------------------+-------------------+-------------------------+-------------------+-------------------+-------------------------+--------------------+
|dispatching_base_num|affiliated_base_num|    pickup_datetime|              pickup_zone|pickup_service_zone|   dropoff_datetime|             dropoff_zone|dropoff_service_zone|
+--------------------+-------------------+-------------------+-------------------------+-------------------+-------------------+-------------------------+--------------------+
|              B00254|             B02356|2018-12-31 22:33:03|          Lenox Hill East|        Yellow Zone|2018-12-31 23:37:24|          Lenox Hill East|         Yellow Zone|
|              B00254|             B00254|2018-12-31 22:03:00|          Lenox Hill West|        Yellow Zone|2018-12-31 22:34:25|          Lenox Hill West|         Yellow Zone|
|              B00254|             B00254|2018-12-31 22:45:48|    Upper East Side South|        Yellow Zone|2018-12-31 2

                                                                                