In [1]:
from os import environ as env

from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [2]:
env["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/iobruno/Vault/credentials/iobruno-training-gcp_terraform-admin.json"

### Setup Spark Session

In [3]:
spark = SparkSession.builder\
            .master("local[*]")\
            .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
            .config("spark.driver.memory", "2g")\
            .config("spark.executor.memory", "8g")\
            .config("spark.cores.max", 8) \
            .appName("pyspark-playground")\
            .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/03 02:28:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
spark.sparkContext.setLogLevel("ERROR")

In [5]:
spark._jsc\
    .hadoopConfiguration() \
    .set("google.cloud.auth.service.account.json.keyfile", env["GOOGLE_APPLICATION_CREDENTIALS"])

## Load Datasets from GCS

#### FHV Dataset

In [6]:
fhv_df = spark.read\
            .parquet("gs://iobruno_datalake_raw/dtc_ny_taxi_tripdata/fhv/fhv_tripdata_2019-01.parquet.snappy")

                                                                                

In [7]:
fhv_df.createOrReplaceTempView("fhv")

In [8]:
fhv_df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropOff_datetime: timestamp (nullable = true)
 |-- PUlocationID: long (nullable = true)
 |-- DOlocationID: long (nullable = true)
 |-- SR_Flag: long (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



#### Taxi Zone Lookup Dataset

In [9]:
zone_lookup_schema = StructType([
    StructField("LocationID", IntegerType(), True),
    StructField("Borough", StringType(), True),
    StructField("Zone", StringType(), True),
    StructField("service_zone", StringType(), True)
])

In [10]:
zone_lookup_df = spark.read\
                    .option("header", True)\
                    .schema(zone_lookup_schema)\
                    .csv("gs://iobruno_datalake_raw/dtc_ny_taxi_tripdata/zone_lookup/taxi_zone_lookup.csv")

In [11]:
zone_lookup_df.createOrReplaceTempView("zones")

In [12]:
zone_lookup_df.printSchema()

root
 |-- LocationID: integer (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)



### SparkSQL - Joining DataFrames

In [13]:
df = spark.sql("""
    WITH t_fhv AS (
        SELECT
            dispatching_base_num, 
            Affiliated_base_number as affiliated_base_num,
            pickup_datetime,
            dropOff_datetime as dropoff_datetime,
            PUlocationID as pickup_location_id,
            DOlocationID as dropoff_location_id
        FROM fhv
    ),
    
    t_zones AS (
        SELECT
            LocationID as location_id,
            Borough as borough,
            Zone as zone,
            service_zone as service_zone                
        FROM zones        
    )    
    
    SELECT 
        f.dispatching_base_num,
        f.affiliated_base_num,

        -- Pickup Location
        f.pickup_datetime,
        pu.zone as pickup_zone,
        pu.service_zone as pickup_service_zone,
        
        -- Dropoff Location
        f.dropoff_location_id,
        do.zone as dropoff_zone,
        do.service_zone as dropoff_service_zone
                
    FROM t_fhv f    
    INNER JOIN t_zones pu ON f.pickup_location_id  = pu.location_id
    INNER JOIN t_zones do ON f.dropoff_location_id = do.location_id
""")

In [14]:
df.show(4, 100, True)



-RECORD 0-------------------------------------
 dispatching_base_num | B00254                
 affiliated_base_num  | B02356                
 pickup_datetime      | 2018-12-31 22:33:03   
 pickup_zone          | Lenox Hill East       
 pickup_service_zone  | Yellow Zone           
 dropoff_location_id  | 52                    
 dropoff_zone         | Cobble Hill           
 dropoff_service_zone | Boro Zone             
-RECORD 1-------------------------------------
 dispatching_base_num | B00254                
 affiliated_base_num  | B00254                
 pickup_datetime      | 2018-12-31 22:03:00   
 pickup_zone          | Lenox Hill West       
 pickup_service_zone  | Yellow Zone           
 dropoff_location_id  | 237                   
 dropoff_zone         | Upper East Side South 
 dropoff_service_zone | Yellow Zone           
-RECORD 2-------------------------------------
 dispatching_base_num | B00254                
 affiliated_base_num  | B00254                
 pickup_datet

                                                                                

In [15]:
df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- affiliated_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- pickup_zone: string (nullable = true)
 |-- pickup_service_zone: string (nullable = true)
 |-- dropoff_location_id: long (nullable = true)
 |-- dropoff_zone: string (nullable = true)
 |-- dropoff_service_zone: string (nullable = true)

