In [1]:
import os
from pyspark.sql import SparkSession
from pyspark import SparkConf
from dotenv import load_dotenv, find_dotenv

In [2]:
find_dotenv()
load_dotenv()

google_credentials = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')

In [3]:
spark_config = [
    ("spark.jars", "https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar"),
    ("spark.hadoop.google.cloud.auth.service.account.enable", "true"),
    ("spark.hadoop.google.cloud.auth.service.account.json.keyfile", google_credentials),
    ("spark.sql.adaptive.enabled", "false"),
]

spark_conf = SparkConf().setAll(spark_config)

In [4]:
spark = SparkSession.builder \
        .appName('spark_dezoomcamp') \
        .master('local[*]') \
        .config(conf=spark_conf) \
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/02/25 13:54:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
df_green = spark.read\
    .parquet('gs://de-zoomcamp-bq-1/green_taxi/green_tripdata_2019-01.parquet')
df_green.createOrReplaceTempView('green_taxi')

                                                                                

In [6]:
df_yellow = spark.read\
    .parquet('gs://de-zoomcamp-bq-1/yellow_taxi/yellow_tripdata_2019-01.parquet')
df_yellow.createOrReplaceTempView('yellow_taxi')

In [7]:
query = f"""
    SELECT 
        PULocationID AS zone, 
        date_trunc('HOUR', lpep_pickup_datetime) AS pickup_datetime,
        DECIMAL(SUM(total_amount)) AS green_revenue,
        COUNT(1) AS green_number_of_records
    FROM 
        green_taxi
    WHERE lpep_pickup_datetime > '2019-01-01 00:00:00'
    GROUP BY 
        zone, pickup_datetime
"""

df_green_tmp = spark.sql(query)

In [8]:
query = f"""
    SELECT 
        PULocationID AS zone, 
        date_trunc('HOUR', tpep_pickup_datetime) AS pickup_datetime,
        DECIMAL(SUM(total_amount)) AS yellow_revenue,
        COUNT(1) AS yellow_number_of_records
    FROM 
        yellow_taxi
    WHERE tpep_pickup_datetime > '2019-01-01 00:00:00'
    GROUP BY 
        zone, pickup_datetime
"""

df_yellow_tmp = spark.sql(query)

In [9]:
df_join = df_green_tmp.join(df_yellow_tmp, on=['zone', 'pickup_datetime'])

In [10]:
df_join.show()



+----+-------------------+-------------+-----------------------+--------------+------------------------+
|zone|    pickup_datetime|green_revenue|green_number_of_records|yellow_revenue|yellow_number_of_records|
+----+-------------------+-------------+-----------------------+--------------+------------------------+
|  55|2019-01-01 13:00:00|          286|                      5|           156|                       3|
|  43|2019-01-01 16:00:00|           26|                      3|          3300|                     257|
|  33|2019-01-01 20:00:00|          155|                     11|            65|                       3|
| 244|2019-01-01 21:00:00|           96|                      7|           188|                       8|
|  39|2019-01-02 08:00:00|          533|                     15|            45|                       2|
|  37|2019-01-02 09:00:00|           95|                      4|            31|                       1|
| 236|2019-01-02 11:00:00|           44|               

                                                                                