In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("test") \
    .getOrCreate()

24/02/25 22:58:52 WARN Utils: Your hostname, hp-computer resolves to a loopback address: 127.0.1.1; using 192.168.178.105 instead (on interface wlp0s20f3)
24/02/25 22:58:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/02/25 22:58:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
df_green = spark.read.parquet("../data/data/raw/green/*/*/")

                                                                                

In [3]:
# Data should be correctly loaded to the dataframe
df_green.show(5)

                                                                                

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2| 2019-12-18 16:52:30|  2019-12-18 16:54:39|                 N|       1.0|         264|         264|            5.0|          0.0|        3.5|  0.5|    0.

**Querying the Green Taxi dataframe with SQL:**

In [4]:
df_green.createOrReplaceTempView("green")

In [5]:
df_green_revenue = spark.sql("""
SELECT
    date_trunc('hour', lpep_pickup_datetime) AS hour,
    PULocationID AS zone,

    ROUND(SUM(total_amount), 2) AS amount,
    COUNT(1) AS number_records
FROM
    green
WHERE 
    lpep_pickup_datetime >= '2020-01-01 00:00:00'
GROUP BY
    1, 2
""")

In [6]:
df_green_revenue.show()



+-------------------+----+------+--------------+
|               hour|zone|amount|number_records|
+-------------------+----+------+--------------+
|2020-01-01 02:00:00|  24|126.42|             6|
|2020-01-01 06:00:00| 244| 25.26|             2|
|2020-01-01 08:00:00| 112| 66.76|             3|
|2020-01-01 11:00:00| 169|  6.12|             1|
|2020-01-01 12:00:00| 193|  14.3|             2|
|2020-01-01 16:00:00|  25|192.12|            11|
|2020-01-01 20:00:00| 196|  36.4|             3|
|2020-01-02 15:00:00| 228| 96.46|             4|
|2020-01-02 21:00:00| 146| 15.66|             2|
|2020-01-03 00:00:00|  52|  21.3|             1|
|2020-01-03 04:00:00| 179|   7.3|             1|
|2020-01-03 09:00:00| 212|  16.8|             1|
|2020-01-03 11:00:00|  29| 163.8|             7|
|2020-01-03 12:00:00| 122|  10.8|             1|
|2020-01-03 13:00:00| 182| 67.19|             5|
|2020-01-03 14:00:00| 129|318.47|            25|
|2020-01-03 15:00:00|   7|583.32|            32|
|2020-01-03 14:00:00

                                                                                

In [7]:
df_green_revenue.repartition(20).write.parquet(
    "../data/data/report/revenue/green",
    mode="overwrite")

[Stage 10:>                                                        (0 + 8) / 20]

24/02/25 22:59:38 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers




24/02/25 22:59:40 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers




24/02/25 22:59:40 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers


                                                                                

**Querying the Yellow Taxi dataframe with SQL:**

In [8]:
df_yellow = spark.read.parquet("../data/data/raw/yellow/*/*")
df_yellow.createOrReplaceTempView("yellow")

In [9]:
df_yellow_revenue = spark.sql("""
SELECT
    date_trunc('hour', tpep_pickup_datetime) AS hour,
    PULocationID AS zone,

    ROUND(SUM(total_amount), 2) AS amount,
    COUNT(1) AS number_records
FROM
    yellow
WHERE 
    tpep_pickup_datetime >= '2020-01-01 00:00:00'
GROUP BY
    1, 2
""")

In [10]:
df_yellow_revenue.repartition(20).write.parquet(
    "../data/data/report/revenue/yellow",
    mode="overwrite")



24/02/25 23:00:11 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers




24/02/25 23:00:12 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers




24/02/25 23:00:13 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers


                                                                                

### 5.4.3 Joins in Spark

Loading the revenue dataframes instead of redoing all computations that lead to it:

In [22]:
df_green_revenue = spark.read.parquet("../data/data/report/revenue/green")
df_yellow_revenue = spark.read.parquet("../data/data/report/revenue/yellow")

The two previously created dataframes will now be joined along the columns `hour` and `zone`:

In [23]:
# Renaming is required, otherwise ther would be 2 `amount` and `number_records`
df_green_revenue_tmp = df_green_revenue \
    .withColumnRenamed("amount", "green_amount") \
    .withColumnRenamed("number_records", "green_number_records")

df_yellow_revenue_tmp = df_yellow_revenue \
    .withColumnRenamed("amount", "yellow_amount") \
    .withColumnRenamed("number_records", "yellow_number_records")

In [24]:
df_join = df_green_revenue_tmp.join(
    df_yellow_revenue_tmp,
    on=["hour", "zone"],
    how="outer"
)

In [25]:
df_join.show(n=5)

[Stage 51:>                                                         (0 + 1) / 1]

+-------------------+----+------------+--------------------+-------------+---------------------+
|               hour|zone|green_amount|green_number_records|yellow_amount|yellow_number_records|
+-------------------+----+------------+--------------------+-------------+---------------------+
|2020-01-01 00:00:00|  50|        null|                null|        18.36|                    2|
|2020-01-01 00:00:00|  79|        null|                null|        17.76|                    1|
|2020-01-01 00:00:00|  87|        null|                null|        23.16|                    1|
|2020-01-01 00:00:00|  90|        null|                null|        14.12|                    1|
|2020-01-01 00:00:00| 107|        null|                null|         14.3|                    1|
+-------------------+----+------------+--------------------+-------------+---------------------+
only showing top 5 rows



                                                                                

In [26]:
df_join.write.parquet(
    "../data/data/report/revenue/total",
    mode="overwrite"
)

[Stage 56:>                                                         (0 + 8) / 9]

24/02/26 00:08:52 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers


                                                                                

#### Joining one large tabele and one small table

In [31]:
# Loading the dataframe that was just saved (to avoid recomputations)
df_join = spark.read.parquet("../data/data/report/revenue/total")

In [32]:
# Loading the previously downloaded taxi-zones data
df_zones = spark.read.parquet('zones/')

In [33]:
# Performing inner join of big taxi dataframe with small zones dataframe
df_result = df_join.join(df_zones, df_join.zone == df_zones.LocationID)

In [35]:
df_result \
    .drop('LocationID', 'zone') \
    .write.parquet(
        'tmp/revenue-zones', 
        mode="overwrite"
    )

[Stage 64:>                                                         (0 + 8) / 9]

24/02/26 00:36:36 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers




24/02/26 00:36:40 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers


                                                                                