In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("test") \
    .getOrCreate()

25/03/02 16:55:53 WARN Utils: Your hostname, hp-computer resolves to a loopback address: 127.0.1.1; using 192.168.178.155 instead (on interface wlp0s20f3)
25/03/02 16:55:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/03/02 16:55:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/02 16:55:57 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
df_green = spark.read.parquet("../data/data/raw/green/*/*/")

                                                                                

In [3]:
# Data should be correctly loaded to the dataframe
df_green.show(5)

                                                                                

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2| 2019-12-18 16:52:30|  2019-12-18 16:54:39|                 N|       1.0|         264|         264|            5.0|          0.0|        3.5|  0.5|    0.

**Querying the Green Taxi dataframe with SQL:**

In [4]:
df_green.createOrReplaceTempView("green")

In [8]:
df_green_revenue = spark.sql("""
SELECT
    PULocationID AS zone,
    date_trunc('hour', lpep_pickup_datetime) AS hour,

    ROUND(SUM(total_amount), 2) AS amount,
    COUNT(1) AS number_records
FROM
    green
WHERE 
    lpep_pickup_datetime >= '2020-01-01 00:00:00'
GROUP BY
    1, 2
""")

In [9]:
df_green_revenue.show()



+----+-------------------+------+--------------+
|zone|               hour|amount|number_records|
+----+-------------------+------+--------------+
| 174|2020-01-01 01:00:00|  21.7|             1|
|  41|2020-01-01 03:00:00|556.11|            36|
| 173|2020-01-01 06:00:00| 59.04|             3|
|  75|2020-01-01 08:00:00|127.16|            10|
|   7|2020-01-01 18:00:00|357.84|            28|
|  36|2020-01-01 17:00:00| 18.09|             2|
|  41|2020-01-01 19:00:00|487.93|            33|
|  25|2020-01-01 20:00:00|262.92|            12|
| 179|2020-01-02 00:00:00|139.97|             6|
|  62|2020-01-02 11:00:00| 75.78|             6|
| 181|2020-01-02 18:00:00|179.72|            10|
|  83|2020-01-02 19:00:00| 53.89|             3|
|  92|2020-01-02 19:00:00|160.36|             8|
| 189|2020-01-02 21:00:00| 61.01|             3|
| 243|2020-01-02 23:00:00| 26.56|             2|
| 166|2020-01-03 02:00:00| 20.85|             2|
| 181|2020-01-03 11:00:00|250.37|            13|
| 213|2020-01-03 17:

                                                                                

In [10]:
df_green_revenue.repartition(20).write.parquet(
    "../data/data/report/revenue/green",
    mode="overwrite")



25/03/02 17:08:03 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers




25/03/02 17:08:04 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers


                                                                                

**Querying the Yellow Taxi dataframe with SQL:**

In [11]:
df_yellow = spark.read.parquet("../data/data/raw/yellow/*/*")
df_yellow.createOrReplaceTempView("yellow")

In [12]:
df_yellow_revenue = spark.sql("""
SELECT
    date_trunc('hour', tpep_pickup_datetime) AS hour,
    PULocationID AS zone,

    ROUND(SUM(total_amount), 2) AS amount,
    COUNT(1) AS number_records
FROM
    yellow
WHERE 
    tpep_pickup_datetime >= '2020-01-01 00:00:00'
GROUP BY
    1, 2
""")

In [14]:
df_yellow_revenue.repartition(20).write.parquet(
    "../data/data/report/revenue/yellow",
    mode="overwrite"
)



25/03/02 17:28:22 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers




25/03/02 17:28:23 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers


                                                                                

### 5.4.3 Joins in Spark

Loading the revenue dataframes instead of redoing all computations that lead to it:

In [15]:
df_green_revenue = spark.read.parquet("../data/data/report/revenue/green")
df_yellow_revenue = spark.read.parquet("../data/data/report/revenue/yellow")

The two previously created dataframes will now be joined along the columns `hour` and `zone`:

In [16]:
# Renaming is required, otherwise ther would be 2 `amount` and `number_records`
df_green_revenue_tmp = df_green_revenue \
    .withColumnRenamed("amount", "green_amount") \
    .withColumnRenamed("number_records", "green_number_records")

df_yellow_revenue_tmp = df_yellow_revenue \
    .withColumnRenamed("amount", "yellow_amount") \
    .withColumnRenamed("number_records", "yellow_number_records")

In [19]:
df_join = df_green_revenue_tmp.join(
    df_yellow_revenue_tmp,
    on=["hour", "zone"],
    how="outer"
)

In [20]:
df_join.show(n=5)



+-------------------+----+------------+--------------------+-------------+---------------------+
|               hour|zone|green_amount|green_number_records|yellow_amount|yellow_number_records|
+-------------------+----+------------+--------------------+-------------+---------------------+
|2020-01-01 00:00:00|  50|        null|                null|        18.36|                    2|
|2020-01-01 00:00:00|  79|        null|                null|        17.76|                    1|
|2020-01-01 00:00:00|  87|        null|                null|        23.16|                    1|
|2020-01-01 00:00:00|  90|        null|                null|        14.12|                    1|
|2020-01-01 00:00:00| 107|        null|                null|         14.3|                    1|
+-------------------+----+------------+--------------------+-------------+---------------------+
only showing top 5 rows



                                                                                

In [21]:
df_join.write.parquet(
    "../data/data/report/revenue/total",
    mode="overwrite"
)

[Stage 49:>                                                         (0 + 8) / 9]

25/03/02 17:34:51 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers


                                                                                

#### Joining one large tabele and one small table

In [22]:
# Loading the dataframe that was just saved (to avoid recomputations)
df_join = spark.read.parquet("../data/data/report/revenue/total")

In [23]:
# Loading the previously downloaded taxi-zones data
df_zones = spark.read.parquet('zones/')

In [24]:
# Performing inner join of big taxi dataframe with small zones dataframe
df_result = df_join.join(df_zones, df_join.zone == df_zones.LocationID)

In [25]:
df_result \
    .drop('LocationID', 'zone') \
    .write.parquet(
        'tmp/revenue-zones', 
        mode="overwrite"
    )

25/03/02 17:36:41 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers


                                                                                