# Exercise 1
## Part 4 - Convert SQL Queries to Spark

### 0. Imports & Spark Session

In [1]:
import time
import builtins  # <-- IMPORTANT
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    avg,
    round as spark_round,   # Spark round ONLY for Columns
    count,
    col,
    sum as _sum
)

spark = (
    SparkSession.builder
    .appName("PostgresVsSparkBenchmark")
    .config("spark.jars.packages", "org.postgresql:postgresql:42.7.2")
    .config("spark.eventLog.enabled", "true")
    .config("spark.eventLog.dir", "/tmp/spark-events")
    .config("spark.history.fs.logDirectory", "/tmp/spark-events")
    .config("spark.sql.shuffle.partitions", "4")
    .config("spark.default.parallelism", "4")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")

### 1. JDBC connection config

In [2]:
jdbc_url = "jdbc:postgresql://postgres:5432/postgres"
jdbc_props = {
    "user": "postgres",
    "password": "postgres",
    "driver": "org.postgresql.Driver"
}

### 2. Load data from PostgreSQL

In [3]:
print("\n=== Loading orders from PostgreSQL ===")

start = time.time()

df_orders = spark.read.jdbc(
    url=jdbc_url,
    table="orders",
    properties=jdbc_props
)

# Force materialization
row_count = df_orders.count()

print(f"Rows loaded: {row_count}")
print("Load time:", builtins.round(time.time() - start, 2), "seconds")

# Register temp view
df_orders.createOrReplaceTempView("orders")


=== Loading orders from PostgreSQL ===
Rows loaded: 1000000
Load time: 4.66 seconds


### 3. Query (a): What is the single item with the highest `price_per_unit`?

In [4]:
print("\n=== Query (a): What is the single item with the highest `price_per_unit`? ===")

start = time.time()

q_a = (
    df_orders
    .orderBy("price_per_unit", ascending=False)
    .limit(1)
)

q_a.collect()
q_a.show(truncate=False)
print("Query (a) time:", builtins.round(time.time() - start, 2), "seconds")


=== Query (a): What is the single item with the highest `price_per_unit`? ===
+------+-------------+----------------+--------+--------------+----------+-------+
|id    |customer_name|product_category|quantity|price_per_unit|order_date|country|
+------+-------------+----------------+--------+--------------+----------+-------+
|841292|Emma Brown   |Automotive      |3       |2000.00       |2024-10-11|Italy  |
+------+-------------+----------------+--------+--------------+----------+-------+

Query (a) time: 7.14 seconds


### 4. Query (b): What are the top 3 products category with the highest total quantity sold across all orders?

In [5]:
print("\n=== Query (b): What are the top 3 products category with the highest total quantity sold across all orders? ===")

start = time.time()

q_b = (
    df_orders
    .groupBy("product_category")
    .agg(_sum("quantity").alias("total_quantity"))
    .orderBy(col("total_quantity").desc())
    .limit(3)
)

q_b.collect()
q_b.show(truncate=False)
print("Query (b) time:", builtins.round(time.time() - start, 2), "seconds")


=== Query (b): What are the top 3 products category with the highest total quantity sold across all orders? ===
+----------------+--------------+
|product_category|total_quantity|
+----------------+--------------+
|Health & Beauty |300842        |
|Electronics     |300804        |
|Toys            |300598        |
+----------------+--------------+

Query (b) time: 2.73 seconds


### 5. Query (c): What is the total revenue per product category?y;

In [10]:
print("\n=== Query (c): What is the total revenue per product category? ===")

start = time.time()

q_c = (
    df_orders
    .groupBy("product_category")
    .agg(_sum(col("quantity")*col("price_per_unit")).alias("revenue"))
    .orderBy("revenue", ascending=False)
)

q_c.collect()
q_c.show(truncate=False)
print("Query (c) time:", builtins.round(time.time() - start, 2), "seconds")


=== Query (c): What is the total revenue per product category? ===
+----------------+------------+
|product_category|revenue     |
+----------------+------------+
|Automotive      |306589798.86|
|Electronics     |241525009.45|
|Home & Garden   |78023780.09 |
|Sports          |61848990.83 |
|Health & Beauty |46599817.89 |
|Office Supplies |38276061.64 |
|Fashion         |31566368.22 |
|Toys            |23271039.02 |
|Grocery         |15268355.66 |
|Books           |12731976.04 |
+----------------+------------+

Query (c) time: 3.41 seconds


### 6. Query (d): Which customers have the highest total spending?
Query: 

```sql
SELECT customer_name, SUM(quantity * price_per_unit) AS total_spending
FROM orders
GROUP BY customer_name
ORDER BY total_spending DESC
LIMIT 10;
```

Output: 

In [15]:
print("\n=== Query (d): Which customers have the highest total spending? ===")

start = time.time()

q_d = (
    df_orders
    .groupBy("customer_name")
    .agg(_sum(col("quantity")*col("price_per_unit")).alias("total_spending"))
    .orderBy("total_spending", ascending=False)
    .limit(10)
)

q_d.collect()
q_d.show(truncate=False)
print("Query (d) time:", builtins.round(time.time() - start, 2), "seconds")


=== Query (d): Which customers have the highest total spending? ===
+--------------+--------------+
|customer_name |total_spending|
+--------------+--------------+
|Carol Taylor  |991179.18     |
|Nina Lopez    |975444.95     |
|Daniel Jackson|959344.48     |
|Carol Lewis   |947708.57     |
|Daniel Young  |946030.14     |
|Alice Martinez|935100.02     |
|Ethan Perez   |934841.24     |
|Leo Lee       |934796.48     |
|Eve Young     |933176.86     |
|Ivy Rodriguez |925742.64     |
+--------------+--------------+

Query (d) time: 3.9 seconds


### 8. Cleanup

In [16]:
spark.stop()