## Init Spark Session

In [6]:
import os

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Check if the JDBC driver exists
jdbc_driver_path = "/home/user/work/jars/postgresql-42.7.3.jar"
if not os.path.isfile(jdbc_driver_path):
    raise FileNotFoundError(f"The JDBC driver was not found at the specified path: {jdbc_driver_path}")

# Initialize the Spark session
spark = SparkSession.builder \
    .appName("PostgreSQL to PySpark") \
    .config("spark.jars", jdbc_driver_path) \
    .getOrCreate()

# PostgreSQL connection properties
jdbc_url = "jdbc:postgresql://postgres-staging:5432/staging_db"
connection_properties = {
    "user": "admin",
    "password": "admin",
    "driver": "org.postgresql.Driver"
}

## Top Selling Category

In [None]:
# Load necessary tables (assuming they are already loaded as shown above)
order_details = spark.read.jdbc(url=jdbc_url, table="order_details", properties=connection_properties)
orders = spark.read.jdbc(url=jdbc_url, table="orders", properties=connection_properties)
products = spark.read.jdbc(url=jdbc_url, table="products", properties=connection_properties)
categories = spark.read.jdbc(url=jdbc_url, table="categories", properties=connection_properties)

# Perform the equivalent operations in PySpark
product_revenue = orders.join(order_details, orders["orderID"] == order_details["orderID"], "inner") \
    .join(products, order_details["productID"] == products["productID"], "inner") \
    .select(
        F.date_trunc("month", orders["orderDate"]).alias("orderMonth"),
        products["categoryID"],
        order_details["productID"],
        order_details["unitPrice"],
        order_details["quantity"],
        order_details["discount"],
        ((order_details["unitPrice"] - (order_details["unitPrice"] * order_details["discount"])) * order_details["quantity"]).alias("gross_revenue")
    )

# Aggregate to get category_monthly_revenue
category_monthly_revenue = product_revenue.groupBy(
    F.date_format(F.date_trunc("month", product_revenue["orderMonth"]), "yyyy-MM-dd").alias("month"),
    product_revenue["categoryID"]
).agg(
    F.sum(product_revenue["gross_revenue"]).alias("total_gross_revenue")
)

# Rank categories based on total gross revenue
ranked_categories = category_monthly_revenue.withColumn(
    "category_rank",
    F.rank().over(Window.partitionBy("month").orderBy(F.desc("total_gross_revenue")))
).filter(
    F.col("category_rank") == 1
)

# Join with categories to get final result
final_result = ranked_categories.join(
    categories, ranked_categories["categoryID"] == categories["categoryID"], "inner"
).select(
    ranked_categories["month"],
    ranked_categories["categoryID"],
    categories["categoryName"],
    ranked_categories["total_gross_revenue"]
)

# Show or save the final result as needed
final_result.orderBy(F.asc("month"), F.desc("total_gross_revenue")).show()
