In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum

In [4]:
spark = SparkSession.builder.appName("S2").getOrCreate()

In [5]:
data = [
    (101, "Laptop", "Electronics", 700, 10, 4.5),
    (102, "Smartphone", "Electronics", 500, 20, 4.3),
    (103, "Tablet", "Electronics", 300, 15, 4.1),
    (201, "Chair", "Furniture", 50, 50, 4.0),
    (209, "Table", "Furniture", 150, 30, 4.2),
    (205, "Table", "Furniture", 150, 30, 4.2),
    (203, "Couch", "Furniture", 500, 10, 4.6),
    (301, "Shampoo", "Personal Care", 10, 100, 4.3),
    (302, "Soap", "Personal Care", 5, 200, 4.4),
    (201, "Chair", "Furniture", 50, 50, 4.0),
    (209, "Table", "Furniture", 150, 30, 4.2),
    (203, "Couch", "Furniture", 500, 10, 4.6),
    (303, "Toothpaste", "Personal Care", 2, 300, 4.1),
    (401, "T-shirt", "Apparel", 20, 50, 4.2),
]

cols = ["ProductID", "ProductName", "Category", "Price", "StockQuantity", "Rating"]

In [6]:
df = spark.createDataFrame(data, cols)

In [7]:
sorted_df = df.sort(col("Price").desc(), col("Category").asc())
sorted_df.show()

+---------+-----------+-------------+-----+-------------+------+
|ProductID|ProductName|     Category|Price|StockQuantity|Rating|
+---------+-----------+-------------+-----+-------------+------+
|      101|     Laptop|  Electronics|  700|           10|   4.5|
|      102| Smartphone|  Electronics|  500|           20|   4.3|
|      203|      Couch|    Furniture|  500|           10|   4.6|
|      203|      Couch|    Furniture|  500|           10|   4.6|
|      103|     Tablet|  Electronics|  300|           15|   4.1|
|      209|      Table|    Furniture|  150|           30|   4.2|
|      209|      Table|    Furniture|  150|           30|   4.2|
|      205|      Table|    Furniture|  150|           30|   4.2|
|      201|      Chair|    Furniture|   50|           50|   4.0|
|      201|      Chair|    Furniture|   50|           50|   4.0|
|      401|    T-shirt|      Apparel|   20|           50|   4.2|
|      301|    Shampoo|Personal Care|   10|          100|   4.3|
|      302|       Soap|Pe

In [8]:
total_sales = df.withColumn("TotalSales", col("Price")*col("StockQuantity"))
total_sales.show()

+---------+-----------+-------------+-----+-------------+------+----------+
|ProductID|ProductName|     Category|Price|StockQuantity|Rating|TotalSales|
+---------+-----------+-------------+-----+-------------+------+----------+
|      101|     Laptop|  Electronics|  700|           10|   4.5|      7000|
|      102| Smartphone|  Electronics|  500|           20|   4.3|     10000|
|      103|     Tablet|  Electronics|  300|           15|   4.1|      4500|
|      201|      Chair|    Furniture|   50|           50|   4.0|      2500|
|      209|      Table|    Furniture|  150|           30|   4.2|      4500|
|      205|      Table|    Furniture|  150|           30|   4.2|      4500|
|      203|      Couch|    Furniture|  500|           10|   4.6|      5000|
|      301|    Shampoo|Personal Care|   10|          100|   4.3|      1000|
|      302|       Soap|Personal Care|    5|          200|   4.4|      1000|
|      201|      Chair|    Furniture|   50|           50|   4.0|      2500|
|      209| 

In [9]:
sales_by_category = total_sales.groupBy("Category").agg(_sum("TotalSales").alias("SalesByCategory"))
sales_by_category.show()

+-------------+---------------+
|     Category|SalesByCategory|
+-------------+---------------+
|  Electronics|          21500|
|    Furniture|          28500|
|      Apparel|           1000|
|Personal Care|           2600|
+-------------+---------------+



In [13]:
sales_qty_by_product = total_sales.groupBy("ProductID", "ProductName").agg(_sum("TotalSales").alias("SalesByProduct"), _sum("StockQuantity").alias("QtyByProduct"))
sales_qty_by_product.show()

+---------+-----------+--------------+------------+
|ProductID|ProductName|SalesByProduct|QtyByProduct|
+---------+-----------+--------------+------------+
|      102| Smartphone|         10000|          20|
|      203|      Couch|         10000|          20|
|      209|      Table|          9000|          60|
|      205|      Table|          4500|          30|
|      103|     Tablet|          4500|          15|
|      201|      Chair|          5000|         100|
|      101|     Laptop|          7000|          10|
|      401|    T-shirt|          1000|          50|
|      301|    Shampoo|          1000|         100|
|      303| Toothpaste|           600|         300|
|      302|       Soap|          1000|         200|
+---------+-----------+--------------+------------+



In [14]:
spark.stop()