In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [4]:
spark = SparkSession.builder.appName("PartitioningExample").getOrCreate()

df = spark.read.csv("supermarket_sales.csv", header=True, inferSchema=True)

df.printSchema()
df.show(5)


root
 |-- Invoice ID: string (nullable = true)
 |-- Branch: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Customer type: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Product line: string (nullable = true)
 |-- Unit price: double (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Tax 5%: double (nullable = true)
 |-- Total: double (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: timestamp (nullable = true)
 |-- Payment: string (nullable = true)
 |-- cogs: double (nullable = true)
 |-- gross margin percentage: double (nullable = true)
 |-- gross income: double (nullable = true)
 |-- Rating: double (nullable = true)

+-----------+------+---------+-------------+------+--------------------+----------+--------+-------+--------+----------+-------------------+-----------+------+-----------------------+------------+------+
| Invoice ID|Branch|     City|Customer type|Gender|        Product line|Unit price|Quantity| Tax 5%|  

In [5]:
partitioned_df = df.repartition("City")
print(f"Number of partitions: {partitioned_df.rdd.getNumPartitions()}")

partitioned_df.show(5)

Number of partitions: 1
+-----------+------+---------+-------------+------+--------------------+----------+--------+-------+--------+----------+-------------------+-------+------+-----------------------+------------+------+
| Invoice ID|Branch|     City|Customer type|Gender|        Product line|Unit price|Quantity| Tax 5%|   Total|      Date|               Time|Payment|  cogs|gross margin percentage|gross income|Rating|
+-----------+------+---------+-------------+------+--------------------+----------+--------+-------+--------+----------+-------------------+-------+------+-----------------------+------------+------+
|226-31-3081|     C|Naypyitaw|       Normal|Female|Electronic access...|     15.28|       5|   3.82|   80.22|03/08/2019|2025-03-20 10:29:00|   Cash|  76.4|            4.761904762|        3.82|   9.6|
|699-14-3026|     C|Naypyitaw|       Normal|  Male|Electronic access...|     85.39|       7|29.8865|627.6165| 3/25/2019|2025-03-20 18:30:00|Ewallet|597.73|            4.7619047

In [6]:
range_partition_df = df.repartitionByRange(4, col("Total"))
print(f"Number of partitions: {range_partition_df.rdd.getNumPartitions()}")

range_partition_df.show(5)

Number of partitions: 4
+-----------+------+---------+-------------+------+--------------------+----------+--------+------+-------+----------+-------------------+-----------+------+-----------------------+------------+------+
| Invoice ID|Branch|     City|Customer type|Gender|        Product line|Unit price|Quantity|Tax 5%|  Total|      Date|               Time|    Payment|  cogs|gross margin percentage|gross income|Rating|
+-----------+------+---------+-------------+------+--------------------+----------+--------+------+-------+----------+-------------------+-----------+------+-----------------------+------------+------+
|226-31-3081|     C|Naypyitaw|       Normal|Female|Electronic access...|     15.28|       5|  3.82|  80.22|03/08/2019|2025-03-20 10:29:00|       Cash|  76.4|            4.761904762|        3.82|   9.6|
|665-32-9167|     A|   Yangon|       Member|Female|   Health and beauty|     36.26|       2| 3.626| 76.146|01/10/2019|2025-03-20 17:15:00|Credit card| 72.52|           

In [7]:
filtered_df = partitioned_df.filter(col("City") == "Yangon")

summary_df = filtered_df.groupBy("Product line").sum("Quantity")

sorted_df = summary_df.orderBy(col("sum(Quantity)").desc())

sorted_df.show()

+--------------------+-------------+
|        Product line|sum(Quantity)|
+--------------------+-------------+
|  Home and lifestyle|          371|
|   Sports and travel|          333|
|Electronic access...|          322|
|  Food and beverages|          313|
| Fashion accessories|          263|
|   Health and beauty|          257|
+--------------------+-------------+



In [8]:
spark.stop()
