In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [2]:
spark = SparkSession.builder.appName("PartitioningExample").getOrCreate()

df = spark.read.csv("supermarket_sales.csv", header=True, inferSchema=True)

df.printSchema()
df.show(5)


root
 |-- Invoice ID: string (nullable = true)
 |-- Branch: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Customer type: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Product line: string (nullable = true)
 |-- Unit price: double (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Tax 5%: double (nullable = true)
 |-- Total: double (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: timestamp (nullable = true)
 |-- Payment: string (nullable = true)
 |-- cogs: double (nullable = true)
 |-- gross margin percentage: double (nullable = true)
 |-- gross income: double (nullable = true)
 |-- Rating: double (nullable = true)

+-----------+------+---------+-------------+------+--------------------+----------+--------+-------+--------+----------+-------------------+-----------+------+-----------------------+------------+------+
| Invoice ID|Branch|     City|Customer type|Gender|        Product line|Unit price|Quantity| Tax 5%|  

In [None]:
partitioned_df = df.repartition("City")
print(f"Number of partitions: {partitioned_df.rdd.getNumPartitions()}")

partitioned_df.show(5)

Number of partitions: 1


In [None]:
range_partition_df = df.repartitionByRange(4, col("Total"))
print(f"Number of partitions: {range_partition_df.rdd.getNumPartitions()}")

range_partition_df.show(5)

Number of partitions: 4


In [None]:
filtered_df = partitioned_df.filter(col("City") == "Yangon")

summary_df = filtered_df.groupBy("Product line").sum("Quantity")

sorted_df = summary_df.orderBy(col("sum(Quantity)").desc())

sorted_df.show()

+--------------------+-------------+
|        Product line|sum(Quantity)|
+--------------------+-------------+
|  Home and lifestyle|          371|
|   Sports and travel|          333|
|Electronic access...|          322|
|  Food and beverages|          313|
| Fashion accessories|          263|
|   Health and beauty|          257|
+--------------------+-------------+



In [6]:
spark.stop()
