In [25]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("order_by_partition").getOrCreate()


data = [('10', '2024-11-23','Lincoln Rd.','0.5 Kw'),
        ('25', '2024-11-29','High Street','0.4 Kw'),
        ('11', '2023-10-18','Islington Square','2.0 kW'),
        ('89', '2024-09-18','Leytonstone Road','1.0 kW'),
        ('25', '2025-09-18','London Ave.','0.6 Kw'),
        ('11', '2024-10-18',"ST MARK'S HILL",'No Data'),
        ('65', '2023-02-26','Main St.','3.0 kW'),
        ('11', '2023-12-30','Oak St.','No Data')]

df = spark.createDataFrame(data).toDF("loc_id",'date','street','Surge_Protection')

df.show()

25/08/10 15:27:46 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+------+----------+----------------+----------------+
|loc_id|      date|          street|Surge_Protection|
+------+----------+----------------+----------------+
|    10|2024-11-23|     Lincoln Rd.|          0.5 Kw|
|    25|2024-11-29|     High Street|          0.4 Kw|
|    11|2023-10-18|Islington Square|          2.0 kW|
|    89|2024-09-18|Leytonstone Road|          1.0 kW|
|    25|2025-09-18|     London Ave.|          0.6 Kw|
|    11|2024-10-18|  ST MARK'S HILL|         No Data|
|    65|2023-02-26|        Main St.|          3.0 kW|
|    11|2023-12-30|         Oak St.|         No Data|
+------+----------+----------------+----------------+



In [26]:
df.createOrReplaceTempView('spc')

partition by `loc_id`  
order by `date` DESC, `Surge_Protection` DESC  
ROW_NUMBER() function is used to provide consecutive numbering of the rows  

Is a window function that assigns a sequential integer to each row within the partition of a result set. The row number starts with 1 for the first row in each partition

In [27]:
df1 = spark.sql(""" SELECT *, ROW_NUMBER() OVER (PARTITION BY loc_id ORDER BY date DESC, Surge_Protection DESC) rk FROM spc""")
        
df1.show(truncate=False)      

+------+----------+----------------+----------------+---+
|loc_id|date      |street          |Surge_Protection|rk |
+------+----------+----------------+----------------+---+
|10    |2024-11-23|Lincoln Rd.     |0.5 Kw          |1  |
|11    |2024-10-18|ST MARK'S HILL  |No Data         |1  |
|11    |2023-12-30|Oak St.         |No Data         |2  |
|11    |2023-10-18|Islington Square|2.0 kW          |3  |
|25    |2025-09-18|London Ave.     |0.6 Kw          |1  |
|25    |2024-11-29|High Street     |0.4 Kw          |2  |
|65    |2023-02-26|Main St.        |3.0 kW          |1  |
|89    |2024-09-18|Leytonstone Road|1.0 kW          |1  |
+------+----------+----------------+----------------+---+



`partition by` clause divides the result set into partitions    
ROW_NUMBER() function is applied to each partition separately and reinitialized the row number for each partition.   
`order by` clause defines the logical order of the rows within each partition of the result set. The `order by` clause is mandatory because the ROW_NUMBER() function is order sensitive  
rk choose the row taken from the partition, rk=1  => takes first row of each partititon            

In [28]:
df2 = spark.sql(""" select * 
                    from  (
                        SELECT
                        b.*, ROW_NUMBER() OVER (PARTITION BY loc_id ORDER BY date DESC, Surge_Protection DESC) rk
                        FROM spc b
                        ) WHERE rk = 1 """)

df2.show(truncate=False)        

+------+----------+----------------+----------------+---+
|loc_id|date      |street          |Surge_Protection|rk |
+------+----------+----------------+----------------+---+
|10    |2024-11-23|Lincoln Rd.     |0.5 Kw          |1  |
|11    |2024-10-18|ST MARK'S HILL  |No Data         |1  |
|25    |2025-09-18|London Ave.     |0.6 Kw          |1  |
|65    |2023-02-26|Main St.        |3.0 kW          |1  |
|89    |2024-09-18|Leytonstone Road|1.0 kW          |1  |
+------+----------+----------------+----------------+---+

