In [3]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[1]") \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()

# Sample DataFrame
data_student = [
    ("John", "Physics",90, "P", 90),
    ("John", "Chemistry",70, "P", 80),
    ("John", "Math", 30, "F", 40),
    ("Michael", "Physics",80, "P", 70),
    ("Michael", "Chemistry",65, "P", 50),
    ("Michael", "Math", 70, "P", 60),
    ("Blessy", "Physics",40, "F", 30),
    ("Blessy", "Chemistry",55, "P", 50),
    ("Blessy", "Math", 80, "P", 50),
    ("Nancy", "Physics",85, "P", 75),
    ("Nancy", "Chemistry",35, "F", 55),
    ("Nancy", "Math", 75, "P", 50),
    ("David", "Physics",30, "F", 40),
    ("David", "Chemistry",70, "P", 50),
    ("David", "Math", 75, "P", 75)   
]

# Create DataFrame
df = spark.createDataFrame(data=data_student, schema = ["Name", "Subject", "Mark", "Status", "Attendance"])
df.show()

+-------+---------+----+------+----------+
|   Name|  Subject|Mark|Status|Attendance|
+-------+---------+----+------+----------+
|   John|  Physics|  90|     P|        90|
|   John|Chemistry|  70|     P|        80|
|   John|     Math|  30|     F|        40|
|Michael|  Physics|  80|     P|        70|
|Michael|Chemistry|  65|     P|        50|
|Michael|     Math|  70|     P|        60|
| Blessy|  Physics|  40|     F|        30|
| Blessy|Chemistry|  55|     P|        50|
| Blessy|     Math|  80|     P|        50|
|  Nancy|  Physics|  85|     P|        75|
|  Nancy|Chemistry|  35|     F|        55|
|  Nancy|     Math|  75|     P|        50|
|  David|  Physics|  30|     F|        40|
|  David|Chemistry|  70|     P|        50|
|  David|     Math|  75|     P|        75|
+-------+---------+----+------+----------+



In [7]:
#Create rank within each group of Name
from pyspark.sql.window import Window
from pyspark.sql.functions import col,row_number

Window_dept = Window.partitionBy("Name").orderBy(col("Mark").desc())
df2 = df.withColumn("row",row_number().over(Window_dept)).orderBy("Name","row")
df2.show()                                                                  

+-------+---------+----+------+----------+---+
|   Name|  Subject|Mark|Status|Attendance|row|
+-------+---------+----+------+----------+---+
| Blessy|     Math|  80|     P|        50|  1|
| Blessy|Chemistry|  55|     P|        50|  2|
| Blessy|  Physics|  40|     F|        30|  3|
|  David|     Math|  75|     P|        75|  1|
|  David|Chemistry|  70|     P|        50|  2|
|  David|  Physics|  30|     F|        40|  3|
|   John|  Physics|  90|     P|        90|  1|
|   John|Chemistry|  70|     P|        80|  2|
|   John|     Math|  30|     F|        40|  3|
|Michael|  Physics|  80|     P|        70|  1|
|Michael|     Math|  70|     P|        60|  2|
|Michael|Chemistry|  65|     P|        50|  3|
|  Nancy|  Physics|  85|     P|        75|  1|
|  Nancy|     Math|  75|     P|        50|  2|
|  Nancy|Chemistry|  35|     F|        55|  3|
+-------+---------+----+------+----------+---+



In [8]:
#Top N rows with the above group
df3=df2.filter(col("row")<=1)
df3.show()

+-------+-------+----+------+----------+---+
|   Name|Subject|Mark|Status|Attendance|row|
+-------+-------+----+------+----------+---+
| Blessy|   Math|  80|     P|        50|  1|
|  David|   Math|  75|     P|        75|  1|
|   John|Physics|  90|     P|        90|  1|
|Michael|Physics|  80|     P|        70|  1|
|  Nancy|Physics|  85|     P|        75|  1|
+-------+-------+----+------+----------+---+



In [9]:
#Create rank within each group of Subject

Window_Subject = Window.partitionBy("Subject").orderBy(col("Mark").desc())
df4 = df.withColumn("row",row_number().over(Window_Subject)).orderBy("Name","row")
df4.show() 

+-------+---------+----+------+----------+---+
|   Name|  Subject|Mark|Status|Attendance|row|
+-------+---------+----+------+----------+---+
| Blessy|     Math|  80|     P|        50|  1|
| Blessy|Chemistry|  55|     P|        50|  4|
| Blessy|  Physics|  40|     F|        30|  4|
|  David|Chemistry|  70|     P|        50|  2|
|  David|     Math|  75|     P|        75|  3|
|  David|  Physics|  30|     F|        40|  5|
|   John|  Physics|  90|     P|        90|  1|
|   John|Chemistry|  70|     P|        80|  1|
|   John|     Math|  30|     F|        40|  5|
|Michael|  Physics|  80|     P|        70|  3|
|Michael|Chemistry|  65|     P|        50|  3|
|Michael|     Math|  70|     P|        60|  4|
|  Nancy|     Math|  75|     P|        50|  2|
|  Nancy|  Physics|  85|     P|        75|  2|
|  Nancy|Chemistry|  35|     F|        55|  5|
+-------+---------+----+------+----------+---+



In [10]:
#Top N rows with the above group
df5=df4.filter(col("row")<=1)
df5.show()

+------+---------+----+------+----------+---+
|  Name|  Subject|Mark|Status|Attendance|row|
+------+---------+----+------+----------+---+
|Blessy|     Math|  80|     P|        50|  1|
|  John|Chemistry|  70|     P|        80|  1|
|  John|  Physics|  90|     P|        90|  1|
+------+---------+----+------+----------+---+



In [12]:
#Bottom N rows per Name group
Window_dept_bottom = Window.partitionBy("Name").orderBy(col("Mark"))
df6 = df.withColumn("row",row_number().over(Window_dept_bottom)).orderBy("Name","row")
df6.show()      
df7=df6.filter(col("row")<=1)
df7.show()

+-------+---------+----+------+----------+---+
|   Name|  Subject|Mark|Status|Attendance|row|
+-------+---------+----+------+----------+---+
| Blessy|  Physics|  40|     F|        30|  1|
| Blessy|Chemistry|  55|     P|        50|  2|
| Blessy|     Math|  80|     P|        50|  3|
|  David|  Physics|  30|     F|        40|  1|
|  David|Chemistry|  70|     P|        50|  2|
|  David|     Math|  75|     P|        75|  3|
|   John|     Math|  30|     F|        40|  1|
|   John|Chemistry|  70|     P|        80|  2|
|   John|  Physics|  90|     P|        90|  3|
|Michael|Chemistry|  65|     P|        50|  1|
|Michael|     Math|  70|     P|        60|  2|
|Michael|  Physics|  80|     P|        70|  3|
|  Nancy|Chemistry|  35|     F|        55|  1|
|  Nancy|     Math|  75|     P|        50|  2|
|  Nancy|  Physics|  85|     P|        75|  3|
+-------+---------+----+------+----------+---+

+-------+---------+----+------+----------+---+
|   Name|  Subject|Mark|Status|Attendance|row|
+-------+---

In [13]:
#Bottom N rows per Subject group
Window_Subject_bottom = Window.partitionBy("Subject").orderBy(col("Mark"))
df8 = df.withColumn("row",row_number().over(Window_Subject_bottom)).orderBy("Name","row")
df8.show()  
df9=df8.filter(col("row")<=1)
df9.show()

+-------+---------+----+------+----------+---+
|   Name|  Subject|Mark|Status|Attendance|row|
+-------+---------+----+------+----------+---+
| Blessy|  Physics|  40|     F|        30|  2|
| Blessy|Chemistry|  55|     P|        50|  2|
| Blessy|     Math|  80|     P|        50|  5|
|  David|  Physics|  30|     F|        40|  1|
|  David|     Math|  75|     P|        75|  4|
|  David|Chemistry|  70|     P|        50|  5|
|   John|     Math|  30|     F|        40|  1|
|   John|Chemistry|  70|     P|        80|  4|
|   John|  Physics|  90|     P|        90|  5|
|Michael|     Math|  70|     P|        60|  2|
|Michael|  Physics|  80|     P|        70|  3|
|Michael|Chemistry|  65|     P|        50|  3|
|  Nancy|Chemistry|  35|     F|        55|  1|
|  Nancy|     Math|  75|     P|        50|  3|
|  Nancy|  Physics|  85|     P|        75|  4|
+-------+---------+----+------+----------+---+

+-----+---------+----+------+----------+---+
| Name|  Subject|Mark|Status|Attendance|row|
+-----+---------

In [14]:
df10=df8.filter(col("row")<=2)
df10.show()

+-------+---------+----+------+----------+---+
|   Name|  Subject|Mark|Status|Attendance|row|
+-------+---------+----+------+----------+---+
| Blessy|  Physics|  40|     F|        30|  2|
| Blessy|Chemistry|  55|     P|        50|  2|
|  David|  Physics|  30|     F|        40|  1|
|   John|     Math|  30|     F|        40|  1|
|Michael|     Math|  70|     P|        60|  2|
|  Nancy|Chemistry|  35|     F|        55|  1|
+-------+---------+----+------+----------+---+

