In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Start Spark session
spark = SparkSession.builder.appName("DepartmentEmployee").getOrCreate()

# Define Department schema and data
dept_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True)
])

dept_data = [
    (1, 'IT'),
    (2, 'Sales')
]

dept_df = spark.createDataFrame(dept_data, dept_schema)

# Define Employee schema and data
emp_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("salary", IntegerType(), True),
    StructField("departmentId", IntegerType(), True)
])

emp_data = [
    (1, 'Joe', 85000, 1),
    (2, 'Henry', 80000, 2),
    (3, 'Sam', 60000, 2),
    (4, 'Max', 90000, 1),
    (5, 'Janet', 69000, 1),
    (6, 'Randy', 85000, 1),
    (7, 'Will', 70000, 1)
]

emp_df = spark.createDataFrame(emp_data, emp_schema)

# Show both DataFrames
print("Department Table:")
dept_df.show()

print("Employee Table:")
emp_df.show()


In [0]:
/* Write your T-SQL query statement below */
with cte as (
select name,salary,departmentid, dense_rank() over(partition by departmentId order by salary desc) as rnk
from employee
)
select d.name as department, cte.name as employee, salary from cte 
inner join department d
on cte.departmentid = d.id
where rnk <=3


In [0]:
w1 = Window.partitionBy('departmentId').orderBy(col('salary').desc())
cte_emp_df = emp_df.withColumn('rnk', dense_rank().over(w1))

In [0]:
result_df = cte_emp_df.join(dept_df, on=cte_emp_df.departmentId==dept_df.id, how='inner') \
    .filter('rnk <=3') \
        .select(dept_df.name.alias('department'), cte_emp_df.name.alias('employee'), 'salary')

result_df.display()