In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder\
.master("local[1]")\
.appName("Windowing Functions")\
.getOrCreate()

In [3]:
data = [(1, "John", 30, "Sales", 50000.0),
(2, "Alice", 28, "Marketing", 60000.0),
(3, "Bob", 32, "Finance", 55000.0),
(4, "Sarah", 29, "Sales", 52000.0),
(5, "Mike", 31, "Finance", 58000.0)
]

In [4]:
from pyspark.sql.types import*

In [5]:
schema = StructType([
StructField("id", IntegerType(), nullable=False),
StructField("name", StringType(), nullable=False),
StructField("age", IntegerType(), nullable=False),
StructField("department", StringType(), nullable=False),
StructField("salary", DoubleType(), nullable=False)
])

In [6]:
employeeDF = spark.createDataFrame(data, schema)

In [7]:
employeeDF.show()

+---+-----+---+----------+-------+
| id| name|age|department| salary|
+---+-----+---+----------+-------+
|  1| John| 30|     Sales|50000.0|
|  2|Alice| 28| Marketing|60000.0|
|  3|  Bob| 32|   Finance|55000.0|
|  4|Sarah| 29|     Sales|52000.0|
|  5| Mike| 31|   Finance|58000.0|
+---+-----+---+----------+-------+



In [8]:
from pyspark.sql.functions import*

In [11]:
#Calculate the average salary for each department

In [12]:
employeeDF.groupBy("department").agg(avg("salary").alias("avg_sal")).show()

+----------+-------+
|department|avg_sal|
+----------+-------+
|     Sales|51000.0|
|   Finance|56500.0|
| Marketing|60000.0|
+----------+-------+



In [13]:
#Add a new column named "bonus" that is 10% of the salary for all employees.

In [15]:
employeeDF.selectExpr("*",".1*salary as bonus").show()

+---+-----+---+----------+-------+------+
| id| name|age|department| salary| bonus|
+---+-----+---+----------+-------+------+
|  1| John| 30|     Sales|50000.0|5000.0|
|  2|Alice| 28| Marketing|60000.0|6000.0|
|  3|  Bob| 32|   Finance|55000.0|5500.0|
|  4|Sarah| 29|     Sales|52000.0|5200.0|
|  5| Mike| 31|   Finance|58000.0|5800.0|
+---+-----+---+----------+-------+------+



In [16]:
#Group the data by department and find the employee with the highest salary in each department

In [9]:
from pyspark.sql import Window

In [21]:
mywindow=Window.partitionBy("department").orderBy(desc("salary"))

In [22]:
high_sal=employeeDF.withColumn("highest_sal",row_number().over(mywindow))

In [24]:
high_sal.filter("highest_sal==1").show()

+---+-----+---+----------+-------+-----------+
| id| name|age|department| salary|highest_sal|
+---+-----+---+----------+-------+-----------+
|  5| Mike| 31|   Finance|58000.0|          1|
|  2|Alice| 28| Marketing|60000.0|          1|
|  4|Sarah| 29|     Sales|52000.0|          1|
+---+-----+---+----------+-------+-----------+



In [25]:
#Find the top 3 departments with the highest total salary.

In [26]:
employeeDF.groupBy("department").agg(sum("salary").alias("total_sal")).orderBy(desc("total_sal")).show()

+----------+---------+
|department|total_sal|
+----------+---------+
|   Finance| 113000.0|
|     Sales| 102000.0|
| Marketing|  60000.0|
+----------+---------+



In [27]:
#Find the top most department having highest salary


In [11]:
total_sal_df=employeeDF.groupBy("department").agg(sum("salary").alias("total_sal"))

In [12]:
mywindow=Window.orderBy(desc(total_sal_df["total_sal"]))

In [14]:
row_dept=total_sal_df.withColumn("row_number",row_number().over(mywindow))

In [15]:
row_dept.filter("row_number=1").show()

+----------+---------+----------+
|department|total_sal|row_number|
+----------+---------+----------+
|   Finance| 113000.0|         1|
+----------+---------+----------+



In [16]:
#Filter the DataFrame to keep only employees aged 30 or above and working in the "Sales" department

In [23]:
employeeDF.filter((employeeDF.age >= 30)&(employeeDF.department == 'Sales')).show()

+---+----+---+----------+-------+
| id|name|age|department| salary|
+---+----+---+----------+-------+
|  1|John| 30|     Sales|50000.0|
+---+----+---+----------+-------+



In [24]:
#Calculate the difference between each employee's salary and the average salary of their respective department

In [26]:
mywindow=Window.partitionBy("department")

In [36]:
employeedf=employeeDF.withColumn("avg_sal",avg(col("salary")).over(mywindow))

In [37]:
employeedf.withColumn("diff",col("salary")-col("avg_sal")).show()

+---+-----+---+----------+-------+-------+-------+
| id| name|age|department| salary|avg_sal|   diff|
+---+-----+---+----------+-------+-------+-------+
|  3|  Bob| 32|   Finance|55000.0|56500.0|-1500.0|
|  5| Mike| 31|   Finance|58000.0|56500.0| 1500.0|
|  2|Alice| 28| Marketing|60000.0|60000.0|    0.0|
|  1| John| 30|     Sales|50000.0|51000.0|-1000.0|
|  4|Sarah| 29|     Sales|52000.0|51000.0| 1000.0|
+---+-----+---+----------+-------+-------+-------+



In [38]:
#Calculate the sum of salaries for employees whose names start with the letter "J".

In [39]:
employeeDF.filter(col("name").startswith('J')).agg(sum(col("salary"))).show()

+-----------+
|sum(salary)|
+-----------+
|    50000.0|
+-----------+



In [40]:
#Sort the DataFrame based on the "age" column in ascending order and then by "salary" column in descending order

In [41]:
employeeDF.orderBy("age",desc("salary")).show()

+---+-----+---+----------+-------+
| id| name|age|department| salary|
+---+-----+---+----------+-------+
|  2|Alice| 28| Marketing|60000.0|
|  4|Sarah| 29|     Sales|52000.0|
|  1| John| 30|     Sales|50000.0|
|  5| Mike| 31|   Finance|58000.0|
|  3|  Bob| 32|   Finance|55000.0|
+---+-----+---+----------+-------+



In [42]:
#Replace the department name "Finance" with "Financial Services" in the DataFrame

In [43]:
employeeDF.withColumn("department",when(col("department")=='Finance','Financial services').otherwise(col("department"))).show()

+---+-----+---+------------------+-------+
| id| name|age|        department| salary|
+---+-----+---+------------------+-------+
|  1| John| 30|             Sales|50000.0|
|  2|Alice| 28|         Marketing|60000.0|
|  3|  Bob| 32|Financial services|55000.0|
|  4|Sarah| 29|             Sales|52000.0|
|  5| Mike| 31|Financial services|58000.0|
+---+-----+---+------------------+-------+



In [44]:
#Calculate the percentage of total salary each employee contributes to their respective department.

In [45]:
mywindow=Window.partitionBy("department")

In [46]:
emp_df=employeeDF.withColumn("total_sal",sum("salary").over(mywindow))

In [48]:
percentage=(col("total_sal")/col("salary"))*100

In [49]:
emp_df.withColumn("percent",round(percentage,2)).show()

+---+-----+---+----------+-------+---------+-------+
| id| name|age|department| salary|total_sal|percent|
+---+-----+---+----------+-------+---------+-------+
|  3|  Bob| 32|   Finance|55000.0| 113000.0| 205.45|
|  5| Mike| 31|   Finance|58000.0| 113000.0| 194.83|
|  2|Alice| 28| Marketing|60000.0|  60000.0|  100.0|
|  1| John| 30|     Sales|50000.0| 102000.0|  204.0|
|  4|Sarah| 29|     Sales|52000.0| 102000.0| 196.15|
+---+-----+---+----------+-------+---------+-------+

