## Window Functions
###  Basic Window Functions
####  Use orderBy() when order matters:
 ● Ranking Functions (row_number, rank, dense_rank)
 ● Offset Functions (lead, lag)
 ● Cumulative Aggregations (sum, avg with rowsBetween)
####  Skip orderBy() when order is irrelevant:
 ● Partition-wise Aggregates (sum, avg, count)
 ● Row-Agnostic Aggregations (max, min)

In [0]:
from pyspark.sql import Row

data = [
    Row(employee_name="Alice", employee_id=101, department="HR", salary=55000),
    Row(employee_name="Bob", employee_id=102, department="Finance", salary=72000),
    Row(employee_name="Charlie", employee_id=103, department="IT", salary=95000),
    Row(employee_name="David", employee_id=104, department="HR", salary=60000),
    Row(employee_name="Eva", employee_id=105, department="Finance", salary=80000),
    Row(employee_name="Frank", employee_id=106, department="IT", salary=120000),
    Row(employee_name="Grace", employee_id=107, department="Marketing", salary=65000),
    Row(employee_name="Helen", employee_id=108, department="Marketing", salary=70000),
    Row(employee_name="Ian", employee_id=109, department="IT", salary=105000),
    Row(employee_name="Jane", employee_id=110, department="HR", salary=58000)
]

df = spark.createDataFrame(data)
df.show()

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number, rank, dense_rank, lag, lead, sum, avg
# Define window specification (partition by department, order by salary descending)
window_spec = Window.partitionBy("department").orderBy(col("salary").desc())

# Apply window functions
#**row_number:** Assigns unique numbers to each row in a partition.
df = df.withColumn("row_number", row_number().over(window_spec))
# **rank:** Similar to row_number but allows rank gaps.
df = df.withColumn("rank", rank().over(window_spec))
# **dense_rank:** Like rank but without gaps.
df = df.withColumn("dense_rank", dense_rank().over(window_spec))
# **lag:** Gets the previous row's value.
df = df.withColumn("previous_salary", lag("salary").over(window_spec))
# **lead:** Gets the next row's value.
df = df.withColumn("next_salary", lead("salary").over(window_spec))
# **sum:** Computes a running total.
df = df.withColumn("running_total", sum("salary").over(window_spec))
# **avg:** Computes a moving average.
df = df.withColumn("moving_avg", avg("salary").over(window_spec))
# Show result
df.show()

### With Rows Between

In [0]:

from pyspark.sql.window import Window
from pyspark.sql.functions import col, sum, avg, min, max, count
#1. Rolling sum over the last 2 rows and current row
window_spec1 = Window.partitionBy("department").orderBy("salary").rowsBetween(-2, 0)
df = df.withColumn("rolling_sum_last_2", sum("salary").over(window_spec1))
#2. Moving average including previous, current, and next row
window_spec2 = Window.partitionBy("department").orderBy("salary").rowsBetween(-1, 1)
df = df.withColumn("moving_avg2", avg("salary").over(window_spec2))

#3. Rolling minimum for current and next 2 rows
window_spec3 = Window.partitionBy("department").orderBy("salary").rowsBetween(0, 2)
df = df.withColumn("rolling_min_next_2", min("salary").over(window_spec3))
#4. Maximum salary over all previous rows (running max)
window_spec4 = Window.partitionBy("department").orderBy("salary").rowsBetween(Window.unboundedPreceding, 0)
df = df.withColumn("running_max", max("salary").over(window_spec4))
#5. Count total rows within the window (entire partition)
window_spec5 = Window.partitionBy("department").orderBy("salary").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
df = df.withColumn("total_rows", count("salary").over(window_spec5))
# Show result
display(df)