In [1]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 46 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 48.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=6475ef07d5bf33fe91bdba863dcdc8de726c3865ba82711f97d66917a9653f5a
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


### Rank, Dense Rank and Percent Rank

In [51]:
from pyspark.sql.window import Window
import pyspark
from pyspark.sql import SparkSession

In [52]:
spark = SparkSession.builder.appName("windows_function").getOrCreate()

In [53]:
# sample data for dataframe
sampleData = (("Ram", 28, "Sales", 3000),
              ("Meena", 33, "Sales", 4600),
              ("Robin", 40, "Sales", 4100),
              ("Kunal", 25, "Finance", 3000),
              ("Ram", 28, "Sales", 3000),
              ("Srishti", 46, "Management", 3300),
              ("Jeny", 26, "Finance", 3900),
              ("Hitesh", 30, "Marketing", 3000),
              ("Kailash", 29, "Marketing", 2000),
              ("Sharad", 39, "Sales", 4100)
              )

In [54]:
# column names for dataframe
columns = ["Employee_Name", "Age","Department", "Salary"]

In [55]:
# create a dataframe
df = spark.createDataFrame(data=sampleData,schema=columns)

In [56]:
# check the df schema
df.printSchema()

root
 |-- Employee_Name: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)



In [65]:
# creating a window
window = Window.partitionBy('Department').orderBy('age')

In [66]:
df.show()

+-------------+---+----------+------+
|Employee_Name|Age|Department|Salary|
+-------------+---+----------+------+
|          Ram| 28|     Sales|  3000|
|        Meena| 33|     Sales|  4600|
|        Robin| 40|     Sales|  4100|
|        Kunal| 25|   Finance|  3000|
|          Ram| 28|     Sales|  3000|
|      Srishti| 46|Management|  3300|
|         Jeny| 26|   Finance|  3900|
|       Hitesh| 30| Marketing|  3000|
|      Kailash| 29| Marketing|  2000|
|       Sharad| 39|     Sales|  4100|
+-------------+---+----------+------+



<!-- # Windows function main type:
# 1. Analytical Function
# 2. Ranking Function
# 3. Aggregate Function -->

### Windows function main type:
### 1. Analytical Function
### 2. Ranking Function
### 3. Aggregate Function

In [67]:
# Analytical Function
# cume_dist : cumulative distribution
import pyspark.sql.functions as F

df.withColumn('distribution', F.cume_dist().over(window)).show()
### will create a new col call distribution and calculates cumulative distribution on the department col

+-------------+---+----------+------+------------+
|Employee_Name|Age|Department|Salary|distribution|
+-------------+---+----------+------+------------+
|        Kunal| 25|   Finance|  3000|         0.5|
|         Jeny| 26|   Finance|  3900|         1.0|
|      Srishti| 46|Management|  3300|         1.0|
|      Kailash| 29| Marketing|  2000|         0.5|
|       Hitesh| 30| Marketing|  3000|         1.0|
|          Ram| 28|     Sales|  3000|         0.4|
|          Ram| 28|     Sales|  3000|         0.4|
|        Meena| 33|     Sales|  4600|         0.6|
|       Sharad| 39|     Sales|  4100|         0.8|
|        Robin| 40|     Sales|  4100|         1.0|
+-------------+---+----------+------+------------+



In [68]:
# lag function

df.withColumn('Lag', F.lag('Salary',2).over(window)).show()

+-------------+---+----------+------+----+
|Employee_Name|Age|Department|Salary| Lag|
+-------------+---+----------+------+----+
|        Kunal| 25|   Finance|  3000|null|
|         Jeny| 26|   Finance|  3900|null|
|      Srishti| 46|Management|  3300|null|
|      Kailash| 29| Marketing|  2000|null|
|       Hitesh| 30| Marketing|  3000|null|
|          Ram| 28|     Sales|  3000|null|
|          Ram| 28|     Sales|  3000|null|
|        Meena| 33|     Sales|  4600|3000|
|       Sharad| 39|     Sales|  4100|3000|
|        Robin| 40|     Sales|  4100|4600|
+-------------+---+----------+------+----+



In [69]:
# lead function
df.withColumn("Lead", F.lead("salary", 2).over(window)).show()

+-------------+---+----------+------+----+
|Employee_Name|Age|Department|Salary|Lead|
+-------------+---+----------+------+----+
|        Kunal| 25|   Finance|  3000|null|
|         Jeny| 26|   Finance|  3900|null|
|      Srishti| 46|Management|  3300|null|
|      Kailash| 29| Marketing|  2000|null|
|       Hitesh| 30| Marketing|  3000|null|
|          Ram| 28|     Sales|  3000|4600|
|          Ram| 28|     Sales|  3000|4100|
|        Meena| 33|     Sales|  4600|4100|
|       Sharad| 39|     Sales|  4100|null|
|        Robin| 40|     Sales|  4100|null|
+-------------+---+----------+------+----+



In [70]:
# Ranking Function

# sample data for dataframe
sampleData = ((101, "Ram", "Biology", 80),
              (103, "Meena", "Social Science", 78),
              (104, "Robin", "Sanskrit", 58),
              (102, "Kunal", "Phisycs", 89),
              (101, "Ram", "Biology", 80),
              (106, "Srishti", "Maths", 70),
              (108, "Jeny", "Physics", 75),
              (107, "Hitesh", "Maths", 88),
              (109, "Kailash", "Maths", 90),
              (105, "Sharad", "Social Science", 84)
              )
columns = ["Roll_No", "Student_Name", "Subject", "Marks"]

In [75]:
window2 =  Window.partitionBy('Subject').orderBy('Marks')
df2 = spark.createDataFrame(data = sampleData, schema = columns)

In [76]:
df2.printSchema()

root
 |-- Roll_No: long (nullable = true)
 |-- Student_Name: string (nullable = true)
 |-- Subject: string (nullable = true)
 |-- Marks: long (nullable = true)



In [77]:

df.show()

+-------------+---+----------+------+
|Employee_Name|Age|Department|Salary|
+-------------+---+----------+------+
|          Ram| 28|     Sales|  3000|
|        Meena| 33|     Sales|  4600|
|        Robin| 40|     Sales|  4100|
|        Kunal| 25|   Finance|  3000|
|          Ram| 28|     Sales|  3000|
|      Srishti| 46|Management|  3300|
|         Jeny| 26|   Finance|  3900|
|       Hitesh| 30| Marketing|  3000|
|      Kailash| 29| Marketing|  2000|
|       Sharad| 39|     Sales|  4100|
+-------------+---+----------+------+



In [81]:
## row_number function
# generates row number based on the specified partition
df2.withColumn('row_number', F.row_number().over(window2)).show()

+-------+------------+--------------+-----+----------+
|Roll_No|Student_Name|       Subject|Marks|row_number|
+-------+------------+--------------+-----+----------+
|    101|         Ram|       Biology|   80|         1|
|    101|         Ram|       Biology|   80|         2|
|    106|     Srishti|         Maths|   70|         1|
|    107|      Hitesh|         Maths|   88|         2|
|    109|     Kailash|         Maths|   90|         3|
|    102|       Kunal|       Phisycs|   89|         1|
|    108|        Jeny|       Physics|   75|         1|
|    104|       Robin|      Sanskrit|   58|         1|
|    103|       Meena|Social Science|   78|         1|
|    105|      Sharad|Social Science|   84|         2|
+-------+------------+--------------+-----+----------+



In [82]:
# rank()

df2.withColumn('rank', F.rank().over(window2)).show()

+-------+------------+--------------+-----+----+
|Roll_No|Student_Name|       Subject|Marks|rank|
+-------+------------+--------------+-----+----+
|    101|         Ram|       Biology|   80|   1|
|    101|         Ram|       Biology|   80|   1|
|    106|     Srishti|         Maths|   70|   1|
|    107|      Hitesh|         Maths|   88|   2|
|    109|     Kailash|         Maths|   90|   3|
|    102|       Kunal|       Phisycs|   89|   1|
|    108|        Jeny|       Physics|   75|   1|
|    104|       Robin|      Sanskrit|   58|   1|
|    103|       Meena|Social Science|   78|   1|
|    105|      Sharad|Social Science|   84|   2|
+-------+------------+--------------+-----+----+



In [85]:
# percent_rank

df2.withColumn("percent_rank", F.percent_rank().over(window2)).show()

+-------+------------+--------------+-----+------------+
|Roll_No|Student_Name|       Subject|Marks|percent_rank|
+-------+------------+--------------+-----+------------+
|    101|         Ram|       Biology|   80|         0.0|
|    101|         Ram|       Biology|   80|         0.0|
|    106|     Srishti|         Maths|   70|         0.0|
|    107|      Hitesh|         Maths|   88|         0.5|
|    109|     Kailash|         Maths|   90|         1.0|
|    102|       Kunal|       Phisycs|   89|         0.0|
|    108|        Jeny|       Physics|   75|         0.0|
|    104|       Robin|      Sanskrit|   58|         0.0|
|    103|       Meena|Social Science|   78|         0.0|
|    105|      Sharad|Social Science|   84|         1.0|
+-------+------------+--------------+-----+------------+



In [86]:
# dense_rank

df2.withColumn('dense_rank', F.dense_rank().over(window2)).show()

+-------+------------+--------------+-----+----------+
|Roll_No|Student_Name|       Subject|Marks|dense_rank|
+-------+------------+--------------+-----+----------+
|    101|         Ram|       Biology|   80|         1|
|    101|         Ram|       Biology|   80|         1|
|    106|     Srishti|         Maths|   70|         1|
|    107|      Hitesh|         Maths|   88|         2|
|    109|     Kailash|         Maths|   90|         3|
|    102|       Kunal|       Phisycs|   89|         1|
|    108|        Jeny|       Physics|   75|         1|
|    104|       Robin|      Sanskrit|   58|         1|
|    103|       Meena|Social Science|   78|         1|
|    105|      Sharad|Social Science|   84|         2|
+-------+------------+--------------+-----+----------+



In [87]:
# Aggregate Function

# sample data for dataframe
sampleData = (("Ram", "Sales", 3000),
              ("Meena", "Sales", 4600),
              ("Robin", "Sales", 4100),
              ("Kunal", "Finance", 3000),
              ("Ram", "Sales", 3000),
              ("Srishti", "Management", 3300),
              ("Jeny", "Finance", 3900),
              ("Hitesh", "Marketing", 3000),
              ("Kailash", "Marketing", 2000),
              ("Sharad", "Sales", 4100)
              )

In [88]:
# column names 
columns = ["Employee_Name", "Department", "Salary"]
 
# creating the df
df3 = spark.createDataFrame(data=sampleData,
                            schema=columns)

In [89]:
df3.printSchema()

root
 |-- Employee_Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)



In [90]:
df.show()

+-------------+---+----------+------+
|Employee_Name|Age|Department|Salary|
+-------------+---+----------+------+
|          Ram| 28|     Sales|  3000|
|        Meena| 33|     Sales|  4600|
|        Robin| 40|     Sales|  4100|
|        Kunal| 25|   Finance|  3000|
|          Ram| 28|     Sales|  3000|
|      Srishti| 46|Management|  3300|
|         Jeny| 26|   Finance|  3900|
|       Hitesh| 30| Marketing|  3000|
|      Kailash| 29| Marketing|  2000|
|       Sharad| 39|     Sales|  4100|
+-------------+---+----------+------+



In [91]:
window3  = Window.partitionBy("Department")

In [98]:
df3.withColumn('Avg', F.avg('Salary').over(window3)) \
    .withColumn('Sum', F.sum('Salary').over(window3))\
    .withColumn('Min',F.sum('Salary').over(window3))\
    .withColumn('Max',F.max('Salary').over(window3)).show()

+-------------+----------+------+------+-----+-----+----+
|Employee_Name|Department|Salary|   Avg|  Sum|  Min| Max|
+-------------+----------+------+------+-----+-----+----+
|        Kunal|   Finance|  3000|3450.0| 6900| 6900|3900|
|         Jeny|   Finance|  3900|3450.0| 6900| 6900|3900|
|      Srishti|Management|  3300|3300.0| 3300| 3300|3300|
|       Hitesh| Marketing|  3000|2500.0| 5000| 5000|3000|
|      Kailash| Marketing|  2000|2500.0| 5000| 5000|3000|
|          Ram|     Sales|  3000|3760.0|18800|18800|4600|
|        Meena|     Sales|  4600|3760.0|18800|18800|4600|
|        Robin|     Sales|  4100|3760.0|18800|18800|4600|
|          Ram|     Sales|  3000|3760.0|18800|18800|4600|
|       Sharad|     Sales|  4100|3760.0|18800|18800|4600|
+-------------+----------+------+------+-----+-----+----+

