<a href="https://colab.research.google.com/github/jugalpanchal/bd-chef/blob/main/spark_window_funcs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Follow the steps to install the dependencies:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # install java
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz # spark package download
!tar xf spark-3.1.2-bin-hadoop3.2.tgz # unzip spark package
!pip install -q findspark # install spark

# Set the location of Java and Spark:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import Row

# create or get spark session
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Spark_App1") \
        .getOrCreate()

# Spark 2.x does not give direct sparkcontext so we need to get it from SparkSession
sc = spark.sparkContext

In [3]:
products_df = spark.read.format('csv')\
  .option('header', 'true')\
  .load('datasets/products.csv')

In [4]:
products_df.show()

+----------+--------+-----+
|   product|category|price|
+----------+--------+-----+
|Samsung TX|  Tablet|  999|
|Samsung JX|  Mobile|  799|
|Redmi Note|  Mobile|  399|
|        Mi|  Mobile|  299|
|      iPad|  Tablet|  789|
|    iPhone|  Mobile|  999|
|  Micromax|  Mobile|  249|
|    Lenovo|  Tablet|  499|
|   OnePlus|  Mobile|  356|
|        Xu|  Tablet|  267|
+----------+--------+-----+



In [5]:
import sys
from pyspark.sql.window import Window
import pyspark.sql.functions as func

# Case1:
# partition by category so there would be two partition. e.g. Tablet and Mobile
# there is a individual order by in each partition
win_func1 = Window.partitionBy(products_df['category'])\
  .orderBy(products_df['price'].desc())

 # add a rank with each row in each individual partition/window
#price_rank = func.rank().over(win_func1)

# DataFrame[product: string, category: string, price: string, rank: int]
products_rank_df = products_df.select(
        products_df['product'], 
        products_df['category'], 
        products_df['price']
      ).withColumn('product_rank', func.rank().over(win_func1))

products_rank_df.show()

+----------+--------+-----+------------+
|   product|category|price|product_rank|
+----------+--------+-----+------------+
|    iPhone|  Mobile|  999|           1|
|Samsung JX|  Mobile|  799|           2|
|Redmi Note|  Mobile|  399|           3|
|   OnePlus|  Mobile|  356|           4|
|        Mi|  Mobile|  299|           5|
|  Micromax|  Mobile|  249|           6|
|Samsung TX|  Tablet|  999|           1|
|      iPad|  Tablet|  789|           2|
|    Lenovo|  Tablet|  499|           3|
|        Xu|  Tablet|  267|           4|
+----------+--------+-----+------------+



In [6]:
# Case2:
# partition by category so there would be two partition. e.g. Tablet and Mobile
# there is a individual order by in each partition
win_func2 = Window.partitionBy(products_df['category'])\
  .orderBy(products_df['price'].desc())\
  .rowsBetween(-1, 0) # specificy the frame specification; -1 is previuse row and 0 is current row
# There is a special handing when the cursor is at the first row then -1 is pointing to the currrent row because there is no -1 row.

 # find a price accoresponding the window specification
 # add a max price with each row in each individual partition/window
 # It picks the price from the previous row for the current row.
price_max = func.max(products_df['price']).over(win_func2)

products_max_df = products_df.select(
          products_df['product'], 
          products_df['category'],
          products_df['price'],
          price_max.alias('price_max'))

products_max_df.show()

# Ex. The Micromax range can be 249 to 299. If price goes more than the 299 then the customer will go with the Mi.

+----------+--------+-----+---------+
|   product|category|price|price_max|
+----------+--------+-----+---------+
|    iPhone|  Mobile|  999|      999|
|Samsung JX|  Mobile|  799|      999|
|Redmi Note|  Mobile|  399|      799|
|   OnePlus|  Mobile|  356|      399|
|        Mi|  Mobile|  299|      356|
|  Micromax|  Mobile|  249|      299|
|Samsung TX|  Tablet|  999|      999|
|      iPad|  Tablet|  789|      999|
|    Lenovo|  Tablet|  499|      789|
|        Xu|  Tablet|  267|      499|
+----------+--------+-----+---------+



In [7]:
# Case3:
win_func3 = Window.partitionBy(products_df['category'])\
                  .orderBy(products_df['price'].desc())\
                  .rangeBetween(-sys.maxsize, sys.maxsize) # the sys.maxsize is all rows (before current row, after current row)

price_diff = func.max(products_df['price']).over(win_func3) - products_df['price']

products_diff_df = products_df.select(
          products_df['product'], 
          products_df['category'],
          products_df['price'],
          price_diff.alias('price_diff'))

products_diff_df.show()

# Ex. Samsung JX price is $799 and differece between the max price product is $200(999-799)
# Ex. OnePlus price is $356 and difference between the max price product is $643(999-356)

+----------+--------+-----+----------+
|   product|category|price|price_diff|
+----------+--------+-----+----------+
|    iPhone|  Mobile|  999|       0.0|
|Samsung JX|  Mobile|  799|     200.0|
|Redmi Note|  Mobile|  399|     600.0|
|   OnePlus|  Mobile|  356|     643.0|
|        Mi|  Mobile|  299|     700.0|
|  Micromax|  Mobile|  249|     750.0|
|Samsung TX|  Tablet|  999|       0.0|
|      iPad|  Tablet|  789|     210.0|
|    Lenovo|  Tablet|  499|     500.0|
|        Xu|  Tablet|  267|     732.0|
+----------+--------+-----+----------+

