Documentation:
* [databricks doc](https://databricks.com/blog/2015/07/15/introducing-window-functions-in-spark-sql.html)
* [plural sight](https://app.pluralsight.com/course-player?clipId=a7bdf996-3a11-4f8a-9fda-3222c93eef12)

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('windowing').getOrCreate()

In [4]:
products_file = '../datasets/products.csv'
products = spark.read.csv(products_file, header=True)
products.columns

['product', 'category', 'price']

In [5]:
products.show(5)

+----------+--------+-----+
|   product|category|price|
+----------+--------+-----+
|Samsung TX|  Tablet|  999|
|Samsung JX|  Mobile|  799|
|Redmi Note|  Mobile|  399|
|        Mi|  Mobile|  299|
|      iPad|  Tablet|  789|
+----------+--------+-----+
only showing top 5 rows



### Sort products based on prices in every category

In [6]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col
category_window = Window\
                    .partitionBy('category')\
                    .orderBy(col('price').desc())

In [7]:
import pyspark.sql.functions as func

rank = func.dense_rank().over(category_window)

In [8]:
ranked_products = products.withColumn('rank',rank)
ranked_products.show()

+----------+--------+-----+----+
|   product|category|price|rank|
+----------+--------+-----+----+
|    iPhone|  Mobile|  999|   1|
|Samsung JX|  Mobile|  799|   2|
|Redmi Note|  Mobile|  399|   3|
|   OnePlus|  Mobile|  356|   4|
|        Mi|  Mobile|  299|   5|
|  Micromax|  Mobile|  249|   6|
|Samsung TX|  Tablet|  999|   1|
|      iPad|  Tablet|  789|   2|
|    Lenovo|  Tablet|  499|   3|
|        Xu|  Tablet|  267|   4|
+----------+--------+-----+----+



### Most expensive products in each category

In [9]:
import pyspark.sql.functions as func

exp_products = products.withColumn('rank',rank)\
                            .where(col('rank') ==1 )\
                            .drop('rank')
exp_products.show()

+----------+--------+-----+
|   product|category|price|
+----------+--------+-----+
|    iPhone|  Mobile|  999|
|Samsung TX|  Tablet|  999|
+----------+--------+-----+



### 2nd most expensive products

In [10]:
exp2_products = products.withColumn('rank',rank)\
                            .where(col('rank') ==2 )
exp2_products.show()

+----------+--------+-----+----+
|   product|category|price|rank|
+----------+--------+-----+----+
|Samsung JX|  Mobile|  799|   2|
|      iPad|  Tablet|  789|   2|
+----------+--------+-----+----+



#### SQL

In [11]:
products.createOrReplaceTempView('productsView')

In [12]:
spark.sql(
    'SELECT *,rank() OVER (PARTITION BY category ORDER BY price DESC) AS rank FROM productsView'
).show(5)

+----------+--------+-----+----+
|   product|category|price|rank|
+----------+--------+-----+----+
|    iPhone|  Mobile|  999|   1|
|Samsung JX|  Mobile|  799|   2|
|Redmi Note|  Mobile|  399|   3|
|   OnePlus|  Mobile|  356|   4|
|        Mi|  Mobile|  299|   5|
+----------+--------+-----+----+
only showing top 5 rows



### percentage price difference from  previous product 
Row frame

In [13]:
window_frame = Window\
.partitionBy('category')\
.orderBy(col('price').desc())\
.rowsBetween(-1,0)

In [15]:
prev = func.first(col('price')).over(window_frame)
now = func.last(col('price')).over(window_frame)
pdiff = (now-prev)/prev*100
products.withColumn('percentage_diff', pdiff).show()

+----------+--------+-----+-------------------+
|   product|category|price|    percentage_diff|
+----------+--------+-----+-------------------+
|    iPhone|  Mobile|  999|                0.0|
|Samsung JX|  Mobile|  799| -20.02002002002002|
|Redmi Note|  Mobile|  399| -50.06257822277848|
|   OnePlus|  Mobile|  356|-10.776942355889723|
|        Mi|  Mobile|  299| -16.01123595505618|
|  Micromax|  Mobile|  249|-16.722408026755854|
|Samsung TX|  Tablet|  999|                0.0|
|      iPad|  Tablet|  789| -21.02102102102102|
|    Lenovo|  Tablet|  499|-36.755386565272495|
|        Xu|  Tablet|  267| -46.49298597194389|
+----------+--------+-----+-------------------+



#### No partition over category. Entire data as a window

In [24]:
window_frame = Window\
.orderBy(col('price').desc())\
.rowsBetween(-1,0)

prev = func.first(col('price')).over(window_frame)
now = func.last(col('price')).over(window_frame)
pdiff = (now-prev)/prev*100
products.withColumn('percentage_diff', pdiff).show()

+----------+--------+-----+-------------------+
|   product|category|price|    percentage_diff|
+----------+--------+-----+-------------------+
|Samsung TX|  Tablet|  999|                0.0|
|    iPhone|  Mobile|  999|                0.0|
|Samsung JX|  Mobile|  799| -20.02002002002002|
|      iPad|  Tablet|  789|-1.2515644555694618|
|    Lenovo|  Tablet|  499|-36.755386565272495|
|Redmi Note|  Mobile|  399| -20.04008016032064|
|   OnePlus|  Mobile|  356|-10.776942355889723|
|        Mi|  Mobile|  299| -16.01123595505618|
|        Xu|  Tablet|  267|-10.702341137123746|
|  Micromax|  Mobile|  249| -6.741573033707865|
+----------+--------+-----+-------------------+

