## Import libraries

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,row_number, lead, lag, count, avg
from pyspark.sql.window import Window

## Create spark session object

In [2]:
spark = (SparkSession.builder
         .appName("apply-window-functions")
         .master("spark://spark-master:7077")
         .config("spark.executor.memory", "512m")
         .getOrCreate()
        )

spark.sparkContext.setLogLevel("ERROR")


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/22 18:36:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Read the data

In [4]:
df = (spark.read.format("csv")
      .option("header", "true")
      .option("nullValue", "null")
      .option("dateFormat", "LLLL d, y")
      .load("../../data/netflix_titles.csv")
     )



                                                                                

In [5]:
df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



## Define a window specification

In [8]:
window_spec = Window.partitionBy("country").orderBy("date_added")

## Using the row_number window function

In [13]:
result = df.withColumn("row_number", row_number().over(window_spec))

result.select("title", "country", "date_added","row_number").show(truncate=False)

+------------------------------------------------------------+-------+------------------+----------+
|title                                                       |country|date_added        |row_number|
+------------------------------------------------------------+-------+------------------+----------+
|Kikoriki                                                    |null   |null              |1         |
|Fit for Fashion                                             |null   | December 14, 2018|2         |
|Lego Friends                                                |null   | February 1, 2019 |3         |
|Nightmare Tenants, Slum Landlords                           |null   | July 12, 2019    |4         |
|Satrangi                                                    |null   |April 1, 2017     |5         |
|Buddha                                                      |null   |April 1, 2018     |6         |
|Fishpeople                                                  |null   |April 1, 2018     |7 

## Use the lag or lead window functions

## Add lead function

In [22]:
df = (df.withColumn("lead_date_added",lead("date_added",1).over(window_spec)
                    .filter("country").isNotNull()
                   ))
                    

df.select("title", "country","date_added", "lead_date_added").show()

                                           
                   

TypeError: 'Column' object is not callable

## Repositioning of the columns

## Adding lag column

In [19]:
new_col_order = ["title","country", "date_added", "lead_date_added", "lag_date_added"]

df = df.withColumn("lag_date_added", lag("date_added").over(window_spec))

df.select(*new_col_order).show()

+--------------------+-------+------------------+------------------+------------------+
|               title|country|        date_added|   lead_date_added|    lag_date_added|
+--------------------+-------+------------------+------------------+------------------+
|            Kikoriki|   null|              null| December 14, 2018|              null|
|     Fit for Fashion|   null| December 14, 2018|  February 1, 2019|              null|
|        Lego Friends|   null|  February 1, 2019|     July 12, 2019| December 14, 2018|
|Nightmare Tenants...|   null|     July 12, 2019|     April 1, 2017|  February 1, 2019|
|            Satrangi|   null|     April 1, 2017|     April 1, 2018|     July 12, 2019|
|              Buddha|   null|     April 1, 2018|     April 1, 2018|     April 1, 2017|
|          Fishpeople|   null|     April 1, 2018|     April 1, 2019|     April 1, 2018|
|Kicko & Super Speedo|   null|     April 1, 2019|     April 1, 2020|     April 1, 2018|
|  Pokémon the Series|   null|  