In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('LazyEvaluation').getOrCreate()

In [3]:
# Lazy Evaluation and Action
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

data = [
    ("Alice", 25, "New York"),
    ("Bob", 30, "San Francisco"),
    ("Charlie", 35, "Chicago")
]

schema = StructType([
    StructField("name", StringType()),
    StructField("age", IntegerType()),
    StructField("city", StringType())
])

df = spark.createDataFrame(data, schema)

In [4]:
# In lazy evaluation, code is not immediately executed 
# It is placed in a plan for optimisation
# Note that only a df object is instantiated
df = df.filter(col("city") == "New York")

In [5]:
# Code is placed in a plan for optimisation
# Note that only a df object is instantiated
df = df.select("city")

In [6]:
# Code in the optimised execution plan is only executed when an action is run
# Note that a job is created
display(df)

DataFrame[city: string]

In [7]:
# You can see the execution plan via explain()
df.explain()

== Physical Plan ==
*(1) Project [city#2]
+- *(1) Filter (isnotnull(city#2) AND (city#2 = New York))
   +- *(1) Scan ExistingRDD[name#0,age#1,city#2]


