In [0]:
import pandas as pd
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("LazyEvaluationDemo").getOrCreate()

In [0]:
pandas_df = pd.DataFrame({
    'id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'age': [25, 30, 35, 40, 45],
    'salary': [50000, 60000, 70000, 80000, 90000]
})
data = [(1, 'Alice', 25, 50000),
        (2, 'Bob', 30, 60000),
        (3, 'Charlie', 35, 70000),
        (4, 'David', 40, 80000),
        (5, 'Eve', 45, 90000)]
columns = ['id', 'name', 'age', 'salary']

spark_df = spark.createDataFrame(data, columns)

In [0]:
print("Pandas DataFrame (eager evaluation):")
print(pandas_df)
print("\nNotice the data is immediately displayed - evaluation happens right away.")

Pandas DataFrame (eager evaluation):
   id     name  age  salary
0   1    Alice   25   50000
1   2      Bob   30   60000
2   3  Charlie   35   70000
3   4    David   40   80000
4   5      Eve   45   90000

Notice the data is immediately displayed - evaluation happens right away.


In [0]:
print("\nPySpark DataFrame (lazy evaluation):")
print(spark_df)
print("\nNotice you only see the schema - no data processing has happened yet.")


PySpark DataFrame (lazy evaluation):
DataFrame[id: bigint, name: string, age: bigint, salary: bigint]

Notice you only see the schema - no data processing has happened yet.


In [0]:
print("\nTriggering execution with collect():")
display(spark_df.collect())
print("\nNow the data is processed and displayed - execution was triggered by collect().")


Triggering execution with collect():


id,name,age,salary
1,Alice,25,50000
2,Bob,30,60000
3,Charlie,35,70000
4,David,40,80000
5,Eve,45,90000



Now the data is processed and displayed - execution was triggered by collect().


In [0]:
print("Simple explain mode:")
spark_df.explain(mode="simple")

print("\nExtended explain mode:")
spark_df.explain(mode="extended")

print("\nCodegen explain mode:")
spark_df.explain(mode="codegen")

print("\nCost explain mode:")
spark_df.explain(mode="cost")

print("\nFormatted explain mode:")
spark_df.explain(mode="formatted")

Simple explain mode:
== Physical Plan ==
LocalTableScan [id#10485L, name#10486, age#10487L, salary#10488L]


== Photon Explanation ==
Photon does not fully support the query because:
		Unsupported node: LocalTableScan [id#10485L, name#10486, age#10487L, salary#10488L].

Reference node:
	LocalTableScan [id#10485L, name#10486, age#10487L, salary#10488L]


Extended explain mode:
== Parsed Logical Plan ==
Project [id#10477L AS id#10485L, name#10478 AS name#10486, age#10479L AS age#10487L, salary#10480L AS salary#10488L]
+- LocalRelation [id#10477L, name#10478, age#10479L, salary#10480L]

== Analyzed Logical Plan ==
id: bigint, name: string, age: bigint, salary: bigint
Project [id#10477L AS id#10485L, name#10478 AS name#10486, age#10479L AS age#10487L, salary#10480L AS salary#10488L]
+- LocalRelation [id#10477L, name#10478, age#10479L, salary#10480L]

== Optimized Logical Plan ==
LocalRelation [id#10485L, name#10486, age#10487L, salary#10488L]

== Physical Plan ==
LocalTableScan [id#10485L,

In [0]:
transformed_df = spark_df.filter(spark_df.age > 30).select("name", "salary")
print("\nAfter transformations (still no execution):")
print(transformed_df)
print("\nExecution plan for transformations:")
transformed_df.explain()
print("\nTriggering execution with show():")
transformed_df.show()


After transformations (still no execution):
DataFrame[name: string, salary: bigint]

Execution plan for transformations:
== Physical Plan ==
LocalTableScan [name#10650, salary#10652L]


== Photon Explanation ==
Photon does not fully support the query because:
		Unsupported node: LocalTableScan [name#10650, salary#10652L].

Reference node:
	LocalTableScan [name#10650, salary#10652L]


Triggering execution with show():
+-------+------+
|   name|salary|
+-------+------+
|Charlie| 70000|
|  David| 80000|
|    Eve| 90000|
+-------+------+

