# S1 J4 ? Performance (Cache + Partition)

This notebook demonstrates basic performance techniques: partitioning, caching, and inspecting plans.


In [None]:
try:
    spark
except NameError:
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.appName("performance-cache-partition").getOrCreate()


In [None]:
from pyspark.sql import functions as F

data_path = "../../data/example.csv"

raw = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv(data_path)
)

silver = (
    raw
    .withColumn("signup_date", F.to_date("signup_date"))
    .withColumn("spend", F.col("spend").cast("double"))
)


In [None]:
# Partitioning example
print("Initial partitions:", silver.rdd.getNumPartitions())
by_plan = silver.repartition(4, "plan")
print("After repartition:", by_plan.rdd.getNumPartitions())


In [None]:
# Cache to avoid recomputation across actions
by_plan.cache()

# Materialize cache
by_plan.count()

# Confirm cache status
print("Is cached:", spark.catalog.isCached(by_plan.toString()))


In [None]:
# Explain physical plan for a simple aggregation
agg = (
    by_plan
    .groupBy("plan")
    .agg(
        F.count("*").alias("users"),
        F.round(F.sum("spend"), 2).alias("total_spend"),
    )
)

agg.explain("formatted")
agg.show(truncate=False)


In [None]:
# Free cache when done
by_plan.unpersist()
