In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("example-transform")
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/02/06 12:20:23 WARN Utils: Your hostname, MA-L-481079, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
26/02/06 12:20:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/06 12:20:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from pyspark.sql import functions as F, types as T

data_path = "../../data/example.csv"

df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv(data_path)
)

# Inline lookup table: plan -> monthly_fee, tier
lookup = spark.createDataFrame(
    [
        ("Free", 0, "low"),
        ("Basic", 29, "mid"),
        ("Pro", 79, "mid"),
        ("Enterprise", 199, "high"),
    ],
    schema=["plan", "monthly_fee", "tier"]
)

# Transformations: filter, join, derived columns
result = (
    df
    .filter((F.col("is_active") == True) & (F.col("spend") > 0))
    .join(lookup, on="plan", how="left")
    .withColumn("signup_date", F.to_date("signup_date"))
    .withColumn("spend_per_month", F.round(F.col("spend") / F.lit(12), 2))
    .withColumn("high_value", F.col("spend") >= 500)
    .select(
        "name", "age", "city", "signup_date",
        "plan", "tier", "monthly_fee", "spend",
        "spend_per_month", "high_value"
    )
)

result.show(truncate=False)
result.printSchema()


+------+---+---------+-----------+----------+----+-----------+------+---------------+----------+
|name  |age|city     |signup_date|plan      |tier|monthly_fee|spend |spend_per_month|high_value|
+------+---+---------+-----------+----------+----+-----------+------+---------------+----------+
|Amina |29 |Seattle  |2024-01-15 |Pro       |mid |79         |129.5 |10.79          |false     |
|Chen  |41 |San Jose |2022-06-30 |Pro       |mid |79         |560.0 |46.67          |true      |
|Ines  |36 |Portland |2022-02-14 |Pro       |mid |79         |410.25|34.19          |false     |
|Fatima|31 |Denver   |2024-05-05 |Basic     |mid |29         |89.99 |7.5            |false     |
|Jamal |30 |Atlanta  |2024-07-01 |Basic     |mid |29         |59.0  |4.92           |false     |
|Eli   |38 |New York |2021-03-22 |Enterprise|high|199        |1240.0|103.33         |true      |
|Hiro  |45 |San Diego|2020-08-19 |Enterprise|high|199        |2300.1|191.67         |true      |
|Daria |23 |Chicago  |2024-09-