# Self-Contained Data Processing Template

Standalone PySpark scaffold (no spark_fuse imports) to create a session, log progress, load dummy data, transform, test, and write results.

## Notebook guidelines

- Keep one primary table per notebook; name notebooks in snake_case.
- Make runs idempotent: deterministic transforms, safe overwrites/merges, repeatable partition logic.
- Functions in snake_case, classes in PascalCase, constants UPPER_SNAKE; avoid global state.
- Document inputs/outputs; prefer pure helpers defined inline.


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
import datetime as _dt
import time

# Lightweight helpers
def create_progress_tracker(total_steps: int):
    return {"current": 0, "total": float(total_steps), "start": time.perf_counter(), "last": None}

def log_progress(tracker, label: str):
    now = time.perf_counter()
    last = tracker["last"] or tracker["start"]
    tracker["current"] += 1
    tracker["last"] = now
    current = int(tracker["current"])
    total = tracker["total"] or 1
    elapsed = now - last
    total_elapsed = now - tracker["start"]
    filled = max(0, min(10, int(10 * current / total)))
    bar = "#" * filled + "." * (10 - filled)
    print(f"[INFO] [{bar}] {current}/{int(total)} {label} (+{elapsed:.2f}s, total {total_elapsed:.2f}s)")

def ensure_columns(df, required):
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")
    return df

def preview(df, n=5):
    rows = [r.asDict(recursive=True) for r in df.limit(n).collect()]
    print(f"rows={rows}\nschema={df.schema.simpleString()}")

progress_tracker = create_progress_tracker(total_steps=8)
job_ts = _dt.datetime.utcnow().replace(microsecond=0).isoformat()


  job_ts = _dt.datetime.utcnow().replace(microsecond=0).isoformat()


## Create a Spark session


In [2]:
print("[INFO] Starting Spark session...")
builder = (
    SparkSession.builder.appName("data-processing-template-standalone")
    .master("local[*]")
    .config("spark.sql.shuffle.partitions", "8")
)

# Optional: add Delta extensions if packages are available
builder = builder.config(
    "spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"
)

spark = builder.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
log_progress(progress_tracker, "Spark session ready")
spark




[INFO] Starting Spark session...


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/02 09:22:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


[INFO] [#.........] 1/8 Spark session ready (+2.46s, total 2.46s)


## Load dummy data


In [3]:
orders_schema = T.StructType(
    [
        T.StructField("order_id", T.StringType(), False),
        T.StructField("order_ts", T.StringType(), False),
        T.StructField("order_total", T.DoubleType(), False),
        T.StructField("customer_id", T.StringType(), False),
    ]
)
orders_data = [
    ("O-1001", "2024-01-05", 42.50, "C001"),
    ("O-1002", "2024-01-06", 18.00, "C002"),
    ("O-1003", "2024-01-06", 120.75, "C003"),
]
orders_df = spark.createDataFrame(orders_data, schema=orders_schema)

customers_schema = T.StructType(
    [
        T.StructField("customer_id", T.StringType(), False),
        T.StructField("segment", T.StringType(), True),
        T.StructField("country", T.StringType(), True),
    ]
)
customers_data = [
    ("C001", "retail", "US"),
    ("C002", "enterprise", "CA"),
    ("C003", "retail", "UK"),
]
customers_df = spark.createDataFrame(customers_data, schema=customers_schema)

print("[INFO] Input data loaded.")
preview(orders_df)
log_progress(progress_tracker, "Input data loaded")


[INFO] Input data loaded.


                                                                                

rows=[{'order_id': 'O-1001', 'order_ts': '2024-01-05', 'order_total': 42.5, 'customer_id': 'C001'}, {'order_id': 'O-1002', 'order_ts': '2024-01-06', 'order_total': 18.0, 'customer_id': 'C002'}, {'order_id': 'O-1003', 'order_ts': '2024-01-06', 'order_total': 120.75, 'customer_id': 'C003'}]
schema=struct<order_id:string,order_ts:string,order_total:double,customer_id:string>
[INFO] [##........] 2/8 Input data loaded (+1.72s, total 4.17s)


## Process data


In [4]:
curated_orders_df = (
    orders_df
    .withColumn("order_date", F.to_date("order_ts"))
    .withColumn("order_month", F.date_format("order_date", "yyyy-MM"))
    .withColumn("processing_ts", F.lit(job_ts))
)
curated_customers_df = customers_df.select("customer_id", "segment", "country")
print("[INFO] Curated datasets ready.")
log_progress(progress_tracker, "Curated datasets ready")


[INFO] Curated datasets ready.
[INFO] [###.......] 3/8 Curated datasets ready (+0.05s, total 4.23s)


## Join


In [5]:
joined_df = (
    curated_orders_df.alias("o")
    .join(curated_customers_df.alias("c"), on="customer_id", how="left")
)
print("[INFO] Join complete.")
preview(joined_df)
log_progress(progress_tracker, "Join complete")


[INFO] Join complete.
rows=[{'customer_id': 'C001', 'order_id': 'O-1001', 'order_ts': '2024-01-05', 'order_total': 42.5, 'order_date': datetime.date(2024, 1, 5), 'order_month': '2024-01', 'processing_ts': '2025-12-02T08:21:59', 'segment': 'retail', 'country': 'US'}, {'customer_id': 'C002', 'order_id': 'O-1002', 'order_ts': '2024-01-06', 'order_total': 18.0, 'order_date': datetime.date(2024, 1, 6), 'order_month': '2024-01', 'processing_ts': '2025-12-02T08:21:59', 'segment': 'enterprise', 'country': 'CA'}, {'customer_id': 'C003', 'order_id': 'O-1003', 'order_ts': '2024-01-06', 'order_total': 120.75, 'order_date': datetime.date(2024, 1, 6), 'order_month': '2024-01', 'processing_ts': '2025-12-02T08:21:59', 'segment': 'retail', 'country': 'UK'}]
schema=struct<customer_id:string,order_id:string,order_ts:string,order_total:double,order_date:date,order_month:string,processing_ts:string,segment:string,country:string>
[INFO] [#####.....] 4/8 Join complete (+0.57s, total 4.80s)


## Data tests


In [6]:
print("[INFO] Running data tests...")
ensure_columns(joined_df, ["order_id", "customer_id", "order_date", "order_month"])
assert joined_df.filter(F.col("order_id").isNull()).count() == 0, "order_id should be populated"
assert joined_df.filter(F.col("customer_id").isNull()).count() == 0, "customer_id should be populated"
assert joined_df.dropDuplicates(["order_id"]).count() == joined_df.count(), "order_id should be unique"
invalid_states = joined_df.filter(F.col("order_total") < 0).count()
assert invalid_states == 0, f"Found {invalid_states} negative order totals"
print("[INFO] Data tests passed.")
log_progress(progress_tracker, "In-memory data tests passed")


[INFO] Running data tests...
[INFO] Data tests passed.
[INFO] [######....] 5/8 In-memory data tests passed (+1.07s, total 5.87s)


## Write data


In [7]:
output_path = "/tmp/spark_fuse/orders_enriched_standalone"
print(f"[INFO] Writing data to {output_path} (Parquet)...")
(
    joined_df.write
    .mode("overwrite")
    .format("parquet")
    .partitionBy("order_month")
    .save(output_path)
)
log_progress(progress_tracker, "Write complete")


[INFO] Writing data to /tmp/spark_fuse/orders_enriched_standalone (Parquet)...
[INFO] [#######...] 6/8 Write complete (+0.96s, total 6.83s)


## Post-write validations


In [8]:
persisted_df = spark.read.parquet(output_path)
ensure_columns(persisted_df, ["order_id", "customer_id", "order_date", "order_month"])
assert persisted_df.count() > 0, "Persisted dataset is empty"
assert persisted_df.filter(F.col("order_id").isNull()).count() == 0, "order_id should be populated"
assert persisted_df.dropDuplicates(["order_id"]).count() == persisted_df.count(), "order_id should be unique"
invalid_persisted_states = persisted_df.filter(F.col("order_total") < 0).count()
assert invalid_persisted_states == 0, f"Found {invalid_persisted_states} negative order totals after write"
print("[INFO] Post-write validations passed.")
log_progress(progress_tracker, "Post-write validations passed")


[INFO] Post-write validations passed.
[INFO] [########..] 7/8 Post-write validations passed (+0.60s, total 7.43s)


## Stop session


In [9]:
print("[INFO] Stopping Spark session.")
spark.stop()
print("[INFO] Spark session stopped.")
log_progress(progress_tracker, "Spark session stopped")


[INFO] Stopping Spark session.
[INFO] Spark session stopped.
[INFO] [##########] 8/8 Spark session stopped (+0.38s, total 7.81s)
