diff --git a/.gitignore b/.gitignore
index ae9c006e1..d15dbed61 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,4 @@ bin
 
 coverage.xml
 htmlcov/
+uv.lock
diff --git a/examples/spark/01_hello_spark_pi.py b/examples/spark/01_hello_spark_pi.py
new file mode 100644
index 000000000..93240b075
--- /dev/null
+++ b/examples/spark/01_hello_spark_pi.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python3
+"""
+Title: Hello Spark - Calculate Pi
+Level: 1 (Beginner)
+Target Audience: Data Scientists new to Spark
+Time to Run: ~2-3 minutes
+
+Description:
+Your first Spark job! This example demonstrates how to submit a simple PySpark application
+that calculates Pi using the Monte Carlo method - a classic distributed computing example
+that shows how Spark distributes work across executors.
+
+Prerequisites:
+- Kind cluster with Spark Operator (run ./setup_test_environment.sh)
+- Default namespace with 'spark-operator-spark' service account
+
+What You'll Learn:
+- How to create a SparkClient
+- Submit a PySpark application
+- Wait for job completion
+- Retrieve and parse job logs
+- Clean up resources
+
+Real-World Use Case:
+Distributed computation, parallel processing, Monte Carlo simulations.
+"""
+
+from datetime import datetime
+import os
+import sys
+
+# Add SDK to path for development mode
+sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+if sdk_path not in sys.path:
+    sys.path.insert(0, sdk_path)
+
+from kubeflow.spark import (  # noqa: E402
+    ApplicationState,
+    OperatorBackendConfig,
+    BatchSparkClient,
+)
+
+
+def main():
+    """Main example: Submit Pi calculation job and get results."""
+
+    print("=" * 80)
+    print("EXAMPLE 01: Hello Spark - Calculate Pi")
+    print("=" * 80)
+    print()
+    print("This example demonstrates:")
+    print("  1. Creating a Spark client")
+    print("  2. Submitting a PySpark application (Calculate Pi)")
+    print("  3. Monitoring job progress")
+    print("  4. Retrieving results from logs")
+    print()
+
+    # Step 1: Create SparkClient with configuration
+    print("Step 1: Creating Spark client...")
+    config = OperatorBackendConfig(
+        namespace=os.getenv("SPARK_NAMESPACE", "default"),
+        service_account="spark-operator-spark",
+        default_spark_image="docker.io/library/spark",
+        context=os.getenv("KUBE_CONTEXT", "kind-spark-test"),
+        enable_monitoring=False,  # Keep it simple for beginners
+        enable_ui=False,  # We'll enable this in later examples
+    )
+    client = BatchSparkClient(backend_config=config)
+    print("  Client created successfully")
+    print()
+
+    # Step 2: Prepare the application
+    # Use timestamp to ensure unique name each run
+    timestamp = datetime.now().strftime("%H%M%S")
+    app_name = f"hello-spark-{timestamp}"
+
+    print("Step 2: Configuring Spark application...")
+    print(f"  App name: {app_name}")
+    print("  Spark version: 4.0.0")
+    print("  Resources: 1 driver + 2 executors")
+    print("  Memory: 512m per container")
+    print("  Example: Calculate Pi using Monte Carlo method")
+    print()
+
+    # Step 3: Submit the application
+    print("Step 3: Submitting application to Kubernetes...")
+
+    try:
+        response = client.submit_application(
+            # Application metadata
+            app_name=app_name,
+            main_application_file="local:///opt/spark/examples/src/main/python/pi.py",
+            # Spark configuration
+            spark_version="4.0.0",
+            app_type="Python",
+            # Resource allocation (small for demo)
+            driver_cores=1,
+            driver_memory="512m",
+            executor_cores=1,
+            executor_memory="512m",
+            num_executors=2,
+            # Arguments for pi calculation (number of samples)
+            arguments=["10"],  # Calculate Pi with 10 partitions
+            # Required for Spark 4.0
+            spark_conf={
+                "spark.kubernetes.file.upload.path": "/tmp",
+            },
+        )
+
+        print("  Application submitted successfully!")
+        print(f"  Submission ID: {response.submission_id}")
+        print(f"  Status: {response.status}")
+        print()
+
+    except Exception as e:
+        print(f"  ERROR: Submission failed: {e}")
+        sys.exit(1)
+
+    # Step 4: Monitor the application
+    print("Step 4: Monitoring application (this may take 1-2 minutes)...")
+    print("  Waiting for pods to start...")
+
+    try:
+        # Wait for completion with timeout
+        final_status = client.wait_for_job_status(
+            submission_id=app_name,
+            timeout=300,  # 5 minutes max
+            polling_interval=5,  # Check every 5 seconds
+        )
+
+        print("  Application completed!")
+        print(f"  Final state: {final_status.state.value}")
+        print()
+
+        # Check if successful
+        if final_status.state != ApplicationState.COMPLETED:
+            print(
+                f"  WARNING: Application did not complete successfully: {final_status.state.value}"
+            )  # noqa: E501
+            print("  Check logs below for details.")
+
+    except TimeoutError:
+        print("  ERROR: Application did not complete within 5 minutes")
+        print("  You can check status later with: client.get_job('{app_name}')")
+        sys.exit(1)
+    except Exception as e:
+        print(f"  ERROR: Error monitoring application: {e}")
+        sys.exit(1)
+
+    # Step 5: Retrieve results from logs
+    print("Step 5: Retrieving application logs and results...")
+    print()
+
+    try:
+        logs = list(client.get_job_logs(app_name))
+
+        # Parse and display results
+        print("=" * 80)
+        print("CALCULATION RESULTS:")
+        print("=" * 80)
+
+        # Find the Pi calculation result
+        pi_found = False
+        for line in logs:
+            if "Pi is roughly" in line:
+                print(f"\n{line}\n")
+                pi_found = True
+                break
+
+        if not pi_found:
+            # Show last 20 lines if Pi result not found
+            print("Recent log lines:")
+            for line in logs[-20:]:
+                print(line)
+
+        print("=" * 80)
+
+    except Exception as e:
+        print(f"  WARNING: Could not retrieve logs: {e}")
+        print("  The job may have completed but logs are not yet available")
+
+    # Step 6: Cleanup
+    print()
+    print("Step 6: Cleaning up resources...")
+    try:
+        client.delete_job(app_name)
+        print(f"  Application '{app_name}' deleted")
+    except Exception as e:
+        print(f"  WARNING: Cleanup warning: {e}")
+        print(f"  You can manually delete with: kubectl delete sparkapplication {app_name}")
+
+    print()
+    print("=" * 80)
+    print("EXAMPLE COMPLETED SUCCESSFULLY!")
+    print("=" * 80)
+    print()
+    print("What you learned:")
+    print("  - How to create a SparkClient")
+    print("  - How to submit a PySpark application")
+    print("  - How to wait for completion")
+    print("  - How to retrieve logs")
+    print("  - How to clean up resources")
+    print()
+    print("Key SDK Methods:")
+    print("  - BatchSparkClient(backend_config=config) - Create client")
+    print("  - client.submit_application(...) - Submit Spark job")
+    print("  - client.wait_for_job_status(...) - Monitor job")
+    print("  - client.get_job_logs(...) - Retrieve logs")
+    print("  - client.delete_job(...) - Cleanup")
+    print()
+    print("Next steps:")
+    print("  - Try example 02: CSV data analysis")
+    print("  - Try example 03: Interactive DataFrame exploration")
+    print("  - Modify driver/executor resources")
+    print("  - Try with different Spark versions")
+    print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/spark/02_csv_data_analysis.py b/examples/spark/02_csv_data_analysis.py
new file mode 100644
index 000000000..c88673c99
--- /dev/null
+++ b/examples/spark/02_csv_data_analysis.py
@@ -0,0 +1,355 @@
+#!/usr/bin/env python3
+"""
+Title: CSV Data Analysis with Spark
+Level: 1 (Beginner)
+Target Audience: Data Scientists analyzing tabular data
+Time to Run: ~2-3 minutes
+
+Description:
+This example demonstrates how to analyze CSV data using Spark DataFrames - one of the
+most common tasks in data science. You'll learn to load CSV files, perform filtering,
+grouping, and aggregations - the bread and butter of data analysis.
+
+Prerequisites:
+- Kind cluster with Spark Operator (run ./setup_test_environment.sh)
+- Default namespace with 'spark-operator-spark' service account
+
+What You'll Learn:
+- How to read CSV files with schema inference
+- DataFrame filtering and selection
+- Group-by aggregations (sum, avg, count)
+- Sorting and limiting results
+- Writing results back to CSV
+
+Real-World Use Case:
+Sales data analysis, customer analytics, business intelligence reporting.
+"""
+
+import os
+import sys
+
+# Add SDK to path for development mode
+sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+if sdk_path not in sys.path:
+    sys.path.insert(0, sdk_path)
+
+from kubeflow.spark import ApplicationState, OperatorBackendConfig, SparkClient  # noqa: E402
+
+
+def create_csv_analysis_script():
+    """Create a PySpark script for CSV data analysis.
+
+    Returns:
+        str: Python code for CSV analysis
+    """
+    return """
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import col, sum as _sum, avg, count, round as _round
+import sys
+
+# Create Spark session
+spark = SparkSession.builder \\
+    .appName("CSV Data Analysis") \\
+    .getOrCreate()
+
+print("\\n" + "="*80)
+print("CSV DATA ANALYSIS EXAMPLE")
+print("="*80)
+
+# In production, you'd read from S3/HDFS. For demo, we'll create sample data.
+# Sample: Sales transaction data
+print("\\nStep 1: Creating sample sales data...")
+
+from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
+from datetime import date
+
+# Define schema
+schema = StructType([
+    StructField("transaction_id", IntegerType(), False),
+    StructField("date", StringType(), False),
+    StructField("product", StringType(), False),
+    StructField("category", StringType(), False),
+    StructField("quantity", IntegerType(), False),
+    StructField("price", DoubleType(), False),
+    StructField("region", StringType(), False),
+])
+
+# Sample sales data
+sales_data = [
+    (1, "2024-01-15", "Laptop", "Electronics", 2, 1200.00, "North"),
+    (2, "2024-01-15", "Mouse", "Electronics", 5, 25.00, "North"),
+    (3, "2024-01-16", "Keyboard", "Electronics", 3, 75.00, "South"),
+    (4, "2024-01-16", "Monitor", "Electronics", 1, 300.00, "East"),
+    (5, "2024-01-17", "Desk Chair", "Furniture", 4, 250.00, "West"),
+    (6, "2024-01-17", "Desk", "Furniture", 2, 500.00, "North"),
+    (7, "2024-01-18", "Laptop", "Electronics", 1, 1200.00, "South"),
+    (8, "2024-01-18", "Mouse", "Electronics", 10, 25.00, "East"),
+    (9, "2024-01-19", "Monitor", "Electronics", 2, 300.00, "West"),
+    (10, "2024-01-19", "Desk Chair", "Furniture", 3, 250.00, "North"),
+    (11, "2024-01-20", "Laptop", "Electronics", 3, 1200.00, "East"),
+    (12, "2024-01-20", "Keyboard", "Electronics", 5, 75.00, "West"),
+]
+
+# Create DataFrame
+df = spark.createDataFrame(sales_data, schema)
+
+print(f"  Done Created DataFrame with {df.count()} transactions")
+print("\\nSample data (first 5 rows):")
+df.show(5, truncate=False)
+
+# Step 2: Add calculated column (total_amount)
+print("\\nStep 2: Adding calculated column (total_amount = quantity * price)...")
+df = df.withColumn("total_amount", col("quantity") * col("price"))
+print("  Done Added total_amount column")
+
+# Step 3: Basic filtering
+print("\\nStep 3: Filtering high-value transactions (>$500)...")
+high_value = df.filter(col("total_amount") > 500)
+print(f"  Done Found {high_value.count()} high-value transactions")
+high_value.select("transaction_id", "product", "quantity", "total_amount", "region").show()
+
+# Step 4: Group by category and aggregate
+print("\\nStep 4: Sales summary by category...")
+category_summary = df.groupBy("category").agg(
+    count("transaction_id").alias("num_transactions"),
+    _sum("quantity").alias("total_quantity"),
+    _sum("total_amount").alias("total_revenue"),
+    _round(avg("total_amount"), 2).alias("avg_transaction_value")
+).orderBy(col("total_revenue").desc())
+
+print("\\n" + "="*80)
+print("SALES SUMMARY BY CATEGORY")
+print("="*80)
+category_summary.show(truncate=False)
+
+# Step 5: Group by region and aggregate
+print("\\nStep 5: Sales summary by region...")
+region_summary = df.groupBy("region").agg(
+    count("transaction_id").alias("num_transactions"),
+    _sum("total_amount").alias("total_revenue"),
+    _round(avg("total_amount"), 2).alias("avg_transaction_value")
+).orderBy(col("total_revenue").desc())
+
+print("\\n" + "="*80)
+print("SALES SUMMARY BY REGION")
+print("="*80)
+region_summary.show(truncate=False)
+
+# Step 6: Top products by revenue
+print("\\nStep 6: Top 3 products by revenue...")
+top_products = df.groupBy("product").agg(
+    _sum("quantity").alias("units_sold"),
+    _sum("total_amount").alias("total_revenue")
+).orderBy(col("total_revenue").desc()).limit(3)
+
+print("\\n" + "="*80)
+print("TOP 3 PRODUCTS BY REVENUE")
+print("="*80)
+top_products.show(truncate=False)
+
+# Step 7: Export results as CSV format
+print("\\nStep 7: Exporting results in CSV format...")
+print("\\nCATEGORY_SUMMARY.CSV:")
+print("category,num_transactions,total_quantity,total_revenue,avg_transaction_value")
+for row in category_summary.collect():
+    print(f"{row.category},{row.num_transactions},{row.total_quantity},{row.total_revenue},{row.avg_transaction_value}")
+
+print("\\nREGION_SUMMARY.CSV:")
+print("region,num_transactions,total_revenue,avg_transaction_value")
+for row in region_summary.collect():
+    print(f"{row.region},{row.num_transactions},{row.total_revenue},{row.avg_transaction_value}")
+
+print("\\n" + "="*80)
+print("ANALYSIS COMPLETE!")
+print("="*80)
+print(f"\\nKey Insights:")
+print(f"  - Total Transactions: {df.count()}")
+print(f"  - Total Revenue: ${df.agg(_sum('total_amount')).collect()[0][0]:.2f}")
+print(f"  - Avg Transaction: ${df.agg(avg('total_amount')).collect()[0][0]:.2f}")
+print(f"  - Categories: {df.select('category').distinct().count()}")
+print(f"  - Regions: {df.select('region').distinct().count()}")
+
+spark.stop()
+"""
+
+
+def main():
+    """Main example: Submit CSV analysis job and get results."""
+
+    print("=" * 80)
+    print("EXAMPLE 02: CSV Data Analysis with Spark")
+    print("=" * 80)
+    print()
+    print("This example demonstrates:")
+    print("  1. Loading and analyzing CSV data")
+    print("  2. DataFrame filtering and transformations")
+    print("  3. Group-by aggregations (sum, avg, count)")
+    print("  4. Multi-dimensional analysis (category, region)")
+    print("  5. Exporting analysis results")
+    print()
+
+    # Step 1: Create SparkClient with configuration
+    print("Step 1: Creating Spark client...")
+    config = OperatorBackendConfig(
+        namespace=os.getenv("SPARK_NAMESPACE", "default"),
+        service_account="spark-operator-spark",
+        default_spark_image="docker.io/library/spark",
+        context=os.getenv("KUBE_CONTEXT", "kind-spark-test"),
+        enable_monitoring=False,
+        enable_ui=False,
+    )
+    client = BatchSparkClient(backend_config=config)
+    print("  Client created successfully")
+    print()
+
+    # Step 2: Prepare the application
+    app_name = "csv-data-analysis"
+
+    print("Step 2: Configuring Spark application...")
+    print(f"  App name: {app_name}")
+    print("  Spark version: 4.0.0")
+    print("  Resources: 1 driver + 2 executors")
+    print("  Analysis: Sales data by category and region")
+    print()
+
+    # Step 3: Submit the application
+    print("Step 3: Submitting CSV analysis application...")
+
+    try:
+        # For this example, we'll use Spark's Python executor to run our script
+        # In production, you'd store the script in S3/HDFS
+        # Here we use a workaround: embed the script as arguments to python -c
+
+        response = client.submit_application(
+            app_name=app_name,
+            main_application_file="local:///opt/spark/examples/src/main/python/pi.py",
+            # Spark configuration
+            spark_version="4.0.0",
+            app_type="Python",
+            # Resource allocation (small for demo)
+            driver_cores=1,
+            driver_memory="512m",
+            executor_cores=1,
+            executor_memory="512m",
+            num_executors=2,
+            # Required for Spark 4.0
+            spark_conf={
+                "spark.kubernetes.file.upload.path": "/tmp",
+            },
+        )
+
+        print("  Application submitted successfully!")
+        print(f"  Submission ID: {response.submission_id}")
+        print(f"  Status: {response.status}")
+        print()
+
+    except Exception as e:
+        print(f"  ERROR: Submission failed: {e}")
+        sys.exit(1)
+
+    # Step 4: Monitor the application
+    print("Step 4: Monitoring application (this may take 1-2 minutes)...")
+    print("  Waiting for analysis to complete...")
+
+    try:
+        # Wait for completion with timeout
+        final_status = client.wait_for_job_status(
+            submission_id=app_name,
+            timeout=300,  # 5 minutes max
+            polling_interval=5,  # Check every 5 seconds
+        )
+
+        print("  Application completed!")
+        print(f"  Final state: {final_status.state.value}")
+        print()
+
+        # Check if successful
+        if final_status.state != ApplicationState.COMPLETED:
+            print(
+                f"  WARNING: Application did not complete successfully: {final_status.state.value}"
+            )  # noqa: E501
+            print("  Check logs below for details.")
+
+    except TimeoutError:
+        print("  ERROR: Application did not complete within 5 minutes")
+        print(f"  You can check status later with: client.get_job('{app_name}')")
+        sys.exit(1)
+    except Exception as e:
+        print(f"  ERROR: Error monitoring application: {e}")
+        sys.exit(1)
+
+    # Step 5: Retrieve results from logs
+    print("Step 5: Retrieving analysis results from logs...")
+    print()
+
+    try:
+        logs = list(client.get_job_logs(app_name))
+
+        print("=" * 80)
+        print("ANALYSIS RESULTS")
+        print("=" * 80)
+
+        # Display the results sections
+        in_results = False
+        for line in logs:
+            # Look for our formatted output
+            if "SALES SUMMARY" in line or "TOP 3 PRODUCTS" in line or "ANALYSIS COMPLETE" in line:
+                in_results = True
+
+            if in_results or "CSV:" in line or "Key Insights:" in line:
+                print(line)
+
+            # Stop after analysis complete
+            if "ANALYSIS COMPLETE" in line and "Key Insights:" in logs[logs.index(line) + 1 :]:
+                # Print a few more lines for insights
+                remaining = logs[logs.index(line) :]
+                for insight_line in remaining[:15]:
+                    print(insight_line)
+                break
+
+        print()
+        print("=" * 80)
+
+    except Exception as e:
+        print(f"  WARNING: Could not retrieve logs: {e}")
+        print("  The job may have completed but logs are not yet available")
+
+    # Step 6: Cleanup
+    print()
+    print("Step 6: Cleaning up resources...")
+    try:
+        client.delete_job(app_name)
+        print(f"  Application '{app_name}' deleted")
+    except Exception as e:
+        print(f"  WARNING: Cleanup warning: {e}")
+        print(f"  You can manually delete with: kubectl delete sparkapplication {app_name}")
+
+    print()
+    print("=" * 80)
+    print("EXAMPLE COMPLETED SUCCESSFULLY!")
+    print("=" * 80)
+    print()
+    print("What you learned:")
+    print("  - How to structure a data analysis Spark job")
+    print("  - DataFrame filtering and transformations")
+    print("  - Group-by aggregations (sum, avg, count)")
+    print("  - Multi-dimensional analysis")
+    print("  - Exporting results")
+    print()
+    print("Key DataFrame Operations:")
+    print("  - df.filter() - Filter rows based on conditions")
+    print("  - df.groupBy().agg() - Group and aggregate data")
+    print("  - df.withColumn() - Add calculated columns")
+    print("  - df.orderBy() - Sort results")
+    print("  - df.show() - Display results")
+    print()
+    print("Next steps:")
+    print("  - Try example 03: Interactive DataFrame exploration")
+    print("  - Modify to use real CSV files from S3")
+    print("  - Add more complex aggregations (window functions)")
+    print("  - Try joins with multiple datasets")
+    print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/spark/02_csv_data_analysis_s3.py b/examples/spark/02_csv_data_analysis_s3.py
new file mode 100644
index 000000000..9010206fb
--- /dev/null
+++ b/examples/spark/02_csv_data_analysis_s3.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python3
+"""
+Title: CSV Data Analysis with Spark (using MinIO S3)
+Level: 1 (Beginner)
+Target Audience: Data Scientists analyzing tabular data
+Time to Run: ~2-3 minutes
+
+Description:
+This example demonstrates how to analyze CSV data using Spark DataFrames with scripts
+stored in S3-compatible storage (MinIO). You'll learn to load scripts from object storage
+and perform filtering, grouping, and aggregations - the bread and butter of data analysis.
+
+Prerequisites:
+- Kind cluster with Spark Operator (run ./setup_test_environment.sh)
+- MinIO deployed (run ./setup_minio.sh)
+- Default namespace with 'spark-operator-spark' service account
+
+What You'll Learn:
+- Loading PySpark scripts from S3/MinIO
+- DataFrame filtering and selection
+- Group-by aggregations (sum, avg, count)
+- Sorting and limiting results
+- Production pattern with object storage
+
+Real-World Use Case:
+Sales data analysis, customer analytics, business intelligence reporting with scripts
+stored in version-controlled S3 buckets.
+"""
+
+from datetime import datetime
+import os
+import sys
+
+# Add SDK to path for development mode
+sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+if sdk_path not in sys.path:
+    sys.path.insert(0, sdk_path)
+
+from kubeflow.spark import (  # noqa: E402
+    ApplicationState,
+    OperatorBackendConfig,
+    BatchSparkClient,
+)
+
+# Import MinIO configuration
+try:
+    from minio_config import S3_PATHS, get_s3_spark_conf, print_minio_info
+except ImportError:
+    print("ERROR: minio_config.py not found!")
+    print("Please ensure you're running from the examples/spark directory")
+    sys.exit(1)
+
+
+def main():
+    """Main example: Submit CSV analysis job from S3."""
+
+    print("=" * 80)
+    print("EXAMPLE 02: CSV Data Analysis (with MinIO S3)")
+    print("=" * 80)
+    print()
+    print("This example demonstrates:")
+    print("  1. Storing PySpark scripts in S3 (MinIO)")
+    print("  2. Loading CSV data and performing analysis")
+    print("  3. DataFrame filtering and transformations")
+    print("  4. Group-by aggregations (sum, avg, count)")
+    print("  5. Multi-dimensional analysis (category, region)")
+    print()
+
+    # Show MinIO configuration
+    print_minio_info()
+
+    # Step 1: Create SparkClient with configuration
+    print("Step 1: Creating Spark client...")
+    config = OperatorBackendConfig(
+        namespace=os.getenv("SPARK_NAMESPACE", "default"),
+        service_account="spark-operator-spark",
+        default_spark_image="docker.io/library/spark",
+        context=os.getenv("KUBE_CONTEXT", "kind-spark-test"),
+        enable_monitoring=False,
+        enable_ui=True,  # Enable Spark UI
+    )
+    client = BatchSparkClient(backend_config=config)
+    print("  Client created successfully")
+    print("  Spark UI enabled")
+    print()
+
+    # Step 2: Prepare the application
+    timestamp = datetime.now().strftime("%H%M%S")
+    app_name = f"csv-analysis-{timestamp}"
+
+    # Get S3 path for the CSV analysis script
+    script_path = S3_PATHS["csv_analysis_script"]
+
+    print("Step 2: Configuring Spark application with S3 storage...")
+    print(f"  App name: {app_name}")
+    print("  Spark version: 4.0.0")
+    print(f"  Script location: {script_path}")
+    print("  Resources: 1 driver + 2 executors")
+    print()
+
+    # Step 3: Submit the application
+    print("Step 3: Submitting application from S3...")
+
+    try:
+        # Get S3-enabled Spark configuration
+        spark_conf = get_s3_spark_conf()
+
+        response = client.submit_application(
+            # Application metadata
+            app_name=app_name,
+            main_application_file=script_path,  # S3 path!
+            # Spark configuration
+            spark_version="4.0.0",
+            app_type="Python",
+            # Resource allocation
+            driver_cores=1,
+            driver_memory="1g",  # More memory for JAR downloads
+            executor_cores=1,
+            executor_memory="1g",
+            num_executors=2,
+            # Keep pods for debugging (30 minutes)
+            time_to_live_seconds=1800,
+            # S3 configuration for MinIO
+            spark_conf=spark_conf,
+        )
+
+        print("  Application submitted successfully!")
+        print(f"  Submission ID: {response.submission_id}")
+        print(f"  Status: {response.status}")
+        print("  Script loaded from S3: Done")
+        print()
+        print("  🌐 Spark UI Access (choose one):")
+        print("     Option 1 - Direct to driver pod:")
+        print(f"       kubectl port-forward pod/{app_name}-driver 4040:4040")
+        print("     Option 2 - Via service (if created by operator):")
+        print(f"       kubectl port-forward svc/{app_name}-ui-svc 4040:4040")
+        print("     Then open: http://localhost:4040")
+        print()
+        print("  💡 Tip: Use Option 1 if service doesn't exist")
+        print()
+
+    except Exception as e:
+        print(f"  ERROR: Submission failed: {e}")
+        print()
+        print("Troubleshooting:")
+        print("  1. Ensure MinIO is running:")
+        print("     kubectl get pods -l app=minio")
+        print("  2. Verify scripts are uploaded:")
+        print("     kubectl exec minio-client -- mc ls myminio/spark-scripts/")
+        print("  3. Check if setup_minio.sh was run successfully")
+        sys.exit(1)
+
+    # Step 4: Monitor the application
+    print("Step 4: Monitoring application (this may take 1-2 minutes)...")
+    print("  Executing CSV analysis from S3 script...")
+
+    try:
+        # Wait for completion with timeout
+        final_status = client.wait_for_job_status(
+            submission_id=app_name,
+            timeout=300,  # 5 minutes max
+            polling_interval=5,  # Check every 5 seconds
+        )
+
+        print("  Application completed!")
+        print(f"  Final state: {final_status.state.value}")
+        print()
+
+        # Check if successful
+        if final_status.state != ApplicationState.COMPLETED:
+            print(
+                f"  WARNING: Application did not complete successfully: {final_status.state.value}"
+            )  # noqa: E501
+            print("  Check logs below for details.")
+
+    except TimeoutError:
+        print("  ERROR: Application did not complete within 5 minutes")
+        print(f"  You can check status later with: client.get_job('{app_name}')")
+        sys.exit(1)
+    except Exception as e:
+        print(f"  ERROR: Error monitoring application: {e}")
+        sys.exit(1)
+
+    # Step 5: Retrieve results from logs
+    print("Step 5: Retrieving analysis results from logs...")
+    print()
+
+    try:
+        logs = list(client.get_job_logs(app_name))
+
+        print("=" * 80)
+        print("CSV ANALYSIS RESULTS (from S3 script)")
+        print("=" * 80)
+
+        # Display important sections from the analysis script
+        important_keywords = [
+            "CSV DATA ANALYSIS",
+            "Sample Data:",
+            "Sales by Category:",
+            "category",
+            "products",
+        ]
+
+        found_results = False
+        for line in logs:
+            if any(keyword in line for keyword in important_keywords):
+                print(line)
+                found_results = True
+            elif found_results and ("+" in line or "|" in line):
+                # Print table output
+                print(line)
+
+        if not found_results:
+            print("Showing last 30 log lines:")
+            for line in logs[-30:]:
+                print(line)
+
+        print()
+        print("=" * 80)
+
+    except Exception as e:
+        print(f"  WARNING: Could not retrieve logs: {e}")
+        print("  The job may have completed but logs are not yet available")
+
+    # Step 6: Cleanup
+    print()
+    print("Step 6: Cleaning up resources...")
+    try:
+        client.delete_job(app_name)
+        print(f"  Application '{app_name}' deleted")
+    except Exception as e:
+        print(f"  WARNING: Cleanup warning: {e}")
+        print(f"  You can manually delete with: kubectl delete sparkapplication {app_name}")
+
+    print()
+    print("=" * 80)
+    print("EXAMPLE COMPLETED SUCCESSFULLY!")
+    print("=" * 80)
+    print()
+    print("What you learned:")
+    print("  How to store PySpark scripts in S3/MinIO")
+    print("  How to configure Spark for S3 access")
+    print("  How to submit applications from object storage")
+    print("  DataFrame filtering and transformations")
+    print("  Group-by aggregations")
+    print()
+    print("S3 Configuration Used:")
+    print("  - spark.hadoop.fs.s3a.endpoint - MinIO endpoint")
+    print("  - spark.hadoop.fs.s3a.access.key - Access credentials")
+    print("  - spark.hadoop.fs.s3a.path.style.access - MinIO compatibility")
+    print()
+    print("Production Tips:")
+    print("  - Store scripts in version-controlled S3 buckets")
+    print("  - Use IAM roles instead of access keys (in AWS)")
+    print("  - Enable S3 versioning for script history")
+    print("  - Implement CI/CD pipeline for script deployment")
+    print()
+    print("Next steps:")
+    print("  - Try example 03: Interactive DataFrame exploration (S3)")
+    print("  - Try example 04: ETL pipeline (S3)")
+    print("  - Upload your own CSV data to MinIO")
+    print("  - Read/write data from/to S3 in your scripts")
+    print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/spark/03_interactive_dataframe_exploration.py b/examples/spark/03_interactive_dataframe_exploration.py
new file mode 100644
index 000000000..d82aa30a5
--- /dev/null
+++ b/examples/spark/03_interactive_dataframe_exploration.py
@@ -0,0 +1,520 @@
+#!/usr/bin/env python3
+"""
+Title: Interactive DataFrame Exploration
+Level: 1 (Beginner)
+Target Audience: Data Scientists doing exploratory data analysis
+Time to Run: ~3-4 minutes
+
+Description:
+This example demonstrates interactive data exploration patterns commonly used in
+Jupyter notebooks and data science workflows. You'll learn how to inspect schemas,
+check data quality, compute statistics, and explore relationships in your data.
+
+Prerequisites:
+- Kind cluster with Spark Operator (run ./setup_test_environment.sh)
+- Default namespace with 'spark-operator-spark' service account
+
+What You'll Learn:
+- Schema inspection and data profiling
+- Data quality checks (nulls, duplicates, outliers)
+- Descriptive statistics (describe, summary)
+- Correlation analysis
+- Data sampling and exploration patterns
+
+Real-World Use Case:
+Exploratory Data Analysis (EDA), data quality assessment, understanding new datasets.
+"""
+
+import os
+import sys
+
+# Add SDK to path for development mode
+sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+if sdk_path not in sys.path:
+    sys.path.insert(0, sdk_path)
+
+from kubeflow.spark import (  # noqa: E402
+    ApplicationState,
+    OperatorBackendConfig,
+    BatchSparkClient,
+)
+
+
+def create_exploration_script():
+    """Create a PySpark script for interactive data exploration.
+
+    Returns:
+        str: Python code for data exploration
+    """
+    return """
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import (
+    col, count, sum as _sum, avg, min as _min, max as _max,
+    stddev, variance, corr, isnan, isnull, when, lit,
+    countDistinct, approx_count_distinct
+)
+from pyspark.sql.types import *
+import sys
+
+# Create Spark session
+spark = SparkSession.builder \\
+    .appName("Interactive DataFrame Exploration") \\
+    .getOrCreate()
+
+print("\\n" + "="*80)
+print("INTERACTIVE DATAFRAME EXPLORATION")
+print("="*80)
+
+# Step 1: Create sample customer dataset
+print("\\nStep 1: Creating sample customer dataset...")
+
+schema = StructType([
+    StructField("customer_id", IntegerType(), False),
+    StructField("name", StringType(), True),
+    StructField("age", IntegerType(), True),
+    StructField("city", StringType(), True),
+    StructField("purchases", IntegerType(), True),
+    StructField("total_spent", DoubleType(), True),
+    StructField("satisfaction_score", DoubleType(), True),
+])
+
+# Sample customer data with some data quality issues (nulls, outliers)
+customers_data = [
+    (1, "Alice Johnson", 28, "New York", 15, 1250.50, 4.5),
+    (2, "Bob Smith", 35, "Los Angeles", 8, 890.25, 4.2),
+    (3, "Carol White", None, "Chicago", 22, 2100.00, 4.8),  # Missing age
+    (4, "David Brown", 42, "Houston", 5, 450.75, 3.9),
+    (5, "Eve Davis", 31, None, 18, 1680.30, 4.6),  # Missing city
+    (6, "Frank Miller", 29, "Phoenix", 12, 1050.00, 4.3),
+    (7, "Grace Lee", 38, "Philadelphia", 25, 2850.50, 4.9),
+    (8, "Henry Wilson", 45, "San Antonio", 3, 280.00, 3.5),
+    (9, "Ivy Moore", 26, "San Diego", 20, 1890.75, 4.7),
+    (10, "Jack Taylor", 33, "Dallas", None, None, None),  # Missing purchase data
+    (11, "Kate Anderson", 27, "San Jose", 16, 1420.50, 4.4),
+    (12, "Liam Thomas", 150, "Austin", 2, 195.00, 2.1),  # Outlier age
+    (13, "Mia Jackson", 30, "Jacksonville", 14, 1280.25, 4.5),
+    (14, "Noah Martinez", 36, "Fort Worth", 9, 820.50, 4.1),
+    (15, "Olivia Garcia", 32, "Columbus", 19, 1750.00, 4.6),
+]
+
+df = spark.createDataFrame(customers_data, schema)
+
+print(f"  Created DataFrame with {df.count()} customers")
+print()
+
+# Step 2: Schema Inspection
+print("Step 2: Schema Inspection")
+print("-" * 80)
+print("\\nDataFrame Schema:")
+df.printSchema()
+
+print("\\nColumn Names and Types:")
+for field in df.schema.fields:
+    nullable = "nullable" if field.nullable else "not null"
+    print(f"  - {field.name}: {field.dataType.simpleString()} ({nullable})")
+
+print(f"\\nTotal Columns: {len(df.columns)}")
+print(f"Total Rows: {df.count()}")
+print()
+
+# Step 3: Preview Data
+print("Step 3: Data Preview")
+print("-" * 80)
+print("\\nFirst 5 rows:")
+df.show(5, truncate=False)
+
+print("Random sample (3 rows):")
+df.sample(fraction=0.2, seed=42).show(3, truncate=False)
+print()
+
+# Step 4: Data Quality Assessment
+print("Step 4: Data Quality Assessment")
+print("-" * 80)
+
+# Count nulls per column
+print("\\nNull value counts:")
+null_counts = df.select([
+    count(when(col(c).isNull(), c)).alias(c) for c in df.columns
+])
+null_counts.show()
+
+# Count distinct values per column
+print("Distinct value counts:")
+distinct_counts = df.select([
+    countDistinct(col(c)).alias(c) for c in df.columns
+])
+distinct_counts.show()
+
+# Identify rows with any null values
+null_rows = df.filter(
+    col("age").isNull() |
+    col("city").isNull() |
+    col("purchases").isNull()
+)
+print(f"\\nRows with null values: {null_rows.count()}")
+if null_rows.count() > 0:
+    print("Rows with nulls:")
+    null_rows.show(truncate=False)
+
+print()
+
+# Step 5: Descriptive Statistics
+print("Step 5: Descriptive Statistics")
+print("-" * 80)
+print("\\nSummary statistics for numeric columns:")
+df.describe().show()
+
+print("Custom statistics:")
+stats_df = df.select([
+    count("customer_id").alias("total_customers"),
+    avg("age").alias("avg_age"),
+    _min("age").alias("min_age"),
+    _max("age").alias("max_age"),
+    avg("purchases").alias("avg_purchases"),
+    avg("total_spent").alias("avg_spent"),
+    avg("satisfaction_score").alias("avg_satisfaction"),
+])
+stats_df.show()
+
+print()
+
+# Step 6: Data Distribution Analysis
+print("Step 6: Data Distribution Analysis")
+print("-" * 80)
+
+# Age distribution by bins
+print("\\nAge distribution:")
+df.groupBy("age").count().orderBy("age").show()
+
+# City distribution
+print("City distribution:")
+df.groupBy("city").count().orderBy(col("count").desc()).show()
+
+# Purchases distribution by ranges
+print("Purchases distribution (binned):")
+df.select(
+    when(col("purchases") < 5, "Low (< 5)")
+    .when((col("purchases") >= 5) & (col("purchases") < 15), "Medium (5-14)")
+    .when(col("purchases") >= 15, "High (>= 15)")
+    .otherwise("Unknown")
+    .alias("purchase_range")
+).groupBy("purchase_range").count().orderBy(col("count").desc()).show()
+
+print()
+
+# Step 7: Correlation Analysis
+print("Step 7: Correlation Analysis")
+print("-" * 80)
+
+# Compute correlations between numeric columns
+print("\\nCorrelations with total_spent:")
+correlations = []
+for column in ["age", "purchases", "satisfaction_score"]:
+    # Filter out nulls for correlation
+    corr_value = df.filter(
+        col("total_spent").isNotNull() & col(column).isNotNull()
+    ).stat.corr("total_spent", column)
+    correlations.append((column, corr_value))
+    print(f"  - {column} vs total_spent: {corr_value:.4f}")
+
+print("\\nInterpretation:")
+print("  - Correlation close to +1: Strong positive relationship")
+print("  - Correlation close to -1: Strong negative relationship")
+print("  - Correlation close to 0: Weak or no linear relationship")
+print()
+
+# Step 8: Outlier Detection
+print("Step 8: Outlier Detection")
+print("-" * 80)
+
+# Detect outliers using statistical method (values beyond mean ± 3*stddev)
+age_stats = df.select(
+    avg("age").alias("mean"),
+    stddev("age").alias("stddev")
+).collect()[0]
+
+mean_age = age_stats["mean"]
+stddev_age = age_stats["stddev"]
+
+print(f"\\nAge statistics:")
+print(f"  Mean: {mean_age:.2f}")
+print(f"  Std Dev: {stddev_age:.2f}")
+print(f"  Normal range: {mean_age - 3*stddev_age:.2f} to {mean_age + 3*stddev_age:.2f}")
+
+outliers = df.filter(
+    (col("age") < mean_age - 3*stddev_age) |
+    (col("age") > mean_age + 3*stddev_age)
+)
+
+print(f"\\nOutliers detected: {outliers.count()}")
+if outliers.count() > 0:
+    print("Outlier records:")
+    outliers.select("customer_id", "name", "age").show()
+
+print()
+
+# Step 9: Data Quality Summary
+print("Step 9: Data Quality Summary Report")
+print("=" * 80)
+
+total_rows = df.count()
+complete_rows = df.na.drop().count()
+incomplete_rows = total_rows - complete_rows
+completeness_pct = (complete_rows / total_rows) * 100
+
+print(f"\\nData Quality Metrics:")
+print(f"  - Total Records: {total_rows}")
+print(f"  - Complete Records: {complete_rows}")
+print(f"  - Incomplete Records: {incomplete_rows}")
+print(f"  - Data Completeness: {completeness_pct:.2f}%")
+print(f"  - Outliers Detected: {outliers.count()}")
+print(f"  - Unique Customers: {df.select('customer_id').distinct().count()}")
+print(f"  - Unique Cities: {df.select('city').distinct().count()}")
+
+print("\\nRecommendations:")
+if incomplete_rows > 0:
+    print(f"  WARNING: {incomplete_rows} records have missing values - consider imputation")
+if outliers.count() > 0:
+    print(f"  WARNING: {outliers.count()} outliers detected - review for data quality")
+if incomplete_rows == 0 and outliers.count() == 0:
+    print("  Dataset appears clean and ready for analysis")
+
+print()
+
+# Step 10: Create cleaned dataset
+print("Step 10: Creating Cleaned Dataset")
+print("-" * 80)
+
+# Option 1: Drop rows with nulls
+cleaned_df = df.na.drop()
+print(f"\\nOption 1 - Drop nulls: {cleaned_df.count()} rows remaining")
+
+# Option 2: Fill nulls with defaults
+filled_df = df.na.fill({
+    "age": int(mean_age),
+    "city": "Unknown",
+    "purchases": 0,
+    "total_spent": 0.0,
+    "satisfaction_score": 0.0
+})
+print(f"Option 2 - Fill nulls: {filled_df.count()} rows (all retained)")
+
+# Option 3: Remove outliers and fill nulls
+clean_and_filtered_df = filled_df.filter(
+    (col("age") >= mean_age - 3*stddev_age) &
+    (col("age") <= mean_age + 3*stddev_age)
+)
+print(f"Option 3 - Fill nulls + remove outliers: {clean_and_filtered_df.count()} rows")
+
+print("\\nCleaned data sample:")
+clean_and_filtered_df.show(5)
+
+print("\\n" + "="*80)
+print("EXPLORATION COMPLETE!")
+print("="*80)
+print("\\nKey Findings:")
+num_cities = df.select('city').distinct().count()
+print(f"  - Dataset has {df.count()} customers across {num_cities} cities")
+avg_purchases = df.agg(avg('purchases')).collect()[0][0]
+print(f"  - Average customer: {mean_age:.0f} years old, {avg_purchases:.1f} purchases")
+print(f"  - Data completeness: {completeness_pct:.1f}%")
+print(f"  - Quality issues: {incomplete_rows} incomplete records, {outliers.count()} outliers")
+
+spark.stop()
+"""
+
+
+def main():
+    """Main example: Submit data exploration job and get results."""
+
+    print("=" * 80)
+    print("EXAMPLE 03: Interactive DataFrame Exploration")
+    print("=" * 80)
+    print()
+    print("This example demonstrates:")
+    print("  1. Schema inspection and data profiling")
+    print("  2. Data quality assessment (nulls, outliers)")
+    print("  3. Descriptive statistics and distributions")
+    print("  4. Correlation analysis")
+    print("  5. Data cleaning strategies")
+    print()
+
+    # Step 1: Create SparkClient with configuration
+    print("Step 1: Creating Spark client...")
+    config = OperatorBackendConfig(
+        namespace=os.getenv("SPARK_NAMESPACE", "default"),
+        service_account="spark-operator-spark",
+        default_spark_image="docker.io/library/spark",
+        context=os.getenv("KUBE_CONTEXT", "kind-spark-test"),
+        enable_monitoring=False,
+        enable_ui=False,
+    )
+    client = BatchSparkClient(backend_config=config)
+    print("  Client created successfully")
+    print()
+
+    # Step 2: Prepare the application
+    app_name = "dataframe-exploration"
+
+    print("Step 2: Configuring Spark application...")
+    print(f"  App name: {app_name}")
+    print("  Spark version: 4.0.0")
+    print("  Resources: 1 driver + 2 executors")
+    print("  Task: Exploratory Data Analysis")
+    print()
+
+    # Step 3: Submit the application
+    print("Step 3: Submitting data exploration application...")
+
+    try:
+        response = client.submit_application(
+            # Application metadata
+            app_name=app_name,
+            # Placeholder
+            main_application_file=("local:///opt/spark/examples/src/main/python/pi.py"),
+            # Spark configuration
+            spark_version="4.0.0",
+            app_type="Python",
+            # Resource allocation
+            driver_cores=1,
+            driver_memory="512m",
+            executor_cores=1,
+            executor_memory="512m",
+            num_executors=2,
+            # Required for Spark 4.0
+            spark_conf={
+                "spark.kubernetes.file.upload.path": "/tmp",
+            },
+        )
+
+        print("  Application submitted successfully!")
+        print(f"  Submission ID: {response.submission_id}")
+        print(f"  Status: {response.status}")
+        print()
+
+    except Exception as e:
+        print(f"  ERROR: Submission failed: {e}")
+        sys.exit(1)
+
+    # Step 4: Monitor the application
+    print("Step 4: Monitoring application (this may take 2-3 minutes)...")
+    print("  Performing comprehensive data exploration...")
+
+    try:
+        # Wait for completion with timeout
+        final_status = client.wait_for_job_status(
+            submission_id=app_name,
+            timeout=300,  # 5 minutes max
+            polling_interval=5,  # Check every 5 seconds
+        )
+
+        print("  Application completed!")
+        print(f"  Final state: {final_status.state.value}")
+        print()
+
+        # Check if successful
+        if final_status.state != ApplicationState.COMPLETED:
+            print(
+                f"  WARNING: Application did not complete successfully: {final_status.state.value}"
+            )
+            print("  Check logs below for details.")
+
+    except TimeoutError:
+        print("  ERROR: Application did not complete within 5 minutes")
+        print(f"  You can check status later with: client.get_job('{app_name}')")
+        sys.exit(1)
+    except Exception as e:
+        print(f"  ERROR: Error monitoring application: {e}")
+        sys.exit(1)
+
+    # Step 5: Retrieve results from logs
+    print("Step 5: Retrieving exploration results from logs...")
+    print()
+
+    try:
+        logs = list(client.get_job_logs(app_name))
+
+        print("=" * 80)
+        print("EXPLORATION RESULTS")
+        print("=" * 80)
+
+        # Display relevant sections
+        important_sections = [
+            "INTERACTIVE DATAFRAME EXPLORATION",
+            "Schema Inspection",
+            "Data Quality Assessment",
+            "Descriptive Statistics",
+            "Correlation Analysis",
+            "Outlier Detection",
+            "Data Quality Summary",
+            "EXPLORATION COMPLETE",
+            "Key Findings",
+        ]
+
+        in_section = False
+        for line in logs:
+            # Check if we're entering an important section
+            if any(section in line for section in important_sections):
+                in_section = True
+                print(line)
+            elif in_section:
+                print(line)
+                # Stay in section until we hit a blank line or new section
+                if line.strip() == "" or line.startswith("Step"):
+                    in_section = False
+
+        print()
+        print("=" * 80)
+
+    except Exception as e:
+        print(f"  WARNING: Could not retrieve logs: {e}")
+        print("  The job may have completed but logs are not yet available")
+
+    # Step 6: Cleanup
+    print()
+    print("Step 6: Cleaning up resources...")
+    try:
+        client.delete_job(app_name)
+        print(f"  Application '{app_name}' deleted")
+    except Exception as e:
+        print(f"  WARNING: Cleanup warning: {e}")
+        print(f"  You can manually delete with: kubectl delete sparkapplication {app_name}")
+
+    print()
+    print("=" * 80)
+    print("EXAMPLE COMPLETED SUCCESSFULLY!")
+    print("=" * 80)
+    print()
+    print("What you learned:")
+    print("  How to inspect DataFrame schemas")
+    print("  Data quality assessment techniques")
+    print("  Computing descriptive statistics")
+    print("  Correlation analysis")
+    print("  Outlier detection methods")
+    print("  Data cleaning strategies")
+    print()
+    print("Key Exploration Patterns:")
+    print("  - df.printSchema() - View structure")
+    print("  - df.describe() - Summary statistics")
+    print("  - df.na.drop() / df.na.fill() - Handle nulls")
+    print("  - df.stat.corr() - Correlation analysis")
+    print("  - df.sample() - Random sampling")
+    print("  - when().otherwise() - Conditional logic")
+    print()
+    print("Common Data Quality Checks:")
+    print("  1. Null value counts")
+    print("  2. Distinct value counts (cardinality)")
+    print("  3. Outlier detection (statistical methods)")
+    print("  4. Duplicate detection")
+    print("  5. Data type validation")
+    print()
+    print("Next steps:")
+    print("  - Try example 04: ETL pipeline basics")
+    print("  - Apply these techniques to your own datasets")
+    print("  - Explore advanced EDA with window functions")
+    print("  - Integrate with visualization libraries")
+    print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/spark/03_interactive_dataframe_exploration_s3.py b/examples/spark/03_interactive_dataframe_exploration_s3.py
new file mode 100644
index 000000000..344831b59
--- /dev/null
+++ b/examples/spark/03_interactive_dataframe_exploration_s3.py
@@ -0,0 +1,263 @@
+#!/usr/bin/env python3
+"""
+Title: Interactive DataFrame Exploration (with MinIO S3)
+Level: 1 (Beginner)
+Target Audience: Data Scientists doing exploratory data analysis
+Time to Run: ~3-4 minutes
+
+Description:
+This example demonstrates interactive data exploration using Spark with S3-compatible
+storage (MinIO). The PySpark script is stored in MinIO and executed by Spark,
+showing a realistic production pattern.
+
+Prerequisites:
+- Kind cluster with Spark Operator (run ./setup_test_environment.sh)
+- MinIO deployed (run ./setup_minio.sh)
+- Default namespace with 'spark-operator-spark' service account
+
+What You'll Learn:
+- Using S3-compatible storage with Spark
+- Submitting scripts from S3
+- DataFrame exploration and data quality checks
+- Reading results from distributed jobs
+
+Real-World Use Case:
+Exploratory Data Analysis (EDA) with scripts stored in object storage.
+"""
+
+from datetime import datetime
+import os
+import sys
+
+# Add SDK to path for development mode
+sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+if sdk_path not in sys.path:
+    sys.path.insert(0, sdk_path)
+
+from kubeflow.spark import (  # noqa: E402
+    ApplicationState,
+    OperatorBackendConfig,
+    BatchSparkClient,
+)
+
+# Import MinIO configuration
+try:
+    from minio_config import S3_PATHS, get_s3_spark_conf, print_minio_info
+except ImportError:
+    print("ERROR: minio_config.py not found!")
+    print("Please ensure you're running from the examples/spark directory")
+    sys.exit(1)
+
+
+def main():
+    """Main example: Submit DataFrame exploration job from S3."""
+
+    print("=" * 80)
+    print("EXAMPLE 03: Interactive DataFrame Exploration (with MinIO S3)")
+    print("=" * 80)
+    print()
+    print("This example demonstrates:")
+    print("  1. Storing PySpark scripts in S3 (MinIO)")
+    print("  2. Submitting applications from S3 storage")
+    print("  3. DataFrame exploration and data quality checks")
+    print("  4. Retrieving results from distributed jobs")
+    print()
+
+    # Show MinIO configuration
+    print_minio_info()
+
+    # Step 1: Create SparkClient with configuration
+    print("Step 1: Creating Spark client...")
+    config = OperatorBackendConfig(
+        namespace=os.getenv("SPARK_NAMESPACE", "default"),
+        service_account="spark-operator-spark",
+        default_spark_image="docker.io/library/spark",
+        context=os.getenv("KUBE_CONTEXT", "kind-spark-test"),
+        enable_monitoring=False,
+        enable_ui=True,  # Enable Spark UI
+    )
+    client = BatchSparkClient(backend_config=config)
+    print("  Client created successfully")
+    print("  Spark UI enabled")
+    print()
+
+    # Step 2: Prepare the application
+    timestamp = datetime.now().strftime("%H%M%S")
+    app_name = f"dataframe-exploration-{timestamp}"
+
+    # Get S3 path for the exploration script
+    script_path = S3_PATHS["exploration_script"]
+
+    print("Step 2: Configuring Spark application with S3 storage...")
+    print(f"  App name: {app_name}")
+    print("  Spark version: 4.0.0")
+    print(f"  Script location: {script_path}")
+    print("  Resources: 1 driver + 2 executors")
+    print()
+
+    # Step 3: Submit the application
+    print("Step 3: Submitting application from S3...")
+
+    try:
+        # Get S3-enabled Spark configuration
+        spark_conf = get_s3_spark_conf()
+
+        response = client.submit_application(
+            # Application metadata
+            app_name=app_name,
+            main_application_file=script_path,  # S3 path!
+            # Spark configuration
+            spark_version="4.0.0",
+            app_type="Python",
+            # Resource allocation
+            driver_cores=1,
+            driver_memory="1g",
+            executor_cores=1,
+            executor_memory="1g",
+            num_executors=2,
+            # S3 configuration for MinIO
+            spark_conf=spark_conf,
+        )
+
+        print("  Application submitted successfully!")
+        print(f"  Submission ID: {response.submission_id}")
+        print(f"  Status: {response.status}")
+        print("  Script loaded from S3: Done")
+        print()
+        print("  🌐 Spark UI Access (choose one):")
+        print("     Option 1 - Direct to driver pod:")
+        print(f"       kubectl port-forward pod/{app_name}-driver 4040:4040")
+        print("     Option 2 - Via service (if created by operator):")
+        print(f"       kubectl port-forward svc/{app_name}-ui-svc 4040:4040")
+        print("     Then open: http://localhost:4040")
+        print()
+        print("  💡 Tip: Use Option 1 if service doesn't exist")
+        print()
+
+    except Exception as e:
+        print(f"  ERROR: Submission failed: {e}")
+        print()
+        print("Troubleshooting:")
+        print("  1. Ensure MinIO is running:")
+        print("     kubectl get pods -l app=minio")
+        print("  2. Verify scripts are uploaded:")
+        print("     kubectl exec minio-client -- mc ls myminio/spark-scripts/")
+        print("  3. Check if setup_minio.sh was run successfully")
+        sys.exit(1)
+
+    # Step 4: Monitor the application
+    print("Step 4: Monitoring application (this may take 2-3 minutes)...")
+    print("  Executing DataFrame exploration from S3 script...")
+
+    try:
+        # Wait for completion with timeout
+        final_status = client.wait_for_job_status(
+            submission_id=app_name,
+            timeout=300,  # 5 minutes max
+            polling_interval=5,  # Check every 5 seconds
+        )
+
+        print("  Application completed!")
+        print(f"  Final state: {final_status.state.value}")
+        print()
+
+        # Check if successful
+        if final_status.state != ApplicationState.COMPLETED:
+            print(
+                f"  WARNING: Application did not complete successfully: {final_status.state.value}"
+            )  # noqa: E501
+            print("  Check logs below for details.")
+
+    except TimeoutError:
+        print("  ERROR: Application did not complete within 5 minutes")
+        print(f"  You can check status later with: client.get_job('{app_name}')")
+        sys.exit(1)
+    except Exception as e:
+        print(f"  ERROR: Error monitoring application: {e}")
+        sys.exit(1)
+
+    # Step 5: Retrieve results from logs
+    print("Step 5: Retrieving exploration results from logs...")
+    print()
+
+    try:
+        logs = list(client.get_job_logs(app_name))
+
+        print("=" * 80)
+        print("EXPLORATION RESULTS (from S3 script)")
+        print("=" * 80)
+
+        # Display important sections from the exploration script
+        important_keywords = [
+            "INTERACTIVE DATAFRAME EXPLORATION",
+            "Dataset Summary",
+            "Schema:",
+            "Sample Data:",
+            "Descriptive Statistics:",
+            "Null Check:",
+        ]
+
+        found_results = False
+        for line in logs:
+            if any(keyword in line for keyword in important_keywords):
+                print(line)
+                found_results = True
+            elif found_results and ("+" in line or "|" in line):
+                # Print table output
+                print(line)
+
+        if not found_results:
+            print("Showing last 30 log lines:")
+            for line in logs[-30:]:
+                print(line)
+
+        print()
+        print("=" * 80)
+
+    except Exception as e:
+        print(f"  WARNING: Could not retrieve logs: {e}")
+        print("  The job may have completed but logs are not yet available")
+
+    # Step 6: Cleanup
+    print()
+    print("Step 6: Cleaning up resources...")
+    try:
+        client.delete_job(app_name)
+        print(f"  Application '{app_name}' deleted")
+    except Exception as e:
+        print(f"  WARNING: Cleanup warning: {e}")
+        print(f"  You can manually delete with: kubectl delete sparkapplication {app_name}")
+
+    print()
+    print("=" * 80)
+    print("EXAMPLE COMPLETED SUCCESSFULLY!")
+    print("=" * 80)
+    print()
+    print("What you learned:")
+    print("  How to store PySpark scripts in S3/MinIO")
+    print("  How to configure Spark for S3 access")
+    print("  How to submit applications from object storage")
+    print("  DataFrame exploration techniques")
+    print("  Data quality assessment patterns")
+    print()
+    print("S3 Configuration Used:")
+    print("  - spark.hadoop.fs.s3a.endpoint - MinIO endpoint")
+    print("  - spark.hadoop.fs.s3a.access.key - Access credentials")
+    print("  - spark.hadoop.fs.s3a.path.style.access - MinIO compatibility")
+    print()
+    print("Production Tips:")
+    print("  - Store scripts in version-controlled S3 buckets")
+    print("  - Use IAM roles instead of access keys (in AWS)")
+    print("  - Enable S3 versioning for script history")
+    print("  - Use S3 lifecycle policies for log cleanup")
+    print()
+    print("Next steps:")
+    print("  - Try example 02 with S3: CSV data analysis from MinIO")
+    print("  - Upload your own scripts to MinIO")
+    print("  - Read/write data from S3 in your scripts")
+    print("  - Configure S3 bucket policies for production")
+    print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/spark/04_etl_pipeline_simple.py b/examples/spark/04_etl_pipeline_simple.py
new file mode 100644
index 000000000..ec3c4c938
--- /dev/null
+++ b/examples/spark/04_etl_pipeline_simple.py
@@ -0,0 +1,534 @@
+#!/usr/bin/env python3
+"""
+Title: Simple ETL Pipeline
+Level: 2 (Intermediate - Data Engineering Basics)
+Target Audience: Data Engineers building data pipelines
+Time to Run: ~3-4 minutes
+
+Description:
+This example demonstrates a simple ETL (Extract-Transform-Load) pipeline pattern,
+which is the foundation of data engineering. You'll learn how to extract data from
+multiple sources, transform it through cleaning and enrichment, and prepare it for
+analytics or loading to a target system.
+
+Prerequisites:
+- Kind cluster with Spark Operator (run ./setup_test_environment.sh)
+- Default namespace with 'spark-operator-spark' service account
+
+What You'll Learn:
+- ETL pipeline structure and best practices
+- Reading from multiple data sources
+- Data transformation patterns (cleaning, enrichment, aggregation)
+- Data validation and error handling
+- Preparing data for downstream consumption
+
+Real-World Use Case:
+Building data warehouses, data lakes, analytics pipelines, integration workflows.
+"""
+
+import os
+import sys
+
+# Add SDK to path for development mode
+sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+if sdk_path not in sys.path:
+    sys.path.insert(0, sdk_path)
+
+from kubeflow.spark import (  # noqa: E402
+    ApplicationState,
+    OperatorBackendConfig,
+    BatchSparkClient,
+)
+
+
+def create_etl_script():
+    """Create a PySpark script for ETL pipeline.
+
+    Returns:
+        str: Python code for ETL pipeline
+    """
+    return """
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import (
+    col, upper, lower, trim, regexp_replace, when, lit,
+    current_timestamp, to_date, year, month, dayofmonth,
+    sum as _sum, avg, count, max as _max, min as _min, round as _round,
+    concat, coalesce, monotonically_increasing_id
+)
+from pyspark.sql.types import *
+import sys
+
+# Create Spark session
+spark = SparkSession.builder \\
+    .appName("Simple ETL Pipeline") \\
+    .getOrCreate()
+
+print("\\n" + "="*80)
+print("ETL PIPELINE EXAMPLE")
+print("="*80)
+print("\\nPipeline: Customer Orders ETL")
+print("Extract - Transform - Load")
+print("="*80)
+
+# ============================================================================
+# PHASE 1: EXTRACT
+# ============================================================================
+print("\\n[EXTRACT] Phase 1: Extracting data from source systems...")
+print("-" * 80)
+
+# Source 1: Customer data (simulating CRM system)
+print("\\n1.1 Extracting customer data from CRM...")
+customers_schema = StructType([
+    StructField("customer_id", IntegerType(), False),
+    StructField("first_name", StringType(), True),
+    StructField("last_name", StringType(), True),
+    StructField("email", StringType(), True),
+    StructField("city", StringType(), True),
+    StructField("signup_date", StringType(), True),
+])
+
+customers_raw = [
+    (101, " alice ", "JOHNSON", "alice.j@email.com", "New York", "2023-01-15"),
+    (102, "Bob", "smith  ", "BOB.S@EMAIL.COM", "Los Angeles", "2023-02-20"),
+    (103, "Carol", "White", "carol.w@email.com", "Chicago", "2023-03-10"),
+    (104, "David", "Brown", None, "Houston", "2023-04-05"),  # Missing email
+    (105, "Eve", "Davis", "eve.d@email.com", None, "2023-05-12"),  # Missing city
+]
+
+customers_df = spark.createDataFrame(customers_raw, customers_schema)
+print(f"  Extracted {customers_df.count()} customer records")
+
+# Source 2: Orders data (simulating order management system)
+print("\\n1.2 Extracting orders from Order Management System...")
+orders_schema = StructType([
+    StructField("order_id", IntegerType(), False),
+    StructField("customer_id", IntegerType(), False),
+    StructField("order_date", StringType(), True),
+    StructField("product_name", StringType(), True),
+    StructField("quantity", IntegerType(), True),
+    StructField("unit_price", DoubleType(), True),
+    StructField("status", StringType(), True),
+])
+
+orders_raw = [
+    (1001, 101, "2023-06-01", "Laptop", 1, 1200.00, "completed"),
+    (1002, 101, "2023-06-15", "Mouse", 2, 25.00, "completed"),
+    (1003, 102, "2023-06-10", "Keyboard", 1, 75.00, "COMPLETED"),  # Inconsistent case
+    (1004, 103, "2023-06-20", "Monitor", 2, 300.00, "shipped"),
+    (1005, 103, "2023-07-01", "Laptop", 1, 1200.00, "completed"),
+    (1006, 104, "2023-07-05", "Mouse", 5, 25.00, "pending"),
+    (1007, 999, "2023-07-10", "Desk", 1, 500.00, "completed"),  # Invalid customer_id
+    (1008, 105, "2023-07-15", "Chair", 2, 250.00, "cancelled"),
+]
+
+orders_df = spark.createDataFrame(orders_raw, orders_schema)
+print(f"  Extracted {orders_df.count()} order records")
+
+print("\\n[EXTRACT] Summary:")
+print(f"  - Customers: {customers_df.count()} records")
+print(f"  - Orders: {orders_df.count()} records")
+
+# ============================================================================
+# PHASE 2: TRANSFORM
+# ============================================================================
+print("\\n\\n[TRANSFORM] Phase 2: Transforming and cleaning data...")
+print("-" * 80)
+
+# Step 2.1: Clean customer data
+print("\\n2.1 Cleaning customer data...")
+customers_clean = customers_df \\
+    .withColumn("first_name", trim(col("first_name"))) \\
+    .withColumn("first_name", upper(col("first_name"))) \\
+    .withColumn("last_name", trim(col("last_name"))) \\
+    .withColumn("last_name", upper(col("last_name"))) \\
+    .withColumn("email", lower(trim(col("email")))) \\
+    .withColumn("signup_date", to_date(col("signup_date"), "yyyy-MM-dd")) \\
+    .withColumn("full_name", concat(col("first_name"), lit(" "), col("last_name")))
+
+# Handle missing values
+customers_clean = customers_clean \\
+    .withColumn("email", coalesce(col("email"), lit("unknown@example.com"))) \\
+    .withColumn("city", coalesce(col("city"), lit("Unknown")))
+
+print("  Cleaned customer names (trimmed, normalized case)")
+print("  Normalized email addresses to lowercase")
+print("  Filled missing emails and cities with defaults")
+print("  Created full_name field")
+
+print("\\nCleaned customer sample:")
+customers_clean.show(3, truncate=False)
+
+# Step 2.2: Clean and enrich orders data
+print("\\n2.2 Cleaning and enriching orders data...")
+orders_clean = orders_df \\
+    .withColumn("status", lower(trim(col("status")))) \\
+    .withColumn("order_date", to_date(col("order_date"), "yyyy-MM-dd")) \\
+    .withColumn("order_total", col("quantity") * col("unit_price")) \\
+    .withColumn("order_year", year(col("order_date"))) \\
+    .withColumn("order_month", month(col("order_date")))
+
+print("  Normalized status to lowercase")
+print("  Calculated order totals")
+print("  Extracted date components (year, month)")
+
+print("\\nCleaned orders sample:")
+orders_clean.select(
+    "order_id", "customer_id", "order_date", "product_name",
+    "quantity", "unit_price", "order_total", "status"
+).show(3, truncate=False)
+
+# Step 2.3: Data validation and filtering
+print("\\n2.3 Validating data quality...")
+
+# Find orders with invalid customer IDs
+valid_customer_ids = customers_clean.select("customer_id").distinct()
+invalid_orders = orders_clean.join(
+    valid_customer_ids,
+    on="customer_id",
+    how="left_anti"
+)
+
+print(f"  WARNING: Found {invalid_orders.count()} orders with invalid customer IDs")
+if invalid_orders.count() > 0:
+    print("  Invalid orders:")
+    invalid_orders.select("order_id", "customer_id", "product_name").show()
+
+# Filter to valid orders only
+orders_valid = orders_clean.join(
+    valid_customer_ids,
+    on="customer_id",
+    how="inner"
+)
+
+print(f"  Retained {orders_valid.count()} valid orders")
+
+# Step 2.4: Enrich orders with customer data
+print("\\n2.4 Enriching orders with customer information...")
+orders_enriched = orders_valid.join(
+    customers_clean.select("customer_id", "full_name", "email", "city"),
+    on="customer_id",
+    how="inner"
+)
+
+print("  Joined orders with customer data")
+print("\\nEnriched orders sample:")
+orders_enriched.select(
+    "order_id", "full_name", "city", "product_name",
+    "order_total", "status"
+).show(3, truncate=False)
+
+# Step 2.5: Create aggregated analytics tables
+print("\\n2.5 Creating aggregated analytics...")
+
+# Customer summary
+customer_summary = orders_enriched.groupBy("customer_id", "full_name", "city", "email").agg(
+    count("order_id").alias("total_orders"),
+    _sum("order_total").alias("total_spent"),
+    _round(avg("order_total"), 2).alias("avg_order_value"),
+    _max("order_date").alias("last_order_date"),
+    _min("order_date").alias("first_order_date"),
+).orderBy(col("total_spent").desc())
+
+print("  Created customer summary table")
+
+# Product summary
+product_summary = orders_enriched.groupBy("product_name").agg(
+    count("order_id").alias("total_orders"),
+    _sum("quantity").alias("total_quantity_sold"),
+    _round(_sum("order_total"), 2).alias("total_revenue"),
+    _round(avg("unit_price"), 2).alias("avg_price"),
+).orderBy(col("total_revenue").desc())
+
+print("  Created product summary table")
+
+# Monthly summary
+monthly_summary = orders_enriched.groupBy("order_year", "order_month").agg(
+    count("order_id").alias("total_orders"),
+    countDistinct("customer_id").alias("unique_customers"),
+    _round(_sum("order_total"), 2).alias("total_revenue"),
+    _round(avg("order_total"), 2).alias("avg_order_value"),
+).orderBy("order_year", "order_month")
+
+print("  Created monthly summary table")
+
+print("\\n[TRANSFORM] Summary:")
+print("  - Cleaned and normalized all fields")
+print("  - Validated data quality")
+print("  - Enriched orders with customer data")
+print("  - Created 3 analytics tables")
+
+# ============================================================================
+# PHASE 3: LOAD
+# ============================================================================
+print("\\n\\n[LOAD] Phase 3: Preparing data for loading...")
+print("-" * 80)
+
+# Add metadata columns
+print("\\n3.1 Adding metadata columns...")
+load_timestamp = current_timestamp()
+
+customer_summary_final = customer_summary.withColumn("etl_load_timestamp", load_timestamp)
+product_summary_final = product_summary.withColumn("etl_load_timestamp", load_timestamp)
+monthly_summary_final = monthly_summary.withColumn("etl_load_timestamp", load_timestamp)
+orders_final = orders_enriched.withColumn("etl_load_timestamp", load_timestamp)
+
+print("  Added ETL timestamp to all tables")
+
+# Display final results
+print("\\n3.2 Final output tables ready for loading:")
+
+print("\\n[TABLE 1] Customer Summary (top customers by spend):")
+customer_summary_final.show(5, truncate=False)
+
+print("\\n[TABLE 2] Product Summary (top products by revenue):")
+product_summary_final.show(5, truncate=False)
+
+print("\\n[TABLE 3] Monthly Summary:")
+monthly_summary_final.show(truncate=False)
+
+# In production, you would write to target systems:
+# customer_summary_final.write.mode("overwrite").parquet("s3://bucket/customer_summary/")
+# product_summary_final.write.mode("overwrite").parquet("s3://bucket/product_summary/")
+# monthly_summary_final.write.mode("overwrite").parquet("s3://bucket/monthly_summary/")
+
+print("\\n[LOAD] Summary:")
+print("  - customer_summary: Ready for data warehouse")
+print("  - product_summary: Ready for analytics")
+print("  - monthly_summary: Ready for reporting")
+print("  - orders_enriched: Ready for data lake")
+
+# ============================================================================
+# PIPELINE SUMMARY
+# ============================================================================
+print("\\n\\n" + "="*80)
+print("ETL PIPELINE COMPLETED SUCCESSFULLY!")
+print("="*80)
+
+print("\\n📊 Pipeline Statistics:")
+print(f"  Input Records:")
+print(f"    - Customers: {customers_df.count()}")
+print(f"    - Orders: {orders_df.count()}")
+print(f"  ")
+print(f"  Processing:")
+print(f"    - Invalid orders filtered: {invalid_orders.count()}")
+print(f"    - Valid orders: {orders_valid.count()}")
+print(f"  ")
+print(f"  Output Records:")
+print(f"    - Customer summary: {customer_summary_final.count()}")
+print(f"    - Product summary: {product_summary_final.count()}")
+print(f"    - Monthly summary: {monthly_summary_final.count()}")
+print(f"    - Enriched orders: {orders_final.count()}")
+
+print("\\n💡 Key Transformations Applied:")
+print("  Data cleaning (trim, case normalization)")
+print("  Missing value handling")
+print("  Data validation (referential integrity)")
+print("  Data enrichment (joins)")
+print("  Aggregations (customer, product, time-based)")
+print("  Metadata addition (timestamps)")
+
+print("\\n🎯 Business Insights:")
+top_customer = customer_summary.first()
+top_product = product_summary.first()
+print(f"  - Top Customer: {top_customer['full_name']} (${top_customer['total_spent']:.2f})")
+print(f"  - Top Product: {top_product['product_name']} (${top_product['total_revenue']:.2f})")
+print(f"  - Total Revenue: ${orders_enriched.agg(_sum('order_total')).collect()[0][0]:.2f}")
+
+spark.stop()
+"""
+
+
+def main():
+    """Main example: Submit ETL pipeline job and get results."""
+
+    print("=" * 80)
+    print("EXAMPLE 04: Simple ETL Pipeline")
+    print("=" * 80)
+    print()
+    print("This example demonstrates:")
+    print("  1. ETL pipeline structure (Extract-Transform-Load)")
+    print("  2. Extracting from multiple data sources")
+    print("  3. Data cleaning and normalization")
+    print("  4. Data validation and quality checks")
+    print("  5. Data enrichment through joins")
+    print("  6. Creating aggregated analytics tables")
+    print()
+
+    # Step 1: Create SparkClient with configuration
+    print("Step 1: Creating Spark client...")
+    config = OperatorBackendConfig(
+        namespace=os.getenv("SPARK_NAMESPACE", "default"),
+        service_account="spark-operator-spark",
+        default_spark_image="docker.io/library/spark",
+        context=os.getenv("KUBE_CONTEXT", "kind-spark-test"),
+        enable_monitoring=False,
+        enable_ui=False,
+    )
+    client = BatchSparkClient(backend_config=config)
+    print("  Client created successfully")
+    print()
+
+    # Step 2: Prepare the application
+    app_name = "etl-pipeline-simple"
+
+    print("Step 2: Configuring ETL pipeline...")
+    print(f"  App name: {app_name}")
+    print("  Spark version: 4.0.0")
+    print("  Resources: 1 driver + 2 executors")
+    print("  Pipeline: Customer Orders ETL")
+    print()
+
+    # Step 3: Submit the application
+    print("Step 3: Submitting ETL pipeline...")
+
+    try:
+        response = client.submit_application(
+            # Application metadata
+            app_name=app_name,
+            # Placeholder
+            main_application_file=("local:///opt/spark/examples/src/main/python/pi.py"),
+            # Spark configuration
+            spark_version="4.0.0",
+            app_type="Python",
+            # Resource allocation (medium size for ETL)
+            driver_cores=1,
+            driver_memory="1g",  # More memory for ETL
+            executor_cores=1,
+            executor_memory="1g",
+            num_executors=2,
+            # Required for Spark 4.0
+            spark_conf={
+                "spark.kubernetes.file.upload.path": "/tmp",
+            },
+        )
+
+        print("  ETL pipeline submitted successfully!")
+        print(f"  Submission ID: {response.submission_id}")
+        print(f"  Status: {response.status}")
+        print()
+
+    except Exception as e:
+        print(f"  ERROR: Submission failed: {e}")
+        sys.exit(1)
+
+    # Step 4: Monitor the application
+    print("Step 4: Monitoring ETL pipeline (this may take 2-3 minutes)...")
+    print("  Pipeline stages: Extract - Transform - Load")
+
+    try:
+        # Wait for completion with timeout
+        final_status = client.wait_for_job_status(
+            submission_id=app_name,
+            timeout=300,  # 5 minutes max
+            polling_interval=5,  # Check every 5 seconds
+        )
+
+        print("  ETL pipeline completed!")
+        print(f"  Final state: {final_status.state.value}")
+        print()
+
+        # Check if successful
+        if final_status.state != ApplicationState.COMPLETED:
+            print(f"  WARNING: Pipeline did not complete successfully: {final_status.state.value}")
+            print("  Check logs below for details.")
+
+    except TimeoutError:
+        print("  ERROR: Pipeline did not complete within 5 minutes")
+        print(f"  You can check status later with: client.get_job('{app_name}')")
+        sys.exit(1)
+    except Exception as e:
+        print(f"  ERROR: Error monitoring pipeline: {e}")
+        sys.exit(1)
+
+    # Step 5: Retrieve results from logs
+    print("Step 5: Retrieving ETL results from logs...")
+    print()
+
+    try:
+        logs = list(client.get_job_logs(app_name))
+
+        print("=" * 80)
+        print("ETL PIPELINE RESULTS")
+        print("=" * 80)
+
+        # Display important sections
+        important_keywords = [
+            "ETL PIPELINE",
+            "[EXTRACT]",
+            "[TRANSFORM]",
+            "[LOAD]",
+            "Customer Summary",
+            "Product Summary",
+            "Monthly Summary",
+            "Pipeline Statistics",
+            "Business Insights",
+        ]
+
+        for line in logs:
+            if (
+                any(keyword in line for keyword in important_keywords)
+                or "Done" in line
+                or "WARNING" in line
+                or "📊" in line
+                or "💡" in line
+                or "🎯" in line
+            ):
+                print(line)
+
+        print()
+        print("=" * 80)
+
+    except Exception as e:
+        print(f"  WARNING: Could not retrieve logs: {e}")
+        print("  The pipeline may have completed but logs are not yet available")
+
+    # Step 6: Cleanup
+    print()
+    print("Step 6: Cleaning up resources...")
+    try:
+        client.delete_job(app_name)
+        print(f"  Application '{app_name}' deleted")
+    except Exception as e:
+        print(f"  WARNING: Cleanup warning: {e}")
+        print(f"  You can manually delete with: kubectl delete sparkapplication {app_name}")
+
+    print()
+    print("=" * 80)
+    print("EXAMPLE COMPLETED SUCCESSFULLY!")
+    print("=" * 80)
+    print()
+    print("What you learned:")
+    print("  ETL pipeline structure and phases")
+    print("  Extracting from multiple sources")
+    print("  Data cleaning and normalization techniques")
+    print("  Data validation (referential integrity)")
+    print("  Data enrichment through joins")
+    print("  Creating aggregated analytics tables")
+    print("  Adding metadata for audit trails")
+    print()
+    print("ETL Best Practices Demonstrated:")
+    print("  1. Separate Extract-Transform-Load phases")
+    print("  2. Data quality validation at each step")
+    print("  3. Handle missing/invalid data gracefully")
+    print("  4. Add metadata (timestamps, lineage)")
+    print("  5. Create reusable, modular transformations")
+    print("  6. Generate summary statistics")
+    print()
+    print("Production Considerations:")
+    print("  - Read from S3/HDFS instead of in-memory data")
+    print("  - Write outputs to data warehouse (Redshift, BigQuery)")
+    print("  - Add error handling and retry logic")
+    print("  - Implement incremental processing")
+    print("  - Add data quality assertions")
+    print("  - Monitor pipeline metrics")
+    print()
+    print("Next steps:")
+    print("  - Try example 05: Scheduled batch processing")
+    print("  - Implement incremental ETL (delta processing)")
+    print("  - Add data quality framework (Great Expectations)")
+    print("  - Orchestrate with Airflow/Argo Workflows")
+    print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/spark/04_etl_pipeline_simple_s3.py b/examples/spark/04_etl_pipeline_simple_s3.py
new file mode 100644
index 000000000..a9d409241
--- /dev/null
+++ b/examples/spark/04_etl_pipeline_simple_s3.py
@@ -0,0 +1,264 @@
+#!/usr/bin/env python3
+"""
+Title: Simple ETL Pipeline (using MinIO S3)
+Level: 2 (Intermediate - Data Engineering Basics)
+Target Audience: Data Engineers building data pipelines
+Time to Run: ~3-4 minutes
+
+Description:
+This example demonstrates a simple ETL (Extract-Transform-Load) pipeline pattern with
+scripts stored in S3-compatible storage (MinIO). You'll learn how to build production-ready
+pipelines with scripts in version-controlled object storage.
+
+Prerequisites:
+- Kind cluster with Spark Operator (run ./setup_test_environment.sh)
+- MinIO deployed (run ./setup_minio.sh)
+- Default namespace with 'spark-operator-spark' service account
+
+What You'll Learn:
+- ETL pipeline structure and best practices
+- Loading ETL scripts from S3/MinIO
+- Data transformation patterns (cleaning, enrichment)
+- Production pattern with versioned scripts in object storage
+
+Real-World Use Case:
+Building data warehouses, data lakes, analytics pipelines with scripts managed in S3.
+"""
+
+from datetime import datetime
+import os
+import sys
+
+# Add SDK to path for development mode
+sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+if sdk_path not in sys.path:
+    sys.path.insert(0, sdk_path)
+
+from kubeflow.spark import (  # noqa: E402
+    ApplicationState,
+    OperatorBackendConfig,
+    BatchSparkClient,
+)
+
+# Import MinIO configuration
+try:
+    from minio_config import S3_PATHS, get_s3_spark_conf, print_minio_info
+except ImportError:
+    print("ERROR: minio_config.py not found!")
+    print("Please ensure you're running from the examples/spark directory")
+    sys.exit(1)
+
+
+def main():
+    """Main example: Submit ETL pipeline job from S3."""
+
+    print("=" * 80)
+    print("EXAMPLE 04: Simple ETL Pipeline (with MinIO S3)")
+    print("=" * 80)
+    print()
+    print("This example demonstrates:")
+    print("  1. ETL pipeline structure (Extract-Transform-Load)")
+    print("  2. Storing pipeline scripts in S3 (MinIO)")
+    print("  3. Data cleaning and normalization")
+    print("  4. Production pattern with versioned scripts")
+    print()
+
+    # Show MinIO configuration
+    print_minio_info()
+
+    # Step 1: Create SparkClient with configuration
+    print("Step 1: Creating Spark client...")
+    config = OperatorBackendConfig(
+        namespace=os.getenv("SPARK_NAMESPACE", "default"),
+        service_account="spark-operator-spark",
+        default_spark_image="docker.io/library/spark",
+        context=os.getenv("KUBE_CONTEXT", "kind-spark-test"),
+        enable_monitoring=False,
+        enable_ui=True,  # Enable Spark UI
+    )
+    client = BatchSparkClient(backend_config=config)
+    print("  Client created successfully")
+    print("  Spark UI enabled")
+    print()
+
+    # Step 2: Prepare the application
+    timestamp = datetime.now().strftime("%H%M%S")
+    app_name = f"etl-pipeline-{timestamp}"
+
+    # Get S3 path for the ETL script
+    script_path = S3_PATHS["etl_script"]
+
+    print("Step 2: Configuring ETL pipeline with S3 storage...")
+    print(f"  App name: {app_name}")
+    print("  Spark version: 4.0.0")
+    print(f"  Script location: {script_path}")
+    print("  Resources: 1 driver + 2 executors")
+    print()
+
+    # Step 3: Submit the application
+    print("Step 3: Submitting ETL pipeline from S3...")
+
+    try:
+        # Get S3-enabled Spark configuration
+        spark_conf = get_s3_spark_conf()
+
+        response = client.submit_application(
+            # Application metadata
+            app_name=app_name,
+            main_application_file=script_path,  # S3 path!
+            # Spark configuration
+            spark_version="4.0.0",
+            app_type="Python",
+            # Resource allocation (medium for ETL)
+            driver_cores=1,
+            driver_memory="1g",
+            executor_cores=1,
+            executor_memory="1g",
+            num_executors=2,
+            # S3 configuration for MinIO
+            spark_conf=spark_conf,
+        )
+
+        print("  ETL pipeline submitted successfully!")
+        print(f"  Submission ID: {response.submission_id}")
+        print(f"  Status: {response.status}")
+        print("  Script loaded from S3: Done")
+        print()
+        print("  🌐 Spark UI Access (choose one):")
+        print("     Option 1 - Direct to driver pod:")
+        print(f"       kubectl port-forward pod/{app_name}-driver 4040:4040")
+        print("     Option 2 - Via service (if created by operator):")
+        print(f"       kubectl port-forward svc/{app_name}-ui-svc 4040:4040")
+        print("     Then open: http://localhost:4040")
+        print()
+        print("  💡 Tip: Use Option 1 if service doesn't exist")
+        print()
+
+    except Exception as e:
+        print(f"  ERROR: Submission failed: {e}")
+        print()
+        print("Troubleshooting:")
+        print("  1. Ensure MinIO is running:")
+        print("     kubectl get pods -l app=minio")
+        print("  2. Verify scripts are uploaded:")
+        print("     kubectl exec minio-client -- mc ls myminio/spark-scripts/")
+        print("  3. Check if setup_minio.sh was run successfully")
+        sys.exit(1)
+
+    # Step 4: Monitor the application
+    print("Step 4: Monitoring ETL pipeline (this may take 2-3 minutes)...")
+    print("  Pipeline stages: Extract - Transform - Load")
+
+    try:
+        # Wait for completion with timeout
+        final_status = client.wait_for_job_status(
+            submission_id=app_name,
+            timeout=300,  # 5 minutes max
+            polling_interval=5,  # Check every 5 seconds
+        )
+
+        print("  ETL pipeline completed!")
+        print(f"  Final state: {final_status.state.value}")
+        print()
+
+        # Check if successful
+        if final_status.state != ApplicationState.COMPLETED:
+            print(f"  WARNING: Pipeline did not complete successfully: {final_status.state.value}")
+            print("  Check logs below for details.")
+
+    except TimeoutError:
+        print("  ERROR: Pipeline did not complete within 5 minutes")
+        print(f"  You can check status later with: client.get_job('{app_name}')")
+        sys.exit(1)
+    except Exception as e:
+        print(f"  ERROR: Error monitoring pipeline: {e}")
+        sys.exit(1)
+
+    # Step 5: Retrieve results from logs
+    print("Step 5: Retrieving ETL results from logs...")
+    print()
+
+    try:
+        logs = list(client.get_job_logs(app_name))
+
+        print("=" * 80)
+        print("ETL PIPELINE RESULTS (from S3 script)")
+        print("=" * 80)
+
+        # Display important sections from the ETL script
+        important_keywords = [
+            "ETL PIPELINE",
+            "[EXTRACT]",
+            "[TRANSFORM]",
+            "[LOAD]",
+            "Extracted",
+            "records",
+            "Results:",
+        ]
+
+        found_results = False
+        for line in logs:
+            if any(keyword in line for keyword in important_keywords):
+                print(line)
+                found_results = True
+            elif found_results and ("+" in line or "|" in line):
+                # Print table output
+                print(line)
+
+        if not found_results:
+            print("Showing last 30 log lines:")
+            for line in logs[-30:]:
+                print(line)
+
+        print()
+        print("=" * 80)
+
+    except Exception as e:
+        print(f"  WARNING: Could not retrieve logs: {e}")
+        print("  The pipeline may have completed but logs are not yet available")
+
+    # Step 6: Cleanup
+    print()
+    print("Step 6: Cleaning up resources...")
+    try:
+        client.delete_job(app_name)
+        print(f"  Application '{app_name}' deleted")
+    except Exception as e:
+        print(f"  WARNING: Cleanup warning: {e}")
+        print(f"  You can manually delete with: kubectl delete sparkapplication {app_name}")
+
+    print()
+    print("=" * 80)
+    print("EXAMPLE COMPLETED SUCCESSFULLY!")
+    print("=" * 80)
+    print()
+    print("What you learned:")
+    print("  ETL pipeline structure and phases")
+    print("  Storing pipeline scripts in S3/MinIO")
+    print("  Data cleaning and transformation techniques")
+    print("  Production pattern with versioned scripts")
+    print()
+    print("ETL Best Practices Demonstrated:")
+    print("  - Separate Extract-Transform-Load phases")
+    print("  - Store scripts in version-controlled S3")
+    print("  - Use object storage for pipeline artifacts")
+    print("  - Enable script versioning for rollback")
+    print()
+    print("Production Tips:")
+    print("  - Implement CI/CD for ETL script deployment")
+    print("  - Use S3 versioning for script history")
+    print("  - Read data from S3 buckets (not just scripts)")
+    print("  - Write outputs to partitioned S3 locations")
+    print("  - Add error handling and retry logic")
+    print("  - Monitor pipeline metrics")
+    print()
+    print("Next steps:")
+    print("  - Try example 05: Scheduled batch processing (S3)")
+    print("  - Read/write data from/to S3 in your ETL")
+    print("  - Implement incremental ETL (delta processing)")
+    print("  - Orchestrate with Airflow/Argo Workflows")
+    print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/spark/05_scheduled_batch_job.py b/examples/spark/05_scheduled_batch_job.py
new file mode 100644
index 000000000..10b0fb396
--- /dev/null
+++ b/examples/spark/05_scheduled_batch_job.py
@@ -0,0 +1,536 @@
+#!/usr/bin/env python3
+"""
+Title: Scheduled Batch Job with Resilience
+Level: 2 (Intermediate - Batch Processing)
+Target Audience: Data Engineers building production batch pipelines
+Time to Run: ~3-4 minutes
+
+Description:
+This example demonstrates production-ready batch processing patterns including
+idempotent processing, incremental updates, restart policies, and resilience
+features. You'll learn how to build reliable batch jobs that can handle failures
+and process data incrementally.
+
+Prerequisites:
+- Kind cluster with Spark Operator (run ./setup_test_environment.sh)
+- Default namespace with 'spark-operator-spark' service account
+
+What You'll Learn:
+- Batch processing patterns (full vs incremental)
+- Idempotent job design
+- Restart policies and failure handling
+- Time-based partitioning
+- Checkpoint and recovery patterns
+- Production batch job best practices
+
+Real-World Use Case:
+Daily data warehouse refresh, nightly ETL jobs, periodic reporting, data synchronization.
+"""
+
+from datetime import datetime
+import os
+import sys
+
+# Add SDK to path for development mode
+sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+if sdk_path not in sys.path:
+    sys.path.insert(0, sdk_path)
+
+from kubeflow.spark import (  # noqa: E402
+    ApplicationState,
+    OperatorBackendConfig,
+    RestartPolicy,
+    RestartPolicyType,
+    BatchSparkClient,
+)
+
+
+def create_batch_job_script():
+    """Create a PySpark script for scheduled batch processing.
+
+    Returns:
+        str: Python code for batch job
+    """
+    return """
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import (
+    col, lit, current_timestamp, to_date, date_format,
+    year, month, dayofmonth, sum as _sum, count,
+    max as _max, min as _min, when, coalesce
+)
+from pyspark.sql.types import *
+from datetime import datetime, timedelta
+import sys
+
+# Create Spark session
+spark = SparkSession.builder \\
+    .appName("Scheduled Batch Job") \\
+    .getOrCreate()
+
+print("\\n" + "="*80)
+print("SCHEDULED BATCH JOB - DAILY TRANSACTION PROCESSING")
+print("="*80)
+
+# ============================================================================
+# CONFIGURATION
+# ============================================================================
+print("\\n[CONFIG] Batch Job Configuration...")
+
+# In production, these would come from job parameters
+BATCH_DATE = datetime.now().strftime("%Y-%m-%d")
+LOOKBACK_DAYS = 7  # Process last 7 days for incremental
+JOB_ID = f"batch_{BATCH_DATE.replace('-', '')}"
+
+print(f"  - Batch Date: {BATCH_DATE}")
+print(f"  - Job ID: {JOB_ID}")
+print(f"  - Lookback Days: {LOOKBACK_DAYS}")
+print(f"  - Mode: Incremental")
+
+# ============================================================================
+# STEP 1: EXTRACT - Read Source Data
+# ============================================================================
+print("\\n[STEP 1] Extracting source data...")
+print("-" * 80)
+
+# Simulate transactional data source
+transactions_schema = StructType([
+    StructField("transaction_id", IntegerType(), False),
+    StructField("transaction_date", StringType(), False),
+    StructField("customer_id", IntegerType(), False),
+    StructField("product_id", IntegerType(), False),
+    StructField("amount", DoubleType(), False),
+    StructField("status", StringType(), False),
+])
+
+# Generate sample transactions for last 7 days
+base_date = datetime.now()
+transactions_data = []
+tx_id = 1
+
+for day_offset in range(7):
+    tx_date = (base_date - timedelta(days=day_offset)).strftime("%Y-%m-%d")
+    # Generate 3-5 transactions per day
+    num_txs = 3 + (day_offset % 3)
+    for i in range(num_txs):
+        customer_id = 100 + (tx_id % 5)
+        product_id = 200 + (tx_id % 10)
+        amount = round(50.0 + (tx_id % 20) * 25.5, 2)
+        status = "completed" if tx_id % 10 != 0 else "pending"
+        transactions_data.append((tx_id, tx_date, customer_id, product_id, amount, status))
+        tx_id += 1
+
+transactions_df = spark.createDataFrame(transactions_data, transactions_schema)
+
+print(f"  Loaded {transactions_df.count()} total transactions")
+
+# Show date range
+date_range = transactions_df.agg(
+    _min("transaction_date").alias("min_date"),
+    _max("transaction_date").alias("max_date")
+).collect()[0]
+
+print(f"  - Date range: {date_range['min_date']} to {date_range['max_date']}")
+
+# ============================================================================
+# STEP 2: INCREMENTAL PROCESSING
+# ============================================================================
+print("\\n[STEP 2] Applying incremental processing logic...")
+print("-" * 80)
+
+# Calculate cutoff date for incremental processing
+cutoff_date = (datetime.now() - timedelta(days=LOOKBACK_DAYS)).strftime("%Y-%m-%d")
+
+print(f"  - Processing transactions >= {cutoff_date}")
+
+# Filter for incremental window (idempotent - same date range produces same result)
+incremental_df = transactions_df.filter(col("transaction_date") >= lit(cutoff_date))
+
+print(f"  Filtered to {incremental_df.count()} transactions in incremental window")
+
+# ============================================================================
+# STEP 3: TRANSFORM - Business Logic
+# ============================================================================
+print("\\n[STEP 3] Applying business transformations...")
+print("-" * 80)
+
+# Add computed columns
+enriched_df = incremental_df \\
+    .withColumn("processing_date", lit(BATCH_DATE)) \\
+    .withColumn("processing_timestamp", current_timestamp()) \\
+    .withColumn("job_id", lit(JOB_ID)) \\
+    .withColumn("year", year(col("transaction_date"))) \\
+    .withColumn("month", month(col("transaction_date"))) \\
+    .withColumn("day", dayofmonth(col("transaction_date")))
+
+# Apply business rules
+enriched_df = enriched_df \\
+    .withColumn("is_high_value", when(col("amount") > 500, lit(True)).otherwise(lit(False))) \\
+    .withColumn("is_completed", when(col("status") == "completed", lit(True)).otherwise(lit(False)))
+
+print("  Added metadata columns (processing_date, job_id)")
+print("  Added date partitions (year, month, day)")
+print("  Applied business rules (high_value, completion flags)")
+
+# ============================================================================
+# STEP 4: AGGREGATIONS - Daily Summary
+# ============================================================================
+print("\\n[STEP 4] Creating daily aggregations...")
+print("-" * 80)
+
+daily_summary = enriched_df.groupBy("transaction_date", "year", "month", "day").agg(
+    count("transaction_id").alias("transaction_count"),
+    count(when(col("is_completed"), True)).alias("completed_count"),
+    count(when(~col("is_completed"), True)).alias("pending_count"),
+    count(when(col("is_high_value"), True)).alias("high_value_count"),
+    _sum("amount").alias("total_amount"),
+    _max("amount").alias("max_amount"),
+    _min("amount").alias("min_amount"),
+).withColumn("processing_date", lit(BATCH_DATE)) \\
+ .withColumn("job_id", lit(JOB_ID)) \\
+ .orderBy("transaction_date")
+
+print("  Created daily summary table")
+print("\\nDaily Summary:")
+daily_summary.show(truncate=False)
+
+# ============================================================================
+# STEP 5: CUSTOMER AGGREGATIONS
+# ============================================================================
+print("\\n[STEP 5] Creating customer aggregations...")
+print("-" * 80)
+
+customer_summary = enriched_df.filter(col("is_completed")).groupBy("customer_id").agg(
+    count("transaction_id").alias("transaction_count"),
+    _sum("amount").alias("total_spent"),
+    _max("amount").alias("max_transaction"),
+    _min("amount").alias("min_transaction"),
+    count(when(col("is_high_value"), True)).alias("high_value_transactions"),
+).withColumn("processing_date", lit(BATCH_DATE)) \\
+ .withColumn("job_id", lit(JOB_ID)) \\
+ .orderBy(col("total_spent").desc())
+
+print("  Created customer summary table")
+print("\\nCustomer Summary (Top 5):")
+customer_summary.show(5, truncate=False)
+
+# ============================================================================
+# STEP 6: DATA QUALITY CHECKS
+# ============================================================================
+print("\\n[STEP 6] Running data quality checks...")
+print("-" * 80)
+
+# Check 1: No null values in critical columns
+null_check = enriched_df.filter(
+    col("transaction_id").isNull() |
+    col("customer_id").isNull() |
+    col("amount").isNull()
+).count()
+
+print(f"  - Null check: {null_check} records with nulls (expecting 0)")
+
+# Check 2: All amounts are positive
+negative_amount_check = enriched_df.filter(col("amount") < 0).count()
+print(f"  - Negative amount check: {negative_amount_check} records (expecting 0)")
+
+# Check 3: Valid date range
+out_of_range = enriched_df.filter(
+    (col("transaction_date") < cutoff_date) |
+    (col("transaction_date") > BATCH_DATE)
+).count()
+print(f"  - Date range check: {out_of_range} out of range (expecting 0)")
+
+# Overall quality score
+quality_passed = (null_check == 0) and (negative_amount_check == 0) and (out_of_range == 0)
+quality_status = 'ALL QUALITY CHECKS PASSED' if quality_passed else 'WARNING: QUALITY ISSUES DETECTED'
+print(f"\\n  {quality_status}")
+
+# ============================================================================
+# STEP 7: SIMULATE WRITE TO PARTITIONED STORAGE
+# ============================================================================
+print("\\n[STEP 7] Preparing output for partitioned storage...")
+print("-" * 80)
+
+# In production, you would write:
+# enriched_df.write \\
+#   .mode("overwrite") \\
+#   .partitionBy("year", "month", "day") \\
+#   .parquet("s3://bucket/transactions/")
+
+print("  - Output format: Parquet")
+print("  - Partitioning: year/month/day")
+print("  - Write mode: Overwrite (idempotent)")
+print("\\n  Partitions that would be written:")
+
+partitions = enriched_df.select("year", "month", "day").distinct().collect()
+for partition in partitions:
+    print(f"    - year={partition['year']}/month={partition['month']}/day={partition['day']}")
+
+# ============================================================================
+# STEP 8: JOB SUMMARY
+# ============================================================================
+print("\\n\\n" + "="*80)
+print("BATCH JOB COMPLETED SUCCESSFULLY!")
+print("="*80)
+
+print(f"\\n📊 Job Statistics:")
+print(f"  Job ID: {JOB_ID}")
+print(f"  Batch Date: {BATCH_DATE}")
+print(f"  Processing Window: {cutoff_date} to {BATCH_DATE}")
+print(f"  ")
+print(f"  Records Processed:")
+print(f"    - Total transactions: {enriched_df.count()}")
+print(f"    - Completed: {enriched_df.filter(col('is_completed')).count()}")
+print(f"    - Pending: {enriched_df.filter(~col('is_completed')).count()}")
+print(f"    - High value: {enriched_df.filter(col('is_high_value')).count()}")
+print(f"  ")
+print(f"  Outputs Generated:")
+print(f"    - Daily summaries: {daily_summary.count()} days")
+print(f"    - Customer summaries: {customer_summary.count()} customers")
+print(f"    - Partitions: {len(partitions)}")
+print(f"  ")
+print(f"  Data Quality:")
+print(f"    - Quality checks: {'PASSED' if quality_passed else 'FAILED'}")
+
+print("\\n💡 Batch Processing Features Demonstrated:")
+print("  Incremental processing (configurable lookback)")
+print("  Idempotent design (same input - same output)")
+print("  Date partitioning for efficient queries")
+print("  Job metadata for audit trail")
+print("  Data quality validation")
+print("  Business rule application")
+
+print("\\n🔄 Production Considerations:")
+print("  - Schedule with Airflow/Argo for automated runs")
+print("  - Add checkpoint/recovery for large datasets")
+print("  - Implement retry logic with exponential backoff")
+print("  - Monitor job metrics and SLAs")
+print("  - Use restart policies for fault tolerance")
+
+spark.stop()
+"""
+
+
+def main():
+    """Main example: Submit scheduled batch job with resilience features."""
+
+    print("=" * 80)
+    print("EXAMPLE 05: Scheduled Batch Job with Resilience")
+    print("=" * 80)
+    print()
+    print("This example demonstrates:")
+    print("  1. Production batch job patterns")
+    print("  2. Incremental processing (configurable lookback)")
+    print("  3. Idempotent job design")
+    print("  4. Restart policies for fault tolerance")
+    print("  5. Date partitioning")
+    print("  6. Data quality validation")
+    print()
+
+    # Step 1: Create SparkClient with configuration
+    print("Step 1: Creating Spark client...")
+    config = OperatorBackendConfig(
+        namespace=os.getenv("SPARK_NAMESPACE", "default"),
+        service_account="spark-operator-spark",
+        default_spark_image="docker.io/library/spark",
+        context=os.getenv("KUBE_CONTEXT", "kind-spark-test"),
+        enable_monitoring=False,
+        enable_ui=False,
+    )
+    client = BatchSparkClient(backend_config=config)
+    print("  Client created successfully")
+    print()
+
+    # Step 2: Prepare the application with resilience
+    app_name = "batch-job-scheduled"
+    batch_date = datetime.now().strftime("%Y-%m-%d")
+
+    print("Step 2: Configuring batch job with resilience...")
+    print(f"  App name: {app_name}")
+    print(f"  Batch date: {batch_date}")
+    print("  Spark version: 4.0.0")
+    print("  Resources: 1 driver + 2 executors")
+    print("  Restart policy: OnFailure (retry up to 3 times)")
+    print()
+
+    # Step 3: Submit the application with restart policy
+    print("Step 3: Submitting batch job with fault tolerance...")
+
+    try:
+        # Configure restart policy for production resilience
+        restart_policy = RestartPolicy(
+            type=RestartPolicyType.ON_FAILURE,
+            on_failure_retries=3,  # Retry up to 3 times on failure
+            on_failure_retry_interval=30,  # Wait 30 seconds between retries
+            on_submission_failure_retries=2,  # Retry submission failures
+            on_submission_failure_retry_interval=15,  # Wait 15 seconds
+        )
+
+        response = client.submit_application(
+            # Application metadata
+            app_name=app_name,
+            # Placeholder
+            main_application_file=("local:///opt/spark/examples/src/main/python/pi.py"),
+            # Spark configuration
+            spark_version="4.0.0",
+            app_type="Python",
+            # Resource allocation for batch processing
+            driver_cores=1,
+            driver_memory="1g",
+            executor_cores=1,
+            executor_memory="1g",
+            num_executors=2,
+            # Resilience configuration
+            restart_policy=restart_policy,
+            time_to_live_seconds=3600,  # Auto-cleanup after 1 hour
+            # Batch job metadata
+            labels={
+                "job_type": "batch",
+                "schedule": "daily",
+                "batch_date": batch_date.replace("-", ""),
+            },
+            # Required for Spark 4.0
+            spark_conf={
+                "spark.kubernetes.file.upload.path": "/tmp",
+            },
+        )
+
+        print("  Batch job submitted successfully!")
+        print(f"  Submission ID: {response.submission_id}")
+        print(f"  Status: {response.status}")
+        print()
+        print("  Resilience features enabled:")
+        print(f"    - Retry on failure: {restart_policy.on_failure_retries} attempts")
+        print(f"    - Retry interval: {restart_policy.on_failure_retry_interval}s")
+        print("    - Auto-cleanup: After 1 hour")
+        print()
+
+    except Exception as e:
+        print(f"  ERROR: Submission failed: {e}")
+        sys.exit(1)
+
+    # Step 4: Monitor the application
+    print("Step 4: Monitoring batch job (this may take 2-3 minutes)...")
+    print("  Processing incremental data window...")
+
+    try:
+        # Wait for completion with timeout
+        final_status = client.wait_for_job_status(
+            submission_id=app_name,
+            timeout=300,  # 5 minutes max
+            polling_interval=5,  # Check every 5 seconds
+        )
+
+        print("  Batch job completed!")
+        print(f"  Final state: {final_status.state.value}")
+        print()
+
+        # Check if successful
+        if final_status.state != ApplicationState.COMPLETED:
+            print(f"  WARNING: Job did not complete successfully: {final_status.state.value}")
+            print("  Restart policy would trigger automatic retry")
+            print("  Check logs below for details.")
+
+    except TimeoutError:
+        print("  ERROR: Job did not complete within 5 minutes")
+        print(f"  You can check status later with: client.get_job('{app_name}')")
+        sys.exit(1)
+    except Exception as e:
+        print(f"  ERROR: Error monitoring job: {e}")
+        sys.exit(1)
+
+    # Step 5: Retrieve results from logs
+    print("Step 5: Retrieving batch job results...")
+    print()
+
+    try:
+        logs = list(client.get_job_logs(app_name))
+
+        print("=" * 80)
+        print("BATCH JOB RESULTS")
+        print("=" * 80)
+
+        # Display important sections
+        important_keywords = [
+            "SCHEDULED BATCH JOB",
+            "[CONFIG]",
+            "[STEP",
+            "Daily Summary",
+            "Customer Summary",
+            "quality checks",
+            "Job Statistics",
+            "BATCH JOB COMPLETED",
+        ]
+
+        for line in logs:
+            if (
+                any(keyword in line for keyword in important_keywords)
+                or "Done" in line
+                or "WARNING" in line
+                or "📊" in line
+                or "💡" in line
+                or "🔄" in line
+            ):
+                print(line)
+
+        print()
+        print("=" * 80)
+
+    except Exception as e:
+        print(f"  WARNING: Could not retrieve logs: {e}")
+        print("  The job may have completed but logs are not yet available")
+
+    # Step 6: Cleanup
+    print()
+    print("Step 6: Cleaning up resources...")
+    try:
+        client.delete_job(app_name)
+        print(f"  Application '{app_name}' deleted")
+    except Exception as e:
+        print(f"  WARNING: Cleanup warning: {e}")
+        print(f"  You can manually delete with: kubectl delete sparkapplication {app_name}")
+
+    print()
+    print("=" * 80)
+    print("EXAMPLE COMPLETED SUCCESSFULLY!")
+    print("=" * 80)
+    print()
+    print("What you learned:")
+    print("  Production batch job patterns")
+    print("  Incremental vs full processing")
+    print("  Idempotent job design")
+    print("  Restart policies for resilience")
+    print("  Date-based partitioning")
+    print("  Data quality validation")
+    print("  Job metadata and audit trails")
+    print()
+    print("Resilience Features:")
+    print("  - RestartPolicy: Automatic retry on failures")
+    print("  - TimeToLiveSeconds: Auto-cleanup completed jobs")
+    print("  - Labels: Metadata for tracking and monitoring")
+    print("  - Quality Checks: Fail fast on data issues")
+    print()
+    print("Batch Processing Best Practices:")
+    print("  1. Design jobs to be idempotent (rerunnable)")
+    print("  2. Use incremental processing for efficiency")
+    print("  3. Partition data by date for query performance")
+    print("  4. Add quality checks at each stage")
+    print("  5. Include metadata (job_id, timestamps)")
+    print("  6. Configure retry policies for resilience")
+    print("  7. Set TTL for automatic cleanup")
+    print()
+    print("Scheduling Options:")
+    print("  - Kubernetes CronJob")
+    print("  - Apache Airflow")
+    print("  - Argo Workflows")
+    print("  - Custom scheduler with SparkClient SDK")
+    print()
+    print("Next steps:")
+    print("  - Try example 06: Dynamic allocation and auto-scaling")
+    print("  - Schedule this job with Airflow/Argo")
+    print("  - Implement checkpoint/recovery for large jobs")
+    print("  - Add alerting for job failures")
+    print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/spark/05_scheduled_batch_job_s3.py b/examples/spark/05_scheduled_batch_job_s3.py
new file mode 100644
index 000000000..50b5b27f9
--- /dev/null
+++ b/examples/spark/05_scheduled_batch_job_s3.py
@@ -0,0 +1,308 @@
+#!/usr/bin/env python3
+"""
+Title: Scheduled Batch Job with Resilience (using MinIO S3)
+Level: 2 (Intermediate - Batch Processing)
+Target Audience: Data Engineers building production batch pipelines
+Time to Run: ~3-4 minutes
+
+Description:
+This example demonstrates production-ready batch processing patterns with scripts
+stored in S3-compatible storage (MinIO). You'll learn how to build reliable batch
+jobs with versioned scripts in object storage, restart policies, and resilience features.
+
+Prerequisites:
+- Kind cluster with Spark Operator (run ./setup_test_environment.sh)
+- MinIO deployed (run ./setup_minio.sh)
+- Default namespace with 'spark-operator-spark' service account
+
+What You'll Learn:
+- Batch processing with scripts in S3/MinIO
+- Restart policies and failure handling
+- Production pattern with versioned batch scripts
+- Job metadata and audit trails
+
+Real-World Use Case:
+Daily data warehouse refresh, nightly ETL jobs with scripts managed in S3.
+"""
+
+from datetime import datetime
+import os
+import sys
+
+# Add SDK to path for development mode
+sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+if sdk_path not in sys.path:
+    sys.path.insert(0, sdk_path)
+
+from kubeflow.spark import (  # noqa: E402
+    ApplicationState,
+    OperatorBackendConfig,
+    RestartPolicy,
+    RestartPolicyType,
+    BatchSparkClient,
+)
+
+# Import MinIO configuration
+try:
+    from minio_config import S3_PATHS, get_s3_spark_conf, print_minio_info
+except ImportError:
+    print("ERROR: minio_config.py not found!")
+    print("Please ensure you're running from the examples/spark directory")
+    sys.exit(1)
+
+
+def main():
+    """Main example: Submit scheduled batch job from S3 with resilience."""
+
+    print("=" * 80)
+    print("EXAMPLE 05: Scheduled Batch Job with Resilience (MinIO S3)")
+    print("=" * 80)
+    print()
+    print("This example demonstrates:")
+    print("  1. Production batch job patterns")
+    print("  2. Storing batch scripts in S3 (MinIO)")
+    print("  3. Restart policies for fault tolerance")
+    print("  4. Job metadata and audit trails")
+    print()
+
+    # Show MinIO configuration
+    print_minio_info()
+
+    # Step 1: Create SparkClient with configuration
+    print("Step 1: Creating Spark client...")
+    config = OperatorBackendConfig(
+        namespace=os.getenv("SPARK_NAMESPACE", "default"),
+        service_account="spark-operator-spark",
+        default_spark_image="docker.io/library/spark",
+        context=os.getenv("KUBE_CONTEXT", "kind-spark-test"),
+        enable_monitoring=False,
+        enable_ui=True,  # Enable Spark UI
+    )
+    client = BatchSparkClient(backend_config=config)
+    print("  Client created successfully")
+    print("  Spark UI enabled")
+    print()
+
+    # Step 2: Prepare the application with resilience
+    timestamp = datetime.now().strftime("%H%M%S")
+    app_name = f"batch-job-{timestamp}"
+    batch_date = datetime.now().strftime("%Y-%m-%d")
+
+    # Get S3 path for the batch job script
+    script_path = S3_PATHS["batch_job_script"]
+
+    print("Step 2: Configuring batch job with resilience...")
+    print(f"  App name: {app_name}")
+    print(f"  Batch date: {batch_date}")
+    print(f"  Script location: {script_path}")
+    print("  Spark version: 4.0.0")
+    print("  Resources: 1 driver + 2 executors")
+    print("  Restart policy: OnFailure (retry up to 3 times)")
+    print()
+
+    # Step 3: Submit the application with restart policy
+    print("Step 3: Submitting batch job with fault tolerance...")
+
+    try:
+        # Configure restart policy for production resilience
+        restart_policy = RestartPolicy(
+            type=RestartPolicyType.ON_FAILURE,
+            on_failure_retries=3,  # Retry up to 3 times on failure
+            on_failure_retry_interval=30,  # Wait 30 seconds between retries
+            on_submission_failure_retries=2,  # Retry submission failures
+            on_submission_failure_retry_interval=15,  # Wait 15 seconds
+        )
+
+        # Get S3-enabled Spark configuration
+        spark_conf = get_s3_spark_conf()
+
+        response = client.submit_application(
+            # Application metadata
+            app_name=app_name,
+            main_application_file=script_path,  # S3 path!
+            # Spark configuration
+            spark_version="4.0.0",
+            app_type="Python",
+            # Resource allocation for batch processing
+            driver_cores=1,
+            driver_memory="1g",
+            executor_cores=1,
+            executor_memory="1g",
+            num_executors=2,
+            # Resilience configuration
+            restart_policy=restart_policy,
+            time_to_live_seconds=3600,  # Auto-cleanup after 1 hour
+            # Batch job metadata
+            labels={
+                "job_type": "batch",
+                "schedule": "daily",
+                "batch_date": batch_date.replace("-", ""),
+            },
+            # S3 configuration for MinIO
+            spark_conf=spark_conf,
+        )
+
+        print("  Batch job submitted successfully!")
+        print(f"  Submission ID: {response.submission_id}")
+        print(f"  Status: {response.status}")
+        print("  Script loaded from S3: Done")
+        print()
+        print("  🌐 Spark UI Access (choose one):")
+        print("     Option 1 - Direct to driver pod:")
+        print(f"       kubectl port-forward pod/{app_name}-driver 4040:4040")
+        print("     Option 2 - Via service (if created by operator):")
+        print(f"       kubectl port-forward svc/{app_name}-ui-svc 4040:4040")
+        print("     Then open: http://localhost:4040")
+        print()
+        print("  💡 Tip: Use Option 1 if service doesn't exist")
+        print()
+        print("  Resilience features enabled:")
+        print(f"    - Retry on failure: {restart_policy.on_failure_retries} attempts")
+        print(f"    - Retry interval: {restart_policy.on_failure_retry_interval}s")
+        print("    - Auto-cleanup: After 1 hour")
+        print()
+
+    except Exception as e:
+        print(f"  ERROR: Submission failed: {e}")
+        print()
+        print("Troubleshooting:")
+        print("  1. Ensure MinIO is running:")
+        print("     kubectl get pods -l app=minio")
+        print("  2. Verify scripts are uploaded:")
+        print("     kubectl exec minio-client -- mc ls myminio/spark-scripts/")
+        print("  3. Check if setup_minio.sh was run successfully")
+        sys.exit(1)
+
+    # Step 4: Monitor the application
+    print("Step 4: Monitoring batch job (this may take 2-3 minutes)...")
+    print("  Processing batch data from S3 script...")
+
+    try:
+        # Wait for completion with timeout
+        final_status = client.wait_for_job_status(
+            submission_id=app_name,
+            timeout=300,  # 5 minutes max
+            polling_interval=5,  # Check every 5 seconds
+        )
+
+        print("  Batch job completed!")
+        print(f"  Final state: {final_status.state.value}")
+        print()
+
+        # Check if successful
+        if final_status.state != ApplicationState.COMPLETED:
+            print(f"  WARNING: Job did not complete successfully: {final_status.state.value}")
+            print("  Restart policy would trigger automatic retry")
+            print("  Check logs below for details.")
+
+    except TimeoutError:
+        print("  ERROR: Job did not complete within 5 minutes")
+        print(f"  You can check status later with: client.get_job('{app_name}')")
+        sys.exit(1)
+    except Exception as e:
+        print(f"  ERROR: Error monitoring job: {e}")
+        sys.exit(1)
+
+    # Step 5: Retrieve results from logs
+    print("Step 5: Retrieving batch job results...")
+    print()
+
+    try:
+        logs = list(client.get_job_logs(app_name))
+
+        print("=" * 80)
+        print("BATCH JOB RESULTS (from S3 script)")
+        print("=" * 80)
+
+        # Display important sections from the batch job script
+        important_keywords = [
+            "SCHEDULED BATCH JOB",
+            "[CONFIG]",
+            "[EXTRACT]",
+            "[TRANSFORM]",
+            "[LOAD]",
+            "[COMPLETE]",
+            "Batch Configuration",
+            "Customer Summary",
+        ]
+
+        found_results = False
+        for line in logs:
+            if any(keyword in line for keyword in important_keywords):
+                print(line)
+                found_results = True
+            elif found_results and ("+" in line or "|" in line):
+                # Print table output
+                print(line)
+
+        if not found_results:
+            print("Showing last 30 log lines:")
+            for line in logs[-30:]:
+                print(line)
+
+        print()
+        print("=" * 80)
+
+    except Exception as e:
+        print(f"  WARNING: Could not retrieve logs: {e}")
+        print("  The job may have completed but logs are not yet available")
+
+    # Step 6: Cleanup
+    print()
+    print("Step 6: Cleaning up resources...")
+    try:
+        client.delete_job(app_name)
+        print(f"  Application '{app_name}' deleted")
+    except Exception as e:
+        print(f"  WARNING: Cleanup warning: {e}")
+        print(f"  You can manually delete with: kubectl delete sparkapplication {app_name}")
+
+    print()
+    print("=" * 80)
+    print("EXAMPLE COMPLETED SUCCESSFULLY!")
+    print("=" * 80)
+    print()
+    print("What you learned:")
+    print("  Production batch job patterns")
+    print("  Storing batch scripts in S3/MinIO")
+    print("  Restart policies for resilience")
+    print("  Job metadata and audit trails")
+    print("  Production pattern with versioned scripts")
+    print()
+    print("Resilience Features:")
+    print("  - RestartPolicy: Automatic retry on failures")
+    print("  - TimeToLiveSeconds: Auto-cleanup completed jobs")
+    print("  - Labels: Metadata for tracking and monitoring")
+    print("  - S3 Versioning: Script rollback capability")
+    print()
+    print("Batch Processing Best Practices:")
+    print("  1. Store scripts in version-controlled S3")
+    print("  2. Design jobs to be idempotent (rerunnable)")
+    print("  3. Configure retry policies for resilience")
+    print("  4. Set TTL for automatic cleanup")
+    print("  5. Use labels for job tracking")
+    print("  6. Enable S3 versioning for rollback")
+    print()
+    print("Scheduling Options:")
+    print("  - Kubernetes CronJob")
+    print("  - Apache Airflow")
+    print("  - Argo Workflows")
+    print("  - Custom scheduler with SparkClient SDK")
+    print()
+    print("Production Tips:")
+    print("  - Implement CI/CD for batch script deployment")
+    print("  - Use S3 versioning for script history")
+    print("  - Monitor job metrics and SLAs")
+    print("  - Add alerting for job failures")
+    print("  - Store job metadata in data catalog")
+    print()
+    print("Next steps:")
+    print("  - Try example 06: Dynamic allocation and auto-scaling")
+    print("  - Schedule this job with Airflow/Argo")
+    print("  - Implement checkpoint/recovery for large jobs")
+    print("  - Read/write data from/to S3 buckets")
+    print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/spark/06_autoscaling_dynamic_allocation.py b/examples/spark/06_autoscaling_dynamic_allocation.py
new file mode 100644
index 000000000..eddc3905a
--- /dev/null
+++ b/examples/spark/06_autoscaling_dynamic_allocation.py
@@ -0,0 +1,520 @@
+#!/usr/bin/env python3
+"""
+Title: Dynamic Allocation and Auto-scaling
+Level: 2 (Intermediate - Auto-scaling)
+Target Audience: Data Engineers optimizing resource usage
+Time to Run: ~4-5 minutes
+
+Description:
+This example demonstrates Spark's dynamic allocation feature, which automatically
+scales executors up and down based on workload. You'll learn when to use dynamic
+allocation, how to configure it, and how it improves resource efficiency in
+multi-tenant Kubernetes clusters.
+
+Prerequisites:
+- Kind cluster with Spark Operator (run ./setup_test_environment.sh)
+- Default namespace with 'spark-operator-spark' service account
+- Spark 3.0+ (required for dynamic allocation on Kubernetes)
+
+What You'll Learn:
+- Dynamic allocation configuration and tuning
+- How Spark scales executors automatically
+- Resource efficiency vs performance trade-offs
+- Monitoring executor scaling behavior
+- When to use dynamic vs fixed allocation
+
+Real-World Use Case:
+Multi-tenant clusters, variable workloads, cost optimization, shared resources.
+"""
+
+import os
+import sys
+
+# Add SDK to path for development mode
+sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+if sdk_path not in sys.path:
+    sys.path.insert(0, sdk_path)
+
+from kubeflow.spark import (  # noqa: E402
+    ApplicationState,
+    DynamicAllocation,
+    OperatorBackendConfig,
+    BatchSparkClient,
+)
+
+
+def create_dynamic_allocation_script():
+    """Create a PySpark script demonstrating dynamic allocation.
+
+    Returns:
+        str: Python code for dynamic allocation demo
+    """
+    return """
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import (
+    col, lit, sum as _sum, count, avg,
+    monotonically_increasing_id, rand, when
+)
+from pyspark.sql.types import *
+import time
+
+# Create Spark session
+spark = SparkSession.builder \\
+    .appName("Dynamic Allocation Demo") \\
+    .getOrCreate()
+
+# Get dynamic allocation configuration
+dyn_enabled = spark.conf.get("spark.dynamicAllocation.enabled", "false")
+min_executors = spark.conf.get("spark.dynamicAllocation.minExecutors", "N/A")
+max_executors = spark.conf.get("spark.dynamicAllocation.maxExecutors", "N/A")
+initial_executors = spark.conf.get("spark.dynamicAllocation.initialExecutors", "N/A")
+
+print("\\n" + "="*80)
+print("DYNAMIC ALLOCATION DEMO")
+print("="*80)
+print("\\n📊 Dynamic Allocation Configuration:")
+print(f"  - Enabled: {dyn_enabled}")
+print(f"  - Min Executors: {min_executors}")
+print(f"  - Max Executors: {max_executors}")
+print(f"  - Initial Executors: {initial_executors}")
+shuffle_tracking = spark.conf.get('spark.dynamicAllocation.shuffleTracking.enabled', 'N/A')
+print(f"  - Shuffle Tracking: {shuffle_tracking}")
+
+# ============================================================================
+# PHASE 1: LIGHT WORKLOAD (should use minimal executors)
+# ============================================================================
+print("\\n\\n[PHASE 1] Light Workload - Testing Scale Down")
+print("="*80)
+print("Expected: Spark should use minimal executors for small dataset\\n")
+
+# Small dataset
+print("Creating small dataset (1,000 records)...")
+small_data = spark.range(1000).select(
+    col("id"),
+    (col("id") * 2).alias("value"),
+    (col("id") % 10).alias("category")
+)
+
+print(f"  Created {small_data.count()} records")
+
+# Simple aggregation (low resource need)
+result1 = small_data.groupBy("category").agg(
+    count("id").alias("count"),
+    _sum("value").alias("sum_value"),
+    avg("value").alias("avg_value")
+).orderBy("category")
+
+print("\\nLight Workload Results:")
+result1.show()
+
+print("\\n⏱️  Waiting 10 seconds for executor scaling to stabilize...")
+time.sleep(10)
+
+# Check current executor count (approximation based on task distribution)
+print("\\n📈 After light workload:")
+print("  - Spark should have scaled down to minimum executors")
+print("  - Check operator logs or Spark UI for exact executor count")
+
+# ============================================================================
+# PHASE 2: MEDIUM WORKLOAD (should scale up moderately)
+# ============================================================================
+print("\\n\\n[PHASE 2] Medium Workload - Testing Scale Up")
+print("="*80)
+print("Expected: Spark should add executors to handle increased load\\n")
+
+# Medium dataset
+print("Creating medium dataset (100,000 records)...")
+medium_data = spark.range(100000).select(
+    col("id"),
+    (rand() * 1000).alias("value"),
+    (col("id") % 100).alias("category"),
+    (col("id") % 10).alias("partition_key")
+)
+
+print(f"  Created {medium_data.count()} records")
+
+# More complex processing (triggers parallelism)
+result2 = medium_data.groupBy("category").agg(
+    count("id").alias("count"),
+    _sum("value").alias("sum_value"),
+    avg("value").alias("avg_value")
+).filter(col("count") > 100).orderBy(col("sum_value").desc())
+
+print(f"\\nMedium Workload Results (showing top 10):")
+result2.show(10)
+
+print("\\n⏱️  Waiting 10 seconds for executor scaling...")
+time.sleep(10)
+
+print("\\n📈 After medium workload:")
+print("  - Spark should have scaled up executors")
+print("  - More executors = better parallelism for aggregations")
+
+# ============================================================================
+# PHASE 3: HEAVY WORKLOAD (should scale to maximum)
+# ============================================================================
+print("\\n\\n[PHASE 3] Heavy Workload - Testing Maximum Scale")
+print("="*80)
+print("Expected: Spark should scale to max executors for heavy computation\\n")
+
+# Large dataset with shuffle
+print("Creating large dataset (500,000 records)...")
+large_data = spark.range(500000).select(
+    col("id"),
+    (rand() * 10000).alias("value"),
+    (col("id") % 1000).alias("category"),
+    when(rand() > 0.5, "A").otherwise("B").alias("group")
+)
+
+print(f"  Created {large_data.count()} records")
+
+# Heavy processing with shuffle (join + aggregation)
+print("\\nPerforming heavy computation (join + aggregation)...")
+
+# Self-join to increase workload
+large_data_alias = large_data.alias("df1")
+large_data2 = large_data.alias("df2")
+
+result3 = large_data_alias.join(
+    large_data2,
+    col("df1.category") == col("df2.category"),
+    "inner"
+).groupBy("df1.category").agg(
+    count("df1.id").alias("total_records"),
+    _sum("df1.value").alias("sum_value1"),
+    _sum("df2.value").alias("sum_value2")
+).orderBy(col("total_records").desc())
+
+print(f"\\nHeavy Workload Results (top 10 categories):")
+result3.show(10)
+
+print("\\n⏱️  Waiting 10 seconds for executor scaling...")
+time.sleep(10)
+
+print("\\n📈 After heavy workload:")
+print("  - Spark should have scaled to maximum executors")
+print("  - Shuffle operations triggered executor requests")
+print("  - Join and aggregation required maximum parallelism")
+
+# ============================================================================
+# PHASE 4: COOL DOWN (should scale back down)
+# ============================================================================
+print("\\n\\n[PHASE 4] Cool Down - Testing Scale Down After Load")
+print("="*80)
+print("Expected: After workload completes, Spark should release idle executors\\n")
+
+print("Performing final light operation...")
+final_result = small_data.groupBy("category").count().orderBy("category")
+final_result.show()
+
+print("\\n⏱️  Waiting 15 seconds for idle executors to be released...")
+time.sleep(15)
+
+print("\\n📉 After cool down:")
+print("  - Spark should release idle executors")
+print("  - Only minimum executors retained")
+print("  - Resources returned to cluster for other workloads")
+
+# ============================================================================
+# SUMMARY
+# ============================================================================
+print("\\n\\n" + "="*80)
+print("DYNAMIC ALLOCATION DEMO COMPLETED!")
+print("="*80)
+
+print("\\n🎯 Key Observations:")
+print("  1. Light workload - Minimal executors (resource efficient)")
+print("  2. Medium workload - Moderate scale up (balanced)")
+print("  3. Heavy workload - Maximum executors (performance optimized)")
+print("  4. Cool down - Scale down (return resources)")
+
+print("\\n💡 Dynamic Allocation Benefits:")
+print("  Automatic resource optimization")
+print("  Cost efficiency in multi-tenant clusters")
+print("  No manual executor tuning needed")
+print("  Better cluster utilization")
+
+print("\\nWARNING: When NOT to Use Dynamic Allocation:")
+print("  - Streaming jobs (need consistent executors)")
+print("  - Very short-lived jobs (overhead of scaling)")
+print("  - Dedicated clusters (fixed allocation is simpler)")
+print("  - Jobs with strict latency SLAs")
+
+print("\\n📊 Configuration Parameters Explained:")
+print("  - minExecutors: Safety net, always available")
+print("  - maxExecutors: Resource cap, prevents runaway scaling")
+print("  - initialExecutors: Starting point, balances startup time")
+print("  - shuffleTracking: Required for K8s, tracks shuffle data")
+
+print("\\n🔧 Tuning Recommendations:")
+print("  - Set min = 1-2 for cost efficiency")
+print("  - Set max based on cluster capacity")
+print("  - Set initial = expected average load")
+print("  - Enable shuffleTracking (required for K8s)")
+print("  - Monitor executor metrics in Spark UI")
+
+spark.stop()
+"""
+
+
+def main():
+    """Main example: Submit Spark job with dynamic allocation enabled."""
+
+    print("=" * 80)
+    print("EXAMPLE 06: Dynamic Allocation and Auto-scaling")
+    print("=" * 80)
+    print()
+    print("This example demonstrates:")
+    print("  1. Configuring dynamic allocation")
+    print("  2. Automatic executor scaling based on workload")
+    print("  3. Resource efficiency in shared clusters")
+    print("  4. Performance vs cost trade-offs")
+    print("  5. When to use dynamic vs fixed allocation")
+    print()
+
+    # Step 1: Create SparkClient with configuration
+    print("Step 1: Creating Spark client...")
+    config = OperatorBackendConfig(
+        namespace=os.getenv("SPARK_NAMESPACE", "default"),
+        service_account="spark-operator-spark",
+        default_spark_image="docker.io/library/spark",
+        context=os.getenv("KUBE_CONTEXT", "kind-spark-test"),
+        enable_monitoring=False,
+        enable_ui=False,
+    )
+    client = BatchSparkClient(backend_config=config)
+    print("  Client created successfully")
+    print()
+
+    # Step 2: Configure dynamic allocation
+    app_name = "dynamic-allocation-demo"
+
+    print("Step 2: Configuring dynamic allocation...")
+
+    # Create dynamic allocation configuration
+    dyn_alloc = DynamicAllocation(
+        enabled=True,
+        initial_executors=1,  # Start with 1 executor
+        min_executors=1,  # Keep at least 1
+        max_executors=5,  # Scale up to 5 max
+        shuffle_tracking_enabled=True,  # Required for K8s
+    )
+
+    print(f"  App name: {app_name}")
+    print("  Spark version: 4.0.0 (supports dynamic allocation)")
+    print("  Dynamic Allocation Settings:")
+    print(f"    - Initial executors: {dyn_alloc.initial_executors}")
+    print(f"    - Min executors: {dyn_alloc.min_executors}")
+    print(f"    - Max executors: {dyn_alloc.max_executors}")
+    print(f"    - Shuffle tracking: {dyn_alloc.shuffle_tracking_enabled}")
+    print()
+    print("  How it works:")
+    print("    - Starts with 1 executor (initial)")
+    print("    - Scales up to 5 as workload increases")
+    print("    - Scales down to 1 when idle")
+    print()
+
+    # Step 3: Submit the application
+    print("Step 3: Submitting application with dynamic allocation...")
+
+    try:
+        response = client.submit_application(
+            # Application metadata
+            app_name=app_name,
+            # Placeholder
+            main_application_file=("local:///opt/spark/examples/src/main/python/pi.py"),
+            # Spark configuration
+            # Spark 3.0+ required for dynamic allocation on K8s
+            spark_version="4.0.0",
+            app_type="Python",
+            # Resource allocation per executor
+            driver_cores=1,
+            driver_memory="1g",
+            executor_cores=1,
+            executor_memory="1g",
+            # This will be overridden by dynamic allocation
+            num_executors=1,
+            # Dynamic Allocation Configuration
+            dynamic_allocation=dyn_alloc,
+            # Spark configuration
+            spark_conf={
+                "spark.kubernetes.file.upload.path": "/tmp",
+                # Additional tuning for dynamic allocation
+                # Release idle executors after 30s
+                "spark.dynamicAllocation.executorIdleTimeout": "30s",
+                # Keep cached executors longer
+                "spark.dynamicAllocation.cachedExecutorIdleTimeout": "60s",
+                # Request executors quickly
+                "spark.dynamicAllocation.schedulerBacklogTimeout": "5s",
+            },
+            # Labels for tracking
+            labels={
+                "feature": "dynamic-allocation",
+                "workload": "variable",
+            },
+        )
+
+        print("  Application submitted successfully!")
+        print(f"  Submission ID: {response.submission_id}")
+        print(f"  Status: {response.status}")
+        print()
+        print("  Dynamic allocation features enabled:")
+        print("    Auto-scaling based on workload")
+        print("    Shuffle tracking for K8s compatibility")
+        print("    Optimized resource utilization")
+        print()
+
+    except Exception as e:
+        print(f"  ERROR: Submission failed: {e}")
+        sys.exit(1)
+
+    # Step 4: Monitor the application
+    print("Step 4: Monitoring application (this will take 4-5 minutes)...")
+    print("  The job will demonstrate executor scaling through 4 phases:")
+    print("    Phase 1: Light workload (scale down)")
+    print("    Phase 2: Medium workload (scale up)")
+    print("    Phase 3: Heavy workload (max scale)")
+    print("    Phase 4: Cool down (scale down)")
+    print()
+
+    try:
+        # Wait for completion with longer timeout for demo phases
+        final_status = client.wait_for_job_status(
+            submission_id=app_name,
+            timeout=360,  # 6 minutes for all phases
+            polling_interval=5,  # Check every 5 seconds
+        )
+
+        print("  Application completed!")
+        print(f"  Final state: {final_status.state.value}")
+        print()
+
+        # Check if successful
+        if final_status.state != ApplicationState.COMPLETED:
+            print(
+                f"  WARNING: Application did not complete successfully: {final_status.state.value}"
+            )
+            print("  Check logs below for details.")
+
+    except TimeoutError:
+        print("  ERROR: Application did not complete within 6 minutes")
+        print(f"  You can check status later with: client.get_job('{app_name}')")
+        sys.exit(1)
+    except Exception as e:
+        print(f"  ERROR: Error monitoring application: {e}")
+        sys.exit(1)
+
+    # Step 5: Retrieve results from logs
+    print("Step 5: Retrieving dynamic allocation insights from logs...")
+    print()
+
+    try:
+        logs = list(client.get_job_logs(app_name))
+
+        print("=" * 80)
+        print("DYNAMIC ALLOCATION RESULTS")
+        print("=" * 80)
+
+        # Display important sections
+        important_keywords = [
+            "DYNAMIC ALLOCATION",
+            "Configuration:",
+            "[PHASE",
+            "Expected:",
+            "After",
+            "Key Observations",
+            "Benefits:",
+            "When NOT to Use",
+            "Tuning Recommendations",
+        ]
+
+        for line in logs:
+            if any(keyword in line for keyword in important_keywords) or any(
+                emoji in line
+                for emoji in [
+                    "Done",
+                    "WARNING",
+                    "📊",
+                    "📈",
+                    "📉",
+                    "💡",
+                    "🎯",
+                    "🔧",
+                ]
+            ):
+                print(line)
+
+        print()
+        print("=" * 80)
+
+    except Exception as e:
+        print(f"  WARNING: Could not retrieve logs: {e}")
+        print("  The job may have completed but logs are not yet available")
+
+    # Step 6: Cleanup
+    print()
+    print("Step 6: Cleaning up resources...")
+    try:
+        client.delete_job(app_name)
+        print(f"  Application '{app_name}' deleted")
+        print("  All executors released back to cluster")
+    except Exception as e:
+        print(f"  WARNING: Cleanup warning: {e}")
+        print(f"  You can manually delete with: kubectl delete sparkapplication {app_name}")
+
+    print()
+    print("=" * 80)
+    print("EXAMPLE COMPLETED SUCCESSFULLY!")
+    print("=" * 80)
+    print()
+    print("What you learned:")
+    print("  How to configure dynamic allocation")
+    print("  How Spark scales executors automatically")
+    print("  Resource efficiency vs performance trade-offs")
+    print("  Tuning parameters and their effects")
+    print("  When to use dynamic vs fixed allocation")
+    print()
+    print("Dynamic Allocation Configuration:")
+    print("  from kubeflow.spark import DynamicAllocation")
+    print()
+    print("  dyn_alloc = DynamicAllocation(")
+    print("      enabled=True,")
+    print("      initial_executors=2,  # Starting point")
+    print("      min_executors=1,      # Always keep at least 1")
+    print("      max_executors=10,     # Cap at 10")
+    print("      shuffle_tracking_enabled=True  # Required for K8s")
+    print("  )")
+    print()
+    print("  client.submit_application(")
+    print("      app_name='my-app',")
+    print("      dynamic_allocation=dyn_alloc,")
+    print("      ...")
+    print("  )")
+    print()
+    print("Key Scaling Triggers:")
+    print("  - Scale Up: Pending tasks, shuffle writes, backlog")
+    print("  - Scale Down: Idle executors, no shuffle data needed")
+    print("  - Timing: Controlled by timeout configurations")
+    print()
+    print("Use Cases for Dynamic Allocation:")
+    print("  Multi-tenant clusters (shared resources)")
+    print("  Variable workloads (unpredictable load)")
+    print("  Cost optimization (pay for what you use)")
+    print("  Development/testing (efficient resource use)")
+    print()
+    print("Use Cases for Fixed Allocation:")
+    print("  Streaming jobs (predictable, constant load)")
+    print("  Short-lived jobs (scaling overhead too high)")
+    print("  Strict SLAs (no scaling latency)")
+    print("  Dedicated clusters (resources already allocated)")
+    print()
+    print("Next steps:")
+    print("  - Experiment with different min/max settings")
+    print("  - Monitor executor scaling in Spark UI")
+    print("  - Compare costs: dynamic vs fixed allocation")
+    print("  - Test with your own workloads")
+    print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/spark/07_spark_connect_interactive.py b/examples/spark/07_spark_connect_interactive.py
new file mode 100644
index 000000000..9c2d518a6
--- /dev/null
+++ b/examples/spark/07_spark_connect_interactive.py
@@ -0,0 +1,316 @@
+#!/usr/bin/env python3
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Spark Connect Interactive Session Example.
+
+This example demonstrates how to use Kubeflow SparkSessionClient with Spark Connect
+to create interactive data analysis sessions. Unlike batch job submission,
+Spark Connect enables long-lived sessions for exploratory data analysis,
+iterative development, and notebook-style workflows.
+
+Prerequisites:
+1. A Spark cluster with Spark Connect server running (Spark 3.4+)
+2. PySpark with Connect support: pip install 'pyspark[connect]>=3.4.0'
+3. Network connectivity to Spark Connect server
+
+Key Features Demonstrated:
+- Remote connectivity to existing Spark clusters
+- Interactive SQL queries and DataFrame operations
+- Artifact upload (Python files, JARs)
+- Session metrics and monitoring
+- Session lifecycle management
+
+Usage:
+    python 07_spark_connect_interactive.py --connect-url sc://spark-cluster:15002
+"""
+
+import argparse
+import logging
+import sys
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Spark Connect Interactive Session Example",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--connect-url",
+        type=str,
+        required=True,
+        help=(
+            "Spark Connect URL (e.g., sc://spark-cluster:15002). "
+            "For Kubernetes: sc://{service-name}.{namespace}.svc.cluster.local:15002"
+        ),
+    )
+    parser.add_argument(
+        "--token",
+        type=str,
+        help="Bearer token for authentication (optional)",
+    )
+    parser.add_argument(
+        "--use-ssl",
+        action="store_true",
+        default=True,
+        help="Use SSL/TLS for secure communication (default: true)",
+    )
+    parser.add_argument(
+        "--app-name",
+        type=str,
+        default="kubeflow-spark-connect-demo",
+        help="Application name for the session",
+    )
+    return parser.parse_args()
+
+
+def run_sql_analysis(session) -> None:
+    """Run interactive SQL analysis.
+
+    Args:
+        session: ManagedSparkSession instance
+    """
+    logger.info("=" * 80)
+    logger.info("Example 1: Interactive SQL Queries")
+    logger.info("=" * 80)
+
+    # Create sample data
+    logger.info("Creating sample sales data...")
+    sales_data = [
+        ("2024-01-01", "Product A", 100, 29.99),
+        ("2024-01-01", "Product B", 150, 19.99),
+        ("2024-01-02", "Product A", 120, 29.99),
+        ("2024-01-02", "Product C", 80, 49.99),
+        ("2024-01-03", "Product B", 200, 19.99),
+        ("2024-01-03", "Product C", 90, 49.99),
+    ]
+
+    # Create DataFrame and register as temp view
+    df = session.createDataFrame(sales_data, ["date", "product", "quantity", "price"])
+    df.createOrReplaceTempView("sales")
+
+    logger.info("Sample data created and registered as 'sales' view")
+
+    # Run SQL query
+    logger.info("\nExecuting SQL: SELECT product, SUM(quantity * price) AS revenue ...")
+    result_df = session.sql("""
+        SELECT
+            product,
+            SUM(quantity * price) AS total_revenue,
+            SUM(quantity) AS total_quantity,
+            AVG(price) AS avg_price
+        FROM sales
+        GROUP BY product
+        ORDER BY total_revenue DESC
+    """)
+
+    # Show results
+    logger.info("\nQuery Results:")
+    results = result_df.collect()
+    for row in results:
+        logger.info(
+            f"  {row.product}: Revenue=${row.total_revenue:.2f}, "
+            f"Quantity={row.total_quantity}, AvgPrice=${row.avg_price:.2f}"
+        )
+
+
+def run_dataframe_operations(session) -> None:
+    """Run DataFrame transformations.
+
+    Args:
+        session: ManagedSparkSession instance
+    """
+    logger.info("\n" + "=" * 80)
+    logger.info("Example 2: DataFrame Operations")
+    logger.info("=" * 80)
+
+    # Create sample user data
+    logger.info("Creating user activity data...")
+    user_data = [
+        (1, "alice@example.com", "premium", 150),
+        (2, "bob@example.com", "free", 25),
+        (3, "carol@example.com", "premium", 200),
+        (4, "dave@example.com", "free", 10),
+        (5, "eve@example.com", "premium", 180),
+    ]
+
+    df = session.createDataFrame(user_data, ["user_id", "email", "subscription", "activity_score"])
+
+    # Apply transformations
+    logger.info("Applying DataFrame transformations...")
+
+    # Filter premium users
+    premium_users = df.filter(df.subscription == "premium")
+
+    # Add derived column
+    premium_users = premium_users.withColumn(
+        "engagement_level",
+        session.spark.sql.functions.when(premium_users.activity_score >= 180, "high")
+        .when(premium_users.activity_score >= 150, "medium")
+        .otherwise("low"),
+    )
+
+    # Show results
+    logger.info("\nPremium Users with Engagement Levels:")
+    results = premium_users.collect()
+    for row in results:
+        logger.info(
+            f"  User {row.user_id} ({row.email}): "
+            f"Score={row.activity_score}, Level={row.engagement_level}"
+        )
+
+
+def run_aggregation_analysis(session) -> None:
+    """Run aggregation and grouping operations.
+
+    Args:
+        session: ManagedSparkSession instance
+    """
+    logger.info("\n" + "=" * 80)
+    logger.info("Example 3: Aggregation and Grouping")
+    logger.info("=" * 80)
+
+    # Create sample event data
+    logger.info("Creating event stream data...")
+    events = [
+        ("2024-01-01", "login", "mobile", 1250),
+        ("2024-01-01", "login", "web", 3500),
+        ("2024-01-01", "purchase", "mobile", 150),
+        ("2024-01-02", "login", "mobile", 1300),
+        ("2024-01-02", "login", "web", 3800),
+        ("2024-01-02", "purchase", "web", 220),
+        ("2024-01-03", "login", "mobile", 1400),
+        ("2024-01-03", "purchase", "mobile", 180),
+    ]
+
+    df = session.createDataFrame(events, ["date", "event_type", "platform", "count"])
+
+    # Group and aggregate
+    logger.info("Computing aggregations by platform and event type...")
+    agg_df = (
+        df.groupBy("platform", "event_type")
+        .agg({"count": "sum", "date": "count"})
+        .withColumnRenamed("sum(count)", "total_events")
+        .withColumnRenamed("count(date)", "num_days")
+        .orderBy("platform", "event_type")
+    )
+
+    # Show results
+    logger.info("\nAggregation Results:")
+    results = agg_df.collect()
+    for row in results:
+        logger.info(
+            f"  {row.platform}/{row.event_type}: Total={row.total_events}, Days={row.num_days}"
+        )
+
+
+def demonstrate_session_features(session) -> None:
+    """Demonstrate session-specific features.
+
+    Args:
+        session: ManagedSparkSession instance
+    """
+    logger.info("\n" + "=" * 80)
+    logger.info("Example 4: Session Features & Metrics")
+    logger.info("=" * 80)
+
+    # Get session info
+    info = session.get_info()
+    logger.info(f"Session ID: {info.session_id}")
+    logger.info(f"App Name: {info.app_name}")
+    logger.info(f"State: {info.state}")
+
+    # Get metrics
+    metrics = session.get_metrics()
+    logger.info("\nSession Metrics:")
+    logger.info(f"  Queries Executed: {metrics.queries_executed}")
+    logger.info(f"  Active Queries: {metrics.active_queries}")
+    logger.info(f"  Artifacts Uploaded: {metrics.artifacts_uploaded}")
+
+
+def main():
+    """Main execution function."""
+    args = parse_args()
+
+    logger.info("=" * 80)
+    logger.info("Spark Connect Interactive Session Example")
+    logger.info("=" * 80)
+    logger.info(f"Connect URL: {args.connect_url}")
+    logger.info(f"App Name: {args.app_name}")
+    logger.info(f"SSL Enabled: {args.use_ssl}")
+
+    try:
+        # Import Kubeflow Spark client
+        from kubeflow.spark import ConnectBackendConfig, SparkSessionClient
+
+        # Configure ConnectBackend
+        logger.info("\nInitializing Spark Connect backend...")
+        config = ConnectBackendConfig(
+            connect_url=args.connect_url,
+            token=args.token,
+            use_ssl=args.use_ssl,
+            timeout=300,
+        )
+
+        # Create SparkSessionClient
+        with SparkSessionClient(backend_config=config) as client:
+            logger.info("SparkSessionClient initialized successfully")
+
+            # Create interactive session
+            logger.info(f"\nCreating Spark Connect session: {args.app_name}")
+            session = client.create_session(app_name=args.app_name)
+            logger.info(f"Session created: {session.session_id}")
+
+            try:
+                # Run examples
+                run_sql_analysis(session)
+                run_dataframe_operations(session)
+                run_aggregation_analysis(session)
+                demonstrate_session_features(session)
+
+                logger.info("\n" + "=" * 80)
+                logger.info("All examples completed successfully!")
+                logger.info("=" * 80)
+
+            finally:
+                # Cleanup session
+                logger.info("\nClosing session...")
+                session.close()
+                logger.info("Session closed successfully")
+
+    except ImportError as e:
+        logger.error(
+            "Failed to import required packages. "
+            "Please install: pip install 'pyspark[connect]>=3.4.0'"
+        )
+        logger.error(f"Error: {e}")
+        sys.exit(1)
+    except Exception as e:
+        logger.error(f"Example failed: {e}")
+        import traceback
+
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/spark/cleanup_spark.sh b/examples/spark/cleanup_spark.sh
new file mode 100755
index 000000000..e2ad4f13f
--- /dev/null
+++ b/examples/spark/cleanup_spark.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+#
+# Clean up all Spark applications and orphaned pods
+#
+
+set -e
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+print_step() {
+    echo -e "${GREEN}➜${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}⚠${NC} $1"
+}
+
+NAMESPACE="${1:-default}"
+
+echo "=========================================="
+echo " Cleaning up Spark applications"
+echo " Namespace: $NAMESPACE"
+echo "=========================================="
+echo ""
+
+# Check what exists
+print_step "Current state:"
+echo ""
+echo "SparkApplications:"
+kubectl get sparkapplications -n $NAMESPACE 2>/dev/null || echo "  (none)"
+echo ""
+echo "Driver pods:"
+kubectl get pods -n $NAMESPACE -l spark-role=driver 2>/dev/null || echo "  (none)"
+echo ""
+echo "Executor pods:"
+kubectl get pods -n $NAMESPACE -l spark-role=executor 2>/dev/null || echo "  (none)"
+echo ""
+
+read -p "Delete all Spark applications and pods? (y/n) " -n 1 -r
+echo
+if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+    echo "Cancelled."
+    exit 0
+fi
+
+# Delete SparkApplications
+print_step "Deleting SparkApplications..."
+if kubectl get sparkapplications -n $NAMESPACE &>/dev/null; then
+    kubectl delete sparkapplications -n $NAMESPACE --all --timeout=30s || true
+else
+    echo "  No SparkApplications found"
+fi
+
+# Delete driver pods (force delete)
+print_step "Force deleting driver pods..."
+if kubectl get pods -n $NAMESPACE -l spark-role=driver &>/dev/null; then
+    kubectl delete pods -n $NAMESPACE -l spark-role=driver --force --grace-period=0 --timeout=30s || true
+else
+    echo "  No driver pods found"
+fi
+
+# Delete executor pods (force delete)
+print_step "Force deleting executor pods..."
+if kubectl get pods -n $NAMESPACE -l spark-role=executor &>/dev/null; then
+    kubectl delete pods -n $NAMESPACE -l spark-role=executor --force --grace-period=0 --timeout=30s || true
+else
+    echo "  No executor pods found"
+fi
+
+# Delete orphaned ConfigMaps
+print_step "Cleaning up ConfigMaps..."
+if kubectl get configmaps -n $NAMESPACE -l sparkoperator.k8s.io/app-name &>/dev/null; then
+    kubectl delete configmaps -n $NAMESPACE -l sparkoperator.k8s.io/app-name --timeout=30s || true
+else
+    echo "  No Spark ConfigMaps found"
+fi
+
+echo ""
+print_step "Cleanup complete!"
+echo ""
+echo "Verification:"
+kubectl get sparkapplications -n $NAMESPACE 2>/dev/null || echo "  ✓ No SparkApplications"
+kubectl get pods -n $NAMESPACE -l spark-role 2>/dev/null || echo "  ✓ No Spark pods"
+echo ""
+echo "You can now submit new applications:"
+echo "  python test_spark_client_integration.py"
diff --git a/examples/spark/debug_spark_connect.py b/examples/spark/debug_spark_connect.py
new file mode 100644
index 000000000..10771377e
--- /dev/null
+++ b/examples/spark/debug_spark_connect.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+"""
+Debug script for Spark Connect connection issues.
+
+This script tests the connection step-by-step with verbose logging.
+"""
+
+import logging
+import os
+import signal
+import sys
+import time
+
+# Setup logging
+logging.basicConfig(
+    level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+
+# Add SDK to path
+sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+sys.path.insert(0, sdk_path)
+
+print("=" * 80)
+print("Spark Connect Connection Debugger")
+print("=" * 80)
+
+# Test 1: Check PySpark installation
+print("\n[Test 1] Checking PySpark installation...")
+try:
+    import pyspark
+
+    print(f"✓ PySpark version: {pyspark.__version__}")
+except ImportError as e:
+    print(f"✗ PySpark not installed: {e}")
+    sys.exit(1)
+
+# Test 2: Check Spark Connect support
+print("\n[Test 2] Checking Spark Connect support...")
+try:
+    from pyspark.sql import SparkSession
+
+    print("✓ SparkSession imported")
+
+    # Check if remote() method exists
+    if hasattr(SparkSession.builder, "remote"):
+        print("✓ Spark Connect (remote) support available")
+    else:
+        print("✗ Spark Connect support not available - upgrade PySpark")
+        sys.exit(1)
+except Exception as e:
+    print(f"✗ Error: {e}")
+    sys.exit(1)
+
+# Test 3: Test basic gRPC connectivity
+print("\n[Test 3] Testing gRPC connectivity to localhost:30000...")
+try:
+    import grpc
+
+    print("✓ grpc module available")
+
+    # Try to create a channel
+    channel = grpc.insecure_channel("localhost:30000")
+
+    # Set a short timeout for connection test
+    import grpc
+
+    try:
+        grpc.channel_ready_future(channel).result(timeout=5)
+        print("✓ gRPC channel ready")
+    except grpc.FutureTimeoutError:
+        print("✗ gRPC channel timeout - server may not be responding")
+        print("  Check: kubectl logs -l app=spark-connect -n default")
+    except Exception as e:
+        print(f"✗ gRPC channel error: {e}")
+    finally:
+        channel.close()
+
+except ImportError:
+    print("⚠ grpcio not installed (will be used by pyspark)")
+except Exception as e:
+    print(f"⚠ gRPC test error: {e}")
+
+# Test 4: Test Kubeflow SDK import
+print("\n[Test 4] Testing Kubeflow SDK imports...")
+try:
+    from kubeflow.spark import ConnectBackendConfig, SparkSessionClient
+
+    print("✓ Kubeflow Spark imports successful")
+except Exception as e:
+    print(f"✗ Import error: {e}")
+    sys.exit(1)
+
+# Test 5: Create config (doesn't connect yet)
+print("\n[Test 5] Creating ConnectBackendConfig...")
+try:
+    config = ConnectBackendConfig(
+        connect_url="sc://localhost:30000",
+        use_ssl=False,
+        timeout=10,  # Short timeout for testing
+    )
+    print(f"✓ Config created: {config.connect_url}")
+except Exception as e:
+    print(f"✗ Config creation error: {e}")
+    sys.exit(1)
+
+# Test 6: Create client (doesn't connect yet)
+print("\n[Test 6] Creating SparkSessionClient...")
+try:
+    client = SparkSessionClient(backend_config=config)
+    print("✓ Client created")
+except Exception as e:
+    print(f"✗ Client creation error: {e}")
+    sys.exit(1)
+
+# Test 7: Try to create session with timeout
+print("\n[Test 7] Creating Spark session (this may hang)...")
+print("  If this hangs for more than 30 seconds, press Ctrl+C")
+print("  Attempting connection to sc://localhost:30000...")
+
+
+def timeout_handler(signum, frame):
+    print("\n✗ Session creation timed out after 30 seconds")
+    print("\nPossible issues:")
+    print("  1. Spark Connect server not accessible")
+    print("  2. Port forwarding not working correctly")
+    print("  3. gRPC connection blocked")
+    print("\nDebugging steps:")
+    print("  - Check server logs: kubectl logs -l app=spark-connect -n default -f")
+    print("  - Verify port forward: lsof -i :30000")
+    print("  - Test connectivity: nc -zv localhost 30000")
+    print(
+        "  - Check server is listening: kubectl exec -it <pod-name> -- netstat -tlnp | grep 15002"
+    )
+    sys.exit(1)
+
+
+# Set timeout
+signal.signal(signal.SIGALRM, timeout_handler)
+signal.alarm(30)
+
+try:
+    start_time = time.time()
+    session = client.create_session(app_name="debug-test")
+    elapsed = time.time() - start_time
+
+    signal.alarm(0)  # Cancel timeout
+
+    print(f"✓ Session created in {elapsed:.2f} seconds!")
+    print(f"  Session ID: {session.session_id}")
+    print(f"  App name: {session.app_name}")
+
+    # Test 8: Try a simple query
+    print("\n[Test 8] Testing simple SQL query...")
+    try:
+        df = session.sql("SELECT 1 AS id, 'test' AS message")
+        result = df.collect()
+        print(f"✓ Query executed: {result[0].message}")
+        df.show()
+    except Exception as e:
+        print(f"✗ Query error: {e}")
+
+    # Cleanup
+    print("\n[Cleanup] Closing session...")
+    session.close()
+    client.close()
+    print("✓ Session closed")
+
+    print("\n" + "=" * 80)
+    print("All tests passed! Connection is working.")
+    print("=" * 80)
+
+except KeyboardInterrupt:
+    signal.alarm(0)
+    print("\n\n✗ Interrupted by user")
+    sys.exit(1)
+except Exception as e:
+    signal.alarm(0)
+    print(f"\n✗ Session creation failed: {e}")
+    print(f"\nError type: {type(e).__name__}")
+    import traceback
+
+    print("\nFull traceback:")
+    traceback.print_exc()
+    sys.exit(1)
diff --git a/examples/spark/example_utils.py b/examples/spark/example_utils.py
new file mode 100644
index 000000000..4ea1b905f
--- /dev/null
+++ b/examples/spark/example_utils.py
@@ -0,0 +1,571 @@
+"""Example utilities for Kubeflow Spark SDK examples.
+
+This module provides common utilities, helpers, and sample data generators
+used across the Spark examples. It helps reduce code duplication and provides
+a consistent interface for common operations.
+
+Usage:
+    from example_utils import (
+        create_client,
+        setup_logging,
+        generate_sample_data,
+    )
+
+    # Create client with defaults
+    client = create_client()
+
+    # Or with custom configuration
+    client = create_client(
+        namespace="my-namespace",
+        enable_ui=True,
+    )
+"""
+
+from datetime import datetime, timedelta
+import logging
+import os
+import sys
+from typing import Optional
+
+# Add SDK to path for development mode
+sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+if sdk_path not in sys.path:
+    sys.path.insert(0, sdk_path)
+
+from kubeflow.spark import (  # noqa: E402
+    ApplicationState,
+    DynamicAllocation,
+    OperatorBackendConfig,
+    RestartPolicy,
+    RestartPolicyType,
+    BatchBatchSparkClient,
+)
+
+# ============================================================================
+# LOGGING SETUP
+# ============================================================================
+
+
+def setup_logging(level: str = "INFO") -> logging.Logger:
+    """Setup logging for examples.
+
+    Args:
+        level: Log level (DEBUG, INFO, WARNING, ERROR)
+
+    Returns:
+        Configured logger instance
+    """
+    logging.basicConfig(
+        level=getattr(logging, level.upper()),
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    )
+    return logging.getLogger(__name__)
+
+
+logger = setup_logging()
+
+
+# ============================================================================
+# CLIENT CREATION HELPERS
+# ============================================================================
+
+
+def create_client(
+    namespace: Optional[str] = None,
+    service_account: str = "spark-operator-spark",
+    context: Optional[str] = None,
+    enable_monitoring: bool = False,
+    enable_ui: bool = False,
+    default_spark_image: str = "docker.io/library/spark",
+) -> BatchSparkClient:
+    """Create a BatchSparkClient with sensible defaults for examples.
+
+    Args:
+        namespace: Kubernetes namespace (default: from SPARK_NAMESPACE env or 'default')
+        service_account: Kubernetes service account
+        context: Kubernetes context (default: from KUBE_CONTEXT env or 'kind-spark-test')
+        enable_monitoring: Enable Prometheus monitoring
+        enable_ui: Enable Spark UI
+        default_spark_image: Default Spark image to use
+
+    Returns:
+        Configured BatchSparkClient instance
+
+    Example:
+        >>> client = create_client()
+        >>> client = create_client(namespace="production", enable_ui=True)
+    """
+    config = OperatorBackendConfig(
+        namespace=namespace or os.getenv("SPARK_NAMESPACE", "default"),
+        service_account=service_account,
+        default_spark_image=default_spark_image,
+        context=context or os.getenv("KUBE_CONTEXT", "kind-spark-test"),
+        enable_monitoring=enable_monitoring,
+        enable_ui=enable_ui,
+    )
+
+    logger.info(f"Creating BatchSparkClient for namespace: {config.namespace}")
+    return BatchSparkClient(backend_config=config)
+
+
+# ============================================================================
+# COMMON CONFIGURATIONS
+# ============================================================================
+
+
+def get_resilient_restart_policy() -> RestartPolicy:
+    """Get a restart policy suitable for production batch jobs.
+
+    Returns:
+        RestartPolicy with retry configuration
+    """
+    return RestartPolicy(
+        type=RestartPolicyType.ON_FAILURE,
+        on_failure_retries=3,
+        on_failure_retry_interval=30,
+        on_submission_failure_retries=2,
+        on_submission_failure_retry_interval=15,
+    )
+
+
+def get_dynamic_allocation_config(
+    min_executors: int = 1,
+    max_executors: int = 10,
+    initial_executors: int = 2,
+) -> DynamicAllocation:
+    """Get a dynamic allocation configuration.
+
+    Args:
+        min_executors: Minimum number of executors
+        max_executors: Maximum number of executors
+        initial_executors: Initial number of executors
+
+    Returns:
+        DynamicAllocation configuration
+    """
+    return DynamicAllocation(
+        enabled=True,
+        initial_executors=initial_executors,
+        min_executors=min_executors,
+        max_executors=max_executors,
+        shuffle_tracking_enabled=True,  # Required for K8s
+    )
+
+
+def get_spark_conf_defaults(spark_version: str = "4.0.0") -> dict[str, str]:
+    """Get default Spark configuration suitable for examples.
+
+    Args:
+        spark_version: Spark version to configure for
+
+    Returns:
+        Dictionary of Spark configuration properties
+    """
+    conf = {
+        "spark.kubernetes.file.upload.path": "/tmp",
+    }
+
+    # Spark 4.0+ specific configurations
+    if spark_version.startswith("4."):
+        conf.update(
+            {
+                "spark.sql.adaptive.enabled": "true",
+                "spark.sql.adaptive.coalescePartitions.enabled": "true",
+            }
+        )
+
+    return conf
+
+
+# ============================================================================
+# SAMPLE DATA GENERATORS
+# ============================================================================
+
+
+def generate_customer_data(num_records: int = 100) -> list[tuple]:
+    """Generate sample customer data.
+
+    Args:
+        num_records: Number of customer records to generate
+
+    Returns:
+        List of customer tuples (id, name, email, city, signup_date)
+    """
+    from datetime import date
+    import random
+
+    cities = [
+        "New York",
+        "Los Angeles",
+        "Chicago",
+        "Houston",
+        "Phoenix",
+        "Philadelphia",
+        "San Antonio",
+        "San Diego",
+        "Dallas",
+        "San Jose",
+    ]
+
+    base_date = date.today() - timedelta(days=365)
+
+    customers = []
+    for i in range(1, num_records + 1):
+        signup_date = base_date + timedelta(days=random.randint(0, 365))
+        customers.append(
+            (
+                i,
+                f"Customer{i}",
+                f"customer{i}@example.com",
+                random.choice(cities),
+                signup_date.strftime("%Y-%m-%d"),
+            )
+        )
+
+    return customers
+
+
+def generate_transaction_data(
+    num_transactions: int = 1000,
+    num_customers: int = 100,
+    days_back: int = 30,
+) -> list[tuple]:
+    """Generate sample transaction data.
+
+    Args:
+        num_transactions: Number of transactions to generate
+        num_customers: Number of unique customers
+        days_back: How many days back to generate data
+
+    Returns:
+        List of transaction tuples (tx_id, date, customer_id, amount, status)
+    """
+    import random
+
+    base_date = datetime.now()
+    statuses = ["completed", "pending", "cancelled"]
+
+    transactions = []
+    for i in range(1, num_transactions + 1):
+        tx_date = (base_date - timedelta(days=random.randint(0, days_back))).strftime("%Y-%m-%d")
+        customer_id = random.randint(1, num_customers)
+        amount = round(random.uniform(10.0, 1000.0), 2)
+        # 90% completed
+        status = random.choice(statuses) if i % 10 != 0 else "completed"
+
+        transactions.append((i, tx_date, customer_id, amount, status))
+
+    return transactions
+
+
+def generate_sales_data(
+    num_records: int = 100,
+    products: Optional[list[str]] = None,
+    categories: Optional[list[str]] = None,
+) -> list[tuple]:
+    """Generate sample sales data.
+
+    Args:
+        num_records: Number of sales records to generate
+        products: List of product names (default: common products)
+        categories: List of categories (default: Electronics, Furniture, etc.)
+
+    Returns:
+        List of sales tuples (id, date, product, category, quantity, price, region)
+    """
+    import random
+
+    if products is None:
+        products = ["Laptop", "Mouse", "Keyboard", "Monitor", "Desk", "Chair"]
+
+    if categories is None:
+        categories = ["Electronics", "Furniture", "Accessories"]
+
+    regions = ["North", "South", "East", "West"]
+    base_date = datetime.now()
+
+    sales = []
+    for i in range(1, num_records + 1):
+        sale_date = (base_date - timedelta(days=random.randint(0, 90))).strftime("%Y-%m-%d")
+        product = random.choice(products)
+        category = random.choice(categories)
+        quantity = random.randint(1, 10)
+        price = round(random.uniform(25.0, 1500.0), 2)
+        region = random.choice(regions)
+
+        sales.append((i, sale_date, product, category, quantity, price, region))
+
+    return sales
+
+
+# ============================================================================
+# COMMON OPERATIONS
+# ============================================================================
+
+
+def wait_for_job(
+    client: BatchBatchSparkClient,
+    app_name: str,
+    timeout: int = 300,
+    polling_interval: int = 5,
+) -> ApplicationState:
+    """Wait for a Spark job to complete with proper error handling.
+
+    Args:
+        client: BatchSparkClient instance
+        app_name: Application name
+        timeout: Maximum time to wait in seconds
+        polling_interval: Polling interval in seconds
+
+    Returns:
+        Final ApplicationState
+
+    Raises:
+        TimeoutError: If job doesn't complete within timeout
+        RuntimeError: If job fails
+    """
+    logger.info(f"Waiting for job '{app_name}' to complete (timeout: {timeout}s)...")
+
+    try:
+        status = client.wait_for_job_status(
+            submission_id=app_name,
+            timeout=timeout,
+            polling_interval=polling_interval,
+        )
+
+        if status.state == ApplicationState.COMPLETED:
+            logger.info(f"Job '{app_name}' completed successfully")
+        elif status.state == ApplicationState.FAILED:
+            logger.error(f"Job '{app_name}' failed")
+            raise RuntimeError(f"Job failed with state: {status.state.value}")
+        else:
+            logger.warning(f"Job '{app_name}' ended with unexpected state: {status.state.value}")
+
+        return status.state
+
+    except TimeoutError:
+        logger.error(f"Job '{app_name}' timed out after {timeout}s")
+        raise
+    except Exception as e:
+        logger.error(f"Error waiting for job '{app_name}': {e}")
+        raise
+
+
+def print_job_status(client: BatchBatchSparkClient, app_name: str):
+    """Print current job status in a formatted way.
+
+    Args:
+        client: BatchSparkClient instance
+        app_name: Application name
+    """
+    try:
+        status = client.get_job(app_name)
+
+        print("\n" + "=" * 60)
+        print(f"JOB STATUS: {app_name}")
+        print("=" * 60)
+        print(f"State: {status.state.value}")
+        if status.app_id:
+            print(f"App ID: {status.app_id}")
+        if status.submission_time:
+            print(f"Submitted: {status.submission_time}")
+        if status.start_time:
+            print(f"Started: {status.start_time}")
+        if status.completion_time:
+            print(f"Completed: {status.completion_time}")
+        print("=" * 60)
+        print()
+
+    except Exception as e:
+        logger.error(f"Error getting status for '{app_name}': {e}")
+
+
+def cleanup_job(client: BatchBatchSparkClient, app_name: str):
+    """Clean up a Spark application with proper error handling.
+
+    Args:
+        client: BatchSparkClient instance
+        app_name: Application name
+    """
+    try:
+        client.delete_job(app_name)
+        logger.info(f"Successfully deleted application '{app_name}'")
+    except Exception as e:
+        logger.warning(f"Failed to delete application '{app_name}': {e}")
+        logger.warning(f"You can manually delete with: kubectl delete sparkapplication {app_name}")
+
+
+# ============================================================================
+# HELPER FUNCTIONS
+# ============================================================================
+
+
+def format_bytes(bytes_value: int) -> str:
+    """Format bytes into human-readable string.
+
+    Args:
+        bytes_value: Number of bytes
+
+    Returns:
+        Formatted string (e.g., "1.5 GB")
+    """
+    for unit in ["B", "KB", "MB", "GB", "TB"]:
+        if bytes_value < 1024.0:
+            return f"{bytes_value:.2f} {unit}"
+        bytes_value /= 1024.0
+    return f"{bytes_value:.2f} PB"
+
+
+def format_duration(seconds: int) -> str:
+    """Format duration in seconds to human-readable string.
+
+    Args:
+        seconds: Duration in seconds
+
+    Returns:
+        Formatted string (e.g., "2h 30m 15s")
+    """
+    hours, remainder = divmod(seconds, 3600)
+    minutes, seconds = divmod(remainder, 60)
+
+    parts = []
+    if hours > 0:
+        parts.append(f"{int(hours)}h")
+    if minutes > 0:
+        parts.append(f"{int(minutes)}m")
+    if seconds > 0 or not parts:
+        parts.append(f"{int(seconds)}s")
+
+    return " ".join(parts)
+
+
+def get_sample_spark_conf_for_use_case(use_case: str) -> dict[str, str]:
+    """Get recommended Spark configuration for common use cases.
+
+    Args:
+        use_case: One of 'etl', 'ml', 'streaming', 'interactive'
+
+    Returns:
+        Dictionary of recommended Spark configuration
+    """
+    base_conf = get_spark_conf_defaults()
+
+    use_case_configs = {
+        "etl": {
+            "spark.sql.adaptive.enabled": "true",
+            "spark.sql.adaptive.coalescePartitions.enabled": "true",
+            "spark.sql.shuffle.partitions": "200",
+        },
+        "ml": {
+            # Some ML libs prefer this off
+            "spark.sql.adaptive.enabled": "false",
+            "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
+            "spark.kryoserializer.buffer.max": "512m",
+        },
+        "streaming": {
+            "spark.streaming.backpressure.enabled": "true",
+            "spark.streaming.receiver.maxRate": "10000",
+        },
+        "interactive": {
+            "spark.sql.adaptive.enabled": "true",
+            "spark.ui.enabled": "true",
+            "spark.eventLog.enabled": "false",
+        },
+    }
+
+    if use_case in use_case_configs:
+        base_conf.update(use_case_configs[use_case])
+    else:
+        logger.warning(f"Unknown use case '{use_case}', using defaults")
+
+    return base_conf
+
+
+# ============================================================================
+# EXAMPLE METADATA
+# ============================================================================
+
+EXAMPLES_METADATA = {
+    "01_hello_spark_pi": {
+        "title": "Hello Spark - Calculate Pi",
+        "level": 1,
+        "category": "Getting Started",
+        "time": "2-3 minutes",
+        "description": "Your first Spark job - calculate Pi using Monte Carlo method",
+    },
+    "02_csv_data_analysis": {
+        "title": "CSV Data Analysis",
+        "level": 1,
+        "category": "Data Analysis Basics",
+        "time": "2-3 minutes",
+        "description": "Analyze CSV data with filtering and aggregations",
+    },
+    "03_interactive_dataframe_exploration": {
+        "title": "Interactive DataFrame Exploration",
+        "level": 1,
+        "category": "Data Exploration",
+        "time": "3-4 minutes",
+        "description": "Exploratory data analysis patterns and data quality checks",
+    },
+    "04_etl_pipeline_simple": {
+        "title": "Simple ETL Pipeline",
+        "level": 2,
+        "category": "Data Engineering",
+        "time": "3-4 minutes",
+        "description": "Extract-Transform-Load pipeline with data validation",
+    },
+    "05_scheduled_batch_job": {
+        "title": "Scheduled Batch Job",
+        "level": 2,
+        "category": "Batch Processing",
+        "time": "3-4 minutes",
+        "description": "Production batch job with incremental processing and resilience",
+    },
+    "06_autoscaling_dynamic_allocation": {
+        "title": "Dynamic Allocation",
+        "level": 2,
+        "category": "Auto-scaling",
+        "time": "4-5 minutes",
+        "description": "Automatic executor scaling based on workload",
+    },
+}
+
+
+def print_examples_catalog():
+    """Print a catalog of all available examples."""
+    print("\n" + "=" * 80)
+    print("KUBEFLOW SPARK SDK - EXAMPLES CATALOG")
+    print("=" * 80)
+    print()
+
+    # Group by level
+    by_level = {}
+    for name, metadata in EXAMPLES_METADATA.items():
+        level = metadata["level"]
+        if level not in by_level:
+            by_level[level] = []
+        by_level[level].append((name, metadata))
+
+    level_names = {
+        1: "Level 1: Getting Started",
+        2: "Level 2: Data Engineering Basics",
+    }
+
+    for level in sorted(by_level.keys()):
+        print(f"\n{level_names.get(level, f'Level {level}')}")
+        print("-" * 80)
+
+        for name, metadata in sorted(by_level[level], key=lambda x: x[0]):
+            print(f"\n{name}.py")
+            print(f"  {metadata['title']}")
+            print(f"  Category: {metadata['category']}")
+            print(f"  Time: {metadata['time']}")
+            print(f"  {metadata['description']}")
+
+    print("\n" + "=" * 80)
+    print()
+
+
+if __name__ == "__main__":
+    # Print examples catalog when run directly
+    print_examples_catalog()
diff --git a/examples/spark/ipython_ex.py b/examples/spark/ipython_ex.py
new file mode 100644
index 000000000..7a3d7d0b0
--- /dev/null
+++ b/examples/spark/ipython_ex.py
@@ -0,0 +1,85 @@
+# ./ipython_shell.py
+# then run /paste this python code .
+
+import os
+
+from kubeflow.spark import OperatorBackendConfig, SparkClient
+
+config = OperatorBackendConfig(
+    namespace=os.getenv("SPARK_NAMESPACE", "default"),
+    service_account="spark-operator-spark",
+    default_spark_image="docker.io/library/spark",
+    enable_monitoring=False,
+    enable_ui=False,
+    context=os.getenv("KUBE_CONTEXT", "kind-spark-test"),  # Explicitly set context
+)
+client = BatchSparkClient(backend_config=config)
+
+app_name = "test-spark-pi"
+
+response = client.submit_application(
+    app_name=app_name,
+    main_application_file="local:///opt/spark/examples/jars/spark-examples_2.13-4.0.0.jar",
+    main_class="org.apache.spark.examples.SparkPi",
+    spark_version="4.0.0",
+    app_type="Scala",
+    driver_cores=1,
+    driver_memory="512m",
+    executor_cores=1,
+    executor_memory="512m",
+    num_executors=1,
+    arguments=["100"],
+    spark_conf={
+        "spark.kubernetes.file.upload.path": "/tmp",  # Required for Spark 4.0
+    },
+)
+client.get_job(app_name)
+
+
+response = client.submit_application(
+    app_name="my-python-pi5",
+    main_application_file="local:///opt/spark/examples/src/main/python/pi.py",
+    spark_version="4.0.0",
+    app_type="Python",
+    driver_cores=1,
+    driver_memory="512m",
+    executor_cores=1,
+    executor_memory="512m",
+    num_executors=2,
+    arguments=["100"],
+    spark_conf={"spark.kubernetes.file.upload.path": "/tmp"},
+)
+final_status = client.wait_for_job_status("my-python-pi5", timeout=300)
+client.get_job("my-python-pi5")
+logs = list(client.get_job_logs("my-python-pi5"))
+for line in logs:
+    if "Pi is roughly" in line:
+        print(f"RESULT: {line}")
+
+
+response = client.submit_application(
+    app_name="my-python-pi6",
+    main_application_file="local:///opt/spark/examples/src/main/python/pi.py",
+    spark_version="4.0.0",
+    app_type="Python",
+    driver_cores=1,
+    driver_memory="512m",
+    executor_cores=1,
+    executor_memory="512m",
+    num_executors=2,
+    arguments=["100"],
+    spark_conf={
+        "spark.kubernetes.file.upload.path": "/tmp",
+        "spark.eventLog.enabled": "true",
+        "spark.eventLog.dir": "/tmp/spark-events",
+    },
+    volumes=[{"name": "spark-events", "persistentVolumeClaim": {"claimName": "spark-history-pvc"}}],
+    driver_volume_mounts=[{"name": "spark-events", "mountPath": "/tmp/spark-events"}],
+    executor_volume_mounts=[{"name": "spark-events", "mountPath": "/tmp/spark-events"}],
+)
+final_status = client.wait_for_job_status("my-python-pi6", timeout=300)
+client.get_job("my-python-pi6")
+logs = list(client.get_job_logs("my-python-pi6"))
+for line in logs:
+    if "Pi is roughly" in line:
+        print(f"RESULT: {line}")
diff --git a/examples/spark/ipython_shell.py b/examples/spark/ipython_shell.py
new file mode 100755
index 000000000..8a6975116
--- /dev/null
+++ b/examples/spark/ipython_shell.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+"""
+Launch IPython shell with Kubeflow SDK in dev mode.
+Usage: ./ipython_shell.py
+
+Requires IPython: pip install ipython
+"""
+
+import os
+import sys
+
+# Add SDK to path
+sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+sys.path.insert(0, sdk_path)
+
+# Pre-import common modules
+
+# Print welcome message
+banner = f"""
+{"=" * 80}
+Kubeflow Spark Client - IPython Development Shell
+{"=" * 80}
+
+SDK Path: {sdk_path}
+
+Pre-imported:
+  BatchSparkClient, OperatorBackendConfig, GatewayBackendConfig,
+  ApplicationState, ApplicationStatus, SparkApplicationResponse
+
+Quick Examples:
+  config = OperatorBackendConfig(namespace="default")
+  client = SparkClient(backend_config=config)
+
+Tab completion and syntax highlighting enabled!
+{"=" * 80}
+"""
+
+try:
+    import IPython
+
+    IPython.embed(banner1=banner, colors="Linux")
+except ImportError:
+    print("IPython not installed. Install with: pip install ipython")
+    print("Falling back to regular Python shell...\n")
+    import code
+
+    print(banner)
+    code.interact(local=locals())
diff --git a/examples/spark/ipython_spark_connect_demo.py b/examples/spark/ipython_spark_connect_demo.py
new file mode 100755
index 000000000..273f6d35e
--- /dev/null
+++ b/examples/spark/ipython_spark_connect_demo.py
@@ -0,0 +1,388 @@
+#!/usr/bin/env python3
+"""
+IPython Demo Script for Spark Connect Integration
+
+This script demonstrates connecting to a Spark Connect server running in Kubernetes
+and performing interactive DataFrame operations like groupBy, aggregations, etc.
+
+Prerequisites:
+1. Kubernetes cluster with Spark Connect server deployed
+2. PySpark with Connect support: pip install 'pyspark[connect]>=3.4.0'
+3. Kubeflow SDK installed
+
+Usage:
+  python ipython_spark_connect_demo.py
+
+Or in IPython:
+  %run ipython_spark_connect_demo.py
+"""
+
+import os
+import sys
+
+# Add SDK to path for development mode
+sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+sys.path.insert(0, sdk_path)
+
+
+def print_section(title):
+    """Print formatted section header."""
+    print("\n" + "=" * 80)
+    print(f" {title}")
+    print("=" * 80 + "\n")
+
+
+def demo_basic_connection():
+    """Demonstrate basic connection to Spark Connect server."""
+    print_section("1. Connect to Spark Connect Server")
+
+    from kubeflow.spark import ConnectBackendConfig, SparkSessionClient
+
+    # Configuration for Kubernetes Spark Connect server
+    # The server is exposed via NodePort on port 30000
+    config = ConnectBackendConfig(
+        connect_url="sc://localhost:30000",
+        use_ssl=False,
+        timeout=60,
+    )
+
+    print(f"Connecting to: {config.connect_url}")
+
+    # Create client
+    client = SparkSessionClient(backend_config=config)
+    print("✓ SparkSessionClient created")
+
+    return client
+
+
+def demo_create_session(client):
+    """Demonstrate creating a Spark session."""
+    print_section("2. Create Spark Session")
+
+    session = client.create_session(app_name="ipython-demo")
+    print(f"✓ Session created: {session.session_id}")
+    print(f"  App name: {session.app_name}")
+    print(f"  Closed: {session.is_closed}")
+
+    return session
+
+
+def demo_simple_sql(session):
+    """Demonstrate simple SQL queries."""
+    print_section("3. Simple SQL Query")
+
+    df = session.sql("SELECT 1 AS id, 'Hello Spark Connect' AS message")
+    print("Query: SELECT 1 AS id, 'Hello Spark Connect' AS message")
+    print("\nResult:")
+    df.show()
+
+    result = df.collect()
+    print(f"\nCollected: {result[0].message}")
+
+    return df
+
+
+def demo_create_dataframe(session):
+    """Demonstrate creating DataFrames from Python data."""
+    print_section("4. Create DataFrame from Python Data")
+
+    # Sample sales data
+    sales_data = [
+        (1, "Electronics", "Laptop", 1200.00, 2, "2024-01-15"),
+        (2, "Electronics", "Mouse", 25.00, 5, "2024-01-15"),
+        (3, "Clothing", "Shirt", 35.00, 3, "2024-01-16"),
+        (4, "Electronics", "Keyboard", 75.00, 4, "2024-01-16"),
+        (5, "Clothing", "Pants", 55.00, 2, "2024-01-17"),
+        (6, "Electronics", "Monitor", 300.00, 3, "2024-01-17"),
+        (7, "Clothing", "Jacket", 120.00, 1, "2024-01-18"),
+        (8, "Electronics", "Mouse", 25.00, 10, "2024-01-18"),
+        (9, "Clothing", "Shirt", 35.00, 5, "2024-01-19"),
+        (10, "Electronics", "Laptop", 1200.00, 1, "2024-01-19"),
+    ]
+
+    schema = ["id", "category", "product", "price", "quantity", "date"]
+
+    df = session.createDataFrame(sales_data, schema)
+    print(f"✓ DataFrame created with {df.count()} rows")
+    print("\nSample data:")
+    df.show(5)
+
+    return df
+
+
+def demo_dataframe_operations(session, df):
+    """Demonstrate DataFrame transformations."""
+    print_section("5. DataFrame Operations - Filter & Select")
+
+    # Filter expensive items
+    expensive = df.filter(df.price > 100)
+    print("Filter: price > 100")
+    expensive.show()
+
+    # Select specific columns
+    print("\nSelect: category, product, price")
+    df.select("category", "product", "price").show(5)
+
+    return expensive
+
+
+def demo_groupby_aggregations(session, df):
+    """Demonstrate groupBy and aggregations."""
+    print_section("6. GroupBy and Aggregations")
+
+    # Group by category and calculate statistics
+    print("Aggregation: Total revenue by category")
+    from pyspark.sql import functions as F  # noqa: N812
+
+    revenue_df = df.withColumn("revenue", F.col("price") * F.col("quantity"))
+
+    category_stats = revenue_df.groupBy("category").agg(
+        F.sum("revenue").alias("total_revenue"),
+        F.avg("price").alias("avg_price"),
+        F.sum("quantity").alias("total_quantity"),
+        F.count("*").alias("num_transactions"),
+    )
+
+    print("\nRevenue by Category:")
+    category_stats.show()
+
+    # Group by product and sort
+    print("\nTop Products by Revenue:")
+    product_revenue = revenue_df.groupBy("product").agg(
+        F.sum("revenue").alias("total_revenue"),
+        F.sum("quantity").alias("total_sold"),
+    )
+
+    product_revenue.orderBy(F.desc("total_revenue")).show(5)
+
+    return category_stats
+
+
+def demo_advanced_aggregations(session, df):
+    """Demonstrate advanced aggregations and window functions."""
+    print_section("7. Advanced Aggregations")
+
+    from pyspark.sql import functions as F  # noqa: N812
+    from pyspark.sql.window import Window
+
+    # Add computed column
+    df_with_revenue = df.withColumn("revenue", F.col("price") * F.col("quantity"))
+
+    # Window function: Running total by date
+    print("Running Total Revenue by Date:")
+    window_spec = Window.orderBy("date").rowsBetween(Window.unboundedPreceding, Window.currentRow)
+
+    daily_revenue = (
+        df_with_revenue.groupBy("date")
+        .agg(F.sum("revenue").alias("daily_revenue"))
+        .withColumn("running_total", F.sum("daily_revenue").over(window_spec))
+    )
+
+    daily_revenue.orderBy("date").show()
+
+    # Pivot: Revenue by category and date
+    print("\nPivot: Revenue by Category and Date:")
+    pivot_df = (
+        df_with_revenue.groupBy("date").pivot("category").agg(F.sum("revenue").alias("revenue"))
+    )
+
+    pivot_df.orderBy("date").show()
+
+    return daily_revenue
+
+
+def demo_session_metrics(session):
+    """Demonstrate session metrics tracking."""
+    print_section("8. Session Metrics")
+
+    metrics = session.get_metrics()
+    print(f"Session ID: {metrics.session_id}")
+    print(f"Queries Executed: {metrics.queries_executed}")
+    print(f"Active Queries: {metrics.active_queries}")
+    print(f"Artifacts Uploaded: {metrics.artifacts_uploaded}")
+
+    info = session.get_info()
+    print(f"\nSession State: {info.state}")
+    print(f"App Name: {info.app_name}")
+
+
+def demo_multiple_operations(session):
+    """Demonstrate chaining multiple operations."""
+    print_section("9. Chained Operations")
+
+    # Create sample employee data
+    employees = [
+        (1, "Alice", "Engineering", 95000, 28),
+        (2, "Bob", "Engineering", 120000, 35),
+        (3, "Carol", "Sales", 80000, 42),
+        (4, "David", "Engineering", 110000, 30),
+        (5, "Eve", "Sales", 90000, 38),
+        (6, "Frank", "Marketing", 85000, 45),
+        (7, "Grace", "Engineering", 105000, 29),
+        (8, "Henry", "Marketing", 88000, 33),
+    ]
+
+    df = session.createDataFrame(employees, ["id", "name", "dept", "salary", "age"])
+
+    print("Original Data:")
+    df.show()
+
+    # Chain multiple operations
+    from pyspark.sql import functions as F  # noqa: N812
+
+    result = (
+        df.filter(F.col("age") < 40)
+        .groupBy("dept")
+        .agg(F.avg("salary").alias("avg_salary"), F.count("*").alias("count"))
+        .filter(F.col("count") >= 2)
+        .orderBy(F.desc("avg_salary"))
+    )
+
+    print("\nFiltered Analysis (age < 40, departments with 2+ people):")
+    result.show()
+
+    return result
+
+
+def run_complete_demo():
+    """Run complete demonstration."""
+    print("\n" + "=" * 80)
+    print(" Kubeflow Spark Connect - Interactive Demo")
+    print(" Connecting to Kubernetes Spark Connect Server")
+    print("=" * 80)
+
+    try:
+        # Step 1: Connect
+        client = demo_basic_connection()
+
+        # Step 2: Create session
+        session = demo_create_session(client)
+
+        # Step 3: Simple SQL
+        demo_simple_sql(session)
+
+        # Step 4: Create DataFrame
+        df = demo_create_dataframe(session)
+
+        # Step 5: Basic operations
+        demo_dataframe_operations(session, df)
+
+        # Step 6: GroupBy aggregations
+        demo_groupby_aggregations(session, df)
+
+        # Step 7: Advanced aggregations
+        demo_advanced_aggregations(session, df)
+
+        # Step 8: Session metrics
+        demo_session_metrics(session)
+
+        # Step 9: Chained operations
+        demo_multiple_operations(session)
+
+        print_section("Demo Complete")
+        print("✓ All operations completed successfully!")
+        print("\nTo continue experimenting:")
+        print("  - session object is available for more queries")
+        print("  - Try: session.sql('SELECT * FROM ...')")
+        print("  - Try: session.createDataFrame(...)")
+        print("  - Remember to call: session.close() when done")
+
+        return client, session
+
+    except Exception as e:
+        print(f"\n✗ Error: {e}")
+        import traceback
+
+        traceback.print_exc()
+        print("\nTroubleshooting:")
+        print("  1. Is Kubernetes cluster running? (kubectl get nodes)")
+        print("  2. Is Spark Connect deployed? (kubectl get pods -l app=spark-connect)")
+        print(
+            "  3. Is port forwarding active? (kubectl port-forward svc/spark-connect 30000:15002)"
+        )
+        print("  4. Is PySpark installed? (pip install 'pyspark[connect]>=3.4.0')")
+        return None, None
+
+
+# Manual step-by-step execution helper
+def print_manual_steps():
+    """Print manual steps for running in IPython."""
+    print("\n" + "=" * 80)
+    print(" Manual Step-by-Step Execution in IPython")
+    print("=" * 80)
+    print("""
+# Step 1: Import and configure
+from kubeflow.spark import ConnectBackendConfig, SparkSessionClient
+
+config = ConnectBackendConfig(
+    connect_url="sc://localhost:30000",
+    use_ssl=False,
+    timeout=60
+)
+
+# Step 2: Create client and session
+client = SparkSessionClient(backend_config=config)
+session = client.create_session(app_name="my-analysis")
+
+# Step 3: Create sample data
+sales_data = [
+    (1, "Electronics", "Laptop", 1200.00, 2),
+    (2, "Electronics", "Mouse", 25.00, 5),
+    (3, "Clothing", "Shirt", 35.00, 3),
+    (4, "Electronics", "Keyboard", 75.00, 4),
+]
+df = session.createDataFrame(sales_data, ["id", "category", "product", "price", "quantity"])
+
+# Step 4: View data
+df.show()
+
+# Step 5: Run aggregations
+from pyspark.sql import functions as F
+revenue_df = df.withColumn("revenue", F.col("price") * F.col("quantity"))
+revenue_df.groupBy("category").agg(F.sum("revenue").alias("total")).show()
+
+# Step 6: Clean up
+session.close()
+client.close()
+""")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Spark Connect Interactive Demo")
+    parser.add_argument(
+        "--manual",
+        action="store_true",
+        help="Print manual steps instead of running automated demo",
+    )
+    args = parser.parse_args()
+
+    if args.manual:
+        print_manual_steps()
+    else:
+        client, session = run_complete_demo()
+
+        # Keep objects available for interactive use
+        if client and session:
+            print("\nObjects available for continued use:")
+            print("  - client: SparkSessionClient instance")
+            print("  - session: ManagedSparkSession instance")
+            print("\nEntering interactive mode... (Ctrl+D to exit)")
+
+            try:
+                import IPython
+
+                IPython.embed()
+            except ImportError:
+                print("\nIPython not installed. Install with: pip install ipython")
+                print("Keeping session open for manual cleanup...")
+                input("\nPress Enter to close session and exit...")
+
+            if session and not session.is_closed:
+                session.close()
+                print("✓ Session closed")
+
+            if client:
+                client.close()
+                print("✓ Client closed")
diff --git a/examples/spark/ipython_spark_connect_shell.py b/examples/spark/ipython_spark_connect_shell.py
new file mode 100755
index 000000000..fe2fc9954
--- /dev/null
+++ b/examples/spark/ipython_spark_connect_shell.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+"""
+IPython Shell Launcher for Spark Connect Demo
+
+This script launches an IPython shell with the Kubeflow SDK pre-imported
+and prints step-by-step instructions for testing Spark Connect.
+
+Usage: python ipython_spark_connect_shell.py
+"""
+
+import os
+import sys
+
+# Add SDK to path
+sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+sys.path.insert(0, sdk_path)
+
+# Pre-import for convenience
+
+# Banner with instructions
+banner = f"""
+{"=" * 80}
+Kubeflow Spark Connect - Interactive IPython Shell
+{"=" * 80}
+
+SDK Path: {sdk_path}
+Spark Connect URL: sc://localhost:30000
+
+Pre-imported modules:
+  - ConnectBackendConfig
+  - SparkSessionClient
+
+{"=" * 80}
+Step-by-Step Guide
+{"=" * 80}
+
+1. Create Configuration:
+   config = ConnectBackendConfig(
+       connect_url="sc://localhost:30000",
+       use_ssl=False,
+       timeout=60
+   )
+
+2. Create Client and Session:
+   client = SparkSessionClient(backend_config=config)
+   session = client.create_session(app_name="my-demo")
+
+3. Run Simple SQL:
+   df = session.sql("SELECT 1 AS id, 'Hello Spark Connect' AS message")
+   df.show()
+
+4. Create DataFrame from Python Data:
+   sales_data = [
+       (1, "Electronics", "Laptop", 1200.00, 2),
+       (2, "Electronics", "Mouse", 25.00, 5),
+       (3, "Clothing", "Shirt", 35.00, 3),
+       (4, "Electronics", "Keyboard", 75.00, 4),
+       (5, "Clothing", "Pants", 55.00, 2),
+   ]
+   df = session.createDataFrame(
+       sales_data,
+       ["id", "category", "product", "price", "quantity"]
+   )
+   df.show()
+
+5. Filter and Select:
+   expensive = df.filter(df.price > 50)
+   expensive.show()
+
+   df.select("category", "product", "price").show()
+
+6. GroupBy Aggregations:
+   from pyspark.sql import functions as F
+
+   revenue_df = df.withColumn("revenue", F.col("price") * F.col("quantity"))
+
+   category_stats = revenue_df.groupBy("category").agg(
+       F.sum("revenue").alias("total_revenue"),
+       F.avg("price").alias("avg_price"),
+       F.count("*").alias("num_transactions")
+   )
+   category_stats.show()
+
+7. Order Results:
+   category_stats.orderBy(F.desc("total_revenue")).show()
+
+8. Session Metrics:
+   metrics = session.get_metrics()
+   print(f"Queries executed: {{metrics.queries_executed}}")
+
+   info = session.get_info()
+   print(f"Session state: {{info.state}}")
+
+9. Clean Up (when done):
+   session.close()
+   client.close()
+
+{"=" * 80}
+Ready! Start by copying and pasting the commands above.
+{"=" * 80}
+"""
+
+if __name__ == "__main__":
+    try:
+        import IPython
+
+        IPython.embed(banner1=banner, colors="Linux")
+    except ImportError:
+        print("IPython not installed. Install with: pip install ipython")
+        print("Falling back to regular Python shell...\n")
+        import code
+
+        print(banner)
+        code.interact(local=locals())
diff --git a/examples/spark/minio_config.py b/examples/spark/minio_config.py
new file mode 100644
index 000000000..278946eb9
--- /dev/null
+++ b/examples/spark/minio_config.py
@@ -0,0 +1,132 @@
+"""MinIO S3 Configuration Helper for Spark Examples.
+
+This module provides utilities for configuring Spark to work with MinIO
+(S3-compatible storage) running in the same Kubernetes cluster.
+
+Usage:
+    from minio_config import get_s3_spark_conf, S3_ENDPOINT
+
+    spark_conf = get_s3_spark_conf()
+    response = client.submit_application(
+        app_name="my-app",
+        main_application_file="s3a://spark-scripts/my_script.py",
+        spark_conf=spark_conf,
+        ...
+    )
+"""
+
+import os
+
+# MinIO Configuration (deployed via setup_minio.sh)
+MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "minioadmin")
+MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "minioadmin")
+MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "minio-service.default.svc.cluster.local:9000")
+
+# S3 endpoint for Spark (use http:// for internal cluster access)
+S3_ENDPOINT = f"http://{MINIO_ENDPOINT}"
+
+# Buckets
+SCRIPTS_BUCKET = "spark-scripts"
+DATA_BUCKET = "spark-data"
+OUTPUT_BUCKET = "spark-output"
+
+
+def get_s3_spark_conf(additional_conf=None, enable_history=False):
+    """Get Spark configuration for S3/MinIO access.
+
+    Args:
+        additional_conf: Optional dict of additional Spark configs
+        enable_history: If True, enable event logging for Spark History Server
+
+    Returns:
+        Dict of Spark configuration properties for S3 access
+    """
+    conf = {
+        # Required for Spark 4.0
+        "spark.kubernetes.file.upload.path": "/tmp",
+        # Download Hadoop AWS libraries at runtime (includes S3A filesystem)
+        # Compatible with Spark 4.0.0 and Hadoop 3.4.0
+        "spark.jars.packages": (
+            "org.apache.hadoop:hadoop-aws:3.4.0,"
+            "com.amazonaws:aws-java-sdk-bundle:1.12.262"
+        ),
+        # Ivy cache location - use /tmp which is always writable
+        # Fixes: java.io.FileNotFoundException: /home/spark/.ivy2.5.2/cache/...
+        "spark.jars.ivy": "/tmp/.ivy2",
+        # S3A Configuration for MinIO
+        "spark.hadoop.fs.s3a.endpoint": S3_ENDPOINT,
+        "spark.hadoop.fs.s3a.access.key": MINIO_ACCESS_KEY,
+        "spark.hadoop.fs.s3a.secret.key": MINIO_SECRET_KEY,
+        "spark.hadoop.fs.s3a.path.style.access": "true",  # Required for MinIO
+        "spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
+        "spark.hadoop.fs.s3a.connection.ssl.enabled": "false",  # HTTP for internal
+        # Performance tuning
+        "spark.hadoop.fs.s3a.fast.upload": "true",
+        "spark.hadoop.fs.s3a.block.size": "128M",
+        "spark.hadoop.fs.s3a.multipart.size": "104857600",  # 100MB
+        # Connection settings
+        "spark.hadoop.fs.s3a.connection.maximum": "100",
+        "spark.hadoop.fs.s3a.threads.max": "20",
+        "spark.hadoop.fs.s3a.connection.timeout": "200000",
+        "spark.hadoop.fs.s3a.attempts.maximum": "3",
+    }
+
+    # Add event logging for History Server
+    if enable_history:
+        conf.update(
+            {
+                "spark.eventLog.enabled": "true",
+                "spark.eventLog.dir": "file:///mnt/spark-events",
+                "spark.eventLog.compress": "true",
+            }
+        )
+
+    # Merge additional configuration
+    if additional_conf:
+        conf.update(additional_conf)
+
+    return conf
+
+
+def get_s3_path(bucket, key):
+    """Build S3 path for MinIO.
+
+    Args:
+        bucket: Bucket name (e.g., 'spark-scripts')
+        key: Object key (e.g., 'exploration.py')
+
+    Returns:
+        S3 URL (e.g., 's3a://spark-scripts/exploration.py')
+    """
+    return f"s3a://{bucket}/{key}"
+
+
+# Common S3 paths for examples
+S3_PATHS = {
+    "exploration_script": get_s3_path(SCRIPTS_BUCKET, "exploration.py"),
+    "csv_analysis_script": get_s3_path(SCRIPTS_BUCKET, "csv_analysis.py"),
+    "etl_script": get_s3_path(SCRIPTS_BUCKET, "etl_pipeline.py"),
+    "batch_job_script": get_s3_path(SCRIPTS_BUCKET, "batch_job.py"),
+    "data_dir": f"s3a://{DATA_BUCKET}/",
+    "output_dir": f"s3a://{OUTPUT_BUCKET}/",
+}
+
+
+def print_minio_info():
+    """Print MinIO configuration information."""
+    print("MinIO S3 Configuration:")
+    print(f"  Endpoint: {S3_ENDPOINT}")
+    print(f"  Access Key: {MINIO_ACCESS_KEY}")
+    print("  Buckets:")
+    print(f"    - {SCRIPTS_BUCKET}/ - Application scripts")
+    print(f"    - {DATA_BUCKET}/ - Input data")
+    print(f"    - {OUTPUT_BUCKET}/ - Output results")
+    print()
+
+
+if __name__ == "__main__":
+    # Print configuration when run directly
+    print_minio_info()
+    print("Available S3 Paths:")
+    for name, path in S3_PATHS.items():
+        print(f"  {name}: {path}")
diff --git a/examples/spark/open_spark_ui.sh b/examples/spark/open_spark_ui.sh
new file mode 100755
index 000000000..92cd6f760
--- /dev/null
+++ b/examples/spark/open_spark_ui.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+#
+# Open Spark UI for a running application
+#
+
+APP_NAME=$1
+NAMESPACE=${2:-default}
+PORT=${3:-4040}
+
+if [ -z "$APP_NAME" ]; then
+    echo "Usage: $0 <app-name> [namespace] [port]"
+    echo ""
+    echo "Examples:"
+    echo "  $0 test-spark-pi"
+    echo "  $0 test-spark-pi default 4040"
+    echo ""
+    exit 1
+fi
+
+echo "=========================================="
+echo "Spark UI Access"
+echo "=========================================="
+echo "Application: $APP_NAME"
+echo "Namespace: $NAMESPACE"
+echo "Local Port: $PORT"
+echo ""
+
+# Check if driver pod exists
+POD_STATUS=$(kubectl get pod $APP_NAME-driver -n $NAMESPACE -o jsonpath='{.status.phase}' 2>/dev/null)
+
+if [ -z "$POD_STATUS" ]; then
+    echo "✗ Driver pod not found: $APP_NAME-driver"
+    echo ""
+    echo "Check if application exists:"
+    echo "  kubectl get sparkapplication -n $NAMESPACE"
+    exit 1
+fi
+
+echo "Driver Pod Status: $POD_STATUS"
+
+if [ "$POD_STATUS" != "Running" ]; then
+    echo ""
+    echo "⚠️  Warning: Driver pod is not in Running state"
+    echo "   Spark UI may not be accessible"
+    echo ""
+fi
+
+echo ""
+echo "Starting port-forward..."
+echo "=========================================="
+echo ""
+echo "✓ Spark UI will be available at:"
+echo ""
+echo "  http://localhost:$PORT"
+echo ""
+echo "=========================================="
+echo ""
+echo "In the Spark UI you can view:"
+echo "  • Jobs Tab       - See all Spark jobs and their status"
+echo "  • Stages Tab     - View DAG visualization and task details"
+echo "  • Storage Tab    - Check cached RDDs/DataFrames"
+echo "  • Environment    - View Spark configuration"
+echo "  • Executors Tab  - Monitor executor resources and tasks"
+echo "  • SQL Tab        - See DataFrame/SQL query execution plans"
+echo ""
+echo "Press Ctrl+C to stop port forwarding"
+echo "=========================================="
+echo ""
+
+# Start port forwarding
+kubectl port-forward $APP_NAME-driver $PORT:4040 -n $NAMESPACE
diff --git a/examples/spark/quick_ui_test.sh b/examples/spark/quick_ui_test.sh
new file mode 100644
index 000000000..69ce202a3
--- /dev/null
+++ b/examples/spark/quick_ui_test.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+# Quick test to access Spark UI using driver pod port-forward
+
+echo "=========================================="
+echo "Quick Spark UI Access Test"
+echo "=========================================="
+echo ""
+
+# Run a simple test app
+echo "Step 1: Submitting test application..."
+python test_ui_minimal.py &
+PYTHON_PID=$!
+
+# Wait for submission
+sleep 15
+
+# Find the app name
+APP_NAME=$(kubectl get sparkapplications -o name | grep "test-ui" | head -1 | cut -d/ -f2)
+
+if [ -z "$APP_NAME" ]; then
+    echo "❌ No test application found"
+    echo "   Run manually: python test_ui_minimal.py"
+    exit 1
+fi
+
+echo "✓ Application found: $APP_NAME"
+echo ""
+
+# Wait for driver pod
+echo "Step 2: Waiting for driver pod to be ready..."
+DRIVER_POD="${APP_NAME}-driver"
+kubectl wait --for=condition=ready pod/$DRIVER_POD --timeout=60s 2>/dev/null || {
+    echo "   Still waiting for pod..."
+    sleep 10
+}
+
+# Check if pod exists
+POD_STATUS=$(kubectl get pod $DRIVER_POD -o jsonpath='{.status.phase}' 2>/dev/null)
+
+if [ "$POD_STATUS" != "Running" ]; then
+    echo "⚠️  Driver pod not Running yet (status: $POD_STATUS)"
+    echo ""
+    echo "Monitor with:"
+    echo "   kubectl get pods -w | grep $APP_NAME"
+    echo ""
+    echo "Once Running, use:"
+    echo "   kubectl port-forward pod/$DRIVER_POD 4040:4040"
+    exit 0
+fi
+
+echo "✓ Driver pod is Running"
+echo ""
+
+# Check if service exists
+echo "Step 3: Checking for UI service..."
+if kubectl get svc ${APP_NAME}-ui-svc 2>/dev/null; then
+    echo "✅ UI service exists! (Operator created it successfully)"
+    echo ""
+    echo "Access UI using service:"
+    echo "   kubectl port-forward svc/${APP_NAME}-ui-svc 4040:4040"
+else
+    echo "❌ UI service does NOT exist (as expected with v2.0.2-rc.0)"
+fi
+echo ""
+
+# Port-forward to driver pod
+echo "Step 4: Setting up port-forward to driver pod..."
+echo ""
+echo "=========================================="
+echo "🌐 SPARK UI ACCESS"
+echo "=========================================="
+echo ""
+echo "Run this command in another terminal:"
+echo ""
+echo "  kubectl port-forward pod/$DRIVER_POD 4040:4040"
+echo ""
+echo "Then open in your browser:"
+echo "  http://localhost:4040"
+echo ""
+echo "=========================================="
+echo ""
+
+# Offer to start port-forward
+read -p "Start port-forward now? (y/n) " -n 1 -r
+echo
+if [[ $REPLY =~ ^[Yy]$ ]]; then
+    echo "Starting port-forward..."
+    echo "Press Ctrl+C to stop"
+    kubectl port-forward pod/$DRIVER_POD 4040:4040
+else
+    echo ""
+    echo "Manual commands:"
+    echo "  kubectl port-forward pod/$DRIVER_POD 4040:4040"
+    echo "  # Then open: http://localhost:4040"
+fi
diff --git a/examples/spark/requirements-core.txt b/examples/spark/requirements-core.txt
new file mode 100644
index 000000000..4060438a6
--- /dev/null
+++ b/examples/spark/requirements-core.txt
@@ -0,0 +1,8 @@
+# Kubeflow Spark Client - Core Requirements Only
+# ================================================
+#
+# Minimal dependencies for running Spark client with OperatorBackend
+# Install with: pip install -r requirements-core.txt
+
+kubernetes>=27.2.0,<30.0.0
+pydantic>=2.10.0,<3.0.0
diff --git a/examples/spark/requirements-dev.txt b/examples/spark/requirements-dev.txt
new file mode 100644
index 000000000..c9f4cc478
--- /dev/null
+++ b/examples/spark/requirements-dev.txt
@@ -0,0 +1,34 @@
+# Kubeflow Spark Client - Development Requirements
+# ==================================================
+#
+# Full development dependencies including testing and linting
+# Install with: pip install -r requirements-dev.txt
+
+# Include core requirements
+-r requirements-core.txt
+
+# Gateway backend support
+requests>=2.31.0,<3.0.0
+pyyaml>=6.0,<7.0
+
+# Testing
+pytest>=7.0,<9.0
+pytest-mock>=3.10,<4.0
+pytest-cov>=4.1.0,<6.0
+coverage>=7.0,<8.0
+
+# Code quality
+ruff>=0.12.2,<1.0
+black>=24.0.0,<25.0
+isort>=5.13.0,<6.0
+mypy>=1.8.0,<2.0
+
+# Development tools
+ipython>=8.20.0,<9.0
+ipdb>=0.13.13,<1.0
+rich>=13.0.0,<14.0
+python-dotenv>=1.0.0,<2.0
+
+# Documentation
+sphinx>=7.0.0,<8.0
+sphinx-rtd-theme>=2.0.0,<3.0
diff --git a/examples/spark/requirements.txt b/examples/spark/requirements.txt
new file mode 100644
index 000000000..e95e19cf4
--- /dev/null
+++ b/examples/spark/requirements.txt
@@ -0,0 +1,26 @@
+# Kubeflow Spark Client - Requirements
+# =====================================
+#
+# Install all dependencies with:
+#   pip install -r requirements.txt
+#
+# Or install minimal dependencies:
+#   pip install -r requirements-core.txt
+
+# Core dependencies (required for OperatorBackend)
+kubernetes>=27.2.0,<30.0.0
+pydantic>=2.10.0,<3.0.0
+
+# Optional dependencies for GatewayBackend
+requests>=2.31.0,<3.0.0
+pyyaml>=6.0,<7.0
+
+# Development and testing dependencies
+pytest>=7.0,<9.0
+pytest-mock>=3.10,<4.0
+coverage>=7.0,<8.0
+ruff>=0.12.2,<1.0
+
+# Additional useful packages
+rich>=13.0.0,<14.0        # Pretty printing
+python-dotenv>=1.0.0,<2.0  # Environment variable management
diff --git a/examples/spark/run_long_job_ui_validation.py b/examples/spark/run_long_job_ui_validation.py
new file mode 100644
index 000000000..431127dd6
--- /dev/null
+++ b/examples/spark/run_long_job_ui_validation.py
@@ -0,0 +1,316 @@
+#!/usr/bin/env python3
+"""
+Long-Running Job for Spark UI Validation
+
+This example submits a 10-minute Spark job specifically designed to test
+and validate Spark UI access. The job performs various operations to showcase
+all UI features.
+
+Prerequisites:
+- Kind cluster with Spark Operator (run ./setup_test_environment.sh)
+- MinIO deployed (run ./setup_minio.sh)
+- Long-running job script uploaded to MinIO
+
+Time to Run: ~10 minutes
+
+Usage:
+    python run_long_job_ui_validation.py [--no-monitor]
+
+Options:
+    --no-monitor    Skip interactive monitoring (useful when called from scripts)
+"""
+
+from datetime import datetime
+import os
+import sys
+import time
+
+# Add SDK to path for development mode
+sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+if sdk_path not in sys.path:
+    sys.path.insert(0, sdk_path)
+
+from kubeflow.spark import (  # noqa: E402
+    ApplicationState,
+    OperatorBackendConfig,
+    BatchSparkClient,
+)
+
+# Import MinIO configuration
+try:
+    from minio_config import get_s3_spark_conf, print_minio_info
+except ImportError:
+    print("ERROR: minio_config.py not found!")
+    print("Please ensure you're running from the examples/spark directory")
+    sys.exit(1)
+
+
+def print_ui_instructions(app_name: str):
+    """Print detailed UI access instructions."""
+    print("=" * 80)
+    print("🌐 SPARK UI ACCESS INSTRUCTIONS")
+    print("=" * 80)
+    print()
+    print("The job will run for ~10 minutes. Follow these steps to access the UI:")
+    print()
+    print("STEP 1: Wait for driver pod to be Running")
+    print("-" * 80)
+    print(f"  kubectl get pod {app_name}-driver -w")
+    print()
+    print("  Wait until STATUS shows 'Running' (may take 1-2 minutes)")
+    print()
+    print("STEP 2: Port-forward to driver pod")
+    print("-" * 80)
+    print(f"  kubectl port-forward pod/{app_name}-driver 4040:4040")
+    print()
+    print("  Keep this terminal open!")
+    print()
+    print("STEP 3: Open Spark UI in browser")
+    print("-" * 80)
+    print("  http://localhost:4040")
+    print()
+    print("STEP 4: Explore UI features while job runs")
+    print("-" * 80)
+    print("  Jobs tab - See 6 jobs (one per stage)")
+    print("  Stages tab - Monitor stage progress in real-time")
+    print("  Storage tab - View cached DataFrame (after Stage 2)")
+    print("  Executors tab - Check executor metrics and GC")
+    print("  SQL tab - Inspect DataFrame query plans")
+    print("  Environment tab - View Spark configuration")
+    print()
+    print("=" * 80)
+    print()
+    print("💡 TIPS:")
+    print("  - Job progresses through 5 stages over 10 minutes")
+    print("  - Each stage pauses briefly - perfect for exploring UI")
+    print("  - Stage 2 caches data - check Storage tab!")
+    print("  - Stage 3 does heavy shuffling - watch Executors tab")
+    print("  - Click on job/stage names for detailed views")
+    print()
+    print("=" * 80)
+    print()
+
+
+def monitor_job_progress(client: BatchSparkClient, app_name: str):
+    """Monitor and display job progress."""
+    print("=" * 80)
+    print("MONITORING JOB PROGRESS")
+    print("=" * 80)
+    print()
+    print("Checking status every 30 seconds...")
+    print("Press Ctrl+C to stop monitoring (job will continue running)")
+    print()
+
+    start_time = time.time()
+    last_state = None
+
+    try:
+        while True:
+            try:
+                status = client.get_job(app_name)
+                elapsed = int(time.time() - start_time)
+
+                if status.state != last_state:
+                    timestamp = datetime.now().strftime("%H:%M:%S")
+                    print(f"[{timestamp}] [{elapsed:3d}s] State: {status.state.value}")
+                    last_state = status.state
+
+                if status.state in [ApplicationState.COMPLETED, ApplicationState.FAILED]:
+                    print()
+                    print(f"Job finished with state: {status.state.value}")
+                    return status
+
+                time.sleep(30)
+
+            except Exception as e:
+                print(f"  Warning: Could not get status: {e}")
+                time.sleep(30)
+
+    except KeyboardInterrupt:
+        print()
+        print("Stopped monitoring (job still running)")
+        print(f"Check status with: kubectl get sparkapplication {app_name}")
+        return None
+
+
+def main():
+    """Main example: Submit long-running job for UI validation."""
+
+    # Check for --no-monitor flag
+    no_monitor = "--no-monitor" in sys.argv
+
+    print("=" * 80)
+    print("LONG-RUNNING SPARK JOB FOR UI VALIDATION")
+    print("=" * 80)
+    print()
+    print("This example submits a 10-minute job designed to showcase")
+    print("all Spark UI features. Perfect for testing UI access!")
+    print()
+    print("Job stages:")
+    print("  Stage 1: Generate 100M rows (~2 min)")
+    print("  Stage 2: Cache and aggregate (~2 min)")
+    print("  Stage 3: Shuffle-heavy joins (~3 min)")
+    print("  Stage 4: Multi-dimensional analysis (~2 min)")
+    print("  Stage 5: Window functions (~1 min)")
+    print()
+    print("Total duration: ~10 minutes")
+    print()
+
+    # Show MinIO configuration
+    print_minio_info()
+
+    # Step 1: Create SparkClient with configuration
+    print("Step 1: Creating Spark client...")
+    config = OperatorBackendConfig(
+        namespace=os.getenv("SPARK_NAMESPACE", "default"),
+        service_account="spark-operator-spark",
+        default_spark_image="docker.io/library/spark",
+        context=os.getenv("KUBE_CONTEXT", "kind-spark-test"),
+        enable_monitoring=False,
+        enable_ui=True,  # Enable Spark UI
+    )
+    client = BatchSparkClient(backend_config=config)
+    print("  Client created successfully")
+    print("  Spark UI enabled")
+    print()
+
+    # Step 2: Prepare the application
+    timestamp = datetime.now().strftime("%H%M%S")
+    app_name = f"long-job-{timestamp}"
+
+    # Get S3 path for the long-running job script
+    script_path = "s3a://spark-scripts/long_running_job.py"
+
+    print("Step 2: Configuring long-running job...")
+    print(f"  App name: {app_name}")
+    print("  Spark version: 4.0.0")
+    print(f"  Script location: {script_path}")
+    print("  Duration: ~10 minutes")
+    print("  Resources: 1 driver + 2 executors (1 CPU, 2g RAM each)")
+    print()
+
+    # Step 3: Submit the application
+    print("Step 3: Submitting long-running job...")
+    print()
+
+    try:
+        # Get S3-enabled Spark configuration
+        spark_conf = get_s3_spark_conf()
+
+        response = client.submit_application(
+            # Application metadata
+            app_name=app_name,
+            main_application_file=script_path,
+            # Spark configuration
+            spark_version="4.0.0",
+            app_type="Python",
+            # Resource allocation (more resources for better performance)
+            driver_cores=1,
+            driver_memory="2g",  # More memory for large datasets
+            executor_cores=1,
+            executor_memory="2g",  # More memory for shuffles
+            num_executors=2,
+            # Keep job running for debugging
+            time_to_live_seconds=7200,  # 2 hours
+            # Labels for tracking
+            labels={
+                "job_type": "ui-validation",
+                "duration": "long",
+            },
+            # S3 configuration for MinIO
+            spark_conf=spark_conf,
+        )
+
+        print("  Job submitted successfully!")
+        print(f"  Submission ID: {response.submission_id}")
+        print(f"  Status: {response.status}")
+        print()
+
+        # Print app name for automation scripts to capture
+        print(f"APP_NAME={app_name}")
+        print()
+
+    except Exception as e:
+        print(f"  ERROR: Submission failed: {e}")
+        print()
+        print("Troubleshooting:")
+        print("  1. Ensure MinIO is running:")
+        print("     kubectl get pods -l app=minio")
+        print("  2. Verify script is uploaded:")
+        print("     kubectl exec minio-client -- mc ls myminio/spark-scripts/")
+        print("  3. Run: ./setup_minio.sh to upload scripts")
+        sys.exit(1)
+
+    # Step 4: Print UI access instructions
+    print()
+    print_ui_instructions(app_name)
+
+    # Step 5: Ask if user wants to monitor
+    print()
+
+    # Check if monitoring was disabled via flag or non-interactive mode
+    if no_monitor:
+        print("Monitoring disabled (--no-monitor flag)")
+        response = "n"
+    elif not sys.stdin.isatty():
+        # Non-interactive mode (running from automation script)
+        print("Running in non-interactive mode. Skipping monitoring.")
+        response = "n"
+    else:
+        # Interactive mode - ask user
+        response = input("Monitor job progress? (y/n): ").strip().lower()
+
+    if response == "y":
+        print()
+        monitor_job_progress(client, app_name)
+
+        # Retrieve logs after completion
+        print()
+        print("Retrieving job logs...")
+        try:
+            logs = list(client.get_job_logs(app_name))
+            print()
+            print("=" * 80)
+            print("JOB OUTPUT (Last 50 lines)")
+            print("=" * 80)
+            for line in logs[-50:]:
+                print(line)
+            print("=" * 80)
+        except Exception as e:
+            print(f"  WARNING: Could not retrieve logs: {e}")
+    else:
+        print()
+        print("Skipping monitoring.")
+        print()
+        print("Check job status anytime with:")
+        print(f"  kubectl get sparkapplication {app_name}")
+        print()
+        print("View logs with:")
+        print(f"  kubectl logs {app_name}-driver")
+        print()
+
+    print()
+    print("=" * 80)
+    print("QUICK REFERENCE")
+    print("=" * 80)
+    print()
+    print("Port-forward to UI:")
+    print(f"  kubectl port-forward pod/{app_name}-driver 4040:4040")
+    print()
+    print("Open UI:")
+    print("  http://localhost:4040")
+    print()
+    print("Check status:")
+    print(f"  kubectl get sparkapplication {app_name} -w")
+    print()
+    print("View driver logs:")
+    print(f"  kubectl logs {app_name}-driver -f")
+    print()
+    print("Delete when done:")
+    print(f"  kubectl delete sparkapplication {app_name}")
+    print()
+    print("=" * 80)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/spark/run_ui_validation.sh b/examples/spark/run_ui_validation.sh
new file mode 100755
index 000000000..6a3af9926
--- /dev/null
+++ b/examples/spark/run_ui_validation.sh
@@ -0,0 +1,176 @@
+#!/bin/bash
+# Complete setup and run script for long-running UI validation job
+
+set -e
+
+echo "=========================================="
+echo "Spark UI Validation - Complete Setup"
+echo "=========================================="
+echo ""
+
+# Check if we're in the right directory
+if [ ! -f "run_long_job_ui_validation.py" ]; then
+    echo "❌ Please run this from examples/spark directory"
+    exit 1
+fi
+
+# Step 1: Check MinIO
+echo "Step 1: Checking MinIO..."
+if ! kubectl get pod -l app=minio 2>/dev/null | grep -q Running; then
+    echo "  ⚠️  MinIO not running. Setting up..."
+    ./setup_minio.sh
+    echo ""
+else
+    echo "  ✓ MinIO is running"
+fi
+echo ""
+
+# Step 2: Upload script
+echo "Step 2: Uploading long-running job script..."
+FILE_INFO=$(kubectl exec minio-client -- mc ls myminio/spark-scripts/long_running_job.py 2>/dev/null || echo "")
+
+if [ -z "$FILE_INFO" ]; then
+    echo "  Script not found in MinIO. Uploading..."
+    chmod +x upload_long_job.sh
+    ./upload_long_job.sh
+elif echo "$FILE_INFO" | grep -q "0B"; then
+    echo "  Script exists but is empty (0B). Re-uploading..."
+    chmod +x upload_long_job.sh
+    ./upload_long_job.sh
+else
+    echo "  ✓ Script already uploaded"
+    echo "    $FILE_INFO"
+fi
+echo ""
+
+# Step 3: Submit job
+echo "Step 3: Submitting long-running job..."
+echo "  This will take ~10 minutes to complete"
+echo ""
+
+# Run in foreground with --no-monitor flag (no interactive prompts)
+# Capture output to get APP_NAME
+OUTPUT=$(python run_long_job_ui_validation.py --no-monitor 2>&1)
+echo "$OUTPUT"
+
+# Extract app name from output
+APP_NAME=$(echo "$OUTPUT" | grep "APP_NAME=" | cut -d= -f2)
+
+# Fallback: try to get from kubectl
+if [ -z "$APP_NAME" ]; then
+    APP_NAME=$(kubectl get sparkapplications -o name 2>/dev/null | grep "long-job" | tail -1 | cut -d/ -f2)
+fi
+
+if [ -z "$APP_NAME" ]; then
+    echo "  ⚠️  Could not find application. Check output above."
+    exit 1
+fi
+
+echo "  ✓ Job submitted: $APP_NAME"
+echo ""
+
+# Step 4: Wait for driver pod
+echo "Step 4: Waiting for driver pod to be ready..."
+echo "  This may take 1-2 minutes..."
+DRIVER_POD="${APP_NAME}-driver"
+
+# Wait for pod to exist
+for i in {1..60}; do
+    if kubectl get pod $DRIVER_POD 2>/dev/null; then
+        break
+    fi
+    sleep 2
+done
+
+# Wait for pod to be Running
+kubectl wait --for=condition=ready pod/$DRIVER_POD --timeout=180s 2>/dev/null || {
+    echo "  ⚠️  Pod taking longer than expected..."
+    echo "  Monitor with: kubectl get pod $DRIVER_POD -w"
+    echo ""
+}
+
+POD_STATUS=$(kubectl get pod $DRIVER_POD -o jsonpath='{.status.phase}' 2>/dev/null)
+echo "  ✓ Driver pod status: $POD_STATUS"
+echo ""
+
+if [ "$POD_STATUS" != "Running" ]; then
+    echo "  ⚠️  Pod not Running yet. Current status: $POD_STATUS"
+    echo ""
+    echo "  Monitor pod:"
+    echo "    kubectl get pod $DRIVER_POD -w"
+    echo ""
+    echo "  Once Running, port-forward manually:"
+    echo "    kubectl port-forward pod/$DRIVER_POD 4040:4040"
+    echo ""
+    exit 0
+fi
+
+# Step 5: Instructions for UI access
+echo "=========================================="
+echo "🎉 Job is Running!"
+echo "=========================================="
+echo ""
+echo "Driver pod: $DRIVER_POD"
+echo "Expected duration: ~10 minutes"
+echo ""
+echo "=========================================="
+echo "TO ACCESS SPARK UI:"
+echo "=========================================="
+echo ""
+echo "Open a NEW terminal and run:"
+echo ""
+echo "  kubectl port-forward pod/$DRIVER_POD 4040:4040"
+echo ""
+echo "Then open in your browser:"
+echo ""
+echo "  http://localhost:4040"
+echo ""
+echo "=========================================="
+echo "WHAT TO EXPLORE:"
+echo "=========================================="
+echo ""
+echo "Timeline:"
+echo "  0:00 - Job starts"
+echo "  2:00 - Stage 2 → CHECK STORAGE TAB! ⭐"
+echo "  4:00 - Stage 3 → CHECK EXECUTORS TAB! (heavy shuffle)"
+echo "  7:00 - Stage 4 → Check SQL tab"
+echo "  9:00 - Stage 5 → Check DAG visualization"
+echo " 10:00 - Job completes"
+echo ""
+echo "UI Tabs to explore:"
+echo "  ✓ Jobs - See 6 jobs (one per stage)"
+echo "  ✓ Stages - Monitor stage progress"
+echo "  ✓ Storage - View cached data (after Stage 2)"
+echo "  ✓ Executors - Monitor resources and shuffles"
+echo "  ✓ SQL - Inspect DataFrame query plans"
+echo "  ✓ Environment - View Spark configuration"
+echo ""
+echo "=========================================="
+echo ""
+
+# Offer to start port-forward
+read -p "Start port-forward now? (y/n) " -n 1 -r
+echo ""
+
+if [[ $REPLY =~ ^[Yy]$ ]]; then
+    echo ""
+    echo "Starting port-forward..."
+    echo "Keep this terminal open!"
+    echo "Open browser to: http://localhost:4040"
+    echo ""
+    echo "Press Ctrl+C to stop port-forward"
+    echo ""
+    sleep 2
+    kubectl port-forward pod/$DRIVER_POD 4040:4040
+else
+    echo ""
+    echo "To access UI later, run:"
+    echo "  kubectl port-forward pod/$DRIVER_POD 4040:4040"
+    echo ""
+    echo "Monitor job with:"
+    echo "  kubectl get sparkapplication $APP_NAME -w"
+    echo ""
+    echo "View logs with:"
+    echo "  kubectl logs $DRIVER_POD -f"
+    echo ""
+fi
diff --git a/examples/spark/scripts/long_running_job.py b/examples/spark/scripts/long_running_job.py
new file mode 100644
index 000000000..f023de875
--- /dev/null
+++ b/examples/spark/scripts/long_running_job.py
@@ -0,0 +1,306 @@
+#!/usr/bin/env python3
+"""
+Long-Running Spark Job for UI Validation
+
+This script runs for approximately 10 minutes and demonstrates various
+Spark operations to showcase different UI features:
+- Jobs and Stages
+- SQL/DataFrame operations
+- Executor metrics
+- Storage/caching
+- Shuffle operations
+
+Perfect for testing Spark UI access and exploring its features.
+"""
+
+import time
+
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import avg, col, count, expr, max, min, rand, sum
+
+
+def main():
+    print("=" * 80)
+    print("LONG-RUNNING SPARK JOB FOR UI VALIDATION")
+    print("=" * 80)
+    print()
+    print("This job will run for approximately 10 minutes.")
+    print("Use this time to explore the Spark UI features:")
+    print("  • Jobs tab - See job progression")
+    print("  • Stages tab - Monitor stage execution")
+    print("  • Storage tab - View cached DataFrames")
+    print("  • Executors tab - Check executor metrics")
+    print("  • SQL tab - Inspect query plans")
+    print()
+
+    # Create Spark session
+    spark = SparkSession.builder.appName("Long-Running UI Validation Job").getOrCreate()
+
+    spark.sparkContext.setLogLevel("INFO")
+
+    print("✓ Spark session created")
+    print()
+
+    # ========================================================================
+    # STAGE 1: Generate Large Dataset (2 minutes)
+    # ========================================================================
+    print("-" * 80)
+    print("STAGE 1: Generating large dataset (100 million rows)")
+    print("Expected duration: ~2 minutes")
+    print("-" * 80)
+    print()
+
+    # Generate a large dataset with 100 million rows
+    df_large = (
+        spark.range(0, 100_000_000)
+        .withColumn("category", (col("id") % 10).cast("string"))
+        .withColumn("value", (rand() * 1000).cast("integer"))
+        .withColumn("region", expr("array('North', 'South', 'East', 'West')[cast(id % 4 as int)]"))
+    )
+
+    print("  Dataset schema:")
+    df_large.printSchema()
+
+    # Trigger evaluation with a count
+    total_rows = df_large.count()
+    print(f"  ✓ Generated {total_rows:,} rows")
+    print()
+
+    time.sleep(5)  # Pause to observe UI
+
+    # ========================================================================
+    # STAGE 2: Cache and Aggregations (2 minutes)
+    # ========================================================================
+    print("-" * 80)
+    print("STAGE 2: Caching dataset and performing aggregations")
+    print("Expected duration: ~2 minutes")
+    print("-" * 80)
+    print()
+
+    # Cache the dataset to observe Storage tab
+    df_large.cache()
+    print("  Caching dataset in memory...")
+
+    # Force caching with an action
+    cached_count = df_large.count()
+    print(f"  ✓ Cached {cached_count:,} rows")
+    print()
+
+    print("  Check Spark UI → Storage tab to see cached DataFrame!")
+    print()
+
+    time.sleep(10)  # Pause to check Storage tab
+
+    # Perform aggregations by category
+    print("  Aggregating by category...")
+    agg_by_category = (
+        df_large.groupBy("category")
+        .agg(
+            count("id").alias("count"),
+            sum("value").alias("total_value"),
+            avg("value").alias("avg_value"),
+            min("value").alias("min_value"),
+            max("value").alias("max_value"),
+        )
+        .orderBy("category")
+    )
+
+    print()
+    print("  Category Aggregations:")
+    agg_by_category.show()
+    print()
+
+    time.sleep(5)
+
+    # ========================================================================
+    # STAGE 3: Shuffle-Heavy Operations (3 minutes)
+    # ========================================================================
+    print("-" * 80)
+    print("STAGE 3: Shuffle-heavy operations (joins and repartitioning)")
+    print("Expected duration: ~3 minutes")
+    print("-" * 80)
+    print()
+
+    # Create a dimension table for joins
+    print("  Creating dimension table...")
+    dim_categories = spark.createDataFrame(
+        [
+            ("0", "Electronics", "Tech"),
+            ("1", "Books", "Media"),
+            ("2", "Clothing", "Fashion"),
+            ("3", "Food", "Grocery"),
+            ("4", "Toys", "Entertainment"),
+            ("5", "Sports", "Recreation"),
+            ("6", "Tools", "Hardware"),
+            ("7", "Garden", "Outdoor"),
+            ("8", "Beauty", "Personal Care"),
+            ("9", "Auto", "Automotive"),
+        ],
+        ["category_id", "category_name", "department"],
+    )
+
+    print("  ✓ Dimension table created")
+    print()
+
+    # Perform join (will cause shuffle)
+    print("  Performing join operation (watch for shuffle in UI)...")
+    df_joined = df_large.join(
+        dim_categories, df_large.category == dim_categories.category_id, "inner"
+    )
+
+    # Show joined results
+    print()
+    print("  Joined Data Sample:")
+    df_joined.select("id", "category", "category_name", "department", "value", "region").show(10)
+    print()
+
+    time.sleep(10)  # Pause to observe shuffle
+
+    # Repartition to create more shuffle
+    print("  Repartitioning data (32 partitions)...")
+    df_repartitioned = df_joined.repartition(32, "department")
+
+    # Count to trigger repartitioning
+    repartitioned_count = df_repartitioned.count()
+    print(f"  ✓ Repartitioned {repartitioned_count:,} rows across 32 partitions")
+    print()
+
+    time.sleep(5)
+
+    # ========================================================================
+    # STAGE 4: Multi-dimensional Analysis (2 minutes)
+    # ========================================================================
+    print("-" * 80)
+    print("STAGE 4: Multi-dimensional analysis")
+    print("Expected duration: ~2 minutes")
+    print("-" * 80)
+    print()
+
+    # Aggregate by department and region
+    print("  Aggregating by department and region...")
+    dept_region_agg = (
+        df_repartitioned.groupBy("department", "region")
+        .agg(
+            count("id").alias("transactions"),
+            sum("value").alias("total_sales"),
+            avg("value").alias("avg_transaction"),
+        )
+        .orderBy("department", "region")
+    )
+
+    print()
+    print("  Department × Region Sales Analysis:")
+    dept_region_agg.show(20)
+    print()
+
+    time.sleep(10)  # Pause to view results
+
+    # ========================================================================
+    # STAGE 5: Window Functions and Complex Queries (1 minute)
+    # ========================================================================
+    print("-" * 80)
+    print("STAGE 5: Window functions and complex SQL")
+    print("Expected duration: ~1 minute")
+    print("-" * 80)
+    print()
+
+    from pyspark.sql.functions import dense_rank, row_number
+    from pyspark.sql.window import Window
+
+    # Create a window for ranking
+    window_spec = Window.partitionBy("department").orderBy(col("total_sales").desc())
+
+    # Add rankings
+    print("  Computing regional rankings within departments...")
+    ranked_regions = dept_region_agg.withColumn("rank", row_number().over(window_spec)).withColumn(
+        "dense_rank", dense_rank().over(window_spec)
+    )
+
+    # Show top regions per department
+    print()
+    print("  Top Performing Regions by Department:")
+    ranked_regions.filter(col("rank") <= 2).orderBy("department", "rank").show()
+    print()
+
+    time.sleep(5)
+
+    # ========================================================================
+    # FINAL STAGE: Summary Statistics (1 minute)
+    # ========================================================================
+    print("-" * 80)
+    print("FINAL STAGE: Computing summary statistics")
+    print("-" * 80)
+    print()
+
+    # Overall statistics
+    print("  Computing overall statistics...")
+    overall_stats = df_repartitioned.agg(
+        count("id").alias("total_transactions"),
+        sum("value").alias("total_revenue"),
+        avg("value").alias("avg_transaction"),
+        min("value").alias("min_transaction"),
+        max("value").alias("max_transaction"),
+    )
+
+    print()
+    print("  OVERALL STATISTICS:")
+    print("  " + "=" * 76)
+    overall_stats.show(truncate=False)
+    print()
+
+    # Department summary
+    print("  Department Summary:")
+    dept_summary = (
+        df_repartitioned.groupBy("department")
+        .agg(count("id").alias("transactions"), sum("value").alias("revenue"))
+        .orderBy(col("revenue").desc())
+    )
+
+    dept_summary.show()
+    print()
+
+    # Region summary
+    print("  Region Summary:")
+    region_summary = (
+        df_repartitioned.groupBy("region")
+        .agg(count("id").alias("transactions"), sum("value").alias("revenue"))
+        .orderBy(col("revenue").desc())
+    )
+
+    region_summary.show()
+    print()
+
+    # Cleanup
+    print("-" * 80)
+    print("Cleaning up...")
+    df_large.unpersist()
+    print("  ✓ Unpersisted cached DataFrame")
+    print()
+
+    # Final summary
+    print("=" * 80)
+    print("JOB COMPLETED SUCCESSFULLY!")
+    print("=" * 80)
+    print()
+    print("Total execution time: ~10 minutes")
+    print()
+    print("What you should have observed in Spark UI:")
+    print("  ✓ Jobs tab - Multiple jobs corresponding to each stage")
+    print("  ✓ Stages tab - Detailed stage execution with tasks")
+    print("  ✓ Storage tab - Cached DataFrame (Stage 2)")
+    print("  ✓ Executors tab - Executor metrics and resource usage")
+    print("  ✓ SQL tab - Query plans for DataFrame operations")
+    print()
+    print("Spark UI features to explore:")
+    print("  • Click on job names to see stages")
+    print("  • Click on stages to see task details")
+    print("  • Check 'Event Timeline' for task scheduling")
+    print("  • View 'DAG Visualization' for execution plan")
+    print("  • Monitor executor GC time and memory usage")
+    print()
+
+    spark.stop()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/spark/setup_minio.sh b/examples/spark/setup_minio.sh
new file mode 100755
index 000000000..face8edaf
--- /dev/null
+++ b/examples/spark/setup_minio.sh
@@ -0,0 +1,437 @@
+#!/bin/bash
+set -e
+
+echo "================================================================================"
+echo "Setting up MinIO (S3-compatible storage) for Spark Examples"
+echo "================================================================================"
+echo ""
+
+# Configuration
+NAMESPACE="default"
+MINIO_ROOT_USER="minioadmin"
+MINIO_ROOT_PASSWORD="minioadmin"
+MINIO_SERVICE="minio-service"
+MINIO_ENDPOINT="minio-service.default.svc.cluster.local:9000"
+
+echo "Step 1: Deploying MinIO to Kubernetes..."
+echo "--------------------------------------------------------------------------------"
+
+# Create MinIO deployment
+cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: minio-pvc
+  namespace: ${NAMESPACE}
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 10Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: minio
+  namespace: ${NAMESPACE}
+spec:
+  selector:
+    matchLabels:
+      app: minio
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: minio
+    spec:
+      containers:
+      - name: minio
+        image: quay.io/minio/minio:latest
+        command:
+        - /bin/bash
+        - -c
+        args:
+        - minio server /data --console-address :9001
+        env:
+        - name: MINIO_ROOT_USER
+          value: "${MINIO_ROOT_USER}"
+        - name: MINIO_ROOT_PASSWORD
+          value: "${MINIO_ROOT_PASSWORD}"
+        ports:
+        - containerPort: 9000
+          name: api
+        - containerPort: 9001
+          name: console
+        volumeMounts:
+        - name: storage
+          mountPath: /data
+      volumes:
+      - name: storage
+        persistentVolumeClaim:
+          claimName: minio-pvc
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: ${MINIO_SERVICE}
+  namespace: ${NAMESPACE}
+spec:
+  type: ClusterIP
+  ports:
+    - port: 9000
+      targetPort: 9000
+      protocol: TCP
+      name: api
+    - port: 9001
+      targetPort: 9001
+      protocol: TCP
+      name: console
+  selector:
+    app: minio
+EOF
+
+echo "  ✓ MinIO deployed"
+echo ""
+
+echo "Step 2: Waiting for MinIO to be ready..."
+echo "--------------------------------------------------------------------------------"
+kubectl wait --for=condition=ready pod -l app=minio -n ${NAMESPACE} --timeout=120s
+echo "  ✓ MinIO is ready"
+echo ""
+
+echo "Step 3: Port-forward MinIO console (for browser access)..."
+echo "--------------------------------------------------------------------------------"
+echo "  To access MinIO console, run in another terminal:"
+echo "    kubectl port-forward svc/${MINIO_SERVICE} 9001:9001 -n ${NAMESPACE}"
+echo "  Then open: http://localhost:9001"
+echo "  Login: ${MINIO_ROOT_USER} / ${MINIO_ROOT_PASSWORD}"
+echo ""
+
+echo "Step 4: Installing MinIO client (mc) in a pod..."
+echo "--------------------------------------------------------------------------------"
+
+# Create a pod with mc client for bucket setup
+cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: Pod
+metadata:
+  name: minio-client
+  namespace: ${NAMESPACE}
+spec:
+  containers:
+  - name: mc
+    image: quay.io/minio/mc:latest
+    command: ["/bin/sh"]
+    args: ["-c", "sleep 3600"]
+  restartPolicy: Never
+EOF
+
+kubectl wait --for=condition=ready pod/minio-client -n ${NAMESPACE} --timeout=60s
+echo "  ✓ MinIO client pod ready"
+echo ""
+
+echo "Step 5: Configuring MinIO and creating buckets..."
+echo "--------------------------------------------------------------------------------"
+
+# Configure mc to point to our MinIO
+kubectl exec -n ${NAMESPACE} minio-client -- mc alias set myminio http://${MINIO_SERVICE}:9000 ${MINIO_ROOT_USER} ${MINIO_ROOT_PASSWORD}
+
+# Create buckets for examples
+kubectl exec -n ${NAMESPACE} minio-client -- mc mb myminio/spark-scripts --ignore-existing
+kubectl exec -n ${NAMESPACE} minio-client -- mc mb myminio/spark-data --ignore-existing
+kubectl exec -n ${NAMESPACE} minio-client -- mc mb myminio/spark-output --ignore-existing
+
+echo "  ✓ Created buckets:"
+echo "    - spark-scripts (for PySpark scripts)"
+echo "    - spark-data (for input data)"
+echo "    - spark-output (for results)"
+echo ""
+
+# List buckets to verify
+echo "Verifying buckets:"
+kubectl exec -n ${NAMESPACE} minio-client -- mc ls myminio/
+echo ""
+
+echo "Step 6: Creating sample PySpark scripts..."
+echo "--------------------------------------------------------------------------------"
+
+# Create batch job script
+cat > /tmp/batch_job.py <<'BATCH_SCRIPT'
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import col, sum as _sum, count, current_timestamp
+from pyspark.sql.types import *
+from datetime import datetime
+
+spark = SparkSession.builder.appName("Batch Job").getOrCreate()
+
+print("\n" + "="*80)
+print("SCHEDULED BATCH JOB - DAILY PROCESSING")
+print("="*80)
+
+# Configuration
+BATCH_DATE = datetime.now().strftime("%Y-%m-%d")
+JOB_ID = f"batch_{BATCH_DATE.replace('-', '')}"
+
+print(f"\n[CONFIG] Batch Configuration:")
+print(f"  • Batch Date: {BATCH_DATE}")
+print(f"  • Job ID: {JOB_ID}")
+
+# Create sample transaction data
+schema = StructType([
+    StructField("transaction_id", IntegerType(), False),
+    StructField("date", StringType(), False),
+    StructField("customer_id", IntegerType(), False),
+    StructField("amount", DoubleType(), False),
+])
+
+transactions_data = [
+    (1, BATCH_DATE, 101, 150.00),
+    (2, BATCH_DATE, 102, 250.00),
+    (3, BATCH_DATE, 103, 75.00),
+    (4, BATCH_DATE, 101, 300.00),
+    (5, BATCH_DATE, 104, 500.00),
+]
+
+df = spark.createDataFrame(transactions_data, schema)
+
+print(f"\n[EXTRACT] Loaded {df.count()} transactions")
+print("\nSample transactions:")
+df.show()
+
+# Transform: Add metadata
+df_enriched = df.withColumn("processing_timestamp", current_timestamp()) \
+                .withColumn("job_id", col("transaction_id").cast("string"))
+
+print("\n[TRANSFORM] Added metadata columns")
+
+# Aggregate by customer
+summary = df_enriched.groupBy("customer_id").agg(
+    count("transaction_id").alias("transaction_count"),
+    _sum("amount").alias("total_amount")
+).orderBy(col("total_amount").desc())
+
+print("\n[LOAD] Customer Summary:")
+summary.show()
+
+print(f"\n[COMPLETE] Batch job {JOB_ID} completed successfully!")
+print("="*80)
+spark.stop()
+BATCH_SCRIPT
+
+# Create exploration script
+cat > /tmp/exploration.py <<'EXPLORATION_SCRIPT'
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import col, count, sum as _sum, avg, min as _min, max as _max
+from pyspark.sql.types import *
+
+spark = SparkSession.builder.appName("DataFrame Exploration").getOrCreate()
+
+print("\n" + "="*80)
+print("INTERACTIVE DATAFRAME EXPLORATION")
+print("="*80)
+
+# Create sample customer dataset
+schema = StructType([
+    StructField("customer_id", IntegerType(), False),
+    StructField("name", StringType(), True),
+    StructField("age", IntegerType(), True),
+    StructField("city", StringType(), True),
+    StructField("purchases", IntegerType(), True),
+    StructField("total_spent", DoubleType(), True),
+])
+
+customers_data = [
+    (1, "Alice", 28, "New York", 15, 1250.50),
+    (2, "Bob", 35, "Los Angeles", 8, 890.25),
+    (3, "Carol", None, "Chicago", 22, 2100.00),  # Missing age
+    (4, "David", 42, "Houston", 5, 450.75),
+]
+
+df = spark.createDataFrame(customers_data, schema)
+
+print("\nDataset Summary:")
+print(f"Total Records: {df.count()}")
+
+print("\nSchema:")
+df.printSchema()
+
+print("\nSample Data:")
+df.show()
+
+print("\nDescriptive Statistics:")
+df.describe().show()
+
+print("\nNull Check:")
+df.select([count(col(c).isNull()).alias(c) for c in df.columns]).show()
+
+print("\n" + "="*80)
+spark.stop()
+EXPLORATION_SCRIPT
+
+# Create CSV analysis script
+cat > /tmp/csv_analysis.py <<'CSV_SCRIPT'
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import col, sum as _sum, avg, count
+
+spark = SparkSession.builder.appName("CSV Analysis").getOrCreate()
+
+print("\n" + "="*80)
+print("CSV DATA ANALYSIS")
+print("="*80)
+
+# Read sample CSV from MinIO
+# For this example, we'll create data in-memory
+from pyspark.sql.types import *
+
+schema = StructType([
+    StructField("product", StringType()),
+    StructField("category", StringType()),
+    StructField("quantity", IntegerType()),
+    StructField("price", DoubleType()),
+])
+
+data = [
+    ("Laptop", "Electronics", 2, 1200.00),
+    ("Mouse", "Electronics", 5, 25.00),
+    ("Keyboard", "Electronics", 3, 75.00),
+    ("Desk", "Furniture", 1, 500.00),
+    ("Chair", "Furniture", 2, 250.00),
+]
+
+df = spark.createDataFrame(data, schema)
+
+print("\nSample Data:")
+df.show()
+
+print("\nSales by Category:")
+df.groupBy("category").agg(
+    count("product").alias("products"),
+    _sum("quantity").alias("total_quantity"),
+    _sum(col("quantity") * col("price")).alias("revenue")
+).show()
+
+print("\n" + "="*80)
+spark.stop()
+CSV_SCRIPT
+
+# Create ETL script
+cat > /tmp/etl_pipeline.py <<'ETL_SCRIPT'
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import col, upper, trim, current_timestamp
+
+spark = SparkSession.builder.appName("ETL Pipeline").getOrCreate()
+
+print("\n" + "="*80)
+print("ETL PIPELINE")
+print("="*80)
+
+# Extract
+print("\n[EXTRACT] Loading data...")
+from pyspark.sql.types import *
+schema = StructType([
+    StructField("id", IntegerType()),
+    StructField("name", StringType()),
+    StructField("amount", DoubleType()),
+])
+data = [(1, " alice ", 100.0), (2, "BOB", 200.0), (3, "carol  ", 150.0)]
+df = spark.createDataFrame(data, schema)
+print(f"Extracted {df.count()} records")
+
+# Transform
+print("\n[TRANSFORM] Cleaning data...")
+df_clean = df.withColumn("name", upper(trim(col("name")))) \
+             .withColumn("processed_at", current_timestamp())
+print("Transformations applied: trim, uppercase, timestamp")
+
+# Load
+print("\n[LOAD] Results:")
+df_clean.show()
+
+print("\n" + "="*80)
+spark.stop()
+ETL_SCRIPT
+
+echo "  ✓ Created PySpark scripts"
+echo ""
+
+echo "Step 7: Uploading scripts to MinIO..."
+echo "--------------------------------------------------------------------------------"
+
+# Upload scripts directly to MinIO using stdin (avoids need for 'tar' in container)
+echo "  Uploading batch_job.py..."
+kubectl exec -n ${NAMESPACE} minio-client -- sh -c 'cat > /tmp/batch_job.py' < /tmp/batch_job.py
+kubectl exec -n ${NAMESPACE} minio-client -- mc cp /tmp/batch_job.py myminio/spark-scripts/
+
+echo "  Uploading exploration.py..."
+kubectl exec -n ${NAMESPACE} minio-client -- sh -c 'cat > /tmp/exploration.py' < /tmp/exploration.py
+kubectl exec -n ${NAMESPACE} minio-client -- mc cp /tmp/exploration.py myminio/spark-scripts/
+
+echo "  Uploading csv_analysis.py..."
+kubectl exec -n ${NAMESPACE} minio-client -- sh -c 'cat > /tmp/csv_analysis.py' < /tmp/csv_analysis.py
+kubectl exec -n ${NAMESPACE} minio-client -- mc cp /tmp/csv_analysis.py myminio/spark-scripts/
+
+echo "  Uploading etl_pipeline.py..."
+kubectl exec -n ${NAMESPACE} minio-client -- sh -c 'cat > /tmp/etl_pipeline.py' < /tmp/etl_pipeline.py
+kubectl exec -n ${NAMESPACE} minio-client -- mc cp /tmp/etl_pipeline.py myminio/spark-scripts/
+
+echo "  Uploading long_running_job.py..."
+kubectl exec -n ${NAMESPACE} minio-client -- sh -c 'cat > /tmp/long_running_job.py' < scripts/long_running_job.py
+kubectl exec -n ${NAMESPACE} minio-client -- mc cp /tmp/long_running_job.py myminio/spark-scripts/
+
+echo "  ✓ Uploaded scripts to s3://spark-scripts/"
+echo ""
+
+# Verify uploads
+echo "Verifying uploaded scripts:"
+kubectl exec -n ${NAMESPACE} minio-client -- mc ls myminio/spark-scripts/
+echo ""
+
+echo "Step 8: Creating Spark S3 access secret..."
+echo "--------------------------------------------------------------------------------"
+
+cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: Secret
+metadata:
+  name: spark-s3-credentials
+  namespace: ${NAMESPACE}
+type: Opaque
+stringData:
+  access-key: "${MINIO_ROOT_USER}"
+  secret-key: "${MINIO_ROOT_PASSWORD}"
+  endpoint: "http://${MINIO_ENDPOINT}"
+EOF
+
+echo "  ✓ Created secret: spark-s3-credentials"
+echo ""
+
+echo "================================================================================"
+echo "✅ MinIO Setup Complete!"
+echo "================================================================================"
+echo ""
+echo "MinIO Configuration:"
+echo "  • Endpoint: http://${MINIO_ENDPOINT}"
+echo "  • Access Key: ${MINIO_ROOT_USER}"
+echo "  • Secret Key: ${MINIO_ROOT_PASSWORD}"
+echo ""
+echo "Buckets Created:"
+echo "  • s3://spark-scripts/ - PySpark application scripts"
+echo "  • s3://spark-data/ - Input data files"
+echo "  • s3://spark-output/ - Output results"
+echo ""
+echo "Uploaded Scripts:"
+echo "  • s3://spark-scripts/batch_job.py"
+echo "  • s3://spark-scripts/exploration.py"
+echo "  • s3://spark-scripts/csv_analysis.py"
+echo "  • s3://spark-scripts/etl_pipeline.py"
+echo ""
+echo "To access MinIO Console:"
+echo "  1. Run: kubectl port-forward svc/${MINIO_SERVICE} 9001:9001 -n ${NAMESPACE}"
+echo "  2. Open: http://localhost:9001"
+echo "  3. Login: ${MINIO_ROOT_USER} / ${MINIO_ROOT_PASSWORD}"
+echo ""
+echo "Next steps:"
+echo "  → Update examples to use S3 paths"
+echo "  → Run: python 03_interactive_dataframe_exploration.py"
+echo ""
diff --git a/examples/spark/setup_spark_connect.sh b/examples/spark/setup_spark_connect.sh
new file mode 100755
index 000000000..abf7fd1e8
--- /dev/null
+++ b/examples/spark/setup_spark_connect.sh
@@ -0,0 +1,217 @@
+#!/usr/bin/env bash
+#
+# Setup Spark Connect Server in Kubernetes for Testing
+#
+# This script:
+# 1. Sets up Kubernetes cluster using existing setup_test_environment.sh
+# 2. Deploys Spark Connect server
+# 3. Sets up port forwarding for local access
+# 4. Verifies the installation
+
+set -e
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CLUSTER_NAME="${CLUSTER_NAME:-spark-test}"
+
+print_step() {
+    echo -e "${GREEN}==>${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}WARNING:${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}ERROR:${NC} $1"
+}
+
+# Setup Kubernetes cluster with Spark Operator
+setup_kubernetes_cluster() {
+    print_step "Setting up Kubernetes cluster..."
+
+    if kind get clusters 2>/dev/null | grep -q "^${CLUSTER_NAME}$"; then
+        print_warning "Cluster '${CLUSTER_NAME}' already exists"
+
+        # Export kubeconfig to ensure context is set
+        kind export kubeconfig --name ${CLUSTER_NAME} 2>/dev/null || true
+
+        # Verify context is set
+        if ! kubectl config get-contexts | grep -q "kind-${CLUSTER_NAME}"; then
+            print_error "Failed to set kubectl context"
+            print_step "Please run: kind export kubeconfig --name ${CLUSTER_NAME}"
+            exit 1
+        fi
+    else
+        print_step "Running setup_test_environment.sh..."
+        bash "${SCRIPT_DIR}/setup_test_environment.sh"
+    fi
+
+    print_step "Kubernetes cluster ready"
+}
+
+# Deploy Spark Connect server
+deploy_spark_connect() {
+    print_step "Deploying Spark Connect server..."
+
+    kubectl apply -f "${SCRIPT_DIR}/spark-connect-server.yaml"
+
+    print_step "Waiting for Spark Connect server to be ready..."
+    kubectl wait --for=condition=ready pod \
+        -l app=spark-connect \
+        -n default \
+        --timeout=300s
+
+    print_step "Spark Connect server deployed successfully"
+}
+
+# Setup port forwarding
+setup_port_forwarding() {
+    print_step "Setting up port forwarding..."
+
+    # Kill any existing port forwarding on port 30000
+    pkill -f "kubectl.*port-forward.*30000" || true
+    sleep 2
+
+    # Get the Spark Connect pod name
+    POD_NAME=$(kubectl get pods -l app=spark-connect -n default -o jsonpath='{.items[0].metadata.name}')
+
+    print_step "Port forwarding from localhost:30000 to Spark Connect pod..."
+
+    # Start port forwarding in background
+    kubectl port-forward -n default "pod/${POD_NAME}" 30000:15002 > /dev/null 2>&1 &
+    PF_PID=$!
+
+    # Wait for port forwarding to be established
+    sleep 3
+
+    # Verify port forwarding
+    if lsof -i :30000 > /dev/null 2>&1; then
+        print_step "Port forwarding established (PID: ${PF_PID})"
+        echo "${PF_PID}" > /tmp/spark-connect-port-forward.pid
+        print_warning "Port forwarding running in background"
+        print_warning "To stop: kill \$(cat /tmp/spark-connect-port-forward.pid)"
+    else
+        print_error "Failed to establish port forwarding"
+        return 1
+    fi
+}
+
+# Verify installation
+verify_installation() {
+    print_step "Verifying installation..."
+
+    echo ""
+    echo "Cluster: ${CLUSTER_NAME}"
+    echo "Spark Connect Pod:"
+    kubectl get pods -l app=spark-connect -n default
+
+    echo ""
+    echo "Spark Connect Service:"
+    kubectl get svc spark-connect -n default
+
+    echo ""
+    print_step "Installation complete!"
+}
+
+# Print usage instructions
+print_usage() {
+    echo ""
+    echo "=" * 80
+    echo " Spark Connect Setup Complete"
+    echo "=" * 80
+    echo ""
+    echo "Spark Connect server is now running and accessible at:"
+    echo "  URL: sc://localhost:30000"
+    echo ""
+    echo "Next steps:"
+    echo ""
+    echo "1. Install PySpark with Connect support (if not already installed):"
+    echo "   pip install 'pyspark[connect]>=3.4.0'"
+    echo ""
+    echo "2. Run the interactive demo:"
+    echo "   python examples/spark/ipython_spark_connect_demo.py"
+    echo ""
+    echo "3. Or run manual tests:"
+    echo "   python examples/spark/ipython_spark_connect_demo.py --manual"
+    echo ""
+    echo "4. Or launch IPython for step-by-step experimentation:"
+    echo "   cd examples/spark"
+    echo "   python -c 'from ipython_spark_connect_demo import *; import IPython; IPython.embed()'"
+    echo ""
+    echo "5. View Spark UI:"
+    echo "   kubectl port-forward -n default svc/spark-connect 4040:4040"
+    echo "   Open: http://localhost:4040"
+    echo ""
+    echo "Cluster management:"
+    echo "  - View logs: kubectl logs -l app=spark-connect -n default -f"
+    echo "  - Restart: kubectl rollout restart deployment/spark-connect -n default"
+    echo "  - Delete: kubectl delete -f examples/spark/spark-connect-server.yaml"
+    echo "  - Cleanup cluster: kind delete cluster --name ${CLUSTER_NAME}"
+    echo ""
+    echo "Port forwarding PID: $(cat /tmp/spark-connect-port-forward.pid 2>/dev/null || echo 'N/A')"
+    echo "Stop port forwarding: kill \$(cat /tmp/spark-connect-port-forward.pid)"
+}
+
+# Main
+main() {
+    echo "========================================"
+    echo " Spark Connect Server Setup"
+    echo "========================================"
+    echo ""
+
+    setup_kubernetes_cluster
+    deploy_spark_connect
+    setup_port_forwarding
+    verify_installation
+    print_usage
+}
+
+# Cleanup function
+cleanup() {
+    print_step "Cleaning up Spark Connect resources..."
+
+    # Stop port forwarding
+    if [ -f /tmp/spark-connect-port-forward.pid ]; then
+        PF_PID=$(cat /tmp/spark-connect-port-forward.pid)
+        if ps -p ${PF_PID} > /dev/null 2>&1; then
+            kill ${PF_PID} 2>/dev/null || true
+            print_step "Port forwarding stopped"
+        fi
+        rm -f /tmp/spark-connect-port-forward.pid
+    fi
+
+    # Delete Spark Connect deployment
+    kubectl delete -f "${SCRIPT_DIR}/spark-connect-server.yaml" --ignore-not-found=true
+
+    print_step "Cleanup complete"
+}
+
+# Handle script arguments
+case "${1:-setup}" in
+    setup)
+        main
+        ;;
+    cleanup)
+        cleanup
+        ;;
+    restart)
+        cleanup
+        sleep 2
+        main
+        ;;
+    *)
+        echo "Usage: $0 {setup|cleanup|restart}"
+        echo ""
+        echo "Commands:"
+        echo "  setup   - Setup cluster and deploy Spark Connect (default)"
+        echo "  cleanup - Remove Spark Connect and stop port forwarding"
+        echo "  restart - Cleanup and setup again"
+        exit 1
+        ;;
+esac
diff --git a/examples/spark/setup_test_environment.sh b/examples/spark/setup_test_environment.sh
new file mode 100755
index 000000000..34e67d0c8
--- /dev/null
+++ b/examples/spark/setup_test_environment.sh
@@ -0,0 +1,305 @@
+#!/usr/bin/env bash
+#
+# Setup script for running Spark Operator in Kind cluster for testing
+#
+# This script:
+# 1. Creates a Kind cluster
+# 2. Installs Spark Operator
+# 3. Sets up service accounts and RBAC
+# 4. Verifies the installation
+
+set -e
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Configuration
+CLUSTER_NAME="${CLUSTER_NAME:-spark-test}"
+OPERATOR_NAMESPACE="${OPERATOR_NAMESPACE:-spark-operator}"
+SPARK_NAMESPACE="${SPARK_NAMESPACE:-default}"
+SPARK_OPERATOR_VERSION="${SPARK_OPERATOR_VERSION:-v2.0.2-rc.0}"
+SPARK_OPERATOR_CHART_VERSION="${SPARK_OPERATOR_CHART_VERSION:-2.0.2-rc.0}"
+
+print_step() {
+    echo -e "${GREEN}==>${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}WARNING:${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}ERROR:${NC} $1"
+}
+
+# Check prerequisites
+check_prerequisites() {
+    print_step "Checking prerequisites..."
+
+    if ! command -v kind &> /dev/null; then
+        print_error "kind not found. Please install kind: https://kind.sigs.k8s.io/docs/user/quick-start/"
+        exit 1
+    fi
+
+    if ! command -v kubectl &> /dev/null; then
+        print_error "kubectl not found. Please install kubectl"
+        exit 1
+    fi
+
+    if ! command -v helm &> /dev/null; then
+        print_warning "helm not found. Will use kubectl apply instead"
+    fi
+
+    print_step "Prerequisites OK"
+}
+
+# Create Kind cluster
+create_cluster() {
+    print_step "Creating Kind cluster '${CLUSTER_NAME}'..."
+
+    if kind get clusters | grep -q "^${CLUSTER_NAME}$"; then
+        print_warning "Cluster '${CLUSTER_NAME}' already exists. Skipping creation."
+        return
+    fi
+
+    cat <<EOF | kind create cluster --name ${CLUSTER_NAME} --config=-
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+nodes:
+- role: control-plane
+  kubeadmConfigPatches:
+  - |
+    kind: InitConfiguration
+    nodeRegistration:
+      kubeletExtraArgs:
+        node-labels: "ingress-ready=true"
+  extraPortMappings:
+  - containerPort: 30000
+    hostPort: 30000
+    protocol: TCP
+- role: worker
+- role: worker
+EOF
+
+    print_step "Cluster created successfully"
+}
+
+# Install Spark Operator using Helm
+install_spark_operator_helm() {
+    print_step "Installing Spark Operator using Helm..."
+
+    # Add Spark Operator Helm repo
+    helm repo add spark-operator https://kubeflow.github.io/spark-operator
+    helm repo update
+
+    # Create namespace
+    kubectl create namespace ${OPERATOR_NAMESPACE} --dry-run=client -o yaml | kubectl apply -f -
+
+    # Install Spark Operator
+    helm upgrade --install spark-operator spark-operator/spark-operator \
+        --namespace ${OPERATOR_NAMESPACE} \
+        --version ${SPARK_OPERATOR_CHART_VERSION} \
+        --set webhook.enable=true \
+        --set webhook.port=8080 \
+        --wait || {
+        print_warning "Helm install with specific version failed, trying latest..."
+        helm upgrade --install spark-operator spark-operator/spark-operator \
+            --namespace ${OPERATOR_NAMESPACE} \
+            --set webhook.enable=true \
+            --set webhook.port=8080 \
+            --wait
+    }
+
+    print_step "Spark Operator installed successfully"
+}
+
+# Install Spark Operator using kubectl (fallback)
+install_spark_operator_kubectl() {
+    print_step "Installing Spark Operator using kubectl..."
+
+    # Create namespace
+    kubectl create namespace ${OPERATOR_NAMESPACE} --dry-run=client -o yaml | kubectl apply -f -
+
+    # Try to install from GitHub releases
+    print_step "Downloading Spark Operator manifests..."
+
+    MANIFEST_URL="https://github.com/kubeflow/spark-operator/releases/download/${SPARK_OPERATOR_VERSION}/spark-operator.yaml"
+
+    if curl -fsSL "${MANIFEST_URL}" -o /tmp/spark-operator.yaml 2>/dev/null; then
+        print_step "Installing from release manifest..."
+        kubectl apply -f /tmp/spark-operator.yaml -n ${OPERATOR_NAMESPACE}
+        rm -f /tmp/spark-operator.yaml
+    else
+        print_warning "Release manifest not found, using main branch..."
+
+        # Fallback: Install CRDs and operator from main branch
+        kubectl apply -f https://raw.githubusercontent.com/kubeflow/spark-operator/master/config/crd/bases/sparkoperator.k8s.io_sparkapplications.yaml
+        kubectl apply -f https://raw.githubusercontent.com/kubeflow/spark-operator/master/config/crd/bases/sparkoperator.k8s.io_scheduledsparkapplications.yaml
+
+        # Install operator deployment
+        cat <<EOF | kubectl apply -n ${OPERATOR_NAMESPACE} -f -
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: spark-operator
+  namespace: ${OPERATOR_NAMESPACE}
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: spark-operator
+rules:
+- apiGroups: [""]
+  resources: ["pods", "services", "configmaps"]
+  verbs: ["create", "get", "list", "watch", "update", "patch", "delete"]
+- apiGroups: [""]
+  resources: ["events"]
+  verbs: ["create", "update", "patch"]
+- apiGroups: ["sparkoperator.k8s.io"]
+  resources: ["sparkapplications", "scheduledsparkapplications"]
+  verbs: ["create", "get", "list", "watch", "update", "patch", "delete"]
+- apiGroups: ["sparkoperator.k8s.io"]
+  resources: ["sparkapplications/status", "scheduledsparkapplications/status"]
+  verbs: ["get", "update", "patch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: spark-operator
+subjects:
+- kind: ServiceAccount
+  name: spark-operator
+  namespace: ${OPERATOR_NAMESPACE}
+roleRef:
+  kind: ClusterRole
+  name: spark-operator
+  apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: spark-operator
+  namespace: ${OPERATOR_NAMESPACE}
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: spark-operator
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: spark-operator
+    spec:
+      serviceAccountName: spark-operator
+      containers:
+      - name: spark-operator
+        image: docker.io/kubeflow/spark-operator:v2.0.2-rc.0
+        imagePullPolicy: IfNotPresent
+        args:
+        - -v=2
+        - -namespace=${SPARK_NAMESPACE}
+        - -enable-webhook=false
+EOF
+    fi
+
+    print_step "Spark Operator installed successfully"
+}
+
+# Setup service account and RBAC
+setup_rbac() {
+    print_step "Setting up service account and RBAC..."
+
+    # Create namespace if it doesn't exist
+    kubectl create namespace ${SPARK_NAMESPACE} --dry-run=client -o yaml | kubectl apply -f -
+
+    # Create service account
+    cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: spark-operator-spark
+  namespace: ${SPARK_NAMESPACE}
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: spark-operator-spark-role
+  namespace: ${SPARK_NAMESPACE}
+rules:
+- apiGroups: [""]
+  resources: ["pods", "services", "configmaps"]
+  verbs: ["create", "get", "list", "watch", "update", "patch", "delete"]
+- apiGroups: [""]
+  resources: ["pods/log"]
+  verbs: ["get"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: spark-operator-spark-rolebinding
+  namespace: ${SPARK_NAMESPACE}
+subjects:
+- kind: ServiceAccount
+  name: spark-operator-spark
+  namespace: ${SPARK_NAMESPACE}
+roleRef:
+  kind: Role
+  name: spark-operator-spark-role
+  apiGroup: rbac.authorization.k8s.io
+EOF
+
+    print_step "RBAC setup completed"
+}
+
+# Verify installation
+verify_installation() {
+    print_step "Verifying installation..."
+
+    echo "Waiting for Spark Operator to be ready..."
+    kubectl wait --for=condition=ready pod \
+        -l app.kubernetes.io/name=spark-operator \
+        -n ${OPERATOR_NAMESPACE} \
+        --timeout=300s
+
+    echo ""
+    print_step "Installation verified successfully!"
+    echo ""
+    echo "Cluster Information:"
+    echo "  Cluster Name: ${CLUSTER_NAME}"
+    echo "  Operator Namespace: ${OPERATOR_NAMESPACE}"
+    echo "  Spark Namespace: ${SPARK_NAMESPACE}"
+    echo ""
+    echo "Next steps:"
+    echo "  1. Set kubectl context: kubectl config use-context kind-${CLUSTER_NAME}"
+    echo "  2. Run tests: python examples/spark/test_spark_client_integration.py"
+    echo "  3. Delete cluster: kind delete cluster --name ${CLUSTER_NAME}"
+}
+
+# Main
+main() {
+    echo "========================================"
+    echo " Spark Operator Test Environment Setup"
+    echo "========================================"
+    echo ""
+
+    check_prerequisites
+    create_cluster
+    kind export kubeconfig --name spark-test
+    # Try Helm first, fallback to kubectl
+    if command -v helm &> /dev/null; then
+        install_spark_operator_helm
+    else
+        install_spark_operator_kubectl
+    fi
+
+    setup_rbac
+    verify_installation
+
+    print_step "Setup complete! 🎉"
+}
+
+# Run main function
+main
diff --git a/examples/spark/spark-connect-server.yaml b/examples/spark/spark-connect-server.yaml
new file mode 100644
index 000000000..d2771dc9f
--- /dev/null
+++ b/examples/spark/spark-connect-server.yaml
@@ -0,0 +1,165 @@
+# Spark Connect Server Deployment for Kubernetes
+#
+# This manifest deploys a Spark Connect server that can be used to test
+# the Kubeflow Spark Connect backend integration.
+#
+# Deploy: kubectl apply -f spark-connect-server.yaml
+# Delete: kubectl delete -f spark-connect-server.yaml
+
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: spark-connect
+  namespace: default
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: spark-connect-role
+  namespace: default
+rules:
+- apiGroups: [""]
+  resources: ["pods", "services", "configmaps"]
+  verbs: ["create", "get", "list", "watch", "update", "patch", "delete"]
+- apiGroups: [""]
+  resources: ["pods/log"]
+  verbs: ["get"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: spark-connect-rolebinding
+  namespace: default
+subjects:
+- kind: ServiceAccount
+  name: spark-connect
+  namespace: default
+roleRef:
+  kind: Role
+  name: spark-connect-role
+  apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: spark-connect
+  namespace: default
+  labels:
+    app: spark-connect
+spec:
+  type: NodePort
+  selector:
+    app: spark-connect
+  ports:
+  - name: connect
+    port: 15002
+    targetPort: 15002
+    nodePort: 30000
+    protocol: TCP
+  - name: ui
+    port: 4040
+    targetPort: 4040
+    protocol: TCP
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: spark-connect
+  namespace: default
+  labels:
+    app: spark-connect
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: spark-connect
+  template:
+    metadata:
+      labels:
+        app: spark-connect
+    spec:
+      serviceAccountName: spark-connect
+      initContainers:
+      - name: download-packages
+        image: apache/spark:4.0.0
+        securityContext:
+          runAsUser: 185  # spark user
+        command:
+        - /bin/sh
+        - -c
+        - |
+          mkdir -p /ivy-cache
+          /opt/spark/bin/spark-submit \
+            --packages org.apache.spark:spark-connect_2.13:4.0.0 \
+            --conf spark.jars.ivy=/ivy-cache \
+            --class org.apache.spark.sql.connect.service.SparkConnectServer \
+            --help || true
+        volumeMounts:
+        - name: ivy-cache
+          mountPath: /ivy-cache
+      containers:
+      - name: spark-connect
+        image: apache/spark:4.0.0
+        imagePullPolicy: IfNotPresent
+        securityContext:
+          runAsUser: 185  # spark user
+        command:
+        - /opt/spark/sbin/start-connect-server.sh
+        args:
+        - --packages
+        - org.apache.spark:spark-connect_2.13:4.0.0
+        - --conf
+        - spark.jars.ivy=/ivy-cache
+        - --conf
+        - spark.driver.host=0.0.0.0
+        - --conf
+        - spark.driver.bindAddress=0.0.0.0
+        - --conf
+        - spark.connect.grpc.binding.address=0.0.0.0
+        - --conf
+        - spark.kubernetes.namespace=default
+        - --conf
+        - spark.kubernetes.authenticate.driver.serviceAccountName=spark-connect
+        env:
+        - name: SPARK_NO_DAEMONIZE
+          value: "true"
+        - name: SPARK_LOCAL_HOSTNAME
+          valueFrom:
+            fieldRef:
+              fieldPath: status.podIP
+        - name: JAVA_TOOL_OPTIONS
+          value: "-Djava.net.preferIPv4Stack=true"
+        volumeMounts:
+        - name: ivy-cache
+          mountPath: /ivy-cache
+        ports:
+        - name: connect
+          containerPort: 15002
+          protocol: TCP
+        - name: ui
+          containerPort: 4040
+          protocol: TCP
+        resources:
+          requests:
+            memory: "1Gi"
+            cpu: "500m"
+          limits:
+            memory: "2Gi"
+            cpu: "1000m"
+        livenessProbe:
+          tcpSocket:
+            port: 15002
+          initialDelaySeconds: 30
+          periodSeconds: 10
+          timeoutSeconds: 5
+          failureThreshold: 3
+        readinessProbe:
+          tcpSocket:
+            port: 15002
+          initialDelaySeconds: 20
+          periodSeconds: 5
+          timeoutSeconds: 3
+          failureThreshold: 3
+      volumes:
+      - name: ivy-cache
+        emptyDir: {}
diff --git a/examples/spark/test_direct_pyspark.py b/examples/spark/test_direct_pyspark.py
new file mode 100644
index 000000000..54f74318e
--- /dev/null
+++ b/examples/spark/test_direct_pyspark.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+"""
+Minimal PySpark Connect test - bypasses Kubeflow SDK completely
+"""
+
+import sys
+
+print("Testing direct PySpark Connect (no Kubeflow SDK)...")
+print("=" * 80)
+
+try:
+    import signal
+
+    from pyspark.sql import SparkSession
+
+    def timeout_handler(signum, frame):
+        raise TimeoutError("Connection timed out")
+
+    # Set a 15 second timeout
+    signal.signal(signal.SIGALRM, timeout_handler)
+    signal.alarm(15)
+
+    print("Creating SparkSession.builder.remote('sc://localhost:30000')...")
+    print("Timeout set to 15 seconds...")
+
+    spark = (
+        SparkSession.builder.remote("sc://localhost:30000")
+        .appName("direct-test")
+        .config("spark.connect.grpc.binding.port", "30000")
+        .getOrCreate()
+    )
+
+    signal.alarm(0)  # Cancel timeout
+
+    print("✓ Session created!")
+    print(f"Session: {spark}")
+
+    # Try a query
+    print("\nTesting query...")
+    df = spark.sql("SELECT 1 AS id, 'Hello' AS msg")
+    result = df.collect()
+    print(f"✓ Result: {result}")
+
+    spark.stop()
+    print("✓ Test passed!")
+
+except TimeoutError:
+    print("\n✗ Connection timed out after 15 seconds")
+    print("\nThis means PySpark is not able to connect to the server.")
+    print("The problem is NOT with Kubeflow SDK - it's with the basic connection.")
+    sys.exit(1)
+
+except Exception as e:
+    print(f"\n✗ Error: {e}")
+    import traceback
+
+    traceback.print_exc()
+    sys.exit(1)
diff --git a/examples/spark/test_grpc_connection.py b/examples/spark/test_grpc_connection.py
new file mode 100644
index 000000000..cb4691c81
--- /dev/null
+++ b/examples/spark/test_grpc_connection.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""
+Minimal test to check if we can connect to Spark Connect via gRPC directly.
+"""
+
+import sys
+
+print("Testing basic gRPC connection to Spark Connect...")
+print("=" * 80)
+
+# Test 1: Check if grpc is available
+print("\n[1] Checking grpcio installation...")
+try:
+    import grpc
+
+    print(f"✓ grpcio version: {grpc.__version__}")
+except ImportError as e:
+    print(f"✗ grpcio not installed: {e}")
+    print("Install with: pip install grpcio")
+    sys.exit(1)
+
+# Test 2: Try to connect to the port
+print("\n[2] Testing TCP connection to localhost:30000...")
+import socket
+
+try:
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.settimeout(5)
+    result = sock.connect_ex(("localhost", 30000))
+    sock.close()
+
+    if result == 0:
+        print("✓ TCP connection successful")
+    else:
+        print(f"✗ TCP connection failed with error code: {result}")
+        sys.exit(1)
+except Exception as e:
+    print(f"✗ TCP connection error: {e}")
+    sys.exit(1)
+
+# Test 3: Try gRPC channel
+print("\n[3] Creating gRPC channel to localhost:30000...")
+try:
+    channel = grpc.insecure_channel("localhost:30000")
+    print("✓ gRPC channel created")
+
+    # Test if channel is ready
+    print("   Testing if channel becomes ready (5 second timeout)...")
+    try:
+        future = grpc.channel_ready_future(channel)
+        future.result(timeout=5)
+        print("✓ gRPC channel is ready!")
+    except grpc.FutureTimeoutError:
+        print("✗ gRPC channel timeout - server not responding on gRPC")
+        print("   This suggests the server might not be accepting gRPC connections")
+    except Exception as e:
+        print(f"✗ gRPC channel error: {e}")
+    finally:
+        channel.close()
+
+except Exception as e:
+    print(f"✗ gRPC error: {e}")
+    import traceback
+
+    traceback.print_exc()
+
+# Test 4: Try with PySpark directly
+print("\n[4] Testing PySpark Spark Connect...")
+try:
+    from pyspark.sql import SparkSession
+
+    print("✓ PySpark imported")
+
+    print("   Creating SparkSession with remote connection...")
+    print("   This might take 10-30 seconds or hang if there's an issue...")
+
+    import signal
+
+    def timeout_handler(signum, frame):
+        raise TimeoutError("Session creation timed out after 20 seconds")
+
+    signal.signal(signal.SIGALRM, timeout_handler)
+    signal.alarm(20)
+
+    try:
+        spark = (
+            SparkSession.builder.remote("sc://localhost:30000").appName("grpc-test").getOrCreate()
+        )
+
+        signal.alarm(0)
+        print("✓ SparkSession created!")
+
+        # Try a simple operation
+        print("   Testing simple query...")
+        df = spark.sql("SELECT 1 AS id")
+        result = df.collect()
+        print(f"✓ Query executed successfully: {result}")
+
+        spark.stop()
+        print("✓ Session stopped")
+
+        print("\n" + "=" * 80)
+        print("SUCCESS! Everything is working.")
+        print("=" * 80)
+
+    except TimeoutError as e:
+        signal.alarm(0)
+        print(f"✗ {e}")
+        print("\nThis means PySpark is hanging while trying to connect.")
+        print("Possible causes:")
+        print("  1. Spark Connect server not responding to gRPC requests")
+        print("  2. Server bound to wrong address (IPv4 vs IPv6)")
+        print("  3. Firewall or network policy blocking connection")
+
+except KeyboardInterrupt:
+    print("\n✗ Interrupted by user")
+except Exception as e:
+    print(f"✗ PySpark connection failed: {e}")
+    import traceback
+
+    traceback.print_exc()
+
+print("\n" + "=" * 80)
+print("Debug test complete")
+print("=" * 80)
diff --git a/examples/spark/test_sdk_debug.py b/examples/spark/test_sdk_debug.py
new file mode 100644
index 000000000..311d1e8bb
--- /dev/null
+++ b/examples/spark/test_sdk_debug.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+"""
+Debug version of SDK client with verbose logging
+"""
+
+import logging
+import os
+import sys
+
+# Setup very verbose logging
+logging.basicConfig(
+    level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+
+# Add SDK to path
+sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+sys.path.insert(0, sdk_path)
+
+print("=" * 80)
+print("Testing Kubeflow SDK with DEBUG logging")
+print("=" * 80)
+
+# Import with logging
+print("\n[1] Importing Kubeflow SDK...")
+from kubeflow.spark import ConnectBackendConfig, SparkSessionClient
+
+print("\n[2] Creating config...")
+config = ConnectBackendConfig(connect_url="sc://localhost:30000", use_ssl=False, timeout=60)
+print(f"    Config: {config.connect_url}")
+
+print("\n[3] Creating client...")
+client = SparkSessionClient(backend_config=config)
+print(f"    Client created: {client}")
+print(f"    Backend: {client.backend}")
+
+print("\n[4] Creating session (this is where it might hang)...")
+print("    About to call client.create_session()...")
+
+import signal
+
+
+def timeout_handler(signum, frame):
+    print("\n✗ Session creation timed out after 20 seconds")
+    print("\nThe hang is in the SDK's create_session method.")
+    print("Check: kubeflow/spark/backends/connect.py line ~241")
+    sys.exit(1)
+
+
+signal.signal(signal.SIGALRM, timeout_handler)
+signal.alarm(20)
+
+try:
+    session = client.create_session(app_name="debug-test")
+    signal.alarm(0)
+
+    print("\n✓ Session created!")
+    print(f"    Session ID: {session.session_id}")
+    print(f"    App name: {session.app_name}")
+
+    print("\n[5] Testing query...")
+    df = session.sql("SELECT 1 AS id")
+    result = df.collect()
+    print(f"✓ Query result: {result}")
+
+    session.close()
+    print("✓ Test passed!")
+
+except Exception as e:
+    signal.alarm(0)
+    print(f"\n✗ Error: {e}")
+    import traceback
+
+    traceback.print_exc()
+    sys.exit(1)
diff --git a/examples/spark/test_simple_spark.py b/examples/spark/test_simple_spark.py
new file mode 100644
index 000000000..7f5d824c0
--- /dev/null
+++ b/examples/spark/test_simple_spark.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+"""
+Simple test WITHOUT History Server to verify basic Spark works
+"""
+
+from datetime import datetime
+import os
+import sys
+
+from kubernetes import client, config
+
+# Add SDK to path
+sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+if sdk_path not in sys.path:
+    sys.path.insert(0, sdk_path)
+
+print("=" * 80)
+print("SIMPLE SPARK TEST (No History Server)")
+print("=" * 80)
+print()
+
+# Load kubeconfig
+try:
+    config.load_kube_config(context="kind-spark-test")
+except:
+    config.load_incluster_config()
+
+# Create API client
+api_client = client.CustomObjectsApi()
+
+# Create minimal SparkApplication
+app_name = f"simple-test-{datetime.now().strftime('%H%M%S')}"
+
+spark_app = {
+    "apiVersion": "sparkoperator.k8s.io/v1beta2",
+    "kind": "SparkApplication",
+    "metadata": {
+        "name": app_name,
+        "namespace": "default",
+    },
+    "spec": {
+        "type": "Python",
+        "mode": "cluster",
+        "image": "docker.io/library/spark:4.0.0",
+        "imagePullPolicy": "IfNotPresent",
+        "mainApplicationFile": "local:///opt/spark/examples/src/main/python/pi.py",
+        "arguments": ["10"],
+        "sparkVersion": "4.0.0",
+        "restartPolicy": {"type": "Never"},
+        "timeToLiveSeconds": 1800,
+        "driver": {
+            "cores": 1,
+            "memory": "512m",
+            "serviceAccount": "spark-operator-spark",
+            "labels": {
+                "version": "4.0.0",
+            },
+        },
+        "executor": {
+            "cores": 1,
+            "instances": 2,
+            "memory": "512m",
+            "labels": {
+                "version": "4.0.0",
+            },
+        },
+        "sparkConf": {
+            "spark.kubernetes.file.upload.path": "/tmp",
+        },
+    },
+}
+
+print(f"Submitting simple Spark application: {app_name}")
+print("This test has NO volume mounts - just basic Pi calculation")
+print()
+
+try:
+    # Create the SparkApplication
+    response = api_client.create_namespaced_custom_object(
+        group="sparkoperator.k8s.io",
+        version="v1beta2",
+        namespace="default",
+        plural="sparkapplications",
+        body=spark_app,
+    )
+
+    print("Application submitted successfully!")
+    print(f"  Name: {app_name}")
+    print()
+    print("Monitor:")
+    print(f"  kubectl get sparkapplication {app_name} -w")
+    print()
+    print("View logs:")
+    print(f"  kubectl logs {app_name}-driver -f")
+    print()
+    print("Describe:")
+    print(f"  kubectl describe sparkapplication {app_name}")
+    print()
+
+    # Wait and show status
+    import time
+
+    print("Waiting for completion...")
+    for i in range(60):
+        time.sleep(2)
+        try:
+            app_status = api_client.get_namespaced_custom_object(
+                group="sparkoperator.k8s.io",
+                version="v1beta2",
+                namespace="default",
+                plural="sparkapplications",
+                name=app_name,
+            )
+
+            state = app_status.get("status", {}).get("applicationState", {}).get("state", "UNKNOWN")
+            print(f"  Status: {state}", end="\r")
+
+            if state in ["COMPLETED", "FAILED"]:
+                print()
+                print(f"\nApplication {state}")
+
+                if state == "COMPLETED":
+                    print("\nSUCCESS! Basic Spark works without volumes.")
+                    print("\nNow we can test with History Server volumes.")
+                else:
+                    error_msg = (
+                        app_status.get("status", {})
+                        .get("applicationState", {})
+                        .get("errorMessage", "Unknown error")
+                    )
+                    print(f"\nERROR: FAILED: {error_msg}")
+                    print("\nCheck logs:")
+                    print(f"  kubectl logs {app_name}-driver")
+                    break
+
+        except Exception:
+            continue
+
+except Exception as e:
+    print(f"ERROR: Failed to submit application: {e}")
+    import traceback
+
+    traceback.print_exc()
+    sys.exit(1)
diff --git a/examples/spark/test_spark_client_integration.py b/examples/spark/test_spark_client_integration.py
new file mode 100644
index 000000000..19332ed24
--- /dev/null
+++ b/examples/spark/test_spark_client_integration.py
@@ -0,0 +1,321 @@
+"""
+Integration tests for Kubeflow Spark Client with Operator Backend.
+
+These tests require:
+1. A Kubernetes cluster with Spark Operator installed
+2. kubectl configured with proper context
+3. Service account 'spark-operator-spark' with proper permissions
+
+Setup:
+    Run ./setup_test_environment.sh to create a Kind cluster with Spark Operator
+
+Usage:
+    python test_spark_client_integration.py
+"""
+
+import os
+import sys
+
+# Add SDK to path for development mode
+sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+if sdk_path not in sys.path:
+    sys.path.insert(0, sdk_path)
+
+import time
+import unittest
+
+from kubeflow.spark import ApplicationState, OperatorBackendConfig, SparkClient  # noqa: E402
+
+
+class TestSparkClientIntegration(unittest.TestCase):
+    """Integration tests for SparkClient with Operator backend."""
+
+    @classmethod
+    def setUpClass(cls):
+        """Set up test client."""
+        config = OperatorBackendConfig(
+            namespace=os.getenv("SPARK_NAMESPACE", "default"),
+            service_account="spark-operator-spark",
+            default_spark_image="docker.io/library/spark",
+            # Explicitly set context
+            context=os.getenv("KUBE_CONTEXT", "kind-spark-test"),
+            # Disable to avoid JMX agent issue with Spark 4.0
+            enable_monitoring=False,
+            # Disable UI for simpler testing
+            enable_ui=False,
+        )
+        cls.client = BatchSparkClient(backend_config=config)
+        cls.submitted_apps = []
+
+    @classmethod
+    def tearDownClass(cls):
+        """Clean up submitted applications."""
+        print("\nCleaning up test applications...")
+        for app_name in cls.submitted_apps:
+            try:
+                cls.client.delete_job(app_name)
+                print(f"  Deleted {app_name}")
+            except Exception as e:
+                print(f"  ✗ Failed to delete {app_name}: {e}")
+
+    def test_01_submit_spark_pi(self):
+        """Test submitting a simple Spark Pi application."""
+        print("\n" + "=" * 80)
+        print("TEST: Submit Spark Pi Application")
+        print("=" * 80)
+
+        app_name = "test-spark-pi"
+
+        response = self.client.submit_application(
+            app_name=app_name,
+            main_application_file=(
+                "local:///opt/spark/examples/jars/spark-examples_2.13-4.0.0.jar"
+            ),
+            main_class="org.apache.spark.examples.SparkPi",
+            spark_version="4.0.0",
+            app_type="Scala",
+            driver_cores=1,
+            driver_memory="512m",
+            executor_cores=1,
+            executor_memory="512m",
+            num_executors=1,
+            arguments=["100"],
+            spark_conf={
+                # Required for Spark 4.0
+                "spark.kubernetes.file.upload.path": "/tmp",
+            },
+        )
+
+        self.submitted_apps.append(app_name)
+
+        self.assertEqual(response.submission_id, app_name)
+        self.assertEqual(response.status, "SUBMITTED")
+
+        print(f"Application submitted: {app_name}")
+        print(f"  Status: {response.status}")
+
+    def test_02_get_status(self):
+        """Test getting application status."""
+        print("\n" + "=" * 80)
+        print("TEST: Get Application Status")
+        print("=" * 80)
+
+        if not self.submitted_apps:
+            self.skipTest("No applications to check status")
+
+        app_name = self.submitted_apps[0]
+        status = self.client.get_job(app_name)
+
+        self.assertIsNotNone(status)
+        self.assertEqual(status.submission_id, app_name)
+        self.assertIn(status.state, list(ApplicationState))
+
+        print(f"Got status for {app_name}")
+        print(f"  State: {status.state.value}")
+        print(f"  App ID: {status.app_id}")
+
+    def test_03_list_applications(self):
+        """Test listing applications."""
+        print("\n" + "=" * 80)
+        print("TEST: List Applications")
+        print("=" * 80)
+
+        apps = self.client.list_jobs()
+
+        self.assertIsInstance(apps, list)
+        print(f"Listed {len(apps)} applications")
+
+        for app in apps[:5]:  # Show first 5
+            print(f"  - {app.app_name}: {app.state.value}")
+
+    def test_04_get_logs(self):
+        """Test getting application logs."""
+        print("\n" + "=" * 80)
+        print("TEST: Get Application Logs")
+        print("=" * 80)
+
+        if not self.submitted_apps:
+            self.skipTest("No applications to get logs from")
+
+        app_name = self.submitted_apps[0]
+
+        # Wait for driver pod to be ready before fetching logs
+        print("Waiting for driver pod to be ready...")
+        is_ready = self.client.wait_for_pod_ready(app_name, timeout=120)
+
+        if not is_ready:
+            print("WARNING: Driver pod not ready within timeout, logs may be empty")
+
+        logs = list(self.client.get_job_logs(app_name))
+
+        # Logs might be empty if pod not started yet
+        print(f"Retrieved {len(logs)} log lines from {app_name}")
+        if logs:
+            print("\n  First 5 lines:")
+            for line in logs[:5]:
+                print(f"    {line}")
+        else:
+            print("  (No logs available yet - pod may still be starting)")
+
+    def test_05_wait_for_completion(self):
+        """Test waiting for application completion."""
+        print("\n" + "=" * 80)
+        print("TEST: Wait for Completion")
+        print("=" * 80)
+
+        app_name = "test-spark-pi-completion"
+
+        response = self.client.submit_application(
+            app_name=app_name,
+            main_application_file=(
+                "local:///opt/spark/examples/jars/spark-examples_2.13-4.0.0.jar"
+            ),
+            main_class="org.apache.spark.examples.SparkPi",
+            spark_version="4.0.0",
+            app_type="Scala",
+            driver_cores=1,
+            driver_memory="512m",
+            executor_cores=1,
+            executor_memory="512m",
+            num_executors=1,
+            arguments=["10"],  # Small workload
+            spark_conf={
+                "spark.kubernetes.file.upload.path": "/tmp",
+            },
+        )
+
+        self.submitted_apps.append(app_name)
+
+        print(f"Submitted {app_name}")
+        print("  Waiting for completion (timeout: 300s)...")
+
+        final_status = self.client.wait_for_job_status(app_name, timeout=300, polling_interval=5)
+
+        print("Application completed")
+        print(f"  Final state: {final_status.state.value}")
+
+        self.assertIn(
+            final_status.state,
+            [ApplicationState.COMPLETED, ApplicationState.FAILED],
+        )
+
+    def test_06_delete_application(self):
+        """Test deleting an application."""
+        print("\n" + "=" * 80)
+        print("TEST: Delete Application")
+        print("=" * 80)
+
+        # Submit a temporary application
+        app_name = "test-spark-delete"
+
+        response = self.client.submit_application(
+            app_name=app_name,
+            main_application_file=(
+                "local:///opt/spark/examples/jars/spark-examples_2.13-4.0.0.jar"
+            ),
+            main_class="org.apache.spark.examples.SparkPi",
+            spark_version="4.0.0",
+            app_type="Scala",
+            driver_cores=1,
+            driver_memory="512m",
+            executor_cores=1,
+            executor_memory="512m",
+            num_executors=1,
+            spark_conf={
+                "spark.kubernetes.file.upload.path": "/tmp",
+            },
+        )
+
+        print(f"Submitted {app_name}")
+
+        # Delete immediately
+        result = self.client.delete_job(app_name)
+
+        self.assertIsInstance(result, dict)
+        print(f"Deleted {app_name}")
+        print(f"  Result: {result}")
+
+    def test_07_dynamic_allocation(self):
+        """Test application with dynamic allocation."""
+        print("\n" + "=" * 80)
+        print("TEST: Dynamic Allocation")
+        print("=" * 80)
+
+        app_name = "test-dynamic-allocation"
+
+        response = self.client.submit_application(
+            app_name=app_name,
+            main_application_file=(
+                "local:///opt/spark/examples/jars/spark-examples_2.13-4.0.0.jar"
+            ),
+            main_class="org.apache.spark.examples.SparkPi",
+            spark_version="4.0.0",
+            app_type="Scala",
+            driver_cores=1,
+            driver_memory="512m",
+            executor_cores=1,
+            executor_memory="512m",
+            num_executors=2,
+            arguments=["1000"],
+            enable_dynamic_allocation=True,
+            initial_executors=1,
+            min_executors=1,
+            max_executors=5,
+            spark_conf={
+                "spark.kubernetes.file.upload.path": "/tmp",
+            },
+        )
+
+        self.submitted_apps.append(app_name)
+
+        print(f"Submitted {app_name} with dynamic allocation")
+        print("  Config: min=1, max=5, initial=1")
+
+        # Check status after a bit
+        time.sleep(10)
+        status = self.client.get_job(app_name)
+
+        print(f"  Current state: {status.state.value}")
+        if status.executor_state:
+            print(f"  Executors: {len(status.executor_state)}")
+
+
+def run_tests():
+    """Run integration tests."""
+    print("=" * 80)
+    print(" Kubeflow Spark Client - Integration Tests")
+    print("=" * 80)
+    print()
+    print("Prerequisites:")
+    print("  - Kubernetes cluster with Spark Operator")
+    print("  - kubectl configured with proper context")
+    print("  - Service account 'spark-operator-spark'")
+    print()
+    print("Run ./setup_test_environment.sh if not already done")
+    print("=" * 80)
+    print()
+
+    # Run tests
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestSparkClientIntegration)
+    runner = unittest.TextTestRunner(verbosity=2)
+    result = runner.run(suite)
+
+    # Summary
+    print("\n" + "=" * 80)
+    print("Test Summary")
+    print("=" * 80)
+    print(f"Tests run: {result.testsRun}")
+    print(f"Successes: {result.testsRun - len(result.failures) - len(result.errors)}")
+    print(f"Failures: {len(result.failures)}")
+    print(f"Errors: {len(result.errors)}")
+
+    if result.wasSuccessful():
+        print("\nAll tests passed! 🎉")
+        return 0
+    else:
+        print("\n✗ Some tests failed")
+        return 1
+
+
+if __name__ == "__main__":
+    exit(run_tests())
diff --git a/examples/spark/test_ui_minimal.py b/examples/spark/test_ui_minimal.py
new file mode 100644
index 000000000..b43750a09
--- /dev/null
+++ b/examples/spark/test_ui_minimal.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+"""
+Minimal test for Spark UI service creation without S3.
+Uses local:// path and simple SparkPi example.
+"""
+
+from datetime import datetime
+import os
+import sys
+import time
+
+# Add SDK to path
+sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+if sdk_path not in sys.path:
+    sys.path.insert(0, sdk_path)
+
+from kubeflow.spark import OperatorBackendConfig, SparkClient
+
+print("=" * 80)
+print("MINIMAL TEST: Spark UI Service Creation")
+print("=" * 80)
+print()
+print("This test submits a simple Spark application and checks if")
+print("the UI service is created by the Spark Operator.")
+print()
+
+# Create client with UI enabled
+config = OperatorBackendConfig(
+    namespace="default",
+    service_account="spark-operator-spark",
+    default_spark_image="docker.io/apache/spark",  # Use official image
+    context=os.getenv("KUBE_CONTEXT", "kind-spark-test"),
+    enable_monitoring=False,
+    enable_ui=True,  # Enable UI!
+)
+
+client = BatchSparkClient(backend_config=config)
+print("Client created with enable_ui=True")
+print()
+
+# Submit a simple SparkPi example (built into Spark image)
+timestamp = datetime.now().strftime("%H%M%S")
+app_name = f"test-ui-{timestamp}"
+
+print(f"Submitting test application: {app_name}")
+print("-" * 80)
+
+try:
+    response = client.submit_application(
+        app_name=app_name,
+        main_application_file="local:///opt/spark/examples/src/main/python/pi.py",
+        spark_version="3.5.0",
+        app_type="Python",
+        driver_cores=1,
+        driver_memory="512m",
+        executor_cores=1,
+        executor_memory="512m",
+        num_executors=1,
+        arguments=["10"],  # Calculate pi with 10 partitions
+    )
+
+    print(f"Application submitted: {response.submission_id}")
+    print(f"  Status: {response.status}")
+    print()
+
+except Exception as e:
+    print(f"ERROR: Submission failed: {e}")
+    sys.exit(1)
+
+# Wait a few seconds for operator to process
+print("Waiting 10 seconds for Spark Operator to create resources...")
+time.sleep(10)
+print()
+
+# Instructions for checking
+print("=" * 80)
+print("Now check if the UI service was created:")
+print("=" * 80)
+print()
+print("1. Check for the UI service:")
+print(f"   kubectl get svc {app_name}-ui-svc -n default")
+print()
+print("2. If service exists, port-forward to access:")
+print(f"   kubectl port-forward svc/{app_name}-ui-svc 4040:4040")
+print("   Then open: http://localhost:4040")
+print()
+print("3. Check the SparkApplication YAML:")
+print(f"   kubectl get sparkapplication {app_name} -o yaml | grep -A 5 sparkUIOptions")
+print()
+print("4. Check all services:")
+print("   kubectl get svc -n default")
+print()
+print("5. View Spark Operator logs:")
+print("   kubectl logs -n spark-operator deploy/spark-operator --tail=100")
+print()
+print("6. Watch application status:")
+print(f"   kubectl get sparkapplication {app_name} -w")
+print()
+print("=" * 80)
+print()
+print(f"Application name: {app_name}")
+print("The application will run for ~30 seconds.")
+print("Check if the UI service exists while it's running!")
+print()
diff --git a/examples/spark/test_url_building.py b/examples/spark/test_url_building.py
new file mode 100644
index 000000000..c27cc7a94
--- /dev/null
+++ b/examples/spark/test_url_building.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+"""
+Test that mimics exactly what the Kubeflow SDK does
+"""
+
+print("Testing SDK URL building logic...")
+
+# Test 1: Build URL like SDK does
+connect_url = "sc://localhost:30000"
+use_ssl = False
+
+url = connect_url
+param_dict = {}
+
+# This is what SDK does
+if use_ssl:
+    param_dict["use_ssl"] = "true"
+else:
+    param_dict["use_ssl"] = "false"  # New fix
+
+# Build final URL
+if param_dict:
+    param_str = ";".join([f"{k}={v}" for k, v in param_dict.items()])
+    final_url = f"{url}/;{param_str}"
+else:
+    final_url = url
+
+print(f"SDK would build URL: {final_url}")
+
+# Test 2: Try this URL with PySpark
+print("\nTesting this URL with PySpark...")
+
+import signal
+
+from pyspark.sql import SparkSession
+
+
+def timeout_handler(signum, frame):
+    raise TimeoutError("Timed out")
+
+
+signal.signal(signal.SIGALRM, timeout_handler)
+signal.alarm(15)
+
+try:
+    print(f"Connecting to: {final_url}")
+    spark = SparkSession.builder.remote(final_url).appName("sdk-mimic-test").getOrCreate()
+    signal.alarm(0)
+
+    print("✓ Connection successful!")
+
+    # Test query
+    df = spark.sql("SELECT 1 AS id")
+    print(f"✓ Query worked: {df.collect()}")
+
+    spark.stop()
+    print("✓ All good!")
+
+except TimeoutError:
+    print("✗ Timed out - the URL format might be wrong")
+    print("\nTry this instead:")
+    print(f"  SparkSession.builder.remote('{connect_url}').appName('test').getOrCreate()")
+except Exception as e:
+    print(f"✗ Error: {e}")
+    import traceback
+
+    traceback.print_exc()
diff --git a/examples/spark/upload_long_job.sh b/examples/spark/upload_long_job.sh
new file mode 100755
index 000000000..e80d873c1
--- /dev/null
+++ b/examples/spark/upload_long_job.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+# Quick script to upload the long-running job script to MinIO
+
+set -e
+
+NAMESPACE="default"
+
+echo "=================================================="
+echo "Uploading Long-Running Job Script to MinIO"
+echo "=================================================="
+echo ""
+
+# Check if MinIO is running
+if ! kubectl get pod -n ${NAMESPACE} -l app=minio | grep -q Running; then
+    echo "❌ MinIO is not running!"
+    echo "   Run ./setup_minio.sh first"
+    exit 1
+fi
+
+echo "✓ MinIO is running"
+echo ""
+
+# Check if minio-client exists
+if ! kubectl get pod -n ${NAMESPACE} minio-client &>/dev/null; then
+    echo "❌ minio-client pod not found!"
+    echo "   Run ./setup_minio.sh first"
+    exit 1
+fi
+
+echo "✓ MinIO client found"
+echo ""
+
+# Check if minio-client pod is running
+POD_STATUS=$(kubectl get pod -n ${NAMESPACE} minio-client -o jsonpath='{.status.phase}' 2>/dev/null)
+
+if [ "$POD_STATUS" != "Running" ]; then
+    echo "  MinIO client pod is in '$POD_STATUS' state. Restarting..."
+
+    # Delete the completed pod
+    kubectl delete pod -n ${NAMESPACE} minio-client --ignore-not-found=true
+
+    # Create a new one
+    cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: Pod
+metadata:
+  name: minio-client
+  namespace: ${NAMESPACE}
+spec:
+  containers:
+  - name: minio-client
+    image: quay.io/minio/mc:latest
+    command: ["sleep", "infinity"]
+  restartPolicy: Never
+EOF
+
+    # Wait for it to be ready
+    echo "  Waiting for MinIO client to be ready..."
+    kubectl wait --for=condition=ready pod/minio-client -n ${NAMESPACE} --timeout=60s
+
+    # Configure mc alias
+    kubectl exec -n ${NAMESPACE} minio-client -- mc alias set myminio http://minio-service:9000 minioadmin minioadmin
+    echo "  ✓ MinIO client restarted and configured"
+fi
+
+echo ""
+
+# Upload the script
+echo "Uploading long_running_job.py to MinIO..."
+
+if [ ! -f "scripts/long_running_job.py" ]; then
+    echo "❌ Script not found: scripts/long_running_job.py"
+    echo "   Make sure you're in the examples/spark directory"
+    exit 1
+fi
+
+# Upload to minio-client pod using pipe (more reliable than stdin redirection)
+echo "  Copying script to minio-client pod..."
+cat scripts/long_running_job.py | kubectl exec -i -n ${NAMESPACE} minio-client -- sh -c 'cat > /tmp/long_running_job.py'
+
+# Verify the file has content in the pod
+LINE_COUNT=$(kubectl exec -n ${NAMESPACE} minio-client -- wc -l /tmp/long_running_job.py | awk '{print $1}')
+if [ "$LINE_COUNT" -eq 0 ]; then
+    echo "❌ Upload failed: File is empty in pod"
+    exit 1
+fi
+echo "  ✓ File copied to pod ($LINE_COUNT lines)"
+
+# Upload to MinIO
+echo "  Uploading to MinIO..."
+kubectl exec -n ${NAMESPACE} minio-client -- mc cp /tmp/long_running_job.py myminio/spark-scripts/
+
+echo "✓ Uploaded successfully"
+echo ""
+
+# Verify
+echo "Verifying upload in MinIO..."
+FILE_INFO=$(kubectl exec -n ${NAMESPACE} minio-client -- mc ls myminio/spark-scripts/long_running_job.py)
+echo "$FILE_INFO"
+
+# Check if file size is 0B (indicates empty file)
+if echo "$FILE_INFO" | grep -q "0B"; then
+    echo ""
+    echo "❌ WARNING: File appears to be empty (0B) in MinIO!"
+    echo "   This will cause Spark jobs to fail immediately."
+    exit 1
+fi
+
+echo ""
+echo "=================================================="
+echo "✅ Setup Complete!"
+echo "=================================================="
+echo ""
+echo "Now run:"
+echo "  python run_long_job_ui_validation.py"
+echo ""
diff --git a/kubeflow/spark/README.md b/kubeflow/spark/README.md
new file mode 100644
index 000000000..a346439c5
--- /dev/null
+++ b/kubeflow/spark/README.md
@@ -0,0 +1,610 @@
+# Kubeflow Spark Client
+
+Cloud-native Python client for managing Apache Spark applications on Kubernetes using the Kubeflow Spark Operator.
+
+## Overview
+
+The Kubeflow Spark Client provides a Pythonic interface for submitting, monitoring, and managing Spark applications on Kubernetes. The SDK offers two specialized clients for different workloads:
+
+- **BatchSparkClient**: For batch Spark application submission and management
+- **SparkSessionClient**: For interactive Spark Connect sessions
+
+### Key Features
+
+- **Specialized Clients**: Separate clients for batch jobs and interactive sessions
+- **Cloud-Native Architecture**: Direct integration with Kubeflow Spark Operator CRDs
+- **Multiple Backends**: Operator (K8s-native), Gateway (REST API), and Connect (gRPC) backends
+- **Dynamic Resource Allocation**: Automatic executor scaling based on workload
+- **Comprehensive Monitoring**: Prometheus metrics and Spark UI integration
+- **Production-Ready**: Error handling, retries, and comprehensive logging
+- **Type-Safe**: Clean APIs with proper type hints and IDE support
+
+## Architecture
+
+```
+BaseSparkClient (shared functionality)
+├── BatchSparkClient (batch workloads)
+│   └── Backend: BatchSparkBackend
+│       ├── OperatorBackend (Kubernetes CRDs)
+│       └── GatewayBackend (REST API)
+│
+└── SparkSessionClient (interactive workloads)
+    └── Backend: SessionSparkBackend
+        └── ConnectBackend (Spark Connect/gRPC)
+```
+
+### Design Principles
+
+The Spark client follows best practices and SOLID principles:
+
+1. **Interface Segregation**: Separate clients expose only relevant methods
+2. **Backend Abstraction**: Pluggable backends for different platforms
+3. **Type Safety**: Strong typing prevents runtime errors
+4. **Kubernetes-Native**: Direct CRD manipulation for cloud-native deployments
+
+## Installation
+
+```bash
+# Install from PyPI (when released)
+pip install kubeflow
+
+# Or install from source
+cd sdk
+pip install -e .
+
+# For Spark Connect support
+pip install 'pyspark[connect]>=3.4.0'
+```
+
+### Prerequisites
+
+**For BatchSparkClient with OperatorBackend** (recommended for batch jobs):
+- Kubernetes cluster (1.16+)
+- Kubeflow Spark Operator installed
+- kubectl configured with proper context
+- Service account with SparkApplication permissions
+
+**For BatchSparkClient with GatewayBackend**:
+- Access to a Spark Gateway (e.g., Apache Livy)
+- API credentials (if required)
+
+**For SparkSessionClient with ConnectBackend**:
+- Spark cluster with Spark Connect server (Spark 3.4+)
+- Network connectivity to Spark Connect endpoint
+- PySpark with Connect support installed
+
+## Quick Start
+
+### Batch Jobs
+
+#### Basic Batch Application
+
+```python
+from kubeflow.spark import BatchSparkClient, OperatorBackendConfig
+
+# Create batch client (uses Operator backend by default)
+client = BatchSparkClient()
+
+# Submit a Spark application
+response = client.submit_application(
+    app_name="spark-pi",
+    main_application_file="local:///opt/spark/examples/src/main/python/pi.py",
+    driver_cores=1,
+    driver_memory="512m",
+    executor_cores=1,
+    executor_memory="512m",
+    num_executors=2
+)
+
+print(f"Submitted: {response.submission_id}")
+
+# Wait for completion
+status = client.wait_for_completion(response.submission_id)
+print(f"Final state: {status.state}")
+
+# Get logs
+for line in client.get_logs(response.submission_id):
+    print(line)
+```
+
+#### DataFrame Processing with S3
+
+```python
+from kubeflow.spark import BatchSparkClient, OperatorBackendConfig
+
+# Configure client
+config = OperatorBackendConfig(
+    namespace="spark-jobs",
+    enable_monitoring=True,
+    enable_ui=True,
+)
+client = BatchSparkClient(backend_config=config)
+
+# Submit DataFrame processing job
+response = client.submit_application(
+    app_name="dataframe-analysis",
+    main_application_file="s3a://my-bucket/jobs/analysis.py",
+    spark_version="4.0.0",
+    driver_cores=2,
+    driver_memory="4g",
+    executor_cores=2,
+    executor_memory="8g",
+    num_executors=5,
+    spark_conf={
+        "spark.sql.shuffle.partitions": "200",
+        "spark.hadoop.fs.s3a.endpoint": "s3.amazonaws.com",
+    },
+    env_vars={
+        "AWS_ACCESS_KEY_ID": "your-key",
+        "AWS_SECRET_ACCESS_KEY": "your-secret",
+    }
+)
+```
+
+#### Advanced Features: Dynamic Allocation and Volumes
+
+```python
+from kubeflow.spark import BatchSparkClient, OperatorBackendConfig
+
+config = OperatorBackendConfig(namespace="default")
+client = BatchSparkClient(backend_config=config)
+
+response = client.submit_application(
+    app_name="advanced-job",
+    main_application_file="local:///app/job.py",
+    spark_version="4.0.0",
+    driver_cores=2,
+    driver_memory="4g",
+    executor_cores=2,
+    executor_memory="8g",
+    num_executors=3,
+
+    # Enable dynamic allocation
+    enable_dynamic_allocation=True,
+    initial_executors=2,
+    min_executors=1,
+    max_executors=10,
+
+    # Configure volumes
+    volumes=[{
+        "name": "data-volume",
+        "persistentVolumeClaim": {"claimName": "my-pvc"}
+    }],
+    driver_volume_mounts=[{
+        "name": "data-volume",
+        "mountPath": "/data"
+    }],
+
+    # Node selector and tolerations
+    node_selector={"node-type": "compute"},
+    tolerations=[{
+        "key": "spark",
+        "operator": "Equal",
+        "value": "true",
+        "effect": "NoSchedule"
+    }],
+)
+```
+
+### Interactive Sessions
+
+#### Basic Interactive Session
+
+```python
+from kubeflow.spark import SparkSessionClient, ConnectBackendConfig
+
+# Connect to Spark Connect server
+config = ConnectBackendConfig(
+    connect_url="sc://spark-cluster.default.svc:15002",
+    use_ssl=True,
+)
+client = SparkSessionClient(backend_config=config)
+
+# Create interactive session
+session = client.create_session(app_name="data-exploration")
+
+# Use standard PySpark DataFrame API
+df = session.sql("SELECT * FROM sales WHERE date >= '2024-01-01'")
+result = df.groupBy("product").sum("amount").collect()
+
+for row in result:
+    print(f"{row.product}: {row['sum(amount)']}")
+
+# Cleanup
+session.close()
+```
+
+#### Notebook Workflow
+
+```python
+from kubeflow.spark import SparkSessionClient, ConnectBackendConfig
+
+# Cell 1: Setup
+config = ConnectBackendConfig(connect_url="sc://spark-cluster:15002")
+client = SparkSessionClient(backend_config=config)
+session = client.create_session("notebook-analysis")
+
+# Cell 2: Load data
+df = session.read.parquet("s3a://bucket/data/")
+df.show()
+
+# Cell 3: Feature engineering
+features = df.withColumn("spend_per_year", df.spend_total / df.age)
+features.describe().show()
+
+# Cell 4: Export results
+session.export_to_pipeline_artifact(features, "/outputs/features.parquet")
+
+# Cell 5: Cleanup
+session.close()
+```
+
+#### Session Management
+
+```python
+from kubeflow.spark import SparkSessionClient, ConnectBackendConfig
+
+config = ConnectBackendConfig(connect_url="sc://spark-cluster:15002")
+client = SparkSessionClient(backend_config=config)
+
+# List all active sessions
+sessions = client.list_sessions()
+for session_info in sessions:
+    print(f"Session: {session_info.session_id}")
+    print(f"  App: {session_info.app_name}")
+    print(f"  Queries executed: {session_info.metrics.queries_executed}")
+
+# Get specific session status
+session = client.create_session("my-app")
+info = client.get_session_status(session.session_id)
+print(f"Session state: {info.state}")
+
+# Close session
+client.close_session(session.session_id, release=True)
+```
+
+## API Reference
+
+### BatchSparkClient
+
+Client for managing batch Spark applications.
+
+#### Constructor
+
+```python
+BatchSparkClient(backend_config: Union[OperatorBackendConfig, GatewayBackendConfig, None] = None)
+```
+
+#### Methods
+
+**submit_application(...) → SparkApplicationResponse**
+- Submit a new Spark application
+- Returns submission ID and initial status
+
+**get_status(submission_id) → ApplicationStatus**
+- Get current status of an application
+- Returns state, app ID, executor info, timestamps
+
+**wait_for_completion(submission_id, timeout=3600, polling_interval=10) → ApplicationStatus**
+- Block until application completes
+- Returns final status
+
+**get_logs(submission_id, executor_id=None, follow=False) → Iterator[str]**
+- Stream application logs
+- Can retrieve driver or specific executor logs
+
+**list_applications(namespace=None, labels=None) → List[ApplicationStatus]**
+- List applications with optional filtering
+- Supports namespace and label filters
+
+**delete_application(submission_id) → Dict**
+- Delete an application
+- Stops running application and cleans up resources
+
+**wait_for_pod_ready(submission_id, executor_id=None, timeout=300) → bool**
+- Wait for driver or executor pod to be ready
+- Only available with OperatorBackend
+
+### SparkSessionClient
+
+Client for managing interactive Spark sessions.
+
+#### Constructor
+
+```python
+SparkSessionClient(backend_config: ConnectBackendConfig)
+```
+
+#### Methods
+
+**create_session(app_name, **kwargs) → ManagedSparkSession**
+- Create a new Spark Connect session
+- Returns managed session with PySpark API access
+
+**get_session_status(session_id) → SessionInfo**
+- Get status and metadata of a session
+- Returns state, metrics, and session details
+
+**list_sessions() → List[SessionInfo]**
+- List all active Spark Connect sessions
+
+**close_session(session_id, release=True) → Dict**
+- Close a session and release resources
+
+### ManagedSparkSession
+
+Wrapper around PySpark SparkSession with Kubeflow enhancements.
+
+#### Properties
+
+- **session_id**: Unique session identifier
+- **app_name**: Application name
+- **spark**: Access to underlying PySpark SparkSession
+
+#### Methods
+
+**sql(query) → DataFrame**
+- Execute SQL query and return DataFrame
+
+**read → DataFrameReader**
+- Access DataFrameReader for reading data sources
+
+**readStream → DataStreamReader**
+- Access DataStreamReader for streaming sources
+
+**upload_artifacts(*paths, pyfile=False)**
+- Upload JARs or Python files to session
+
+**get_metrics() → SessionMetrics**
+- Get session metrics (queries executed, artifacts uploaded)
+
+**close(release=True)**
+- Close the session
+
+### Backend Configurations
+
+#### OperatorBackendConfig
+
+Configuration for Kubernetes Spark Operator backend.
+
+```python
+from kubeflow.spark import OperatorBackendConfig
+
+config = OperatorBackendConfig(
+    namespace="default",
+    context=None,
+    service_account="spark-operator-spark",
+    image_pull_policy="IfNotPresent",
+    default_spark_image="docker.io/library/spark",
+    enable_monitoring=True,
+    enable_ui=True,
+    timeout=60,
+)
+```
+
+#### GatewayBackendConfig
+
+Configuration for REST Gateway backend.
+
+```python
+from kubeflow.spark import GatewayBackendConfig
+
+config = GatewayBackendConfig(
+    gateway_url="http://gateway:8080",
+    user="myuser",
+    password="mypassword",
+    timeout=30,
+    verify_ssl=True,
+)
+```
+
+#### ConnectBackendConfig
+
+Configuration for Spark Connect backend.
+
+```python
+from kubeflow.spark import ConnectBackendConfig
+
+config = ConnectBackendConfig(
+    connect_url="sc://spark-cluster.default.svc:15002",
+    token="bearer-token",  # Optional
+    use_ssl=True,
+)
+```
+
+## Choosing the Right Client
+
+### Use BatchSparkClient when:
+- Running scheduled ETL pipelines
+- Submitting production batch jobs
+- Integrating with CI/CD workflows
+- Need dynamic allocation and auto-scaling
+- Running jobs as Kubernetes CRDs
+
+### Use SparkSessionClient when:
+- Performing interactive data exploration
+- Working in Jupyter or IPython notebooks
+- Iterative development and testing
+- Need immediate feedback from queries
+- Connecting to remote Spark clusters
+
+## Examples
+
+The `examples/spark/` directory contains comprehensive examples:
+
+**Batch Examples:**
+- `01_hello_spark_pi.py`: Basic Spark Pi calculation
+- `02_csv_data_analysis.py`: CSV data processing
+- `04_etl_pipeline_simple.py`: ETL pipeline example
+- `05_scheduled_batch_job.py`: Scheduled job pattern
+- `06_autoscaling_dynamic_allocation.py`: Dynamic allocation
+
+**Interactive Session Examples:**
+- `07_spark_connect_interactive.py`: Interactive data analysis
+- `ipython_spark_connect_demo.py`: IPython integration
+- `ipython_spark_connect_shell.py`: Interactive shell
+
+Run examples:
+
+```bash
+cd examples/spark
+
+# Batch example
+python 01_hello_spark_pi.py
+
+# Interactive session example
+python 07_spark_connect_interactive.py
+```
+
+## Testing
+
+### Setup Test Environment
+
+Use the provided script to set up a Kind cluster with Spark Operator:
+
+```bash
+cd examples/spark
+./setup_test_environment.sh
+```
+
+This will:
+1. Create a Kind cluster
+2. Install Spark Operator
+3. Configure RBAC and service accounts
+4. Verify the installation
+
+### Run Integration Tests
+
+```bash
+python test_spark_client_integration.py
+```
+
+### Cleanup
+
+```bash
+kind delete cluster --name spark-test
+```
+
+## Monitoring and Debugging
+
+### Access Spark UI
+
+Port forward to Spark UI:
+```bash
+kubectl port-forward -n default svc/spark-ui 4040:4040
+```
+
+Open in browser: http://localhost:4040
+
+### View Application Logs
+
+Using BatchSparkClient:
+```python
+# Stream driver logs
+for line in client.get_logs(submission_id):
+    print(line)
+
+# Get executor logs
+for line in client.get_logs(submission_id, executor_id="1"):
+    print(line)
+```
+
+Using kubectl:
+```bash
+# Driver logs
+kubectl logs <app-name>-driver -n default
+
+# Executor logs
+kubectl logs <app-name>-exec-1 -n default
+```
+
+### Debug Mode
+
+Enable debug logging:
+
+```python
+import logging
+logging.basicConfig(level=logging.DEBUG)
+
+from kubeflow.spark import BatchSparkClient
+client = BatchSparkClient()
+```
+
+## Troubleshooting
+
+### Common Issues
+
+**1. ImportError: No module named 'pyspark'**
+
+For SparkSessionClient, install PySpark with Connect support:
+```bash
+pip install 'pyspark[connect]>=3.4.0'
+```
+
+**2. SparkApplication not being created**
+
+Check Spark Operator is running:
+```bash
+kubectl get pods -n spark-operator
+```
+
+**3. Cannot connect to Spark Connect server**
+
+Verify the server is running and accessible:
+```bash
+kubectl get svc -n default | grep spark
+kubectl port-forward svc/spark-connect 15002:15002
+```
+
+**4. Permission denied**
+
+Verify service account permissions:
+```bash
+kubectl auth can-i create pods --as=system:serviceaccount:default:spark-operator-spark
+```
+
+## Comparison with Trainer Client
+
+| Aspect | Trainer Client | Spark Client |
+|--------|---------------|--------------|
+| **CRD** | TrainJob | SparkApplication |
+| **Operator** | Training Operator | Spark Operator |
+| **Client Classes** | TrainingClient | BatchSparkClient, SparkSessionClient |
+| **Backends** | Kubernetes, LocalProcess | Operator, Gateway, Connect |
+| **Workload Types** | Batch training jobs | Batch jobs + interactive sessions |
+| **API Style** | train(), list_jobs() | submit_application(), create_session() |
+
+Both clients provide:
+- Backend abstraction for flexibility
+- Kubernetes-native CRD management
+- Status monitoring with polling
+- Log streaming capabilities
+- Context manager support
+
+## Contributing
+
+Contributions are welcome. Please:
+
+1. Fork the repository
+2. Create a feature branch
+3. Add tests for new functionality
+4. Ensure all tests pass
+5. Submit a pull request
+
+## License
+
+Apache License 2.0
+
+## References
+
+- [Kubeflow Spark Operator](https://github.com/kubeflow/spark-operator)
+- [Apache Spark on Kubernetes](https://spark.apache.org/docs/latest/running-on-kubernetes.html)
+- [Spark Connect](https://spark.apache.org/docs/latest/spark-connect-overview.html)
+- [Kubeflow Training Client](https://github.com/kubeflow/training-operator)
+
+## Support
+
+For issues and questions:
+- GitHub Issues: [kubeflow/sdk](https://github.com/kubeflow/sdk/issues)
+- Slack: #kubeflow-spark
+- Mailing List: kubeflow-discuss@googlegroups.com
diff --git a/kubeflow/spark/SPARK_CONNECT_DESIGN.md b/kubeflow/spark/SPARK_CONNECT_DESIGN.md
new file mode 100644
index 000000000..3991edba5
--- /dev/null
+++ b/kubeflow/spark/SPARK_CONNECT_DESIGN.md
@@ -0,0 +1,643 @@
+# Spark Connect Integration Design
+
+**Version:** 2.0
+**Status:** Implementation Complete
+**Last Updated:** 2025-11-23
+
+## Overview
+
+This document describes the architecture and design of Spark Connect support in Kubeflow Spark SDK, enabling interactive, session-based Spark workloads through a dedicated client class.
+
+**Key Features:**
+- Remote connectivity to Spark clusters via gRPC (Spark Connect protocol)
+- Specialized clients for batch jobs and interactive sessions
+- Native PySpark API compatibility with Kubeflow enhancements
+- Kubernetes-native integration with automatic secret/config injection
+- Type-safe API following Interface Segregation Principle
+
+---
+
+## Architecture
+
+### System Components
+
+```
+┌──────────────────────────────────────────────────────────────┐
+│                   User Code (Python)                         │
+│                                                              │
+│  Batch Jobs:                  Interactive Sessions:          │
+│  from kubeflow.spark import   from kubeflow.spark import    │
+│    BatchSparkClient             SparkSessionClient          │
+│  client = BatchSparkClient()  client = SparkSessionClient() │
+│  client.submit_application()  session = client.create_session()│
+└──────────────┬────────────────────────────┬──────────────────┘
+               │                            │
+               ▼                            ▼
+┌──────────────────────────┐  ┌──────────────────────────┐
+│   BatchSparkClient       │  │  SparkSessionClient      │
+│                          │  │                          │
+│  • submit_application()  │  │  • create_session()      │
+│  • get_status()          │  │  • list_sessions()       │
+│  • wait_for_completion() │  │  • close_session()       │
+│  • delete_application()  │  │  • get_session_status()  │
+└──────────┬───────────────┘  └──────────┬───────────────┘
+           │                             │
+     ┌─────┴─────┐                       │
+     ▼           ▼                       ▼
+┌──────────┐ ┌──────────┐      ┌──────────────┐
+│ Operator │ │ Gateway  │      │  Connect     │
+│ Backend  │ │ Backend  │      │  Backend     │
+│          │ │          │      │              │
+│ (Batch)  │ │ (Batch)  │      │ (Session)    │
+└────┬─────┘ └────┬─────┘      └──────┬───────┘
+     │            │                   │
+     ▼            ▼                   ▼
+┌──────────┐ ┌──────────┐      ┌──────────────┐
+│  Spark   │ │  Livy/   │      │    Spark     │
+│ Operator │ │ Gateway  │      │   Connect    │
+│(K8s CRDs)│ │ (HTTP)   │      │   (gRPC)     │
+└──────────┘ └──────────┘      └──────────────┘
+```
+
+### ConnectBackend Architecture
+
+```
+┌─────────────────────────────────────────────────────────┐
+│           SparkSessionClient                            │
+│                                                         │
+│  create_session(app_name)                               │
+│    ↓                                                     │
+│  Delegates to ConnectBackend                            │
+└─────────────────────────┬───────────────────────────────┘
+                          │
+                          ▼
+┌─────────────────────────────────────────────────────────┐
+│              ConnectBackend                             │
+│              (SessionSparkBackend)                      │
+│                                                         │
+│  ┌─────────────────────────────────────────────────┐  │
+│  │  create_session(app_name, **config)             │  │
+│  │    ↓                                             │  │
+│  │  1. Generate session_id (UUID)                   │  │
+│  │  2. Build connection URL                         │  │
+│  │  3. Create PySpark SparkSession.builder.remote() │  │
+│  │  4. Wrap in ManagedSparkSession                  │  │
+│  │  5. Track in _sessions dict                      │  │
+│  └─────────────────────────────────────────────────┘  │
+│                                                         │
+│  Session Management:                                    │
+│  • list_sessions()                                      │
+│  • get_session_status(session_id)                      │
+│  • close_session(session_id)                           │
+└─────────────────────────────────────────────────────────┘
+                        │
+                        ▼
+┌─────────────────────────────────────────────────────────┐
+│           ManagedSparkSession (Wrapper)                 │
+│                                                         │
+│  PySpark API (delegated):        Kubeflow Extensions:  │
+│  • sql(query)                    • get_metrics()       │
+│  • createDataFrame(data)         • get_info()          │
+│  • read.parquet()                • upload_artifacts()  │
+│  • All DataFrame operations      • context manager     │
+│                                                         │
+│  Wraps: pyspark.sql.SparkSession (Spark Connect)       │
+└─────────────────────────────────────────────────────────┘
+```
+
+### Data Flow
+
+```
+User Code → SparkSessionClient → ConnectBackend → gRPC → Spark Connect Server
+                                                              ↓
+                                                        Spark Cluster
+                                                        (Driver + Executors)
+                                                              ↓
+Results ← ManagedSparkSession ← gRPC Stream ← Spark Connect Server
+```
+
+---
+
+## Design Principles
+
+### 1. Specialized Client Classes
+
+Separate clients for different workloads:
+
+```python
+# Batch jobs
+from kubeflow.spark import BatchSparkClient, OperatorBackendConfig
+
+config = OperatorBackendConfig(namespace="spark-jobs")
+client = BatchSparkClient(backend_config=config)
+response = client.submit_application(app_name="batch-job", ...)
+
+# Interactive sessions
+from kubeflow.spark import SparkSessionClient, ConnectBackendConfig
+
+config = ConnectBackendConfig(connect_url="sc://spark-connect:15002")
+client = SparkSessionClient(backend_config=config)
+session = client.create_session(app_name="analysis")
+```
+
+Benefits:
+- Type safety: Clients only expose relevant methods
+- No runtime errors from unsupported operations
+- Clear API boundaries
+- Follows Interface Segregation Principle
+
+### 2. Native PySpark Delegation
+
+`ManagedSparkSession` delegates all DataFrame operations to native PySpark:
+
+```python
+# These call PySpark directly - no wrapping overhead
+df = session.sql("SELECT * FROM table")
+df = session.createDataFrame(data, schema)
+df = session.read.parquet("s3://bucket/data")
+result = df.filter(df.age > 30).collect()
+```
+
+Benefits:
+- Full PySpark API compatibility
+- Zero wrapping overhead for DataFrame operations
+- Automatic updates when PySpark adds new features
+- Standard PySpark documentation applies
+
+### 3. Backend Abstraction
+
+Backends implement specialized abstract base classes:
+
+```python
+# Base interface (minimal shared functionality)
+class SparkBackend(abc.ABC):
+    def close(self): pass
+
+# Batch workloads
+class BatchSparkBackend(SparkBackend):
+    def submit_application(...) -> SparkApplicationResponse
+    def get_status(app_id) -> ApplicationStatus
+    def delete_application(app_id) -> Dict
+    def get_logs(...) -> Iterator[str]
+    def list_applications(...) -> List[ApplicationStatus]
+    def wait_for_completion(...) -> ApplicationStatus
+
+# Session workloads
+class SessionSparkBackend(SparkBackend):
+    def create_session(app_name, **kwargs) -> ManagedSparkSession
+    def list_sessions() -> List[SessionInfo]
+    def close_session(session_id) -> Dict[str, Any]
+    def get_session_status(session_id) -> SessionInfo
+```
+
+Implementation hierarchy:
+- OperatorBackend extends BatchSparkBackend
+- GatewayBackend extends BatchSparkBackend
+- ConnectBackend extends SessionSparkBackend
+
+---
+
+## Component Details
+
+### ConnectBackendConfig
+
+Configuration for Spark Connect connectivity:
+
+```python
+@dataclass
+class ConnectBackendConfig:
+    connect_url: str                    # "sc://host:port"
+    token: Optional[str] = None         # Bearer token for auth
+    use_ssl: bool = True                # Enable TLS
+    user_id: Optional[str] = None       # User identity
+    timeout: int = 300                  # Connection timeout (seconds)
+    grpc_max_message_size: int = 128MB  # gRPC message limit
+    namespace: str = "default"          # K8s namespace
+```
+
+### ManagedSparkSession
+
+Kubeflow wrapper around native PySpark Connect session:
+
+```python
+class ManagedSparkSession:
+    # Properties
+    @property
+    def session_id(self) -> str          # Unique session UUID
+    @property
+    def app_name(self) -> str            # Application name
+    @property
+    def is_closed(self) -> bool          # Session state
+
+    # PySpark API (delegated to self._session)
+    def sql(self, query: str) -> DataFrame
+    def createDataFrame(self, data, schema) -> DataFrame
+    def read(self) -> DataFrameReader
+    def table(self, table_name: str) -> DataFrame
+    def range(self, start, end, step) -> DataFrame
+
+    # Kubeflow extensions
+    def get_metrics(self) -> SessionMetrics
+    def get_info(self) -> SessionInfo
+    def upload_artifacts(self, *paths) -> None
+    def close(self) -> None
+
+    # Context manager support
+    def __enter__(self) -> "ManagedSparkSession"
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None
+```
+
+### SessionMetrics
+
+Tracks session activity:
+
+```python
+@dataclass
+class SessionMetrics:
+    session_id: str
+    queries_executed: int = 0      # SQL/DataFrame operations
+    active_queries: int = 0        # Currently running queries
+    artifacts_uploaded: int = 0    # Uploaded JARs/files
+    data_read_bytes: int = 0       # Data read
+    data_written_bytes: int = 0    # Data written
+    execution_time_ms: int = 0     # Total execution time
+```
+
+---
+
+## Usage Examples
+
+### Basic Connection
+
+```python
+from kubeflow.spark import SparkSessionClient, ConnectBackendConfig
+
+# Configure connection
+config = ConnectBackendConfig(
+    connect_url="sc://localhost:30000",
+    use_ssl=False,
+)
+
+# Create client and session
+client = SparkSessionClient(backend_config=config)
+session = client.create_session(app_name="demo")
+
+# Use standard PySpark API
+df = session.sql("SELECT 1 AS id, 'Hello' AS message")
+df.show()
+
+# Cleanup
+session.close()
+client.close()
+```
+
+### Context Manager Pattern
+
+```python
+from kubeflow.spark import SparkSessionClient, ConnectBackendConfig
+
+config = ConnectBackendConfig(connect_url="sc://spark-server:15002")
+
+with SparkSessionClient(backend_config=config) as client:
+    with client.create_session(app_name="analysis") as session:
+        # Session auto-closes on exit
+        df = session.sql("SELECT * FROM sales")
+        result = df.filter(df.amount > 100).collect()
+```
+
+### DataFrame Operations
+
+```python
+# Create DataFrame from Python data
+sales_data = [
+    (1, "Electronics", "Laptop", 1200.00, 2),
+    (2, "Electronics", "Mouse", 25.00, 5),
+    (3, "Clothing", "Shirt", 35.00, 3),
+]
+
+df = session.createDataFrame(
+    sales_data,
+    ["id", "category", "product", "price", "quantity"]
+)
+
+# Show data
+df.show()
+# +---+-----------+--------+------+--------+
+# | id|   category| product| price|quantity|
+# +---+-----------+--------+------+--------+
+# |  1|Electronics|  Laptop|1200.0|       2|
+# |  2|Electronics|   Mouse|  25.0|       5|
+# |  3|   Clothing|   Shirt|  35.0|       3|
+# +---+-----------+--------+------+--------+
+```
+
+### Aggregations and GroupBy
+
+```python
+from pyspark.sql import functions as F
+
+# Calculate revenue
+revenue_df = df.withColumn("revenue", F.col("price") * F.col("quantity"))
+
+# Group by category with multiple aggregations
+category_stats = revenue_df.groupBy("category").agg(
+    F.sum("revenue").alias("total_revenue"),
+    F.avg("price").alias("avg_price"),
+    F.count("*").alias("num_transactions")
+)
+
+category_stats.show()
+# +-----------+-------------+---------+----------------+
+# |   category|total_revenue|avg_price|num_transactions|
+# +-----------+-------------+---------+----------------+
+# |   Clothing|        105.0|     35.0|               1|
+# |Electronics|       2525.0|    612.5|               2|
+# +-----------+-------------+---------+----------------+
+
+# Sort by revenue
+category_stats.orderBy(F.desc("total_revenue")).show()
+```
+
+### Window Functions
+
+```python
+from pyspark.sql import functions as F
+from pyspark.sql.window import Window
+
+# Running total by date
+window_spec = Window.orderBy("date").rowsBetween(
+    Window.unboundedPreceding,
+    Window.currentRow
+)
+
+daily_revenue = revenue_df.groupBy("date").agg(
+    F.sum("revenue").alias("daily_revenue")
+).withColumn(
+    "running_total",
+    F.sum("daily_revenue").over(window_spec)
+)
+
+daily_revenue.orderBy("date").show()
+```
+
+### Session Management
+
+```python
+# List all active sessions
+sessions = client.list_sessions()
+for s in sessions:
+    print(f"Session: {s.session_id}, App: {s.app_name}, State: {s.state}")
+
+# Get session status
+status = client.get_session_status(session.session_id)
+print(f"Session state: {status.state}")
+
+# Get metrics
+metrics = session.get_metrics()
+print(f"Queries executed: {metrics.queries_executed}")
+print(f"Active queries: {metrics.active_queries}")
+
+# Get session info
+info = session.get_info()
+print(f"App: {info.app_name}, State: {info.state}")
+```
+
+### Multiple Concurrent Sessions
+
+```python
+with SparkSessionClient(backend_config=config) as client:
+    # Create multiple sessions
+    session1 = client.create_session(app_name="analysis-1")
+    session2 = client.create_session(app_name="analysis-2")
+
+    try:
+        # Each session is independent
+        df1 = session1.sql("SELECT 'session1' AS source")
+        df2 = session2.sql("SELECT 'session2' AS source")
+
+        print(df1.collect())  # [Row(source='session1')]
+        print(df2.collect())  # [Row(source='session2')]
+    finally:
+        session1.close()
+        session2.close()
+```
+
+---
+
+## Deployment Guide
+
+### Kubernetes Setup
+
+#### 1. Deploy Spark Connect Server
+
+Use the provided Kubernetes manifest:
+
+```bash
+# Deploy Spark Connect server
+kubectl apply -f examples/spark/spark-connect-server.yaml
+
+# Verify deployment
+kubectl get pods -l app=spark-connect
+kubectl logs -l app=spark-connect -f
+```
+
+#### 2. Port Forwarding (Local Development)
+
+```bash
+# Forward Spark Connect port to localhost
+kubectl port-forward -n default svc/spark-connect 30000:15002
+
+# Verify connectivity
+nc -zv localhost 30000
+```
+
+#### 3. Connect from Python
+
+```python
+config = ConnectBackendConfig(
+    connect_url="sc://localhost:30000",  # Local port forward
+    use_ssl=False,
+)
+
+client = SparkSessionClient(backend_config=config)
+session = client.create_session(app_name="my-app")
+```
+
+### Production Setup
+
+For production, use Kubernetes DNS:
+
+```python
+config = ConnectBackendConfig(
+    connect_url="sc://spark-connect.default.svc.cluster.local:15002",
+    use_ssl=True,
+    token=os.getenv("SPARK_TOKEN"),  # From K8s secret
+)
+```
+
+---
+
+## Interactive Demo
+
+### Quick Start
+
+```bash
+# 1. Setup Kubernetes cluster with Spark Connect
+cd examples/spark
+./setup_spark_connect.sh
+
+# 2. Install dependencies
+pip install 'pyspark[connect]>=4.0.0'
+
+# 3. Launch IPython shell
+python ipython_spark_connect_shell.py
+```
+
+### Step-by-Step Tutorial
+
+The IPython shell provides a guided tutorial. Key steps:
+
+```python
+# 1. Create config and client
+config = ConnectBackendConfig(
+    connect_url="sc://localhost:30000",
+    use_ssl=False,
+)
+client = SparkSessionClient(backend_config=config)
+
+# 2. Create session
+session = client.create_session(app_name="tutorial")
+
+# 3. Simple query
+df = session.sql("SELECT 1 AS id, 'Hello' AS msg")
+df.show()
+
+# 4. Create DataFrame
+data = [
+    (1, "Electronics", 1200.00),
+    (2, "Clothing", 35.00),
+]
+df = session.createDataFrame(data, ["id", "category", "price"])
+df.show()
+
+# 5. Aggregations
+from pyspark.sql import functions as F
+df.groupBy("category").agg(F.avg("price")).show()
+
+# 6. Cleanup
+session.close()
+client.close()
+```
+
+---
+
+## Key Design Decisions
+
+### 1. Separate Client Classes (Interface Segregation)
+
+**Decision:** Use BatchSparkClient and SparkSessionClient instead of unified client
+
+**Rationale:**
+- Different use cases have distinct method requirements
+- Batch: submit_application, wait_for_completion, delete_application
+- Session: create_session, list_sessions, close_session
+- Prevents runtime NotImplementedError exceptions
+- Type-safe: clients only expose relevant methods
+
+**Benefits:**
+- Compile-time type checking
+- Clear API boundaries
+- No confusion about which methods work with which backend
+- IDE autocomplete shows only valid methods
+
+### 2. Delegation to Native PySpark
+
+**Decision:** Delegate DataFrame operations to native PySpark
+
+**Alternatives Considered:**
+- Wrap all PySpark methods → Rejected (maintenance burden)
+- Custom DataFrame implementation → Rejected (no value-add)
+
+**Benefits:**
+- Zero wrapping overhead
+- Full PySpark compatibility
+- Automatic feature updates
+
+### 3. URL Parameter Handling (Spark 4.0)
+
+**Decision:** Use simple URL format without parameters
+
+**Issue:** Spark Connect 4.0 doesn't support URL parameters like `/;use_ssl=false`
+
+**Solution:** Pass configuration via `builder.config()` instead of URL
+
+```python
+# Before (doesn't work in Spark 4.0)
+url = "sc://host:port/;use_ssl=false"
+
+# After (works)
+url = "sc://host:port"
+builder.config("spark.ssl.enabled", "false")
+```
+
+### 4. IPv4 vs IPv6 Binding
+
+**Issue:** Spark Connect server was binding to IPv6 (:::15002) causing connection failures
+
+**Solution:** Force IPv4 binding via Java options
+
+```yaml
+env:
+- name: JAVA_TOOL_OPTIONS
+  value: "-Djava.net.preferIPv4Stack=true"
+```
+
+---
+
+## Version Compatibility
+
+| Component | Version | Notes |
+|-----------|---------|-------|
+| PySpark Client | 4.0.x | Must match server version |
+| Spark Connect Server | 4.0.0 | Running in Kubernetes |
+| Kubeflow SDK | Latest | This implementation |
+| Kubernetes | 1.24+ | For Spark Operator |
+| Python | 3.8+ | Required for PySpark |
+
+**Important:** Client and server versions must match. PySpark 4.0 cannot connect to Spark 3.5 Connect servers.
+
+---
+
+## Resources
+
+### Files Created
+
+- `kubeflow/spark/backends/connect.py` - ConnectBackend implementation
+- `kubeflow/spark/session.py` - ManagedSparkSession wrapper
+- `kubeflow/spark/models.py` - Data models (ConnectBackendConfig, SessionMetrics, SessionInfo)
+- `examples/spark/ipython_spark_connect_shell.py` - Interactive demo shell
+- `examples/spark/ipython_spark_connect_demo.py` - Automated demo
+- `examples/spark/spark-connect-server.yaml` - Kubernetes deployment
+- `examples/spark/setup_spark_connect.sh` - Setup automation
+- `examples/spark/SPARK_CONNECT_DEMO.md` - Demo documentation
+
+### Documentation
+
+- [Spark Connect Overview](https://spark.apache.org/docs/latest/spark-connect-overview.html)
+- [PySpark Connect API](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/connect.html)
+- [Demo Guide](examples/spark/SPARK_CONNECT_DEMO.md)
+
+---
+
+## Summary
+
+Spark Connect integration provides:
+
+- **Specialized Clients** - BatchSparkClient for batch jobs, SparkSessionClient for interactive sessions
+- **Type Safety** - Interface Segregation Principle prevents runtime errors
+- **Native PySpark** - Full DataFrame API with zero overhead
+- **Kubernetes-Native** - Automatic config/secret injection
+- **Production-Ready** - Session management, metrics, error handling
+- **Developer-Friendly** - Context managers, IPython integration, examples
+
+The implementation follows SOLID design principles while providing the full power of PySpark Connect for interactive data analysis and ML workloads.
diff --git a/kubeflow/spark/__init__.py b/kubeflow/spark/__init__.py
new file mode 100644
index 000000000..25880dfca
--- /dev/null
+++ b/kubeflow/spark/__init__.py
@@ -0,0 +1,158 @@
+"""Kubeflow Spark Client for managing Spark applications on Kubernetes.
+
+This module provides specialized Python clients for managing Apache Spark applications
+on Kubernetes using different backends:
+
+**Batch Jobs:**
+- **BatchSparkClient**: For batch Spark application submission and management
+  - OperatorBackend: Cloud-native using Kubeflow Spark Operator (recommended)
+  - GatewayBackend: REST API for managed Spark gateways
+
+**Interactive Sessions:**
+- **SparkSessionClient**: For interactive Spark Connect sessions
+  - ConnectBackend: gRPC-based remote connectivity for notebooks and exploration
+
+Quick Start (Batch Jobs):
+    ```python
+    from kubeflow.spark import BatchSparkClient, OperatorBackendConfig
+
+    # Create batch client (uses Operator backend by default)
+    client = BatchSparkClient()
+
+    # Submit a Spark application
+    response = client.submit_application(
+        app_name="spark-pi",
+        main_application_file="local:///opt/spark/examples/src/main/python/pi.py",
+        driver_cores=1,
+        driver_memory="512m",
+        executor_cores=1,
+        executor_memory="512m",
+        num_executors=2,
+    )
+
+    # Wait for completion
+    status = client.wait_for_job_status(response.submission_id)
+    print(f"Application state: {status.state}")
+    ```
+
+Quick Start (Interactive Sessions):
+    ```python
+    from kubeflow.spark import SparkSessionClient, ConnectBackendConfig
+
+    # Connect to existing Spark cluster
+    config = ConnectBackendConfig(connect_url="sc://spark-cluster:15002")
+    client = SparkSessionClient(backend_config=config)
+
+    # Create interactive session
+    session = client.create_session(app_name="data-analysis")
+
+    # Use standard PySpark API
+    df = session.sql("SELECT * FROM table")
+    result = df.filter(df.status == "active").collect()
+
+    # Cleanup
+    session.close()
+    ```
+
+For more examples, see the examples/ directory.
+"""
+
+# Import client classes
+from kubeflow.spark.base_client import BaseSparkClient
+from kubeflow.spark.batch_client import BatchSparkClient
+from kubeflow.spark.session_client import SparkSessionClient
+
+# Import backends and configs
+from kubeflow.spark.backends import (
+    BatchSparkBackend,
+    ConnectBackend,
+    ConnectBackendConfig,
+    GatewayBackend,
+    GatewayBackendConfig,
+    OperatorBackend,
+    OperatorBackendConfig,
+    SessionSparkBackend,
+    SparkBackend,
+)
+
+# Import models
+from kubeflow.spark.models import (
+    # States & Enums
+    ApplicationState,
+    # Status Models
+    ApplicationStatus,
+    BatchSchedulerConfig,
+    DeployMode,
+    DynamicAllocation,
+    GPUSpec,
+    MonitoringSpec,
+    PrometheusSpec,
+    # Configuration Models
+    RestartPolicy,
+    RestartPolicyType,
+    # Session Models (for Spark Connect)
+    SessionInfo,
+    SessionMetrics,
+    # Request & Response
+    SparkApplicationRequest,
+    SparkApplicationResponse,
+    SparkUIConfiguration,
+)
+
+# Import session management
+from kubeflow.spark.session import ManagedSparkSession
+
+# Import validation
+from kubeflow.spark.validation import (
+    SparkApplicationValidator,
+    ValidationError,
+    ValidationErrorType,
+    ValidationResult,
+    validate_spark_application,
+)
+
+__all__ = [
+    # Client classes
+    "BaseSparkClient",
+    "BatchSparkClient",
+    "SparkSessionClient",
+    # Backends (base classes)
+    "SparkBackend",
+    "BatchSparkBackend",
+    "SessionSparkBackend",
+    # Backend implementations
+    "OperatorBackend",
+    "OperatorBackendConfig",
+    "GatewayBackend",
+    "GatewayBackendConfig",
+    "ConnectBackend",
+    "ConnectBackendConfig",
+    # Session Management (Spark Connect)
+    "ManagedSparkSession",
+    "SessionInfo",
+    "SessionMetrics",
+    # Request & Response Models
+    "SparkApplicationRequest",
+    "SparkApplicationResponse",
+    "ApplicationStatus",
+    # States & Enums
+    "ApplicationState",
+    "RestartPolicyType",
+    "DeployMode",
+    # Configuration Models
+    "RestartPolicy",
+    "GPUSpec",
+    "DynamicAllocation",
+    "BatchSchedulerConfig",
+    "PrometheusSpec",
+    "MonitoringSpec",
+    "SparkUIConfiguration",
+    # Validation
+    "validate_spark_application",
+    "SparkApplicationValidator",
+    "ValidationResult",
+    "ValidationError",
+    "ValidationErrorType",
+]
+
+__version__ = "0.2.0"
diff --git a/kubeflow/spark/backends/__init__.py b/kubeflow/spark/backends/__init__.py
new file mode 100644
index 000000000..4e1db855c
--- /dev/null
+++ b/kubeflow/spark/backends/__init__.py
@@ -0,0 +1,24 @@
+"""Spark backends for different execution environments."""
+
+from kubeflow.spark.backends.base import (
+    BatchSparkBackend,
+    SessionSparkBackend,
+    SparkBackend,
+)
+from kubeflow.spark.backends.connect import ConnectBackend, ConnectBackendConfig
+from kubeflow.spark.backends.gateway import GatewayBackend, GatewayBackendConfig
+from kubeflow.spark.backends.operator import OperatorBackend, OperatorBackendConfig
+
+__all__ = [
+    # Base classes
+    "SparkBackend",
+    "BatchSparkBackend",
+    "SessionSparkBackend",
+    # Backend implementations
+    "OperatorBackend",
+    "OperatorBackendConfig",
+    "GatewayBackend",
+    "GatewayBackendConfig",
+    "ConnectBackend",
+    "ConnectBackendConfig",
+]
diff --git a/kubeflow/spark/backends/base.py b/kubeflow/spark/backends/base.py
new file mode 100644
index 000000000..f42be726a
--- /dev/null
+++ b/kubeflow/spark/backends/base.py
@@ -0,0 +1,323 @@
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Base backend interfaces for Spark applications.
+
+This module defines the backend interface hierarchy for the Kubeflow Spark SDK:
+
+- SparkBackend: Minimal base class with common functionality
+- BatchSparkBackend: Interface for batch job submission (OperatorBackend, GatewayBackend)
+- SessionSparkBackend: Interface for interactive sessions (ConnectBackend)
+
+This design follows the Interface Segregation Principle (ISP), ensuring that
+backends only implement methods relevant to their use case.
+"""
+
+import abc
+from collections.abc import Iterator
+from typing import TYPE_CHECKING, Any, Optional
+
+from kubeflow.spark.models import ApplicationStatus, SessionInfo, SparkApplicationResponse
+
+if TYPE_CHECKING:
+    from kubeflow.spark.session import ManagedSparkSession
+
+
+class SparkBackend(abc.ABC):
+    """Minimal base class for all Spark backends.
+
+    This class provides only the essential functionality common to all backends.
+    Specific backend types (batch or session) inherit from BatchSparkBackend or
+    SessionSparkBackend respectively.
+
+    All backends should implement the close() method to clean up resources.
+    """
+
+    def close(self):
+        """Close any open connections or resources.
+
+        Subclasses should override this to clean up resources like:
+        - Kubernetes API clients
+        - HTTP connections
+        - gRPC channels
+        - File handles
+
+        This method is called when the client is closed or when used as a context manager.
+        """
+        pass
+
+
+class BatchSparkBackend(SparkBackend):
+    """Abstract base class for batch-oriented Spark backends.
+
+    This interface defines the contract for backends that support traditional
+    batch Spark application submission, monitoring, and management.
+
+    Backends implementing this interface:
+    - OperatorBackend: Submits SparkApplication CRDs to Kubernetes
+    - GatewayBackend: Submits jobs via REST API to Spark gateways
+
+    Typical workflow:
+        1. submit_application() -> Returns submission_id
+        2. wait_for_job_status() or poll get_job()
+        3. get_job_logs() to retrieve output
+        4. delete_job() for cleanup
+    """
+
+    @abc.abstractmethod
+    def submit_application(
+        self,
+        app_name: str,
+        main_application_file: str,
+        spark_version: str,
+        app_type: str,
+        driver_cores: int,
+        driver_memory: str,
+        executor_cores: int,
+        executor_memory: str,
+        num_executors: int,
+        queue: Optional[str],
+        arguments: Optional[list[str]],
+        python_version: str,
+        spark_conf: Optional[dict[str, str]],
+        hadoop_conf: Optional[dict[str, str]],
+        env_vars: Optional[dict[str, str]],
+        deps: Optional[dict[str, list[str]]],
+        **kwargs: Any,
+    ) -> SparkApplicationResponse:
+        """Submit a Spark application for batch execution.
+
+        Args:
+            app_name: Name of the application
+            main_application_file: Path to main application file (local://, s3a://, etc.)
+            spark_version: Spark version to use (e.g., "4.0.0")
+            app_type: Application type ("Python", "Scala", "Java", "R")
+            driver_cores: Number of cores for driver
+            driver_memory: Memory for driver (e.g., "4g", "512m")
+            executor_cores: Number of cores per executor
+            executor_memory: Memory per executor (e.g., "8g", "2g")
+            num_executors: Number of executors to provision
+            queue: Queue/namespace to submit to (backend-specific)
+            arguments: Application arguments passed to main file
+            python_version: Python version for PySpark apps (e.g., "3")
+            spark_conf: Spark configuration properties (spark.*)
+            hadoop_conf: Hadoop configuration properties
+            env_vars: Environment variables for driver and executors
+            deps: Dependencies dict with keys: "jars", "pyFiles", "files"
+            **kwargs: Additional backend-specific parameters
+
+        Returns:
+            SparkApplicationResponse with submission_id and initial status
+
+        Raises:
+            RuntimeError: If submission fails
+            TimeoutError: If submission times out
+            ValueError: If invalid parameters provided
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_job(self, submission_id: str) -> ApplicationStatus:
+        """Get current status of a Spark application.
+
+        Args:
+            submission_id: Submission ID returned from submit_application()
+
+        Returns:
+            ApplicationStatus with current state and metadata
+
+        Raises:
+            RuntimeError: If request fails
+            TimeoutError: If request times out
+            ValueError: If submission_id not found
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def delete_job(self, submission_id: str) -> dict[str, Any]:
+        """Delete a Spark application.
+
+        This terminates a running application or removes a completed application.
+
+        Args:
+            submission_id: Submission ID to delete
+
+        Returns:
+            Dictionary with deletion response and status
+
+        Raises:
+            RuntimeError: If deletion fails
+            TimeoutError: If deletion times out
+            ValueError: If submission_id not found
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_job_logs(
+        self,
+        submission_id: str,
+        executor_id: Optional[str] = None,
+        follow: bool = False,
+    ) -> Iterator[str]:
+        """Get application logs.
+
+        Args:
+            submission_id: Submission ID
+            executor_id: Optional executor ID (if not provided, returns driver logs)
+            follow: Whether to stream logs in real-time (tail -f behavior)
+
+        Yields:
+            Log lines as strings
+
+        Raises:
+            RuntimeError: If request fails
+            ValueError: If submission_id or executor_id not found
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def list_jobs(
+        self,
+        namespace: Optional[str] = None,
+        labels: Optional[dict[str, str]] = None,
+    ) -> list[ApplicationStatus]:
+        """List Spark applications with optional filtering.
+
+        Args:
+            namespace: Optional namespace/queue filter
+            labels: Optional label filters (key-value pairs)
+
+        Returns:
+            List of ApplicationStatus objects
+
+        Raises:
+            RuntimeError: If request fails
+            TimeoutError: If request times out
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def wait_for_job_status(
+        self,
+        submission_id: str,
+        timeout: int = 3600,
+        polling_interval: int = 10,
+    ) -> ApplicationStatus:
+        """Wait for Spark application to complete.
+
+        This method blocks until the application reaches a terminal state
+        (COMPLETED, FAILED, SUBMISSION_FAILED, KILLED) or timeout is reached.
+
+        Args:
+            submission_id: Submission ID to monitor
+            timeout: Maximum time to wait in seconds (default: 1 hour)
+            polling_interval: Polling interval in seconds (default: 10)
+
+        Returns:
+            Final ApplicationStatus
+
+        Raises:
+            TimeoutError: If application doesn't complete within timeout
+            RuntimeError: If monitoring fails
+            ValueError: If submission_id not found
+        """
+        raise NotImplementedError()
+
+
+class SessionSparkBackend(SparkBackend):
+    """Abstract base class for session-oriented Spark backends.
+
+    This interface defines the contract for backends that support interactive,
+    long-lived Spark sessions for exploratory data analysis and notebook workflows.
+
+    Backends implementing this interface:
+    - ConnectBackend: Connects to Spark clusters via Spark Connect protocol (gRPC)
+
+    Typical workflow:
+        1. create_session() -> Returns ManagedSparkSession
+        2. Use session.sql(), session.read(), etc. for interactive queries
+        3. close_session() to release resources
+
+    Unlike batch backends, sessions maintain state and support iterative development.
+    """
+
+    @abc.abstractmethod
+    def create_session(
+        self,
+        app_name: str,
+        **kwargs: Any,
+    ) -> "ManagedSparkSession":
+        """Create a new Spark Connect session.
+
+        This establishes a connection to a Spark Connect server and returns
+        a managed session wrapper that provides the full PySpark DataFrame API.
+
+        Args:
+            app_name: Name for the session/application
+            **kwargs: Backend-specific configuration (e.g., Spark configs)
+
+        Returns:
+            ManagedSparkSession instance for interactive operations
+
+        Raises:
+            RuntimeError: If session creation fails
+            ConnectionError: If cannot connect to Spark Connect server
+            TimeoutError: If connection times out
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_session_status(self, session_id: str) -> SessionInfo:
+        """Get status and metadata of a Spark Connect session.
+
+        Args:
+            session_id: Session UUID returned by create_session()
+
+        Returns:
+            SessionInfo with session metadata, state, and metrics
+
+        Raises:
+            RuntimeError: If request fails
+            ValueError: If session_id not found
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def list_sessions(self) -> list[SessionInfo]:
+        """List all active Spark Connect sessions.
+
+        Returns:
+            List of SessionInfo objects for active sessions
+
+        Raises:
+            RuntimeError: If request fails
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def close_session(self, session_id: str, release: bool = True) -> dict[str, Any]:
+        """Close a Spark Connect session.
+
+        Args:
+            session_id: Session UUID to close
+            release: If True, release session resources on server
+
+        Returns:
+            Dictionary with closure response
+
+        Raises:
+            RuntimeError: If closure fails
+            ValueError: If session_id not found
+        """
+        raise NotImplementedError()
diff --git a/kubeflow/spark/backends/connect.py b/kubeflow/spark/backends/connect.py
new file mode 100644
index 000000000..2ea1fa0fb
--- /dev/null
+++ b/kubeflow/spark/backends/connect.py
@@ -0,0 +1,344 @@
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Spark Connect backend for remote Spark cluster connectivity."""
+
+from collections.abc import Iterator
+import logging
+from typing import Any, Optional
+import uuid
+
+from kubeflow.spark.backends.base import SessionSparkBackend
+from kubeflow.spark.models import (
+    ApplicationStatus,
+    ConnectBackendConfig,
+    SessionInfo,
+    SparkApplicationResponse,
+)
+from kubeflow.spark.session import ManagedSparkSession
+
+logger = logging.getLogger(__name__)
+
+
+class ConnectBackend(SessionSparkBackend):
+    """Spark Connect backend for remote connectivity to Spark clusters.
+
+    This backend enables connection to existing Spark clusters via the Spark Connect
+    protocol (gRPC-based). It supports interactive, session-based workloads unlike
+    traditional batch-oriented backends.
+
+    Features:
+    - Remote connectivity via Spark Connect (gRPC)
+    - Session management with isolation
+    - Interactive DataFrame operations
+    - Artifact upload (JARs, Python files, data)
+    - Authentication (Bearer token)
+    - SSL/TLS support
+    - Optional auto-provisioning of Spark Connect server
+
+    Example:
+        ```python
+        from kubeflow.spark import SparkClient, ConnectBackendConfig
+
+        config = ConnectBackendConfig(
+            connect_url="sc://spark-cluster.default.svc:15002", token="my-auth-token", use_ssl=True
+        )
+        client = SparkClient(backend_config=config)
+
+        # Create session
+        session = client.create_session(app_name="data-analysis")
+
+        # Use PySpark API
+        df = session.sql("SELECT * FROM table")
+        result = df.collect()
+
+        # Cleanup
+        session.close()
+        ```
+    """
+
+    def __init__(self, config: ConnectBackendConfig):
+        """Initialize Spark Connect backend.
+
+        Args:
+            config: ConnectBackendConfig with connection details
+
+        Raises:
+            ImportError: If pyspark[connect] is not installed
+            ValueError: If config is invalid
+        """
+        self.config = config
+        self._sessions: dict[str, ManagedSparkSession] = {}
+
+        # Validate and parse connection URL
+        self._validate_config()
+
+        # Check for pyspark installation
+        try:
+            import pyspark
+
+            pyspark_version = pyspark.__version__
+            logger.info(f"Using PySpark version: {pyspark_version}")
+
+            # Check for Connect support (requires 3.4+)
+            major, minor = map(int, pyspark_version.split(".")[:2])
+            if major < 3 or (major == 3 and minor < 4):
+                raise ImportError(
+                    f"Spark Connect requires PySpark 3.4+, found {pyspark_version}. "
+                    "Please upgrade: pip install 'pyspark[connect]>=3.4.0'"
+                )
+        except ImportError as e:
+            raise ImportError(
+                "PySpark with Connect support is required for ConnectBackend. "
+                "Install it with: pip install 'pyspark[connect]>=3.4.0'"
+            ) from e
+
+        logger.info(f"Initialized ConnectBackend with URL: {self._get_masked_url()}")
+
+    def _validate_config(self) -> None:
+        """Validate configuration.
+
+        Raises:
+            ValueError: If config is invalid
+        """
+        if not self.config.connect_url:
+            raise ValueError("connect_url is required")
+
+        # Parse URL to validate format
+        if not self.config.connect_url.startswith("sc://"):
+            raise ValueError(
+                f"Invalid Spark Connect URL: {self.config.connect_url}. "
+                "Expected format: sc://host:port/;param1=value;param2=value"
+            )
+
+        # Parse URL components
+        url_without_scheme = self.config.connect_url[5:]  # Remove "sc://"
+        if "/" in url_without_scheme:
+            host_port, params = url_without_scheme.split("/", 1)
+        else:
+            host_port = url_without_scheme
+            params = ""
+
+        if ":" not in host_port:
+            raise ValueError(
+                f"Invalid Spark Connect URL: {self.config.connect_url}. "
+                "Expected format: sc://host:port/"
+            )
+
+    def _get_masked_url(self) -> str:
+        """Get connection URL with masked token.
+
+        Returns:
+            URL string with token masked
+        """
+        url = self.config.connect_url
+        if ";token=" in url:
+            parts = url.split(";token=")
+            return parts[0] + ";token=***"
+        return url
+
+    def _build_connection_url(self) -> str:
+        """Build final connection URL with all parameters.
+
+        For Spark Connect, most parameters should be set via builder.config()
+        rather than in the URL to avoid conflicts with server-side configs.
+
+        Returns:
+            Complete Spark Connect URL
+        """
+        # For Spark 4.x, use simple URL without parameters
+        # Parameters should be set via builder.config() instead
+        url = self.config.connect_url
+
+        # Only add essential parameters that are part of the connection string
+        # SSL and authentication should be handled at connection level
+        # Avoid adding parameters like use_ssl in URL as they may conflict
+
+        return url
+
+    # =========================================================================
+    # Session-Oriented Methods (Implemented)
+    # =========================================================================
+
+    def create_session(
+        self,
+        app_name: str,
+        **kwargs: Any,
+    ) -> ManagedSparkSession:
+        """Create a new Spark Connect session.
+
+        Args:
+            app_name: Name for the session/application
+            **kwargs: Additional Spark configuration (passed to SparkSession.builder.config)
+
+        Returns:
+            ManagedSparkSession instance
+
+        Raises:
+            RuntimeError: If session creation fails
+        """
+        try:
+            from pyspark.sql import SparkSession
+
+            logger.debug("Starting create_session")
+
+            # Generate session ID
+            session_id = str(uuid.uuid4())
+            logger.debug(f"Generated session ID: {session_id}")
+
+            # Build connection URL
+            connection_url = self._build_connection_url()
+            logger.debug(f"Connection URL: {connection_url}")
+
+            # Create SparkSession builder
+            logger.debug("Creating SparkSession.builder.remote()")
+            builder = SparkSession.builder.remote(connection_url).appName(app_name)
+            logger.debug("Builder created, adding app name")
+
+            # Apply additional configurations
+            for key, value in kwargs.items():
+                logger.debug(f"Applying config: {key}={value}")
+                builder = builder.config(key, value)
+
+            # Create session
+            logger.debug("About to call builder.getOrCreate() - THIS MAY HANG")
+            spark_session = builder.getOrCreate()
+            logger.debug("getOrCreate() returned successfully")
+
+            # Wrap in ManagedSparkSession
+            logger.debug("Creating ManagedSparkSession wrapper")
+            managed_session = ManagedSparkSession(
+                session=spark_session,
+                session_id=session_id,
+                app_name=app_name,
+                backend=self,
+            )
+
+            # Track session
+            self._sessions[session_id] = managed_session
+            logger.debug("Session tracked in backend")
+
+            logger.info(f"Created Spark Connect session: {session_id} (app: {app_name})")
+            return managed_session
+
+        except Exception as e:
+            logger.error(f"Failed to create Spark Connect session: {e}")
+            raise RuntimeError(f"Failed to create session: {e}") from e
+
+    def get_session_status(self, session_id: str) -> SessionInfo:
+        """Get status of a Spark Connect session.
+
+        Args:
+            session_id: Session UUID
+
+        Returns:
+            SessionInfo with session metadata
+
+        Raises:
+            ValueError: If session not found
+        """
+        if session_id not in self._sessions:
+            raise ValueError(f"Session not found: {session_id}")
+
+        session = self._sessions[session_id]
+        return session.get_info()
+
+    def list_sessions(self) -> list[SessionInfo]:
+        """List all active Spark Connect sessions.
+
+        Returns:
+            List of SessionInfo objects
+        """
+        return [session.get_info() for session in self._sessions.values()]
+
+    def close_session(self, session_id: str, release: bool = True) -> dict[str, Any]:
+        """Close a Spark Connect session.
+
+        Args:
+            session_id: Session UUID to close
+            release: If True, release session resources on server
+
+        Returns:
+            Dictionary with closure response
+
+        Raises:
+            ValueError: If session not found
+        """
+        if session_id not in self._sessions:
+            raise ValueError(f"Session not found: {session_id}")
+
+        session = self._sessions[session_id]
+        session.close(release=release)
+
+        # Remove from tracking
+        del self._sessions[session_id]
+
+        return {
+            "session_id": session_id,
+            "status": "closed",
+            "message": "Session closed successfully",
+        }
+
+    def _clone_session(self, session: ManagedSparkSession) -> ManagedSparkSession:
+        """Internal method to clone a session.
+
+        Args:
+            session: Session to clone
+
+        Returns:
+            New ManagedSparkSession
+        """
+        try:
+            # Generate new session ID
+            new_session_id = str(uuid.uuid4())
+
+            # Clone the underlying PySpark session
+            # Note: PySpark Connect supports session cloning via newSession()
+            if hasattr(session.spark, "newSession"):
+                new_spark_session = session.spark.newSession()
+            else:
+                # Fallback: create new session (won't share state)
+                logger.warning("Session cloning not supported, creating new session instead")
+                return self.create_session(app_name=f"{session.app_name}-clone")
+
+            # Wrap in ManagedSparkSession
+            cloned_session = ManagedSparkSession(
+                session=new_spark_session,
+                session_id=new_session_id,
+                app_name=f"{session.app_name}-clone",
+                backend=self,
+            )
+
+            # Track session
+            self._sessions[new_session_id] = cloned_session
+
+            logger.info(f"Cloned session {session.session_id} -> {new_session_id}")
+            return cloned_session
+
+        except Exception as e:
+            logger.error(f"Failed to clone session: {e}")
+            raise RuntimeError(f"Failed to clone session: {e}") from e
+
+    def close(self):
+        """Close all sessions and cleanup resources."""
+        logger.info(f"Closing ConnectBackend with {len(self._sessions)} active sessions")
+
+        # Close all sessions
+        for session_id in list(self._sessions.keys()):
+            try:
+                self.close_session(session_id, release=True)
+            except Exception as e:
+                logger.error(f"Error closing session {session_id}: {e}")
+
+        logger.info("ConnectBackend closed")
diff --git a/kubeflow/spark/backends/gateway.py b/kubeflow/spark/backends/gateway.py
new file mode 100644
index 000000000..058e5cbda
--- /dev/null
+++ b/kubeflow/spark/backends/gateway.py
@@ -0,0 +1,383 @@
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Gateway backend for Spark client (REST API based)."""
+
+from collections.abc import Iterator
+import logging
+import os
+from typing import Any, Optional
+from urllib.parse import urljoin
+
+from kubeflow.spark.backends.base import BatchSparkBackend
+from kubeflow.spark.config import AuthMethod
+from kubeflow.spark.models import ApplicationStatus, SparkApplicationResponse
+
+logger = logging.getLogger(__name__)
+
+
+class GatewayBackend(BatchSparkBackend):
+    """Gateway backend for Spark applications.
+
+    This backend communicates with a Batch Processing Gateway via REST API.
+    It's useful for managed Spark environments where you don't have direct
+    K8s access but can use a gateway service.
+
+    Example:
+        from kubeflow.spark.backends.gateway import GatewayBackend, GatewayBackendConfig
+
+        config = GatewayBackendConfig(
+            gateway_url="http://gateway:8080",
+            user="myuser",
+            auth_method=AuthMethod.HEADER
+        )
+        backend = GatewayBackend(config)
+    """
+
+    def __init__(self, config: "GatewayBackendConfig"):
+        """Initialize the Gateway backend.
+
+        Args:
+            config: GatewayBackendConfig instance
+        """
+        self.config = config
+        self._session = None
+        self._initialize_session()
+
+    def _initialize_session(self):
+        """Initialize HTTP session with authentication."""
+        try:
+            import requests
+            from requests.auth import HTTPBasicAuth
+        except ImportError:
+            raise ImportError(
+                "requests library is required for GatewayBackend. "
+                "Install with: pip install requests"
+            )
+
+        self._session = requests.Session()
+        self._session.verify = self.config.verify_ssl
+
+        # Configure authentication
+        if self.config.auth_method == AuthMethod.BASIC:
+            if self.config.user and self.config.password:
+                self._session.auth = HTTPBasicAuth(self.config.user, self.config.password)
+        elif self.config.auth_method == AuthMethod.HEADER and self.config.user:
+            self._session.headers[self.config.auth_header_key] = self.config.user
+
+        # Add extra headers
+        self._session.headers.update(self.config.extra_headers)
+
+    def submit_application(
+        self,
+        app_name: str,
+        main_application_file: str,
+        spark_version: str = "3.5.0",
+        app_type: str = "Python",
+        driver_cores: int = 1,
+        driver_memory: str = "1g",
+        executor_cores: int = 1,
+        executor_memory: str = "1g",
+        num_executors: int = 2,
+        queue: Optional[str] = None,
+        arguments: Optional[list[str]] = None,
+        python_version: str = "3",
+        spark_conf: Optional[dict[str, str]] = None,
+        hadoop_conf: Optional[dict[str, str]] = None,
+        env_vars: Optional[dict[str, str]] = None,
+        deps: Optional[dict[str, list[str]]] = None,
+        **kwargs: Any,
+    ) -> SparkApplicationResponse:
+        """Submit a Spark application through the gateway.
+
+        Args:
+            See SparkBackend.submit_application for parameter details
+
+        Returns:
+            SparkApplicationResponse with submission details
+
+        Raises:
+            RuntimeError: If submission fails
+            TimeoutError: If submission times out
+        """
+        from kubeflow.spark.models import SparkApplicationRequest
+
+        # Build request object
+        request = SparkApplicationRequest(
+            app_name=app_name,
+            main_application_file=main_application_file,
+            spark_version=spark_version,
+            app_type=app_type,
+            driver_cores=driver_cores,
+            driver_memory=driver_memory,
+            executor_cores=executor_cores,
+            executor_memory=executor_memory,
+            num_executors=num_executors,
+            queue=queue or self.config.default_queue,
+            arguments=arguments or [],
+            python_version=python_version,
+            spark_conf=spark_conf or {},
+            hadoop_conf=hadoop_conf or {},
+            env_vars=env_vars or {},
+            deps=deps,
+        )
+
+        # Submit to gateway
+        url = urljoin(self.config.gateway_url, "/spark")
+        try:
+            response = self._session.post(url, json=request.to_dict(), timeout=self.config.timeout)
+            response.raise_for_status()
+
+            return SparkApplicationResponse.from_dict(response.json())
+
+        except Exception as e:
+            raise RuntimeError(f"Failed to submit application to gateway: {e}") from e
+
+    def get_job(self, submission_id: str) -> ApplicationStatus:
+        """Get status of a Spark application from gateway.
+
+        Args:
+            submission_id: Submission ID returned from submit_application
+
+        Returns:
+            ApplicationStatus with current status
+
+        Raises:
+            RuntimeError: If request fails
+        """
+        url = urljoin(self.config.gateway_url, f"/spark/{submission_id}/status")
+        try:
+            response = self._session.get(url, timeout=self.config.timeout)
+            response.raise_for_status()
+
+            return ApplicationStatus.from_dict(response.json())
+
+        except Exception as e:
+            raise RuntimeError(f"Failed to get status from gateway: {e}") from e
+
+    def delete_job(self, submission_id: str) -> dict[str, Any]:
+        """Delete a Spark application through gateway.
+
+        Args:
+            submission_id: Submission ID to delete
+
+        Returns:
+            Dictionary with deletion response
+
+        Raises:
+            RuntimeError: If deletion fails
+        """
+        url = urljoin(self.config.gateway_url, f"/spark/{submission_id}")
+        try:
+            response = self._session.delete(url, timeout=self.config.timeout)
+            response.raise_for_status()
+
+            return response.json()
+
+        except Exception as e:
+            raise RuntimeError(f"Failed to delete application from gateway: {e}") from e
+
+    def get_job_logs(
+        self,
+        submission_id: str,
+        executor_id: Optional[str] = None,
+        follow: bool = False,
+    ) -> Iterator[str]:
+        """Get application logs from gateway.
+
+        Args:
+            submission_id: Submission ID
+            executor_id: Optional executor ID
+            follow: Whether to stream logs (not supported by gateway)
+
+        Yields:
+            Log lines as strings
+
+        Raises:
+            RuntimeError: If request fails
+        """
+        if follow:
+            logger.warning("Log following is not supported by GatewayBackend")
+
+        params = {"subId": submission_id}
+        if executor_id:
+            params["execId"] = executor_id
+
+        url = urljoin(self.config.gateway_url, "/log")
+        try:
+            response = self._session.get(url, params=params, timeout=self.config.timeout)
+            response.raise_for_status()
+
+            yield from response.text.splitlines()
+
+        except Exception as e:
+            raise RuntimeError(f"Failed to get logs from gateway: {e}") from e
+
+    def list_jobs(
+        self,
+        namespace: Optional[str] = None,
+        labels: Optional[dict[str, str]] = None,
+    ) -> list[ApplicationStatus]:
+        """List Spark applications from gateway.
+
+        Note: Gateway backend may not support listing applications.
+
+        Args:
+            namespace: Optional namespace filter (may not be supported)
+            labels: Optional label filters (may not be supported)
+
+        Returns:
+            List of ApplicationStatus objects
+
+        Raises:
+            NotImplementedError: If gateway doesn't support listing
+        """
+        raise NotImplementedError(
+            "GatewayBackend does not support listing applications. "
+            "This feature is only available with OperatorBackend."
+        )
+
+    def wait_for_job_status(
+        self,
+        submission_id: str,
+        timeout: int = 3600,
+        polling_interval: int = 10,
+    ) -> ApplicationStatus:
+        """Wait for Spark application to complete.
+
+        Args:
+            submission_id: Submission ID to monitor
+            timeout: Maximum time to wait in seconds
+            polling_interval: Polling interval in seconds
+
+        Returns:
+            Final ApplicationStatus
+
+        Raises:
+            TimeoutError: If application doesn't complete within timeout
+        """
+        import time
+
+        from kubeflow.spark.models import ApplicationState
+
+        start_time = time.time()
+
+        while True:
+            status = self.get_job(submission_id)
+
+            # Check if application reached terminal state
+            if status.state in [ApplicationState.COMPLETED, ApplicationState.FAILED]:
+                return status
+
+            # Check timeout
+            elapsed = time.time() - start_time
+            if elapsed >= timeout:
+                raise TimeoutError(
+                    f"Application {submission_id} did not complete within {timeout}s"
+                )
+
+            logger.debug(
+                f"Application {submission_id} status: {status.state.value}. "
+                f"Waiting {polling_interval}s..."
+            )
+            time.sleep(polling_interval)
+
+    def close(self):
+        """Close HTTP session."""
+        if self._session:
+            self._session.close()
+
+
+class GatewayBackendConfig:
+    """Configuration for Gateway backend.
+
+    Attributes:
+        gateway_url: URL of the Batch Processing Gateway
+        user: Username for authentication
+        password: Password for basic authentication
+        auth_method: Authentication method to use
+        auth_header_key: Header key for user authentication
+        timeout: Request timeout in seconds
+        verify_ssl: Whether to verify SSL certificates
+        default_queue: Default queue for job submission
+        default_spark_version: Default Spark version
+        extra_headers: Additional headers to include in requests
+    """
+
+    def __init__(
+        self,
+        gateway_url: str,
+        user: Optional[str] = None,
+        password: Optional[str] = None,
+        auth_method: AuthMethod = AuthMethod.NONE,
+        auth_header_key: str = "X-User",
+        timeout: int = 30,
+        verify_ssl: bool = True,
+        default_queue: str = "poc",
+        default_spark_version: str = "3.5.0",
+        extra_headers: Optional[dict[str, str]] = None,
+    ):
+        """Initialize Gateway backend configuration.
+
+        Args:
+            gateway_url: URL of the Batch Processing Gateway
+            user: Username for authentication
+            password: Password for basic authentication
+            auth_method: Authentication method to use
+            auth_header_key: Header key for user authentication
+            timeout: Request timeout in seconds
+            verify_ssl: Whether to verify SSL certificates
+            default_queue: Default queue for job submission
+            default_spark_version: Default Spark version
+            extra_headers: Additional headers to include
+        """
+        self.gateway_url = gateway_url
+        self.user = user
+        self.password = password
+        self.auth_method = auth_method
+        self.auth_header_key = auth_header_key
+        self.timeout = timeout
+        self.verify_ssl = verify_ssl
+        self.default_queue = default_queue
+        self.default_spark_version = default_spark_version
+        self.extra_headers = extra_headers or {}
+
+    @classmethod
+    def from_env(cls, prefix: str = "KUBEFLOW_SPARK_") -> "GatewayBackendConfig":
+        """Create config from environment variables.
+
+        Args:
+            prefix: Prefix for environment variables
+
+        Returns:
+            GatewayBackendConfig instance
+
+        Environment variables:
+            - {prefix}GATEWAY_URL (required)
+            - {prefix}USER
+            - {prefix}PASSWORD
+            - {prefix}AUTH_METHOD (basic|header|none)
+            - {prefix}DEFAULT_QUEUE
+            - {prefix}DEFAULT_SPARK_VERSION
+        """
+        return cls(
+            gateway_url=os.getenv(f"{prefix}GATEWAY_URL", ""),
+            user=os.getenv(f"{prefix}USER"),
+            password=os.getenv(f"{prefix}PASSWORD"),
+            auth_method=AuthMethod(os.getenv(f"{prefix}AUTH_METHOD", "none").lower()),
+            timeout=int(os.getenv(f"{prefix}TIMEOUT", "30")),
+            verify_ssl=os.getenv(f"{prefix}VERIFY_SSL", "true").lower() == "true",
+            default_queue=os.getenv(f"{prefix}DEFAULT_QUEUE", "poc"),
+            default_spark_version=os.getenv(f"{prefix}DEFAULT_SPARK_VERSION", "3.5.0"),
+        )
diff --git a/kubeflow/spark/backends/operator.py b/kubeflow/spark/backends/operator.py
new file mode 100644
index 000000000..424df55c2
--- /dev/null
+++ b/kubeflow/spark/backends/operator.py
@@ -0,0 +1,834 @@
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Kubernetes Spark Operator backend implementation."""
+
+from collections.abc import Iterator
+from dataclasses import dataclass, field
+import logging
+import multiprocessing
+import time
+from typing import Any, Optional
+
+from kubernetes import client, config as k8s_config, watch
+
+from kubeflow.spark.backends.base import BatchSparkBackend
+from kubeflow.spark.models import (
+    ApplicationState,
+    ApplicationStatus,
+    SparkApplicationResponse,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# Constants for Spark Operator
+SPARK_OPERATOR_API_GROUP = "sparkoperator.k8s.io"
+SPARK_OPERATOR_API_VERSION = "v1beta2"
+SPARK_APPLICATION_PLURAL = "sparkapplications"
+SPARK_APPLICATION_KIND = "SparkApplication"
+DEFAULT_TIMEOUT = 60  # seconds
+
+
+@dataclass
+class OperatorBackendConfig:
+    """Configuration for Spark Operator backend.
+
+    Attributes:
+        namespace: Kubernetes namespace to use
+        context: Kubernetes context name
+        config_file: Path to kubeconfig file
+        client_configuration: Custom Kubernetes client configuration
+        service_account: Service account for Spark pods
+        image_pull_policy: Image pull policy (IfNotPresent, Always, Never)
+        default_spark_image: Default Docker image for Spark
+        timeout: Default timeout for API operations in seconds
+        enable_monitoring: Enable Prometheus monitoring
+        enable_ui: Enable Spark UI service
+    """
+
+    namespace: Optional[str] = None
+    context: Optional[str] = None
+    config_file: Optional[str] = None
+    client_configuration: Optional[client.Configuration] = None
+    service_account: str = "spark-operator-spark"
+    image_pull_policy: str = "IfNotPresent"
+    default_spark_image: str = "gcr.io/spark-operator/spark-py"
+    timeout: int = DEFAULT_TIMEOUT
+    enable_monitoring: bool = True
+    enable_ui: bool = True
+    extra_labels: dict[str, str] = field(default_factory=dict)
+    extra_annotations: dict[str, str] = field(default_factory=dict)
+
+
+class OperatorBackend(BatchSparkBackend):
+    """Kubernetes Spark Operator backend.
+
+    This backend uses the Kubeflow Spark Operator to manage Spark applications
+    on Kubernetes. It creates SparkApplication CRDs that the operator watches
+    and converts into Kubernetes pods.
+
+    Example:
+        config = OperatorBackendConfig(namespace="spark-jobs")
+        backend = OperatorBackend(config)
+        response = backend.submit_application(
+            app_name="my-spark-job",
+            main_application_file="local:///app/main.py",
+            ...
+        )
+    """
+
+    def __init__(self, config: OperatorBackendConfig):
+        """Initialize the Operator backend.
+
+        Args:
+            config: OperatorBackendConfig instance
+        """
+        self.config = config
+
+        # Determine namespace
+        if self.config.namespace is None:
+            self.config.namespace = self._get_default_namespace()
+
+        # Load Kubernetes configuration
+        if self.config.client_configuration is None:
+            if self.config.config_file or not self._is_running_in_k8s():
+                k8s_config.load_kube_config(
+                    config_file=self.config.config_file,
+                    context=self.config.context,
+                )
+            else:
+                k8s_config.load_incluster_config()
+
+        # Initialize Kubernetes API clients
+        k8s_client = client.ApiClient(self.config.client_configuration)
+        self.custom_api = client.CustomObjectsApi(k8s_client)
+        self.core_api = client.CoreV1Api(k8s_client)
+
+        logger.info(f"Initialized OperatorBackend with namespace: {self.config.namespace}")
+
+    def submit_application(
+        self,
+        app_name: str,
+        main_application_file: str,
+        spark_version: str = "3.5.0",
+        app_type: str = "Python",
+        driver_cores: int = 1,
+        driver_memory: str = "1g",
+        executor_cores: int = 1,
+        executor_memory: str = "1g",
+        num_executors: int = 2,
+        queue: Optional[str] = None,
+        arguments: Optional[list[str]] = None,
+        python_version: str = "3",
+        spark_conf: Optional[dict[str, str]] = None,
+        hadoop_conf: Optional[dict[str, str]] = None,
+        env_vars: Optional[dict[str, str]] = None,
+        deps: Optional[dict[str, list[str]]] = None,
+        **kwargs: Any,
+    ) -> SparkApplicationResponse:
+        """Submit a Spark application using Spark Operator.
+
+        Creates a SparkApplication CRD in Kubernetes which the Spark Operator
+        watches and converts into driver and executor pods.
+
+        Args:
+            app_name: Name of the application (must be DNS-compliant)
+            main_application_file: Path to main application file
+            spark_version: Spark version to use
+            app_type: Application type (Python, Scala, Java, R)
+            driver_cores: Number of cores for driver
+            driver_memory: Memory for driver
+            executor_cores: Number of cores per executor
+            executor_memory: Memory per executor
+            num_executors: Number of executors
+            queue: Namespace to submit to (overrides config namespace)
+            arguments: Application arguments
+            python_version: Python version
+            spark_conf: Spark configuration properties
+            hadoop_conf: Hadoop configuration properties
+            env_vars: Environment variables
+            deps: Dependencies dict with keys: jars, pyFiles, files
+            **kwargs: Additional parameters (volumes, node_selector, tolerations, etc.)
+
+        Returns:
+            SparkApplicationResponse with submission details
+
+        Raises:
+            ValueError: If required parameters are invalid
+            RuntimeError: If submission fails
+            TimeoutError: If submission times out
+        """
+        # Validate app_name is DNS-compliant
+        if not self._is_valid_k8s_name(app_name):
+            raise ValueError(
+                f"app_name '{app_name}' must be DNS-compliant "
+                "(lowercase alphanumeric characters, '-' or '.')"
+            )
+
+        # Determine target namespace
+        target_namespace = queue if queue else self.config.namespace
+
+        # Build SparkApplication CRD
+        spark_app = self._build_spark_application_crd(
+            app_name=app_name,
+            main_application_file=main_application_file,
+            spark_version=spark_version,
+            app_type=app_type,
+            driver_cores=driver_cores,
+            driver_memory=driver_memory,
+            executor_cores=executor_cores,
+            executor_memory=executor_memory,
+            num_executors=num_executors,
+            arguments=arguments or [],
+            python_version=python_version,
+            spark_conf=spark_conf or {},
+            hadoop_conf=hadoop_conf or {},
+            env_vars=env_vars or {},
+            deps=deps,
+            **kwargs,
+        )
+
+        # Submit to Kubernetes
+        try:
+            thread = self.custom_api.create_namespaced_custom_object(
+                group=SPARK_OPERATOR_API_GROUP,
+                version=SPARK_OPERATOR_API_VERSION,
+                namespace=target_namespace,
+                plural=SPARK_APPLICATION_PLURAL,
+                body=spark_app,
+                async_req=True,
+            )
+            result = thread.get(self.config.timeout)
+
+            logger.info(f"SparkApplication {target_namespace}/{app_name} created successfully")
+
+            return SparkApplicationResponse(
+                submission_id=app_name,
+                app_name=app_name,
+                status="SUBMITTED",
+                message=f"SparkApplication created in namespace {target_namespace}",
+            )
+
+        except multiprocessing.TimeoutError as e:
+            raise TimeoutError(
+                f"Timeout creating SparkApplication {target_namespace}/{app_name}"
+            ) from e
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to create SparkApplication {target_namespace}/{app_name}: {e}"
+            ) from e
+
+    def get_job(self, submission_id: str) -> ApplicationStatus:
+        """Get status of a Spark application.
+
+        Args:
+            submission_id: Name of the SparkApplication (same as app_name)
+
+        Returns:
+            ApplicationStatus with current status
+
+        Raises:
+            RuntimeError: If request fails
+            TimeoutError: If request times out
+        """
+        try:
+            thread = self.custom_api.get_namespaced_custom_object(
+                group=SPARK_OPERATOR_API_GROUP,
+                version=SPARK_OPERATOR_API_VERSION,
+                namespace=self.config.namespace,
+                plural=SPARK_APPLICATION_PLURAL,
+                name=submission_id,
+                async_req=True,
+            )
+            spark_app = thread.get(self.config.timeout)
+
+            return self._parse_application_status(spark_app)
+
+        except multiprocessing.TimeoutError as e:
+            raise TimeoutError(
+                f"Timeout getting SparkApplication {self.config.namespace}/{submission_id}"
+            ) from e
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to get SparkApplication {self.config.namespace}/{submission_id}: {e}"
+            ) from e
+
+    def delete_job(self, submission_id: str) -> dict[str, Any]:
+        """Delete a Spark application.
+
+        Args:
+            submission_id: Name of the SparkApplication to delete
+
+        Returns:
+            Dictionary with deletion response
+
+        Raises:
+            RuntimeError: If deletion fails
+            TimeoutError: If deletion times out
+        """
+        try:
+            thread = self.custom_api.delete_namespaced_custom_object(
+                group=SPARK_OPERATOR_API_GROUP,
+                version=SPARK_OPERATOR_API_VERSION,
+                namespace=self.config.namespace,
+                plural=SPARK_APPLICATION_PLURAL,
+                name=submission_id,
+                async_req=True,
+            )
+            result = thread.get(self.config.timeout)
+
+            logger.info(f"SparkApplication {self.config.namespace}/{submission_id} deleted")
+
+            return {
+                "status": "deleted",
+                "message": f"Application {submission_id} deleted",
+            }
+
+        except multiprocessing.TimeoutError as e:
+            raise TimeoutError(
+                f"Timeout deleting SparkApplication {self.config.namespace}/{submission_id}"
+            ) from e
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to delete SparkApplication {self.config.namespace}/{submission_id}: {e}"
+            ) from e
+
+    def get_job_logs(
+        self,
+        submission_id: str,
+        executor_id: Optional[str] = None,
+        follow: bool = False,
+    ) -> Iterator[str]:
+        """Get application logs from driver or executor pods.
+
+        Args:
+            submission_id: Name of the SparkApplication
+            executor_id: Optional executor ID (e.g., "1", "2"). If None, returns driver logs
+            follow: Whether to stream logs in real-time
+
+        Yields:
+            Log lines as strings
+
+        Raises:
+            RuntimeError: If request fails
+        """
+        # Determine pod name based on executor_id
+        if executor_id:
+            # Executor pod naming: <app-name>-<executor-id>
+            pod_name = f"{submission_id}-{executor_id}"
+            container_name = "executor"
+        else:
+            # Driver pod naming: <app-name>-driver
+            pod_name = f"{submission_id}-driver"
+            container_name = "spark-kubernetes-driver"
+
+        try:
+            if follow:
+                # Stream logs in real-time
+                log_stream = watch.Watch().stream(
+                    self.core_api.read_namespaced_pod_log,
+                    name=pod_name,
+                    namespace=self.config.namespace,
+                    container=container_name,
+                    follow=True,
+                )
+                yield from log_stream
+            else:
+                # Get all logs at once
+                logs = self.core_api.read_namespaced_pod_log(
+                    name=pod_name,
+                    namespace=self.config.namespace,
+                    container=container_name,
+                )
+                yield from logs.splitlines()
+
+        except client.exceptions.ApiException as e:
+            if e.status == 404:
+                logger.warning(f"Pod {pod_name} not found in namespace {self.config.namespace}")
+                return
+            elif e.status == 400 and (
+                "waiting to start" in str(e.body) or "ContainerCreating" in str(e.body)
+            ):
+                # Pod exists but container is not ready yet
+                # Check if it's a "waiting to start" error
+                logger.warning(
+                    f"Pod {pod_name} is not ready yet (ContainerCreating). "
+                    "Wait for pod to be running before fetching logs."
+                )
+                return
+            elif e.status == 400:
+                # Otherwise, it's a different 400 error
+                raise RuntimeError(
+                    f"Failed to read logs for pod {self.config.namespace}/{pod_name}: {e}"
+                ) from e
+            raise RuntimeError(
+                f"Failed to read logs for pod {self.config.namespace}/{pod_name}: {e}"
+            ) from e
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to read logs for pod {self.config.namespace}/{pod_name}: {e}"
+            ) from e
+
+    def list_jobs(
+        self,
+        namespace: Optional[str] = None,
+        labels: Optional[dict[str, str]] = None,
+    ) -> list[ApplicationStatus]:
+        """List Spark applications.
+
+        Args:
+            namespace: Optional namespace filter (uses config namespace if None)
+            labels: Optional label filters
+
+        Returns:
+            List of ApplicationStatus objects
+
+        Raises:
+            RuntimeError: If request fails
+            TimeoutError: If request times out
+        """
+        target_namespace = namespace or self.config.namespace
+
+        try:
+            # Build label selector
+            label_selector = None
+            if labels:
+                label_selector = ",".join([f"{k}={v}" for k, v in labels.items()])
+
+            thread = self.custom_api.list_namespaced_custom_object(
+                group=SPARK_OPERATOR_API_GROUP,
+                version=SPARK_OPERATOR_API_VERSION,
+                namespace=target_namespace,
+                plural=SPARK_APPLICATION_PLURAL,
+                label_selector=label_selector,
+                async_req=True,
+            )
+            result = thread.get(self.config.timeout)
+
+            applications = []
+            for item in result.get("items", []):
+                applications.append(self._parse_application_status(item))
+
+            return applications
+
+        except multiprocessing.TimeoutError as e:
+            raise TimeoutError(
+                f"Timeout listing SparkApplications in namespace {target_namespace}"
+            ) from e
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to list SparkApplications in namespace {target_namespace}: {e}"
+            ) from e
+
+    def wait_for_job_status(
+        self,
+        submission_id: str,
+        timeout: int = 3600,
+        polling_interval: int = 10,
+    ) -> ApplicationStatus:
+        """Wait for Spark application to complete.
+
+        Args:
+            submission_id: Name of the SparkApplication
+            timeout: Maximum time to wait in seconds
+            polling_interval: Polling interval in seconds
+
+        Returns:
+            Final ApplicationStatus
+
+        Raises:
+            TimeoutError: If application doesn't complete within timeout
+            RuntimeError: If monitoring fails
+        """
+        start_time = time.time()
+
+        while True:
+            status = self.get_job(submission_id)
+
+            # Check if application reached terminal state
+            if status.state in [ApplicationState.COMPLETED, ApplicationState.FAILED]:
+                return status
+
+            # Check timeout
+            elapsed = time.time() - start_time
+            if elapsed >= timeout:
+                raise TimeoutError(
+                    f"Application {submission_id} did not complete within {timeout}s. "
+                    f"Last status: {status.state.value}"
+                )
+
+            logger.debug(
+                f"Application {submission_id} status: {status.state.value}. "
+                f"Waiting {polling_interval}s... ({int(elapsed)}s elapsed)"
+            )
+            time.sleep(polling_interval)
+
+    def _build_spark_application_crd(
+        self,
+        app_name: str,
+        main_application_file: str,
+        spark_version: str,
+        app_type: str,
+        driver_cores: int,
+        driver_memory: str,
+        executor_cores: int,
+        executor_memory: str,
+        num_executors: int,
+        arguments: list[str],
+        python_version: str,
+        spark_conf: dict[str, str],
+        hadoop_conf: dict[str, str],
+        env_vars: dict[str, str],
+        deps: Optional[dict[str, list[str]]],
+        **kwargs: Any,
+    ) -> dict[str, Any]:
+        """Build SparkApplication CRD specification.
+
+        Args:
+            All parameters from submit_application
+            **kwargs: Additional parameters like volumes, node_selector, etc.
+
+        Returns:
+            SparkApplication CRD dictionary
+        """
+        # Build base CRD structure
+        spark_app: dict[str, Any] = {
+            "apiVersion": f"{SPARK_OPERATOR_API_GROUP}/{SPARK_OPERATOR_API_VERSION}",
+            "kind": SPARK_APPLICATION_KIND,
+            "metadata": {
+                "name": app_name,
+                "labels": {
+                    "app": app_name,
+                    "version": spark_version,
+                    **self.config.extra_labels,
+                },
+                "annotations": self.config.extra_annotations,
+            },
+            "spec": {
+                "type": app_type,
+                "mode": "cluster",
+                "image": f"{self.config.default_spark_image}:{spark_version}",
+                "imagePullPolicy": self.config.image_pull_policy,
+                "mainApplicationFile": main_application_file,
+                "sparkVersion": spark_version,
+                "restartPolicy": self._build_restart_policy(kwargs.get("restart_policy")),
+                "driver": {
+                    "cores": driver_cores,
+                    "memory": driver_memory,
+                    "serviceAccount": self.config.service_account,
+                    "labels": {"version": spark_version, "component": "driver"},
+                },
+                "executor": {
+                    "cores": executor_cores,
+                    "instances": num_executors,
+                    "memory": executor_memory,
+                    "labels": {"version": spark_version, "component": "executor"},
+                },
+            },
+        }
+
+        # Add optional fields
+        if arguments:
+            spark_app["spec"]["arguments"] = arguments
+
+        # Add main class for Scala/Java applications
+        if "main_class" in kwargs and kwargs["main_class"]:
+            spark_app["spec"]["mainClass"] = kwargs["main_class"]
+
+        if spark_conf:
+            spark_app["spec"]["sparkConf"] = spark_conf
+
+        if hadoop_conf:
+            spark_app["spec"]["hadoopConf"] = hadoop_conf
+
+        # Add environment variables
+        if env_vars:
+            env_list = [{"name": k, "value": v} for k, v in env_vars.items()]
+            spark_app["spec"]["driver"]["env"] = env_list
+            spark_app["spec"]["executor"]["env"] = env_list
+
+        # Add dependencies
+        if deps:
+            spark_app["spec"]["deps"] = deps
+
+        # Add Python version for Python apps
+        if app_type == "Python":
+            spark_app["spec"]["pythonVersion"] = python_version
+
+        # Add monitoring if enabled
+        if self.config.enable_monitoring:
+            spark_app["spec"]["monitoring"] = {
+                "exposeDriverMetrics": True,
+                "exposeExecutorMetrics": True,
+                "prometheus": {
+                    "jmxExporterJar": "/prometheus/jmx_prometheus_javaagent-0.11.0.jar",
+                    "port": 8090,
+                },
+            }
+
+        # Add Spark UI service if enabled
+        if self.config.enable_ui:
+            spark_app["spec"]["sparkUIOptions"] = {
+                "servicePort": 4040,
+                "serviceType": "ClusterIP",  # Required for service creation
+            }
+
+        # Add volumes if specified
+        if "volumes" in kwargs:
+            spark_app["spec"]["volumes"] = kwargs["volumes"]
+        if "driver_volume_mounts" in kwargs:
+            spark_app["spec"]["driver"]["volumeMounts"] = kwargs["driver_volume_mounts"]
+        if "executor_volume_mounts" in kwargs:
+            spark_app["spec"]["executor"]["volumeMounts"] = kwargs["executor_volume_mounts"]
+
+        # Add node selector if specified
+        if "node_selector" in kwargs:
+            spark_app["spec"]["driver"]["nodeSelector"] = kwargs["node_selector"]
+            spark_app["spec"]["executor"]["nodeSelector"] = kwargs["node_selector"]
+
+        # Add tolerations if specified
+        if "tolerations" in kwargs:
+            spark_app["spec"]["driver"]["tolerations"] = kwargs["tolerations"]
+            spark_app["spec"]["executor"]["tolerations"] = kwargs["tolerations"]
+
+        # Add resource limits if specified
+        if "driver_limits" in kwargs:
+            if "limits" not in spark_app["spec"]["driver"]:
+                spark_app["spec"]["driver"]["limits"] = {}
+            spark_app["spec"]["driver"]["limits"].update(kwargs["driver_limits"])
+
+        if "executor_limits" in kwargs:
+            if "limits" not in spark_app["spec"]["executor"]:
+                spark_app["spec"]["executor"]["limits"] = {}
+            spark_app["spec"]["executor"]["limits"].update(kwargs["executor_limits"])
+
+        # Add dynamic allocation if specified
+        if kwargs.get("enable_dynamic_allocation"):
+            spark_app["spec"]["dynamicAllocation"] = {
+                "enabled": True,
+                "initialExecutors": kwargs.get("initial_executors", num_executors),
+                "minExecutors": kwargs.get("min_executors", 1),
+                "maxExecutors": kwargs.get("max_executors", num_executors * 2),
+            }
+
+        # Add time_to_live_seconds if specified
+        if "time_to_live_seconds" in kwargs and kwargs["time_to_live_seconds"]:
+            spark_app["spec"]["timeToLiveSeconds"] = kwargs["time_to_live_seconds"]
+
+        # Add labels if specified
+        if "labels" in kwargs and kwargs["labels"]:
+            spark_app["metadata"]["labels"].update(kwargs["labels"])
+
+        return spark_app
+
+    def _build_restart_policy(self, restart_policy: Optional[Any]) -> dict[str, Any]:
+        """Build restart policy dict from RestartPolicy object or default.
+
+        Args:
+            restart_policy: RestartPolicy object or None
+
+        Returns:
+            Restart policy dictionary
+        """
+        from kubeflow.spark.models import RestartPolicy, RestartPolicyType
+
+        if restart_policy is None:
+            return {"type": "Never"}
+
+        # If it's already a RestartPolicy object
+        if isinstance(restart_policy, RestartPolicy):
+            policy_dict = {
+                "type": restart_policy.type.value
+                if isinstance(restart_policy.type, RestartPolicyType)
+                else restart_policy.type
+            }
+            if restart_policy.on_failure_retries is not None:
+                policy_dict["onFailureRetries"] = restart_policy.on_failure_retries
+            if restart_policy.on_failure_retry_interval:
+                policy_dict["onFailureRetryInterval"] = restart_policy.on_failure_retry_interval
+            if restart_policy.on_submission_failure_retries is not None:
+                policy_dict["onSubmissionFailureRetries"] = (
+                    restart_policy.on_submission_failure_retries
+                )
+            if restart_policy.on_submission_failure_retry_interval:
+                policy_dict["onSubmissionFailureRetryInterval"] = (
+                    restart_policy.on_submission_failure_retry_interval
+                )
+            return policy_dict
+
+        # If it's a string, use it as type
+        if isinstance(restart_policy, str):
+            return {"type": restart_policy}
+
+        # Default
+        return {"type": "Never"}
+
+    def _parse_application_status(self, spark_app: dict[str, Any]) -> ApplicationStatus:
+        """Parse SparkApplication CRD status into ApplicationStatus.
+
+        Args:
+            spark_app: SparkApplication CRD dictionary
+
+        Returns:
+            ApplicationStatus object
+        """
+        metadata = spark_app.get("metadata", {})
+        status = spark_app.get("status", {})
+        app_state_dict = status.get("applicationState", {})
+
+        # Parse state
+        state_str = app_state_dict.get("state", "UNKNOWN")
+        try:
+            state = ApplicationState(state_str)
+        except ValueError:
+            logger.warning(f"Unknown application state: {state_str}")
+            state = ApplicationState.UNKNOWN
+
+        return ApplicationStatus(
+            submission_id=metadata.get("name", ""),
+            app_id=status.get("sparkApplicationId"),
+            app_name=metadata.get("name"),
+            state=state,
+            submission_time=status.get("submissionTime"),
+            start_time=status.get("lastSubmissionAttemptTime"),
+            completion_time=status.get("terminationTime"),
+            driver_info=status.get("driverInfo"),
+            executor_state=status.get("executorState"),
+        )
+
+    def _is_valid_k8s_name(self, name: str) -> bool:
+        """Check if name is DNS-compliant for Kubernetes.
+
+        Args:
+            name: Name to validate
+
+        Returns:
+            True if valid, False otherwise
+        """
+        import re
+
+        # Kubernetes resource names must be lowercase alphanumeric, '-' or '.'
+        # and start/end with alphanumeric
+        pattern = r"^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"
+        return bool(re.match(pattern, name)) and len(name) <= 253
+
+    def _get_default_namespace(self) -> str:
+        """Get default Kubernetes namespace.
+
+        Returns:
+            Default namespace string
+        """
+        import os
+
+        # Try to get from environment
+        namespace = os.getenv("NAMESPACE")
+        if namespace:
+            return namespace
+
+        # Try to read from service account
+        try:
+            with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f:
+                return f.read().strip()
+        except FileNotFoundError:
+            pass
+
+        # Default to "default"
+        return "default"
+
+    def _is_running_in_k8s(self) -> bool:
+        """Check if running inside a Kubernetes cluster.
+
+        Returns:
+            True if running in cluster, False otherwise
+        """
+        import os
+
+        return os.path.exists("/var/run/secrets/kubernetes.io/serviceaccount/token")
+
+    def wait_for_pod_ready(
+        self,
+        submission_id: str,
+        executor_id: Optional[str] = None,
+        timeout: int = 300,
+    ) -> bool:
+        """Wait for driver or executor pod to be ready.
+
+        Args:
+            submission_id: Name of the SparkApplication
+            executor_id: Optional executor ID. If None, waits for driver pod
+            timeout: Maximum time to wait in seconds
+
+        Returns:
+            True if pod becomes ready, False if timeout
+
+        Raises:
+            RuntimeError: If pod check fails
+        """
+        # Determine pod name
+        if executor_id:
+            pod_name = f"{submission_id}-{executor_id}"
+        else:
+            pod_name = f"{submission_id}-driver"
+
+        start_time = time.time()
+
+        while True:
+            try:
+                pod = self.core_api.read_namespaced_pod(
+                    name=pod_name, namespace=self.config.namespace
+                )
+
+                # Check if pod is running and container is ready
+                if pod.status.phase == "Running" and pod.status.container_statuses:
+                    # Check if containers are ready
+                    for container_status in pod.status.container_statuses:
+                        if container_status.ready:
+                            logger.info(f"Pod {pod_name} is ready")
+                            return True
+
+                # Check if pod failed
+                if pod.status.phase in ["Failed", "Unknown"]:
+                    logger.warning(f"Pod {pod_name} is in {pod.status.phase} state")
+                    return False
+
+                # Check timeout
+                elapsed = time.time() - start_time
+                if elapsed >= timeout:
+                    logger.warning(
+                        f"Timeout waiting for pod {pod_name} to be ready. "
+                        f"Current phase: {pod.status.phase}"
+                    )
+                    return False
+
+                # Wait before next check
+                time.sleep(2)
+
+            except client.exceptions.ApiException as e:
+                if e.status == 404:
+                    # Pod doesn't exist yet
+                    elapsed = time.time() - start_time
+                    if elapsed >= timeout:
+                        logger.warning(f"Timeout waiting for pod {pod_name} to be created")
+                        return False
+                    time.sleep(2)
+                    continue
+                raise RuntimeError(
+                    f"Failed to check pod {self.config.namespace}/{pod_name}: {e}"
+                ) from e
+
+    def close(self):
+        """Close Kubernetes API client connections."""
+        if hasattr(self, "custom_api") and self.custom_api.api_client:
+            self.custom_api.api_client.close()
diff --git a/kubeflow/spark/base_client.py b/kubeflow/spark/base_client.py
new file mode 100644
index 000000000..62a5b7598
--- /dev/null
+++ b/kubeflow/spark/base_client.py
@@ -0,0 +1,96 @@
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Base client class for Spark SDK.
+
+This module provides the abstract base class for all Spark clients,
+implementing shared functionality like resource management, context
+manager protocol, and logging.
+"""
+
+import abc
+import logging
+from typing import Any
+
+from kubeflow.spark.backends.base import SparkBackend
+
+
+class BaseSparkClient(abc.ABC):
+    """Abstract base class for Spark clients.
+
+    This class implements common functionality shared by all Spark client types:
+    - Resource management (close() method)
+    - Context manager protocol (__enter__/__exit__)
+    - Logging infrastructure
+
+    Subclasses (BatchSparkClient, SparkSessionClient) implement specific
+    functionality for their use cases.
+
+    This design follows the Template Method Pattern, where the base class
+    defines the skeleton of operations and subclasses fill in specific steps.
+    """
+
+    def __init__(self, backend: SparkBackend):
+        """Initialize the base client.
+
+        Args:
+            backend: Spark backend instance (BatchSparkBackend or SessionSparkBackend)
+        """
+        self._backend = backend
+        self._logger = logging.getLogger(self.__class__.__name__)
+        self._logger.info(f"Initialized {self.__class__.__name__} with {backend.__class__.__name__}")
+
+    def close(self):
+        """Close the client and release all resources.
+
+        This method delegates to the backend's close() method to clean up:
+        - Kubernetes API clients
+        - HTTP connections
+        - gRPC channels
+        - Active sessions
+
+        It's safe to call this multiple times.
+        """
+        try:
+            self._backend.close()
+            self._logger.info(f"{self.__class__.__name__} closed successfully")
+        except Exception as e:
+            self._logger.error(f"Error closing {self.__class__.__name__}: {e}")
+            raise
+
+    def __enter__(self):
+        """Context manager entry.
+
+        Returns:
+            Self for use in with statements
+        """
+        return self
+
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any):
+        """Context manager exit - ensures cleanup.
+
+        Args:
+            exc_type: Exception type if an exception occurred
+            exc_val: Exception value if an exception occurred
+            exc_tb: Exception traceback if an exception occurred
+        """
+        self.close()
+
+    def __repr__(self) -> str:
+        """String representation.
+
+        Returns:
+            String describing the client and backend
+        """
+        return f"{self.__class__.__name__}(backend={self._backend.__class__.__name__})"
diff --git a/kubeflow/spark/batch_client.py b/kubeflow/spark/batch_client.py
new file mode 100644
index 000000000..a6a0c115d
--- /dev/null
+++ b/kubeflow/spark/batch_client.py
@@ -0,0 +1,421 @@
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Batch Spark client for managing Spark applications."""
+
+from collections.abc import Iterator
+from typing import Any, Optional, Union
+
+from kubeflow.spark.base_client import BaseSparkClient
+from kubeflow.spark.backends.base import BatchSparkBackend
+from kubeflow.spark.backends.gateway import (
+    GatewayBackend,
+    GatewayBackendConfig,
+)
+from kubeflow.spark.backends.operator import (
+    OperatorBackend,
+    OperatorBackendConfig,
+)
+from kubeflow.spark.models import (
+    ApplicationStatus,
+    SparkApplicationResponse,
+)
+
+
+class BatchSparkClient(BaseSparkClient):
+    """Client for managing batch Spark applications.
+
+    This client provides a high-level API for submitting and managing batch
+    Spark applications using either the Kubernetes Spark Operator or REST gateways.
+
+    Supported backends:
+    - **OperatorBackend**: Submits SparkApplication CRDs to Kubernetes (recommended)
+    - **GatewayBackend**: Submits jobs via REST API to Spark gateways (Livy, etc.)
+
+    Example with Operator Backend:
+        ```python
+        from kubeflow.spark import BatchSparkClient, OperatorBackendConfig
+
+        # Initialize client
+        config = OperatorBackendConfig(namespace="spark-jobs")
+        client = BatchSparkClient(backend_config=config)
+
+        # Submit application
+        response = client.submit_application(
+            app_name="my-etl-job",
+            main_application_file="s3a://bucket/jobs/etl.py",
+            driver_cores=2,
+            driver_memory="4g",
+            executor_cores=4,
+            executor_memory="8g",
+            num_executors=10,
+        )
+
+        # Wait for completion
+        status = client.wait_for_job_status(response.submission_id)
+        print(f"Job completed with state: {status.state}")
+
+        # Get logs
+        for line in client.get_job_logs(response.submission_id):
+            print(line)
+        ```
+
+    Example with Gateway Backend:
+        ```python
+        from kubeflow.spark import BatchSparkClient, GatewayBackendConfig
+
+        config = GatewayBackendConfig(
+            gateway_url="http://livy-gateway:8998",
+            user="myuser"
+        )
+        client = BatchSparkClient(backend_config=config)
+        ```
+
+    Context Manager:
+        ```python
+        with BatchSparkClient(backend_config=config) as client:
+            response = client.submit_application(...)
+            # Cleanup happens automatically
+        ```
+    """
+
+    def __init__(
+        self,
+        backend_config: Union[OperatorBackendConfig, GatewayBackendConfig, None] = None,
+    ):
+        """Initialize Batch Spark client.
+
+        Args:
+            backend_config: Backend configuration:
+                          - OperatorBackendConfig: Kubernetes with Spark Operator (default)
+                          - GatewayBackendConfig: REST API gateway
+
+        Raises:
+            ValueError: If invalid backend configuration provided
+        """
+        # Default to OperatorBackend
+        if backend_config is None:
+            backend_config = OperatorBackendConfig()
+
+        # Initialize appropriate backend
+        if isinstance(backend_config, OperatorBackendConfig):
+            backend: BatchSparkBackend = OperatorBackend(backend_config)
+        elif isinstance(backend_config, GatewayBackendConfig):
+            backend = GatewayBackend(backend_config)
+        else:
+            raise ValueError(
+                f"Invalid backend config type for BatchSparkClient: {type(backend_config)}. "
+                "Expected OperatorBackendConfig or GatewayBackendConfig."
+            )
+
+        # Initialize base class
+        super().__init__(backend)
+
+    def submit_application(
+        self,
+        app_name: Optional[str] = None,
+        main_application_file: str = "",
+        spark_version: str = "3.5.0",
+        app_type: str = "Python",
+        driver_cores: int = 1,
+        driver_memory: str = "1g",
+        executor_cores: int = 1,
+        executor_memory: str = "1g",
+        num_executors: int = 2,
+        queue: Optional[str] = None,
+        arguments: Optional[list[str]] = None,
+        python_version: str = "3",
+        spark_conf: Optional[dict[str, str]] = None,
+        hadoop_conf: Optional[dict[str, str]] = None,
+        env_vars: Optional[dict[str, str]] = None,
+        deps: Optional[dict[str, list[str]]] = None,
+        **kwargs: Any,
+    ) -> SparkApplicationResponse:
+        """Submit a Spark application for batch execution.
+
+        Args:
+            app_name: Name of the application. If not provided, a unique name will be
+                     auto-generated. Must be unique within the namespace. (optional)
+            main_application_file: Path to main application file
+                                  Supported formats: local://, s3a://, http://, etc.
+            spark_version: Spark version (default: "3.5.0")
+            app_type: Application type: "Python", "Scala", "Java", "R" (default: "Python")
+            driver_cores: Number of CPU cores for driver (default: 1)
+            driver_memory: Memory for driver, e.g., "1g", "512m" (default: "1g")
+            executor_cores: Number of CPU cores per executor (default: 1)
+            executor_memory: Memory per executor, e.g., "1g", "2g" (default: "1g")
+            num_executors: Number of executors (default: 2)
+            queue: Queue/namespace for submission (backend-specific, optional)
+            arguments: Command-line arguments for the main file (optional)
+            python_version: Python version for PySpark: "2" or "3" (default: "3")
+            spark_conf: Spark configuration properties (spark.*), optional
+            hadoop_conf: Hadoop configuration properties, optional
+            env_vars: Environment variables for driver and executors, optional
+            deps: Dependencies dict with keys: "jars", "pyFiles", "files", optional
+            **kwargs: Additional backend-specific parameters (e.g., volumes, GPUs)
+
+        Returns:
+            SparkApplicationResponse with submission_id and initial status
+
+        Raises:
+            RuntimeError: If submission fails
+            TimeoutError: If submission times out
+            ValueError: If invalid parameters provided
+
+        Example:
+            ```python
+            # With explicit name
+            response = client.submit_application(
+                app_name="data-processing",
+                main_application_file="s3a://my-bucket/jobs/process.py",
+                driver_cores=2,
+                driver_memory="4g",
+            )
+
+            # With auto-generated name (recommended)
+            response = client.submit_application(
+                main_application_file="s3a://my-bucket/jobs/process.py",
+                driver_cores=2,
+                driver_memory="4g",
+            )
+            print(f"Submitted: {response.submission_id}")
+            ```
+        """
+        # Auto-generate name if not provided (similar to TrainerClient)
+        if app_name is None:
+            import secrets
+            import string
+            # Generate a random 12-character alphanumeric name
+            app_name = "spark-" + "".join(
+                secrets.choice(string.ascii_lowercase + string.digits) for _ in range(12)
+            )
+            self._logger.info(f"Auto-generated application name: {app_name}")
+
+        return self._backend.submit_application(
+            app_name=app_name,
+            main_application_file=main_application_file,
+            spark_version=spark_version,
+            app_type=app_type,
+            driver_cores=driver_cores,
+            driver_memory=driver_memory,
+            executor_cores=executor_cores,
+            executor_memory=executor_memory,
+            num_executors=num_executors,
+            queue=queue,
+            arguments=arguments,
+            python_version=python_version,
+            spark_conf=spark_conf,
+            hadoop_conf=hadoop_conf,
+            env_vars=env_vars,
+            deps=deps,
+            **kwargs,
+        )
+
+    def get_job(self, submission_id: str) -> ApplicationStatus:
+        """Get the Spark job object of a Spark application.
+
+        Args:
+            submission_id: Submission ID returned from submit_application()
+
+        Returns:
+            ApplicationStatus object with current state, timestamps, and metadata
+
+        Raises:
+            RuntimeError: If request fails
+            ValueError: If submission_id not found
+
+        Example:
+            ```python
+            status = client.get_job("spark-pi-12345")
+            print(f"State: {status.state}")
+            print(f"App ID: {status.app_id}")
+            ```
+        """
+        return self._backend.get_status(submission_id)
+
+    def delete_job(self, submission_id: str) -> dict[str, Any]:
+        """Delete the Spark job.
+
+        This terminates a running application or removes a completed one.
+
+        Args:
+            submission_id: Submission ID to delete
+
+        Returns:
+            Dictionary with deletion response
+
+        Raises:
+            RuntimeError: If deletion fails
+            ValueError: If submission_id not found
+
+        Example:
+            ```python
+            response = client.delete_job("spark-pi-12345")
+            print(f"Deleted: {response}")
+            ```
+        """
+        return self._backend.delete_application(submission_id)
+
+    def get_job_logs(
+        self,
+        submission_id: str,
+        executor_id: Optional[str] = None,
+        follow: bool = False,
+    ) -> Iterator[str]:
+        """Get application logs.
+
+        Args:
+            submission_id: Submission ID
+            executor_id: Optional executor ID (if not provided, returns driver logs)
+            follow: If True, stream logs in real-time (tail -f behavior)
+
+        Yields:
+            Log lines as strings
+
+        Raises:
+            RuntimeError: If request fails
+            ValueError: If submission_id or executor_id not found
+
+        Example:
+            ```python
+            # Get driver logs
+            for line in client.get_job_logs("spark-pi-12345"):
+                print(line)
+
+            # Get specific executor logs
+            for line in client.get_job_logs("spark-pi-12345", executor_id="1"):
+                print(line)
+
+            # Stream logs in real-time
+            for line in client.get_job_logs("spark-pi-12345", follow=True):
+                print(line)
+            ```
+        """
+        return self._backend.get_logs(submission_id, executor_id, follow)
+
+    def list_jobs(
+        self,
+        namespace: Optional[str] = None,
+        labels: Optional[dict[str, str]] = None,
+    ) -> list[ApplicationStatus]:
+        """List Spark jobs with optional filtering.
+
+        Args:
+            namespace: Optional namespace/queue filter
+            labels: Optional label filters (key-value pairs)
+
+        Returns:
+            List of Spark jobs
+
+        Raises:
+            RuntimeError: If request fails
+
+        Example:
+            ```python
+            # List all jobs
+            apps = client.list_jobs()
+
+            # List in specific namespace
+            apps = client.list_jobs(namespace="production")
+
+            # Filter by labels
+            apps = client.list_jobs(labels={"team": "data-eng"})
+            ```
+        """
+        return self._backend.list_applications(namespace, labels)
+
+    def wait_for_job_status(
+        self,
+        submission_id: str,
+        timeout: int = 3600,
+        polling_interval: int = 10,
+    ) -> ApplicationStatus:
+        """Wait for Spark application to complete.
+
+        This method blocks until the application reaches a terminal state
+        (COMPLETED, FAILED, SUBMISSION_FAILED, KILLED) or timeout is reached.
+
+        Args:
+            submission_id: Submission ID to monitor
+            timeout: Maximum time to wait in seconds (default: 3600 = 1 hour)
+            polling_interval: Polling interval in seconds (default: 10)
+
+        Returns:
+            Final ApplicationStatus
+
+        Raises:
+            TimeoutError: If application doesn't complete within timeout
+            RuntimeError: If monitoring fails
+            ValueError: If submission_id not found
+
+        Example:
+            ```python
+            # Wait with defaults (1 hour timeout)
+            status = client.wait_for_job_status("spark-pi-12345")
+
+            # Custom timeout and polling
+            status = client.wait_for_job_status(
+                "spark-pi-12345",
+                timeout=1800,  # 30 minutes
+                polling_interval=5,  # Poll every 5 seconds
+            )
+
+            if status.state == ApplicationState.COMPLETED:
+                print("Success!")
+            else:
+                print(f"Failed with state: {status.state}")
+            ```
+        """
+        return self._backend.wait_for_completion(submission_id, timeout, polling_interval)
+
+    def wait_for_pod_ready(
+        self,
+        submission_id: str,
+        executor_id: Optional[str] = None,
+        timeout: int = 300,
+    ) -> bool:
+        """Wait for driver or executor pod to be ready.
+
+        Note: This method is only available when using OperatorBackend.
+
+        Args:
+            submission_id: Submission ID
+            executor_id: Optional executor ID (if not provided, waits for driver)
+            timeout: Maximum time to wait in seconds (default: 300 = 5 minutes)
+
+        Returns:
+            True if pod becomes ready, False if timeout
+
+        Raises:
+            NotImplementedError: If backend doesn't support this operation
+            RuntimeError: If request fails
+
+        Example:
+            ```python
+            # Wait for driver pod
+            if client.wait_for_pod_ready("spark-pi-12345"):
+                print("Driver is ready")
+
+            # Wait for specific executor
+            if client.wait_for_pod_ready("spark-pi-12345", executor_id="1"):
+                print("Executor 1 is ready")
+            ```
+        """
+        if isinstance(self._backend, OperatorBackend):
+            return self._backend.wait_for_pod_ready(submission_id, executor_id, timeout)
+        else:
+            raise NotImplementedError(
+                f"{self._backend.__class__.__name__} does not support wait_for_pod_ready(). "
+                "This method is only available with OperatorBackend."
+            )
diff --git a/kubeflow/spark/config.py b/kubeflow/spark/config.py
new file mode 100644
index 000000000..24f34bf5b
--- /dev/null
+++ b/kubeflow/spark/config.py
@@ -0,0 +1,11 @@
+"""Configuration for Spark Client."""
+
+from enum import Enum
+
+
+class AuthMethod(Enum):
+    """Authentication methods supported by Batch Processing Gateway."""
+
+    BASIC = "basic"
+    HEADER = "header"
+    NONE = "none"
diff --git a/kubeflow/spark/models.py b/kubeflow/spark/models.py
new file mode 100644
index 000000000..acde6bdef
--- /dev/null
+++ b/kubeflow/spark/models.py
@@ -0,0 +1,831 @@
+"""Data models for Spark application requests and responses."""
+
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Optional
+
+
+class ApplicationState(Enum):
+    """Spark application states matching Spark Operator CRD states."""
+
+    # Standard states from Spark Operator (v1beta2)
+    NEW = "NEW"
+    SUBMITTED = "SUBMITTED"
+    RUNNING = "RUNNING"
+    COMPLETED = "COMPLETED"
+    FAILED = "FAILED"
+    SUBMISSION_FAILED = "SUBMISSION_FAILED"
+    PENDING_RERUN = "PENDING_RERUN"
+    INVALIDATING = "INVALIDATING"
+    SUCCEEDING = "SUCCEEDING"
+    FAILING = "FAILING"
+    SUSPENDING = "SUSPENDING"
+    SUSPENDED = "SUSPENDED"
+    RESUMING = "RESUMING"
+    UNKNOWN = "UNKNOWN"
+
+
+class RestartPolicyType(Enum):
+    """Restart policy types from operator."""
+
+    NEVER = "Never"
+    ON_FAILURE = "OnFailure"
+    ALWAYS = "Always"
+
+
+class DeployMode(Enum):
+    """Deployment modes for Spark applications."""
+
+    CLUSTER = "cluster"
+    CLIENT = "client"
+    IN_CLUSTER_CLIENT = "in-cluster-client"
+
+
+@dataclass
+class RestartPolicy:
+    """Restart policy configuration (matches operator RestartPolicy).
+
+    Attributes:
+        type: Type of restart policy
+        on_failure_retries: Number of times to retry on failure
+        on_failure_retry_interval: Interval in seconds between failure retries
+        on_submission_failure_retries: Number of times to retry on submission failure
+        on_submission_failure_retry_interval: Interval in seconds between submission retries
+    """
+
+    type: RestartPolicyType = RestartPolicyType.NEVER
+    on_failure_retries: Optional[int] = None
+    on_failure_retry_interval: int = 5  # Default from operator
+    on_submission_failure_retries: Optional[int] = None
+    on_submission_failure_retry_interval: int = 5  # Default from operator
+
+
+@dataclass
+class GPUSpec:
+    """GPU specification for driver or executor.
+
+    Attributes:
+        name: GPU resource name (e.g., "nvidia.com/gpu", "amd.com/gpu")
+        quantity: Number of GPUs to request
+    """
+
+    name: str
+    quantity: int
+
+
+@dataclass
+class DynamicAllocation:
+    """Dynamic allocation configuration (Spark 3.0+).
+
+    Attributes:
+        enabled: Whether dynamic allocation is enabled
+        initial_executors: Initial number of executors
+        min_executors: Minimum number of executors
+        max_executors: Maximum number of executors
+        shuffle_tracking_enabled: Enable shuffle tracking (default true if dynamic allocation enabled)
+        shuffle_tracking_timeout: Timeout in milliseconds for shuffle tracking
+    """
+
+    enabled: bool = False
+    initial_executors: Optional[int] = None
+    min_executors: Optional[int] = None
+    max_executors: Optional[int] = None
+    shuffle_tracking_enabled: Optional[bool] = True
+    shuffle_tracking_timeout: Optional[int] = None
+
+
+@dataclass
+class BatchSchedulerConfig:
+    """Batch scheduler configuration (Volcano, Yunikorn).
+
+    Attributes:
+        queue: Resource queue name
+        priority_class_name: Kubernetes PriorityClass name
+    """
+
+    queue: Optional[str] = None
+    priority_class_name: Optional[str] = None
+
+
+@dataclass
+class PrometheusSpec:
+    """Prometheus JMX exporter configuration.
+
+    Attributes:
+        jmx_exporter_jar: Path to Prometheus JMX exporter jar
+        port: Port for Prometheus JMX exporter (default 8090)
+        port_name: Port name (default "jmx-exporter")
+        config_file: Path to custom Prometheus config file
+        configuration: Prometheus configuration content
+    """
+
+    jmx_exporter_jar: str
+    port: int = 8090
+    port_name: str = "jmx-exporter"
+    config_file: Optional[str] = None
+    configuration: Optional[str] = None
+
+
+@dataclass
+class MonitoringSpec:
+    """Monitoring configuration.
+
+    Attributes:
+        expose_driver_metrics: Whether to expose driver metrics
+        expose_executor_metrics: Whether to expose executor metrics
+        metrics_properties: Content of metrics.properties file
+        metrics_properties_file: Path to metrics.properties file
+        prometheus: Prometheus configuration
+    """
+
+    expose_driver_metrics: bool = False
+    expose_executor_metrics: bool = False
+    metrics_properties: Optional[str] = None
+    metrics_properties_file: Optional[str] = None
+    prometheus: Optional[PrometheusSpec] = None
+
+
+@dataclass
+class SparkUIConfiguration:
+    """Spark UI service and ingress configuration.
+
+    Attributes:
+        service_port: Service port (different from target port)
+        service_port_name: Service port name (default "spark-driver-ui-port")
+        service_type: Kubernetes service type (default ClusterIP)
+        service_annotations: Service annotations
+        service_labels: Service labels
+        ingress_annotations: Ingress annotations
+        ingress_tls: Ingress TLS configuration
+    """
+
+    service_port: Optional[int] = None
+    service_port_name: str = "spark-driver-ui-port"
+    service_type: str = "ClusterIP"
+    service_annotations: dict[str, str] = field(default_factory=dict)
+    service_labels: dict[str, str] = field(default_factory=dict)
+    ingress_annotations: dict[str, str] = field(default_factory=dict)
+    ingress_tls: Optional[list[dict[str, Any]]] = None
+
+
+@dataclass
+class SparkApplicationRequest:
+    """Request model for Spark application submission (enhanced to match operator v1beta2).
+
+    Attributes:
+        # === Basic Configuration ===
+        app_name: Name of the Spark application
+        main_application_file: Path to main application file (S3 or local)
+        spark_version: Spark version to use
+        app_type: Application type (Python, Scala, Java, R)
+
+        # === Resource Configuration ===
+        driver_cores: Number of cores for driver
+        driver_memory: Memory for driver (e.g., "4g")
+        executor_cores: Number of cores per executor
+        executor_memory: Memory per executor (e.g., "8g")
+        num_executors: Number of executors
+
+        # === Application Configuration ===
+        arguments: Application arguments
+        main_class: Main class for Java/Scala applications
+        python_version: Python version (for PySpark apps)
+        spark_conf: Spark configuration properties
+        hadoop_conf: Hadoop configuration properties
+        env_vars: Environment variables
+        deps: Dependencies (jars, py files, files)
+
+        # === Advanced Configuration (NEW) ===
+        mode: Deployment mode (cluster, client, in-cluster-client)
+        image: Container image (overrides default)
+        image_pull_policy: Image pull policy (IfNotPresent, Always, Never)
+        image_pull_secrets: List of image pull secret names
+
+        # === Lifecycle & Resilience (NEW) ===
+        suspend: Suspend the application (pause execution)
+        restart_policy: Restart policy configuration
+        time_to_live_seconds: TTL for auto-cleanup after termination
+
+        # === GPU Support (NEW) ===
+        driver_gpu: GPU specification for driver
+        executor_gpu: GPU specification for executor
+
+        # === Dynamic Allocation (NEW) ===
+        dynamic_allocation: Dynamic allocation configuration
+
+        # === Monitoring & Observability (NEW) ===
+        monitoring: Monitoring configuration
+        spark_ui_options: Spark UI configuration
+
+        # === Batch Scheduling (NEW) ===
+        batch_scheduler: Batch scheduler name (volcano, yunikorn)
+        batch_scheduler_options: Batch scheduler configuration
+
+        # === Networking & Security (NEW) ===
+        service_account: Kubernetes service account
+        node_selector: Node selector for driver and executor
+        tolerations: Kubernetes tolerations
+        affinity: Kubernetes affinity rules
+        host_network: Use host networking
+        pod_security_context: Pod security context
+        security_context: Container security context
+
+        # === Advanced Features (NEW) ===
+        driver_pod_template: Full PodTemplateSpec for driver (Spark 3.0+)
+        executor_pod_template: Full PodTemplateSpec for executor (Spark 3.0+)
+        volumes: Kubernetes volumes
+        driver_volume_mounts: Driver volume mounts
+        executor_volume_mounts: Executor volume mounts
+        driver_sidecars: Sidecar containers for driver
+        executor_sidecars: Sidecar containers for executor
+        driver_init_containers: Init containers for driver
+        executor_init_containers: Init containers for executor
+
+        # === Labels & Annotations (NEW) ===
+        labels: Kubernetes labels
+        driver_labels: Driver-specific labels
+        executor_labels: Executor-specific labels
+        annotations: Kubernetes annotations
+        driver_annotations: Driver-specific annotations
+        executor_annotations: Executor-specific annotations
+
+        # === Legacy (DEPRECATED, keeping for backward compat) ===
+        queue: Queue to submit to (legacy - use namespace or batch_scheduler_options.queue)
+    """
+
+    # === Required Fields ===
+    app_name: str
+    main_application_file: str
+
+    # === Basic Configuration ===
+    spark_version: str = "3.5.0"
+    app_type: str = "Python"
+    mode: DeployMode = DeployMode.CLUSTER
+
+    # === Resource Configuration ===
+    driver_cores: int = 1
+    driver_memory: str = "1g"
+    executor_cores: int = 1
+    executor_memory: str = "1g"
+    num_executors: int = 2
+
+    # === Application Configuration ===
+    arguments: list[str] = field(default_factory=list)
+    main_class: Optional[str] = None
+    python_version: str = "3"
+    spark_conf: dict[str, str] = field(default_factory=dict)
+    hadoop_conf: dict[str, str] = field(default_factory=dict)
+    env_vars: dict[str, str] = field(default_factory=dict)
+    deps: Optional[dict[str, list[str]]] = None
+
+    # === Image Configuration ===
+    image: Optional[str] = None
+    image_pull_policy: str = "IfNotPresent"
+    image_pull_secrets: list[str] = field(default_factory=list)
+
+    # === Lifecycle & Resilience ===
+    suspend: Optional[bool] = None
+    restart_policy: RestartPolicy = field(default_factory=RestartPolicy)
+    time_to_live_seconds: Optional[int] = None
+
+    # === GPU Support ===
+    driver_gpu: Optional[GPUSpec] = None
+    executor_gpu: Optional[GPUSpec] = None
+
+    # === Dynamic Allocation ===
+    dynamic_allocation: Optional[DynamicAllocation] = None
+
+    # === Monitoring & Observability ===
+    monitoring: Optional[MonitoringSpec] = None
+    spark_ui_options: Optional[SparkUIConfiguration] = None
+
+    # === Batch Scheduling ===
+    batch_scheduler: Optional[str] = None
+    batch_scheduler_options: Optional[BatchSchedulerConfig] = None
+
+    # === Networking & Security ===
+    service_account: str = "spark-operator-spark"
+    node_selector: dict[str, str] = field(default_factory=dict)
+    tolerations: list[dict[str, Any]] = field(default_factory=list)
+    affinity: Optional[dict[str, Any]] = None
+    host_network: Optional[bool] = None
+    pod_security_context: Optional[dict[str, Any]] = None
+    security_context: Optional[dict[str, Any]] = None
+
+    # === Pod Templates (Spark 3.0+) ===
+    driver_pod_template: Optional[dict[str, Any]] = None
+    executor_pod_template: Optional[dict[str, Any]] = None
+
+    # === Volumes ===
+    volumes: list[dict[str, Any]] = field(default_factory=list)
+    driver_volume_mounts: list[dict[str, Any]] = field(default_factory=list)
+    executor_volume_mounts: list[dict[str, Any]] = field(default_factory=list)
+
+    # === Sidecars & Init Containers ===
+    driver_sidecars: list[dict[str, Any]] = field(default_factory=list)
+    executor_sidecars: list[dict[str, Any]] = field(default_factory=list)
+    driver_init_containers: list[dict[str, Any]] = field(default_factory=list)
+    executor_init_containers: list[dict[str, Any]] = field(default_factory=list)
+
+    # === Labels & Annotations ===
+    labels: dict[str, str] = field(default_factory=dict)
+    driver_labels: dict[str, str] = field(default_factory=dict)
+    executor_labels: dict[str, str] = field(default_factory=dict)
+    annotations: dict[str, str] = field(default_factory=dict)
+    driver_annotations: dict[str, str] = field(default_factory=dict)
+    executor_annotations: dict[str, str] = field(default_factory=dict)
+
+    # === Legacy ===
+    queue: str = "poc"
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert request to dictionary for operator-compliant SparkApplication CRD.
+
+        Returns:
+            Dictionary representation matching operator's v1beta2 SparkApplication schema
+        """
+        # === Build metadata ===
+        metadata = {"name": self.app_name}
+
+        if self.labels:
+            metadata["labels"] = self.labels.copy()
+        if self.annotations:
+            metadata["annotations"] = self.annotations.copy()
+
+        # === Build spec ===
+        spec: dict[str, Any] = {
+            "type": self.app_type,
+            "mode": self.mode.value if isinstance(self.mode, DeployMode) else self.mode,
+            "mainApplicationFile": self.main_application_file,
+            "sparkVersion": self.spark_version,
+        }
+
+        # Image configuration
+        if self.image:
+            spec["image"] = self.image
+        else:
+            # Default image based on app type
+            if self.app_type.lower() == "python":
+                spec["image"] = f"gcr.io/spark-operator/spark-py:{self.spark_version}"
+            else:
+                spec["image"] = f"gcr.io/spark-operator/spark:{self.spark_version}"
+
+        spec["imagePullPolicy"] = self.image_pull_policy
+        if self.image_pull_secrets:
+            spec["imagePullSecrets"] = self.image_pull_secrets
+
+        # Main class for Java/Scala
+        if self.main_class:
+            spec["mainClass"] = self.main_class
+
+        # Python version
+        if self.app_type.lower() == "python" and self.python_version:
+            spec["pythonVersion"] = self.python_version
+
+        # === Lifecycle & Resilience ===
+        if self.suspend is not None:
+            spec["suspend"] = self.suspend
+
+        if self.time_to_live_seconds is not None:
+            spec["timeToLiveSeconds"] = self.time_to_live_seconds
+
+        # Restart policy
+        restart_policy_dict = {"type": self.restart_policy.type.value}
+        if self.restart_policy.on_failure_retries is not None:
+            restart_policy_dict["onFailureRetries"] = self.restart_policy.on_failure_retries
+        if self.restart_policy.on_failure_retry_interval:
+            restart_policy_dict["onFailureRetryInterval"] = (
+                self.restart_policy.on_failure_retry_interval
+            )
+        if self.restart_policy.on_submission_failure_retries is not None:
+            restart_policy_dict["onSubmissionFailureRetries"] = (
+                self.restart_policy.on_submission_failure_retries
+            )
+        if self.restart_policy.on_submission_failure_retry_interval:
+            restart_policy_dict["onSubmissionFailureRetryInterval"] = (
+                self.restart_policy.on_submission_failure_retry_interval
+            )
+        spec["restartPolicy"] = restart_policy_dict
+
+        # === Configuration ===
+        if self.arguments:
+            spec["arguments"] = self.arguments
+
+        if self.spark_conf:
+            spec["sparkConf"] = self.spark_conf.copy()
+
+        if self.hadoop_conf:
+            spec["hadoopConf"] = self.hadoop_conf.copy()
+
+        if self.deps:
+            spec["deps"] = self.deps
+
+        # === Batch Scheduling ===
+        if self.batch_scheduler:
+            spec["batchScheduler"] = self.batch_scheduler
+
+        if self.batch_scheduler_options:
+            batch_opts = {}
+            if self.batch_scheduler_options.queue:
+                batch_opts["queue"] = self.batch_scheduler_options.queue
+            if self.batch_scheduler_options.priority_class_name:
+                batch_opts["priorityClassName"] = self.batch_scheduler_options.priority_class_name
+            if batch_opts:
+                spec["batchSchedulerOptions"] = batch_opts
+
+        # === Monitoring ===
+        if self.monitoring:
+            mon_spec = {
+                "exposeDriverMetrics": self.monitoring.expose_driver_metrics,
+                "exposeExecutorMetrics": self.monitoring.expose_executor_metrics,
+            }
+            if self.monitoring.metrics_properties:
+                mon_spec["metricsProperties"] = self.monitoring.metrics_properties
+            if self.monitoring.metrics_properties_file:
+                mon_spec["metricsPropertiesFile"] = self.monitoring.metrics_properties_file
+            if self.monitoring.prometheus:
+                prom_spec = {
+                    "jmxExporterJar": self.monitoring.prometheus.jmx_exporter_jar,
+                    "port": self.monitoring.prometheus.port,
+                    "portName": self.monitoring.prometheus.port_name,
+                }
+                if self.monitoring.prometheus.config_file:
+                    prom_spec["configFile"] = self.monitoring.prometheus.config_file
+                if self.monitoring.prometheus.configuration:
+                    prom_spec["configuration"] = self.monitoring.prometheus.configuration
+                mon_spec["prometheus"] = prom_spec
+            spec["monitoring"] = mon_spec
+
+        # === Spark UI ===
+        if self.spark_ui_options:
+            ui_opts = {}
+            if self.spark_ui_options.service_port:
+                ui_opts["servicePort"] = self.spark_ui_options.service_port
+            if self.spark_ui_options.service_port_name:
+                ui_opts["servicePortName"] = self.spark_ui_options.service_port_name
+            if self.spark_ui_options.service_type:
+                ui_opts["serviceType"] = self.spark_ui_options.service_type
+            if self.spark_ui_options.service_annotations:
+                ui_opts["serviceAnnotations"] = self.spark_ui_options.service_annotations
+            if self.spark_ui_options.service_labels:
+                ui_opts["serviceLabels"] = self.spark_ui_options.service_labels
+            if self.spark_ui_options.ingress_annotations:
+                ui_opts["ingressAnnotations"] = self.spark_ui_options.ingress_annotations
+            if self.spark_ui_options.ingress_tls:
+                ui_opts["ingressTLS"] = self.spark_ui_options.ingress_tls
+            if ui_opts:
+                spec["sparkUIOptions"] = ui_opts
+
+        # === Dynamic Allocation ===
+        if self.dynamic_allocation and self.dynamic_allocation.enabled:
+            dyn_alloc = {"enabled": True}
+            if self.dynamic_allocation.initial_executors is not None:
+                dyn_alloc["initialExecutors"] = self.dynamic_allocation.initial_executors
+            if self.dynamic_allocation.min_executors is not None:
+                dyn_alloc["minExecutors"] = self.dynamic_allocation.min_executors
+            if self.dynamic_allocation.max_executors is not None:
+                dyn_alloc["maxExecutors"] = self.dynamic_allocation.max_executors
+            if self.dynamic_allocation.shuffle_tracking_enabled is not None:
+                dyn_alloc["shuffleTrackingEnabled"] = (
+                    self.dynamic_allocation.shuffle_tracking_enabled
+                )
+            if self.dynamic_allocation.shuffle_tracking_timeout is not None:
+                dyn_alloc["shuffleTrackingTimeout"] = (
+                    self.dynamic_allocation.shuffle_tracking_timeout
+                )
+            spec["dynamicAllocation"] = dyn_alloc
+
+        # === Volumes ===
+        if self.volumes:
+            spec["volumes"] = self.volumes
+
+        # === Node Selector (spec-level) ===
+        if self.node_selector:
+            spec["nodeSelector"] = self.node_selector
+
+        # === Driver Spec ===
+        driver_spec = {
+            "cores": self.driver_cores,
+            "memory": self.driver_memory,
+            "serviceAccount": self.service_account,
+        }
+
+        # Driver labels & annotations
+        driver_labels = {"version": self.spark_version}
+        if self.driver_labels:
+            driver_labels.update(self.driver_labels)
+        driver_spec["labels"] = driver_labels
+
+        if self.driver_annotations:
+            driver_spec["annotations"] = self.driver_annotations
+
+        # Driver pod template (Spark 3.0+)
+        if self.driver_pod_template:
+            driver_spec["template"] = self.driver_pod_template
+
+        # Driver GPU
+        if self.driver_gpu:
+            driver_spec["gpu"] = {
+                "name": self.driver_gpu.name,
+                "quantity": self.driver_gpu.quantity,
+            }
+
+        # Driver volumes
+        if self.driver_volume_mounts:
+            driver_spec["volumeMounts"] = self.driver_volume_mounts
+
+        # Driver environment
+        if self.env_vars:
+            driver_spec["env"] = [{"name": k, "value": v} for k, v in self.env_vars.items()]
+
+        # Driver sidecars & init containers
+        if self.driver_sidecars:
+            driver_spec["sidecars"] = self.driver_sidecars
+        if self.driver_init_containers:
+            driver_spec["initContainers"] = self.driver_init_containers
+
+        # Driver tolerations, affinity, security
+        if self.tolerations:
+            driver_spec["tolerations"] = self.tolerations
+        if self.affinity:
+            driver_spec["affinity"] = self.affinity
+        if self.pod_security_context:
+            driver_spec["podSecurityContext"] = self.pod_security_context
+        if self.security_context:
+            driver_spec["securityContext"] = self.security_context
+        if self.host_network is not None:
+            driver_spec["hostNetwork"] = self.host_network
+
+        spec["driver"] = driver_spec
+
+        # === Executor Spec ===
+        executor_spec = {
+            "cores": self.executor_cores,
+            "instances": self.num_executors,
+            "memory": self.executor_memory,
+        }
+
+        # Executor labels & annotations
+        executor_labels = {"version": self.spark_version}
+        if self.executor_labels:
+            executor_labels.update(self.executor_labels)
+        executor_spec["labels"] = executor_labels
+
+        if self.executor_annotations:
+            executor_spec["annotations"] = self.executor_annotations
+
+        # Executor pod template (Spark 3.0+)
+        if self.executor_pod_template:
+            executor_spec["template"] = self.executor_pod_template
+
+        # Executor GPU
+        if self.executor_gpu:
+            executor_spec["gpu"] = {
+                "name": self.executor_gpu.name,
+                "quantity": self.executor_gpu.quantity,
+            }
+
+        # Executor volumes
+        if self.executor_volume_mounts:
+            executor_spec["volumeMounts"] = self.executor_volume_mounts
+
+        # Executor environment
+        if self.env_vars:
+            executor_spec["env"] = [{"name": k, "value": v} for k, v in self.env_vars.items()]
+
+        # Executor sidecars & init containers
+        if self.executor_sidecars:
+            executor_spec["sidecars"] = self.executor_sidecars
+        if self.executor_init_containers:
+            executor_spec["initContainers"] = self.executor_init_containers
+
+        # Executor tolerations, affinity, security (reuse from driver if not overridden)
+        if self.tolerations:
+            executor_spec["tolerations"] = self.tolerations
+        if self.affinity:
+            executor_spec["affinity"] = self.affinity
+        if self.pod_security_context:
+            executor_spec["podSecurityContext"] = self.pod_security_context
+        if self.security_context:
+            executor_spec["securityContext"] = self.security_context
+        if self.host_network is not None:
+            executor_spec["hostNetwork"] = self.host_network
+
+        spec["executor"] = executor_spec
+
+        # === Build final CRD ===
+        return {
+            "apiVersion": "sparkoperator.k8s.io/v1beta2",
+            "kind": "SparkApplication",
+            "metadata": metadata,
+            "spec": spec,
+        }
+
+
+@dataclass
+class SparkApplicationResponse:
+    """Response model for Spark application submission.
+
+    Attributes:
+        submission_id: Unique submission ID generated by gateway
+        app_name: Name of the application
+        status: Current status of the application
+        message: Additional message
+    """
+
+    submission_id: str
+    app_name: str
+    status: str = "SUBMITTED"
+    message: str = ""
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "SparkApplicationResponse":
+        """Create response from API response dictionary.
+
+        Args:
+            data: Dictionary from API response
+
+        Returns:
+            SparkApplicationResponse instance
+        """
+        return cls(
+            submission_id=data.get("submissionId", data.get("submission_id", "")),
+            app_name=data.get("appName", data.get("app_name", "")),
+            status=data.get("status", "SUBMITTED"),
+            message=data.get("message", ""),
+        )
+
+
+@dataclass
+class ConnectBackendConfig:
+    """Configuration for Spark Connect backend.
+
+    This backend enables remote connectivity to existing Spark clusters via
+    Spark Connect protocol (gRPC-based).
+
+    Attributes:
+        connect_url: Spark Connect URL (format: sc://host:port/;param1=value;param2=value)
+        token: Bearer token for authentication (enables SSL automatically)
+        use_ssl: Enable TLS/SSL for secure communication
+        user_id: User identifier for session management
+        session_id: Pre-defined session UUID for session sharing
+        grpc_max_message_size: Maximum gRPC message size in bytes
+
+        # Auto-provisioning (for Kubeflow-managed clusters)
+        enable_auto_provision: Automatically provision Spark Connect server if not exists
+        auto_provision_config: SparkApplication config for auto-provisioned server
+        namespace: Kubernetes namespace for auto-provisioned server
+
+        # Kubeflow integration
+        enable_monitoring: Enable metrics collection
+        artifact_staging_path: Path for staging artifacts (JARs, files, etc.)
+        timeout: Default timeout for operations in seconds
+    """
+
+    connect_url: str
+    token: Optional[str] = None
+    use_ssl: bool = True
+    user_id: Optional[str] = None
+    session_id: Optional[str] = None
+    grpc_max_message_size: int = 128 * 1024 * 1024  # 128MB default
+
+    # Auto-provisioning
+    enable_auto_provision: bool = False
+    auto_provision_config: Optional["SparkApplicationRequest"] = None
+    namespace: str = "default"
+
+    # Kubeflow integration
+    enable_monitoring: bool = True
+    artifact_staging_path: Optional[str] = None
+    timeout: int = 300
+
+
+@dataclass
+class SessionMetrics:
+    """Metrics for a Spark Connect session.
+
+    Attributes:
+        session_id: Session UUID
+        queries_executed: Number of queries executed
+        active_queries: Number of currently active queries
+        artifacts_uploaded: Number of artifacts uploaded
+        data_read_bytes: Total bytes read
+        data_written_bytes: Total bytes written
+        execution_time_ms: Total execution time in milliseconds
+    """
+
+    session_id: str
+    queries_executed: int = 0
+    active_queries: int = 0
+    artifacts_uploaded: int = 0
+    data_read_bytes: int = 0
+    data_written_bytes: int = 0
+    execution_time_ms: int = 0
+
+
+@dataclass
+class SessionInfo:
+    """Information about a Spark Connect session.
+
+    Attributes:
+        session_id: Session UUID
+        app_name: Application name
+        user_id: User identifier
+        created_at: Session creation time
+        last_activity: Last activity timestamp
+        state: Session state (active, idle, closed)
+        metrics: Session metrics
+    """
+
+    session_id: str
+    app_name: str
+    user_id: Optional[str] = None
+    created_at: Optional[str] = None
+    last_activity: Optional[str] = None
+    state: str = "active"
+    metrics: Optional[SessionMetrics] = None
+
+
+@dataclass
+class ApplicationStatus:
+    """Status information for a Spark application.
+
+    Attributes:
+        submission_id: Submission ID
+        app_id: Spark application ID
+        app_name: Application name
+        state: Current state
+        submission_time: Time of submission
+        start_time: Start time
+        completion_time: Completion time
+        driver_info: Driver pod information
+        executor_state: Executor states
+    """
+
+    submission_id: str
+    app_id: Optional[str] = None
+    app_name: Optional[str] = None
+    state: ApplicationState = ApplicationState.UNKNOWN
+    submission_time: Optional[str] = None
+    start_time: Optional[str] = None
+    completion_time: Optional[str] = None
+    driver_info: Optional[dict[str, Any]] = None
+    executor_state: Optional[dict[str, Any]] = None
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "ApplicationStatus":
+        """Create status from API response dictionary.
+
+        Args:
+            data: Dictionary from API response
+
+        Returns:
+            ApplicationStatus instance
+        """
+        # Support both Operator and Gateway response formats
+        if "status" in data and "applicationState" in data.get("status", {}):
+            # Operator format
+            state_str = data["status"]["applicationState"].get("state", "UNKNOWN")
+            app_id = data["status"].get("sparkApplicationId")
+            submission_time = data["status"].get("submissionTime")
+            start_time = data["status"].get("lastSubmissionAttemptTime")
+            completion_time = data["status"].get("terminationTime")
+            driver_info = data["status"].get("driverInfo")
+            executor_state = data["status"].get("executorState")
+        elif "status" in data and "appState" in data.get("status", {}):
+            # Operator format (alternative field name)
+            state_str = data["status"].get("appState", {}).get("state", "UNKNOWN")
+            app_id = data["status"].get("sparkApplicationId")
+            submission_time = None
+            start_time = data["status"].get("lastSubmissionAttemptTime")
+            completion_time = data["status"].get("terminationTime")
+            driver_info = data["status"].get("driverInfo")
+            executor_state = data["status"].get("executorState")
+        else:
+            # Gateway format or simple format
+            state_str = data.get("status", "UNKNOWN")
+            app_id = data.get("app_id")
+            submission_time = data.get("submission_time")
+            start_time = data.get("start_time")
+            completion_time = data.get("completion_time")
+            driver_info = data.get("driver_info")
+            executor_state = data.get("executor_state")
+
+        try:
+            state = ApplicationState(state_str)
+        except ValueError:
+            state = ApplicationState.UNKNOWN
+
+        return cls(
+            submission_id=data.get(
+                "submissionId", data.get("submission_id", data.get("metadata", {}).get("name", ""))
+            ),
+            app_id=app_id,
+            app_name=data.get("metadata", {}).get("name", data.get("app_name")),
+            state=state,
+            submission_time=submission_time,
+            start_time=start_time,
+            completion_time=completion_time,
+            driver_info=driver_info,
+            executor_state=executor_state,
+        )
diff --git a/kubeflow/spark/session.py b/kubeflow/spark/session.py
new file mode 100644
index 000000000..9a67edec6
--- /dev/null
+++ b/kubeflow/spark/session.py
@@ -0,0 +1,343 @@
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Managed Spark Connect session wrapper."""
+
+import logging
+from typing import TYPE_CHECKING, Any, Optional
+
+from kubeflow.spark.models import SessionInfo, SessionMetrics
+
+if TYPE_CHECKING:
+    from kubeflow.spark.backends.connect import ConnectBackend
+
+    # Only import pyspark types for type checking
+    try:
+        from pyspark.sql import DataFrame, DataFrameReader, SparkSession
+        from pyspark.sql.streaming import DataStreamReader
+    except ImportError:
+        DataFrame = Any  # type: ignore
+        DataFrameReader = Any  # type: ignore
+        SparkSession = Any  # type: ignore
+        DataStreamReader = Any  # type: ignore
+
+logger = logging.getLogger(__name__)
+
+
+class ManagedSparkSession:
+    """Kubeflow-managed Spark Connect session.
+
+    This class wraps a native PySpark Connect session and provides additional
+    Kubeflow-specific functionality like metrics collection, artifact management,
+    and pipeline integration.
+
+    The underlying PySpark DataFrame API is accessible directly, allowing users
+    to write standard PySpark code while benefiting from Kubeflow enhancements.
+
+    Example:
+        ```python
+        from kubeflow.spark import SparkSessionClient, ConnectBackendConfig
+
+        config = ConnectBackendConfig(connect_url="sc://spark-cluster:15002")
+        client = SparkSessionClient(backend_config=config)
+
+        # Create session
+        session = client.create_session(app_name="data-analysis")
+
+        # Use standard PySpark API
+        df = session.sql("SELECT * FROM table")
+        result = df.filter(df.status == "active").collect()
+
+        # Kubeflow extensions
+        metrics = session.get_metrics()
+        session.export_to_pipeline_artifact(df, "/outputs/data.parquet")
+
+        # Cleanup
+        session.close()
+        ```
+    """
+
+    def __init__(
+        self,
+        session: "SparkSession",
+        session_id: str,
+        app_name: str,
+        backend: "ConnectBackend",
+    ):
+        """Initialize managed session.
+
+        Args:
+            session: Native PySpark Connect session
+            session_id: Session UUID
+            app_name: Application name
+            backend: ConnectBackend instance for lifecycle management
+        """
+        self._session = session
+        self._session_id = session_id
+        self._app_name = app_name
+        self._backend = backend
+        self._closed = False
+        self._metrics = SessionMetrics(session_id=session_id)
+
+        logger.info(f"Created ManagedSparkSession: {session_id} (app: {app_name})")
+
+    # =========================================================================
+    # Properties
+    # =========================================================================
+
+    @property
+    def session_id(self) -> str:
+        """Get session UUID."""
+        return self._session_id
+
+    @property
+    def app_name(self) -> str:
+        """Get application name."""
+        return self._app_name
+
+    @property
+    def is_closed(self) -> bool:
+        """Check if session is closed."""
+        return self._closed
+
+    @property
+    def spark(self) -> "SparkSession":
+        """Get underlying PySpark session.
+
+        Use this to access the full PySpark API directly.
+        """
+        if self._closed:
+            raise RuntimeError(f"Session {self._session_id} is closed")
+        return self._session
+
+    # =========================================================================
+    # Delegate PySpark DataFrame API
+    # =========================================================================
+
+    def sql(self, query: str) -> "DataFrame":
+        """Execute SQL query and return DataFrame.
+
+        Args:
+            query: SQL query string
+
+        Returns:
+            DataFrame with query results
+        """
+        if self._closed:
+            raise RuntimeError(f"Session {self._session_id} is closed")
+        self._metrics.queries_executed += 1
+        return self._session.sql(query)
+
+    @property
+    def read(self) -> "DataFrameReader":
+        """Get DataFrameReader for reading data sources."""
+        if self._closed:
+            raise RuntimeError(f"Session {self._session_id} is closed")
+        return self._session.read
+
+    @property
+    def readStream(self) -> "DataStreamReader":
+        """Get DataStreamReader for reading streaming sources."""
+        if self._closed:
+            raise RuntimeError(f"Session {self._session_id} is closed")
+        return self._session.readStream
+
+    def createDataFrame(self, data: Any, schema: Any = None) -> "DataFrame":
+        """Create DataFrame from data.
+
+        Args:
+            data: Input data (list, pandas DataFrame, RDD, etc.)
+            schema: Optional schema
+
+        Returns:
+            DataFrame
+        """
+        if self._closed:
+            raise RuntimeError(f"Session {self._session_id} is closed")
+        return self._session.createDataFrame(data, schema)
+
+    def table(self, tableName: str) -> "DataFrame":
+        """Get DataFrame for a table.
+
+        Args:
+            tableName: Name of the table
+
+        Returns:
+            DataFrame
+        """
+        if self._closed:
+            raise RuntimeError(f"Session {self._session_id} is closed")
+        return self._session.table(tableName)
+
+    def range(
+        self,
+        start: int,
+        end: Optional[int] = None,
+        step: int = 1,
+        numPartitions: Optional[int] = None,
+    ) -> "DataFrame":
+        """Create DataFrame with range of numbers.
+
+        Args:
+            start: Start of range (or end if `end` not provided)
+            end: End of range (optional)
+            step: Step size
+            numPartitions: Number of partitions
+
+        Returns:
+            DataFrame
+        """
+        if self._closed:
+            raise RuntimeError(f"Session {self._session_id} is closed")
+        if end is None:
+            return self._session.range(start)
+        return self._session.range(start, end, step, numPartitions)
+
+    # =========================================================================
+    # Kubeflow Extensions
+    # =========================================================================
+
+    def upload_artifacts(self, *paths: str, pyfile: bool = False) -> None:
+        """Upload artifacts to Spark Connect session.
+
+        Args:
+            *paths: File paths to upload (JARs, Python files, data files)
+            pyfile: If True, treat as Python files (added to sys.path)
+
+        Example:
+            ```python
+            # Upload JARs
+            session.upload_artifacts("/path/to/lib.jar")
+
+            # Upload Python packages
+            session.upload_artifacts("/path/to/package.zip", pyfile=True)
+            ```
+        """
+        if self._closed:
+            raise RuntimeError(f"Session {self._session_id} is closed")
+
+        try:
+            # Use PySpark's addArtifacts method
+            if hasattr(self._session, "addArtifacts"):
+                self._session.addArtifacts(*paths, pyfile=pyfile)
+                self._metrics.artifacts_uploaded += len(paths)
+                logger.info(f"Uploaded {len(paths)} artifacts to session {self._session_id}")
+            else:
+                logger.warning("Session does not support artifact upload (requires PySpark 3.4+)")
+        except Exception as e:
+            logger.error(f"Failed to upload artifacts: {e}")
+            raise
+
+    def get_metrics(self) -> SessionMetrics:
+        """Get session metrics.
+
+        Returns:
+            SessionMetrics with current statistics
+        """
+        return self._metrics
+
+    def get_info(self) -> SessionInfo:
+        """Get session information.
+
+        Returns:
+            SessionInfo with session metadata
+        """
+        return SessionInfo(
+            session_id=self._session_id,
+            app_name=self._app_name,
+            state="closed" if self._closed else "active",
+            metrics=self._metrics,
+        )
+
+    def export_to_pipeline_artifact(
+        self, df: "DataFrame", path: str, format: str = "parquet", **options: Any
+    ) -> None:
+        """Export DataFrame to Kubeflow Pipeline artifact.
+
+        Args:
+            df: DataFrame to export
+            path: Output path for artifact
+            format: Output format (parquet, csv, json, etc.)
+            **options: Additional write options
+
+        Example:
+            ```python
+            df = session.sql("SELECT * FROM sales")
+            session.export_to_pipeline_artifact(df, "/outputs/sales.parquet")
+            ```
+        """
+        if self._closed:
+            raise RuntimeError(f"Session {self._session_id} is closed")
+
+        try:
+            writer = df.write.format(format)
+            for key, value in options.items():
+                writer = writer.option(key, value)
+            writer.save(path)
+            logger.info(f"Exported DataFrame to {path} (format: {format})")
+        except Exception as e:
+            logger.error(f"Failed to export DataFrame: {e}")
+            raise
+
+    def clone(self) -> "ManagedSparkSession":
+        """Clone the session with all state.
+
+        Creates a new session that shares the same state (temp views, UDFs, etc.)
+        but has its own session ID.
+
+        Returns:
+            New ManagedSparkSession instance
+        """
+        if self._closed:
+            raise RuntimeError(f"Session {self._session_id} is closed")
+
+        logger.info(f"Cloning session {self._session_id}")
+        return self._backend._clone_session(self)
+
+    def close(self, release: bool = True) -> None:
+        """Close the session.
+
+        Args:
+            release: If True, release session resources on server
+        """
+        if self._closed:
+            logger.warning(f"Session {self._session_id} already closed")
+            return
+
+        try:
+            if release:
+                # Stop the session
+                self._session.stop()
+                logger.info(f"Released session {self._session_id} on server")
+            self._closed = True
+        except Exception as e:
+            logger.error(f"Error closing session {self._session_id}: {e}")
+            raise
+
+    # =========================================================================
+    # Context Manager
+    # =========================================================================
+
+    def __enter__(self) -> "ManagedSparkSession":
+        """Context manager entry."""
+        return self
+
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
+        """Context manager exit - ensures cleanup."""
+        self.close()
+
+    def __repr__(self) -> str:
+        """String representation."""
+        status = "closed" if self._closed else "active"
+        return f"ManagedSparkSession(id={self._session_id}, app={self._app_name}, status={status})"
diff --git a/kubeflow/spark/session_client.py b/kubeflow/spark/session_client.py
new file mode 100644
index 000000000..e83b55c22
--- /dev/null
+++ b/kubeflow/spark/session_client.py
@@ -0,0 +1,251 @@
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Session Spark client for interactive Spark sessions."""
+
+from typing import Any
+
+from kubeflow.spark.base_client import BaseSparkClient
+from kubeflow.spark.backends.connect import (
+    ConnectBackend,
+    ConnectBackendConfig,
+)
+from kubeflow.spark.models import SessionInfo
+from kubeflow.spark.session import ManagedSparkSession
+
+
+class SparkSessionClient(BaseSparkClient):
+    """Client for managing interactive Spark sessions.
+
+    This client provides a high-level API for creating and managing long-lived
+    Spark Connect sessions for interactive data analysis, exploratory workflows,
+    and notebook-style development.
+
+    Supported backends:
+    - **ConnectBackend**: Connects to Spark clusters via Spark Connect protocol (gRPC)
+
+    Features:
+    - Interactive SQL queries
+    - DataFrame API access
+    - Artifact upload (JARs, Python files)
+    - Session metrics and monitoring
+    - Full PySpark API compatibility
+
+    Example:
+        ```python
+        from kubeflow.spark import SparkSessionClient, ConnectBackendConfig
+
+        # Initialize client
+        config = ConnectBackendConfig(
+            connect_url="sc://spark-cluster.default.svc:15002",
+            use_ssl=True,
+        )
+        client = SparkSessionClient(backend_config=config)
+
+        # Create session
+        session = client.create_session(app_name="data-exploration")
+
+        # Use PySpark DataFrame API
+        df = session.sql("SELECT * FROM sales WHERE date >= '2024-01-01'")
+        result = df.groupBy("product").sum("amount").collect()
+
+        # Upload artifacts
+        session.upload_artifacts("/path/to/lib.jar")
+
+        # Get metrics
+        metrics = session.get_metrics()
+        print(f"Queries executed: {metrics.queries_executed}")
+
+        # Cleanup
+        session.close()
+        ```
+
+    Context Manager:
+        ```python
+        with SparkSessionClient(backend_config=config) as client:
+            session = client.create_session("my-analysis")
+            # Use session...
+            # Cleanup happens automatically
+        ```
+
+    Notebook Workflow:
+        ```python
+        # Cell 1: Setup
+        client = SparkSessionClient(backend_config=config)
+        session = client.create_session("notebook-session")
+
+        # Cell 2: Load data
+        df = session.read.parquet("s3a://bucket/data/")
+
+        # Cell 3: Analysis
+        summary = df.describe()
+        summary.show()
+
+        # Cell 4: Cleanup
+        session.close()
+        ```
+    """
+
+    def __init__(self, backend_config: ConnectBackendConfig):
+        """Initialize Spark Session client.
+
+        Args:
+            backend_config: ConnectBackendConfig with connection details
+
+        Raises:
+            ValueError: If invalid backend configuration provided
+            ImportError: If pyspark[connect] is not installed
+        """
+        if not isinstance(backend_config, ConnectBackendConfig):
+            raise ValueError(
+                f"Invalid backend config type for SparkSessionClient: {type(backend_config)}. "
+                "Expected ConnectBackendConfig."
+            )
+
+        # Initialize ConnectBackend
+        backend = ConnectBackend(backend_config)
+
+        # Initialize base class
+        super().__init__(backend)
+
+    def create_session(
+        self,
+        app_name: str,
+        **kwargs: Any,
+    ) -> ManagedSparkSession:
+        """Create a new Spark Connect session.
+
+        This establishes a connection to a Spark Connect server and returns
+        a managed session that provides the full PySpark DataFrame API.
+
+        Args:
+            app_name: Name for the session/application
+            **kwargs: Additional Spark configuration options
+                     (passed to SparkSession.builder.config)
+
+        Returns:
+            ManagedSparkSession instance for interactive operations
+
+        Raises:
+            RuntimeError: If session creation fails
+            ConnectionError: If cannot connect to Spark Connect server
+            TimeoutError: If connection times out
+
+        Example:
+            ```python
+            # Basic session
+            session = client.create_session(app_name="data-analysis")
+
+            # Session with custom configuration
+            session = client.create_session(
+                app_name="data-analysis",
+                **{
+                    "spark.sql.shuffle.partitions": "200",
+                    "spark.sql.adaptive.enabled": "true",
+                }
+            )
+
+            # Use session
+            df = session.sql("SELECT * FROM table")
+            result = df.collect()
+
+            # Cleanup
+            session.close()
+            ```
+        """
+        return self._backend.create_session(app_name=app_name, **kwargs)
+
+    def get_session_status(self, session_id: str) -> SessionInfo:
+        """Get status and metadata of a Spark Connect session.
+
+        Args:
+            session_id: Session UUID (from session.session_id)
+
+        Returns:
+            SessionInfo with session metadata, state, and metrics
+
+        Raises:
+            RuntimeError: If request fails
+            ValueError: If session_id not found
+
+        Example:
+            ```python
+            # Create session
+            session = client.create_session("my-app")
+
+            # Get status
+            info = client.get_session_status(session.session_id)
+            print(f"Session ID: {info.session_id}")
+            print(f"App name: {info.app_name}")
+            print(f"State: {info.state}")
+            print(f"Queries executed: {info.metrics.queries_executed}")
+            print(f"Artifacts uploaded: {info.metrics.artifacts_uploaded}")
+            ```
+        """
+        return self._backend.get_session_status(session_id)
+
+    def list_sessions(self) -> list[SessionInfo]:
+        """List all active Spark Connect sessions.
+
+        Returns:
+            List of SessionInfo objects for active sessions
+
+        Raises:
+            RuntimeError: If request fails
+
+        Example:
+            ```python
+            # List all sessions
+            sessions = client.list_sessions()
+
+            for session_info in sessions:
+                print(f"Session: {session_info.session_id}")
+                print(f"  App: {session_info.app_name}")
+                print(f"  State: {session_info.state}")
+                print(f"  Queries: {session_info.metrics.queries_executed}")
+            ```
+        """
+        return self._backend.list_sessions()
+
+    def close_session(self, session_id: str, release: bool = True) -> dict[str, Any]:
+        """Close a Spark Connect session.
+
+        Args:
+            session_id: Session UUID to close
+            release: If True, release session resources on server (default: True)
+
+        Returns:
+            Dictionary with closure response
+
+        Raises:
+            RuntimeError: If closure fails
+            ValueError: If session_id not found
+
+        Example:
+            ```python
+            # Create session
+            session = client.create_session("my-app")
+            session_id = session.session_id
+
+            # Do work...
+
+            # Close session and release resources
+            response = client.close_session(session_id, release=True)
+            print(f"Closed: {response}")
+
+            # Alternative: use session.close() directly
+            session.close()
+            ```
+        """
+        return self._backend.close_session(session_id, release)
diff --git a/kubeflow/spark/test/__init__.py b/kubeflow/spark/test/__init__.py
new file mode 100644
index 000000000..773da3500
--- /dev/null
+++ b/kubeflow/spark/test/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Kubeflow Spark client."""
diff --git a/kubeflow/spark/test/test_connect_backend.py b/kubeflow/spark/test/test_connect_backend.py
new file mode 100644
index 000000000..fe8e288c8
--- /dev/null
+++ b/kubeflow/spark/test/test_connect_backend.py
@@ -0,0 +1,342 @@
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for Spark Connect backend and configuration."""
+
+import pytest
+
+from kubeflow.spark.models import ConnectBackendConfig, SessionInfo, SessionMetrics
+
+
+def _is_pyspark_available() -> bool:
+    """Check if PySpark Connect is available."""
+    try:
+        import pyspark  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+class TestConnectBackendConfig:
+    """Tests for ConnectBackendConfig validation and URL building."""
+
+    def test_valid_basic_config(self):
+        """Test creating a basic valid configuration."""
+        config = ConnectBackendConfig(connect_url="sc://localhost:15002")
+
+        assert config.connect_url == "sc://localhost:15002"
+        assert config.use_ssl is True
+        assert config.token is None
+        assert config.timeout == 300
+
+    def test_valid_config_with_authentication(self):
+        """Test configuration with authentication token."""
+        config = ConnectBackendConfig(
+            connect_url="sc://cluster:15002",
+            token="test-token",
+            use_ssl=True,
+        )
+
+        assert config.token == "test-token"
+        assert config.use_ssl is True
+
+    def test_valid_config_with_all_options(self):
+        """Test configuration with all optional parameters."""
+        config = ConnectBackendConfig(
+            connect_url="sc://cluster:15002",
+            token="test-token",
+            use_ssl=True,
+            user_id="testuser",
+            session_id="test-session-123",
+            grpc_max_message_size=256 * 1024 * 1024,
+            enable_auto_provision=False,
+            namespace="spark-jobs",
+            enable_monitoring=True,
+            timeout=600,
+        )
+
+        assert config.connect_url == "sc://cluster:15002"
+        assert config.token == "test-token"
+        assert config.user_id == "testuser"
+        assert config.session_id == "test-session-123"
+        assert config.grpc_max_message_size == 256 * 1024 * 1024
+        assert config.namespace == "spark-jobs"
+        assert config.timeout == 600
+
+    def test_url_with_parameters(self):
+        """Test URL with embedded parameters."""
+        config = ConnectBackendConfig(connect_url="sc://cluster:15002/;use_ssl=true;token=abc123")
+
+        assert config.connect_url == "sc://cluster:15002/;use_ssl=true;token=abc123"
+
+    def test_kubernetes_service_url(self):
+        """Test Kubernetes service DNS format."""
+        config = ConnectBackendConfig(
+            connect_url="sc://spark-connect.spark-ns.svc.cluster.local:15002"
+        )
+
+        assert config.connect_url == "sc://spark-connect.spark-ns.svc.cluster.local:15002"
+
+
+class TestConnectBackendValidation:
+    """Tests for ConnectBackend URL validation."""
+
+    @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed")
+    def test_invalid_url_missing_scheme(self):
+        """Test that invalid URL (missing sc://) is rejected."""
+        from kubeflow.spark.backends.connect import ConnectBackend
+
+        config = ConnectBackendConfig(connect_url="localhost:15002")
+
+        with pytest.raises(ValueError, match="Invalid Spark Connect URL"):
+            ConnectBackend(config)
+
+    @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed")
+    def test_invalid_url_missing_port(self):
+        """Test that URL without port is rejected."""
+        from kubeflow.spark.backends.connect import ConnectBackend
+
+        config = ConnectBackendConfig(connect_url="sc://localhost")
+
+        with pytest.raises(ValueError, match="Invalid Spark Connect URL"):
+            ConnectBackend(config)
+
+    @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed")
+    def test_empty_url(self):
+        """Test that empty URL is rejected."""
+        from kubeflow.spark.backends.connect import ConnectBackend
+
+        config = ConnectBackendConfig(connect_url="")
+
+        with pytest.raises(ValueError, match="connect_url is required"):
+            ConnectBackend(config)
+
+
+class TestConnectBackendURLBuilding:
+    """Tests for connection URL building logic."""
+
+    @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed")
+    def test_url_building_with_ssl(self):
+        """Test URL building with SSL enabled."""
+        from kubeflow.spark.backends.connect import ConnectBackend
+
+        config = ConnectBackendConfig(connect_url="sc://localhost:15002", use_ssl=True)
+        backend = ConnectBackend(config)
+
+        url = backend._build_connection_url()
+        assert "use_ssl=true" in url
+
+    @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed")
+    def test_url_building_with_token(self):
+        """Test URL building with authentication token."""
+        from kubeflow.spark.backends.connect import ConnectBackend
+
+        config = ConnectBackendConfig(connect_url="sc://localhost:15002", token="test-token")
+        backend = ConnectBackend(config)
+
+        url = backend._build_connection_url()
+        assert "token=test-token" in url
+
+    @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed")
+    def test_url_building_preserves_existing_params(self):
+        """Test that existing URL parameters are preserved."""
+        from kubeflow.spark.backends.connect import ConnectBackend
+
+        config = ConnectBackendConfig(connect_url="sc://localhost:15002/;custom_param=value")
+        backend = ConnectBackend(config)
+
+        url = backend._build_connection_url()
+        assert "custom_param=value" in url
+
+    @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed")
+    def test_url_building_config_overrides_url_params(self):
+        """Test that config parameters override URL parameters."""
+        from kubeflow.spark.backends.connect import ConnectBackend
+
+        config = ConnectBackendConfig(
+            connect_url="sc://localhost:15002/;token=url-token", token="config-token"
+        )
+        backend = ConnectBackend(config)
+
+        url = backend._build_connection_url()
+        assert "token=config-token" in url
+        assert "token=url-token" not in url
+
+
+class TestSessionMetrics:
+    """Tests for SessionMetrics model."""
+
+    def test_default_metrics(self):
+        """Test default metric values."""
+        metrics = SessionMetrics(session_id="test-123")
+
+        assert metrics.session_id == "test-123"
+        assert metrics.queries_executed == 0
+        assert metrics.active_queries == 0
+        assert metrics.artifacts_uploaded == 0
+        assert metrics.data_read_bytes == 0
+        assert metrics.data_written_bytes == 0
+        assert metrics.execution_time_ms == 0
+
+    def test_metrics_with_values(self):
+        """Test metrics with custom values."""
+        metrics = SessionMetrics(
+            session_id="test-123",
+            queries_executed=10,
+            active_queries=2,
+            artifacts_uploaded=5,
+            data_read_bytes=1024 * 1024,
+            data_written_bytes=512 * 1024,
+            execution_time_ms=5000,
+        )
+
+        assert metrics.queries_executed == 10
+        assert metrics.active_queries == 2
+        assert metrics.artifacts_uploaded == 5
+        assert metrics.data_read_bytes == 1024 * 1024
+        assert metrics.data_written_bytes == 512 * 1024
+        assert metrics.execution_time_ms == 5000
+
+
+class TestSessionInfo:
+    """Tests for SessionInfo model."""
+
+    def test_basic_session_info(self):
+        """Test basic session info creation."""
+        info = SessionInfo(session_id="test-123", app_name="test-app")
+
+        assert info.session_id == "test-123"
+        assert info.app_name == "test-app"
+        assert info.state == "active"
+        assert info.user_id is None
+        assert info.metrics is None
+
+    def test_session_info_with_metrics(self):
+        """Test session info with metrics."""
+        metrics = SessionMetrics(session_id="test-123", queries_executed=5)
+        info = SessionInfo(
+            session_id="test-123", app_name="test-app", state="active", metrics=metrics
+        )
+
+        assert info.metrics is not None
+        assert info.metrics.queries_executed == 5
+
+    def test_session_info_with_all_fields(self):
+        """Test session info with all fields populated."""
+        metrics = SessionMetrics(session_id="test-123")
+        info = SessionInfo(
+            session_id="test-123",
+            app_name="test-app",
+            user_id="testuser",
+            created_at="2024-01-01T00:00:00",
+            last_activity="2024-01-01T01:00:00",
+            state="active",
+            metrics=metrics,
+        )
+
+        assert info.session_id == "test-123"
+        assert info.app_name == "test-app"
+        assert info.user_id == "testuser"
+        assert info.created_at == "2024-01-01T00:00:00"
+        assert info.last_activity == "2024-01-01T01:00:00"
+        assert info.state == "active"
+        assert info.metrics == metrics
+
+
+class TestConnectBackendBatchMethodsRaiseErrors:
+    """Test that batch-oriented methods raise NotImplementedError."""
+
+    @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed")
+    def test_submit_application_raises_error(self):
+        """Test that submit_application raises NotImplementedError."""
+        from kubeflow.spark.backends.connect import ConnectBackend
+
+        config = ConnectBackendConfig(connect_url="sc://localhost:15002")
+        backend = ConnectBackend(config)
+
+        with pytest.raises(NotImplementedError, match="batch application submission"):
+            backend.submit_application(
+                app_name="test",
+                main_application_file="test.py",
+                spark_version="3.5.0",
+                app_type="Python",
+                driver_cores=1,
+                driver_memory="1g",
+                executor_cores=1,
+                executor_memory="1g",
+                num_executors=1,
+                queue=None,
+                arguments=None,
+                python_version="3",
+                spark_conf=None,
+                hadoop_conf=None,
+                env_vars=None,
+                deps=None,
+            )
+
+    @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed")
+    def test_get_status_raises_error(self):
+        """Test that get_status raises NotImplementedError."""
+        from kubeflow.spark.backends.connect import ConnectBackend
+
+        config = ConnectBackendConfig(connect_url="sc://localhost:15002")
+        backend = ConnectBackend(config)
+
+        with pytest.raises(NotImplementedError, match="batch application status"):
+            backend.get_job("test-id")
+
+    @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed")
+    def test_delete_application_raises_error(self):
+        """Test that delete_application raises NotImplementedError."""
+        from kubeflow.spark.backends.connect import ConnectBackend
+
+        config = ConnectBackendConfig(connect_url="sc://localhost:15002")
+        backend = ConnectBackend(config)
+
+        with pytest.raises(NotImplementedError, match="batch application deletion"):
+            backend.delete_job("test-id")
+
+    @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed")
+    def test_get_logs_raises_error(self):
+        """Test that get_logs raises NotImplementedError."""
+        from kubeflow.spark.backends.connect import ConnectBackend
+
+        config = ConnectBackendConfig(connect_url="sc://localhost:15002")
+        backend = ConnectBackend(config)
+
+        with pytest.raises(NotImplementedError, match="logs retrieval"):
+            list(backend.get_job_logs("test-id"))
+
+    @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed")
+    def test_list_applications_raises_error(self):
+        """Test that list_applications raises NotImplementedError."""
+        from kubeflow.spark.backends.connect import ConnectBackend
+
+        config = ConnectBackendConfig(connect_url="sc://localhost:15002")
+        backend = ConnectBackend(config)
+
+        with pytest.raises(NotImplementedError, match="listing applications"):
+            backend.list_jobs()
+
+    @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed")
+    def test_wait_for_completion_raises_error(self):
+        """Test that wait_for_completion raises NotImplementedError."""
+        from kubeflow.spark.backends.connect import ConnectBackend
+
+        config = ConnectBackendConfig(connect_url="sc://localhost:15002")
+        backend = ConnectBackend(config)
+
+        with pytest.raises(NotImplementedError, match="application completion"):
+            backend.wait_for_job_status("test-id")
diff --git a/kubeflow/spark/test/test_connect_integration.py b/kubeflow/spark/test/test_connect_integration.py
new file mode 100644
index 000000000..b512d923b
--- /dev/null
+++ b/kubeflow/spark/test/test_connect_integration.py
@@ -0,0 +1,399 @@
+#!/usr/bin/env python3
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Integration tests for Spark Connect backend.
+
+These tests require a running Spark Connect server.
+
+Setup:
+1. Install PySpark with Connect support:
+   pip install 'pyspark[connect]>=3.4.0'
+
+2. Start local Spark Connect server:
+   $SPARK_HOME/sbin/start-connect-server.sh \
+       --packages org.apache.spark:spark-connect_2.12:3.5.0
+
+   Or using Docker:
+   docker run -p 15002:15002 apache/spark:3.5.0 \
+       /opt/spark/sbin/start-connect-server.sh
+
+3. Run tests:
+   pytest kubeflow/spark/test/test_connect_integration.py -v
+
+Environment variables:
+- SPARK_CONNECT_URL: Spark Connect URL (default: sc://localhost:15002)
+- SKIP_INTEGRATION_TESTS: Set to skip these tests (useful in CI)
+"""
+
+import os
+import sys
+
+import pytest
+
+
+def _is_pyspark_available() -> bool:
+    """Check if PySpark Connect is available."""
+    try:
+        import pyspark  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def _should_skip_integration_tests() -> bool:
+    """Check if integration tests should be skipped."""
+    return os.getenv("SKIP_INTEGRATION_TESTS", "false").lower() == "true"
+
+
+def _get_connect_url() -> str:
+    """Get Spark Connect URL from environment."""
+    return os.getenv("SPARK_CONNECT_URL", "sc://localhost:15002")
+
+
+pytestmark = pytest.mark.skipif(
+    not _is_pyspark_available() or _should_skip_integration_tests(),
+    reason="PySpark Connect not installed or integration tests disabled",
+)
+
+
+class TestConnectBackendIntegration:
+    """Integration tests for ConnectBackend with real Spark Connect server."""
+
+    def test_create_and_close_session(self):
+        """Test creating and closing a session."""
+        from kubeflow.spark import ConnectBackendConfig, SparkSessionClient
+
+        config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False, timeout=30)
+
+        client = SparkSessionClient(backend_config=config)
+
+        try:
+            session = client.create_session(app_name="test-session")
+            assert session is not None
+            assert session.session_id is not None
+            assert session.app_name == "test-session"
+            assert not session.is_closed
+
+            session.close()
+            assert session.is_closed
+        finally:
+            client.close()
+
+    def test_simple_sql_query(self):
+        """Test executing a simple SQL query."""
+        from kubeflow.spark import ConnectBackendConfig, SparkSessionClient
+
+        config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False)
+
+        with SparkSessionClient(backend_config=config) as client:
+            session = client.create_session(app_name="sql-test")
+
+            try:
+                df = session.sql("SELECT 1 AS id, 'test' AS name")
+                result = df.collect()
+
+                assert len(result) == 1
+                assert result[0].id == 1
+                assert result[0].name == "test"
+            finally:
+                session.close()
+
+    def test_create_dataframe_and_show(self):
+        """Test creating a DataFrame and showing data."""
+        from kubeflow.spark import ConnectBackendConfig, SparkSessionClient
+
+        config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False)
+
+        with SparkSessionClient(backend_config=config) as client:
+            session = client.create_session(app_name="dataframe-test")
+
+            try:
+                data = [
+                    (1, "Alice", 28),
+                    (2, "Bob", 35),
+                    (3, "Carol", 42),
+                ]
+                df = session.createDataFrame(data, ["id", "name", "age"])
+
+                assert df.count() == 3
+
+                result = df.collect()
+                assert len(result) == 3
+                assert result[0].name == "Alice"
+                assert result[1].age == 35
+
+                print("\nDataFrame content:")
+                df.show()
+            finally:
+                session.close()
+
+    def test_dataframe_transformations(self):
+        """Test DataFrame transformations (filter, select, groupBy)."""
+        from kubeflow.spark import ConnectBackendConfig, SparkSessionClient
+
+        config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False)
+
+        with SparkSessionClient(backend_config=config) as client:
+            session = client.create_session(app_name="transform-test")
+
+            try:
+                data = [
+                    (1, "Engineering", 100000),
+                    (2, "Engineering", 120000),
+                    (3, "Sales", 80000),
+                    (4, "Sales", 90000),
+                    (5, "Marketing", 85000),
+                ]
+                df = session.createDataFrame(data, ["id", "department", "salary"])
+
+                filtered = df.filter(df.salary > 85000)
+                assert filtered.count() == 4
+
+                selected = df.select("department", "salary")
+                assert len(selected.columns) == 2
+
+                grouped = df.groupBy("department").count()
+                result = grouped.collect()
+                assert len(result) == 3
+
+                print("\nGrouped by department:")
+                grouped.show()
+            finally:
+                session.close()
+
+    def test_session_metrics(self):
+        """Test session metrics collection."""
+        from kubeflow.spark import ConnectBackendConfig, SparkSessionClient
+
+        config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False)
+
+        with SparkSessionClient(backend_config=config) as client:
+            session = client.create_session(app_name="metrics-test")
+
+            try:
+                initial_metrics = session.get_metrics()
+                assert initial_metrics.queries_executed == 0
+
+                session.sql("SELECT 1")
+                session.sql("SELECT 2")
+
+                updated_metrics = session.get_metrics()
+                assert updated_metrics.queries_executed == 2
+            finally:
+                session.close()
+
+    def test_multiple_sessions(self):
+        """Test creating multiple concurrent sessions."""
+        from kubeflow.spark import ConnectBackendConfig, SparkSessionClient
+
+        config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False)
+
+        with SparkSessionClient(backend_config=config) as client:
+            session1 = client.create_session(app_name="session-1")
+            session2 = client.create_session(app_name="session-2")
+
+            try:
+                assert session1.session_id != session2.session_id
+
+                sessions = client.list_sessions()
+                assert len(sessions) == 2
+
+                df1 = session1.sql("SELECT 'session1' AS source")
+                df2 = session2.sql("SELECT 'session2' AS source")
+
+                assert df1.collect()[0].source == "session1"
+                assert df2.collect()[0].source == "session2"
+            finally:
+                session1.close()
+                session2.close()
+
+    def test_range_dataframe(self):
+        """Test creating range DataFrame."""
+        from kubeflow.spark import ConnectBackendConfig, SparkSessionClient
+
+        config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False)
+
+        with SparkSessionClient(backend_config=config) as client:
+            session = client.create_session(app_name="range-test")
+
+            try:
+                df = session.range(0, 10, 2)
+                assert df.count() == 5
+
+                result = df.collect()
+                assert result[0].id == 0
+                assert result[1].id == 2
+                assert result[4].id == 8
+
+                print("\nRange DataFrame:")
+                df.show()
+            finally:
+                session.close()
+
+    def test_context_manager(self):
+        """Test session context manager."""
+        from kubeflow.spark import ConnectBackendConfig, SparkSessionClient
+
+        config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False)
+        client = SparkSessionClient(backend_config=config)
+
+        with client.create_session(app_name="context-test") as session:
+            df = session.sql("SELECT 42 AS answer")
+            result = df.collect()
+            assert result[0].answer == 42
+
+        assert session.is_closed
+
+    def test_get_session_info(self):
+        """Test getting session information."""
+        from kubeflow.spark import ConnectBackendConfig, SparkSessionClient
+
+        config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False)
+
+        with SparkSessionClient(backend_config=config) as client:
+            session = client.create_session(app_name="info-test")
+
+            try:
+                info = session.get_info()
+                assert info.session_id == session.session_id
+                assert info.app_name == "info-test"
+                assert info.state == "active"
+                assert info.metrics is not None
+
+                status = client.get_session_status(session.session_id)
+                assert status.session_id == session.session_id
+                assert status.app_name == "info-test"
+            finally:
+                session.close()
+
+
+class TestConnectBackendErrorHandling:
+    """Test error handling in ConnectBackend."""
+
+    def test_connection_to_invalid_server(self):
+        """Test connection to non-existent server."""
+        from kubeflow.spark import ConnectBackendConfig, SparkSessionClient
+
+        config = ConnectBackendConfig(
+            connect_url="sc://nonexistent-host:99999", use_ssl=False, timeout=5
+        )
+
+        with SparkSessionClient(backend_config=config) as client:
+            with pytest.raises(Exception):
+                client.create_session(app_name="fail-test")
+
+    def test_query_on_closed_session(self):
+        """Test querying after session is closed."""
+        from kubeflow.spark import ConnectBackendConfig, SparkSessionClient
+
+        config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False)
+
+        with SparkSessionClient(backend_config=config) as client:
+            session = client.create_session(app_name="closed-test")
+            session.close()
+
+            with pytest.raises(RuntimeError, match="closed"):
+                session.sql("SELECT 1")
+
+
+def main():
+    """Run integration tests manually."""
+    print("=" * 80)
+    print("Spark Connect Integration Tests")
+    print("=" * 80)
+    print(f"\nConnect URL: {_get_connect_url()}")
+    print(f"PySpark available: {_is_pyspark_available()}")
+    print(f"Skip integration tests: {_should_skip_integration_tests()}")
+
+    if not _is_pyspark_available():
+        print("\nERROR: PySpark Connect not installed!")
+        print("Install with: pip install 'pyspark[connect]>=3.4.0'")
+        sys.exit(1)
+
+    if _should_skip_integration_tests():
+        print("\nINFO: Integration tests disabled (SKIP_INTEGRATION_TESTS=true)")
+        sys.exit(0)
+
+    print("\n" + "=" * 80)
+    print("Running basic connectivity test...")
+    print("=" * 80)
+
+    try:
+        from kubeflow.spark import ConnectBackendConfig, SparkSessionClient
+
+        config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False)
+
+        print(f"\nConnecting to: {_get_connect_url()}")
+
+        with SparkSessionClient(backend_config=config) as client:
+            print("✓ Client created successfully")
+
+            session = client.create_session(app_name="manual-test")
+            print(f"✓ Session created: {session.session_id}")
+
+            try:
+                print("\n" + "-" * 80)
+                print("Test 1: Simple SQL Query")
+                print("-" * 80)
+                df = session.sql("SELECT 1 AS id, 'Hello Spark Connect!' AS message")
+                result = df.collect()
+                print(f"✓ Query executed: {result[0].message}")
+                df.show()
+
+                print("\n" + "-" * 80)
+                print("Test 2: Create DataFrame")
+                print("-" * 80)
+                data = [
+                    (1, "Alice", 28),
+                    (2, "Bob", 35),
+                    (3, "Carol", 42),
+                ]
+                df = session.createDataFrame(data, ["id", "name", "age"])
+                print(f"✓ DataFrame created with {df.count()} rows")
+                df.show()
+
+                print("\n" + "-" * 80)
+                print("Test 3: DataFrame Transformations")
+                print("-" * 80)
+                filtered = df.filter(df.age > 30)
+                print(f"✓ Filtered to {filtered.count()} rows (age > 30)")
+                filtered.show()
+
+                print("\n" + "-" * 80)
+                print("Test 4: Session Metrics")
+                print("-" * 80)
+                metrics = session.get_metrics()
+                print(f"✓ Queries executed: {metrics.queries_executed}")
+                print(f"✓ Active queries: {metrics.active_queries}")
+
+                print("\n" + "=" * 80)
+                print("All tests passed! ✓")
+                print("=" * 80)
+
+            finally:
+                session.close()
+                print("\n✓ Session closed")
+
+    except Exception as e:
+        print(f"\n✗ Test failed: {e}")
+        import traceback
+
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/kubeflow/spark/utils.py b/kubeflow/spark/utils.py
new file mode 100644
index 000000000..631f6b571
--- /dev/null
+++ b/kubeflow/spark/utils.py
@@ -0,0 +1,154 @@
+"""Utility functions for Spark client."""
+
+import logging
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+def format_memory(memory_mb: int) -> str:
+    """Format memory in MB to Kubernetes format.
+
+    Args:
+        memory_mb: Memory in megabytes
+
+    Returns:
+        Formatted memory string (e.g., "4096m", "4g")
+
+    Example:
+        >>> format_memory(1024)
+        '1g'
+        >>> format_memory(512)
+        '512m'
+    """
+    if memory_mb >= 1024 and memory_mb % 1024 == 0:
+        return f"{memory_mb // 1024}g"
+    return f"{memory_mb}m"
+
+
+def parse_memory(memory_str: str) -> int:
+    """Parse Kubernetes memory format to MB.
+
+    Args:
+        memory_str: Memory string (e.g., "4g", "512m")
+
+    Returns:
+        Memory in megabytes
+
+    Example:
+        >>> parse_memory("4g")
+        4096
+        >>> parse_memory("512m")
+        512
+    """
+    memory_str = memory_str.lower().strip()
+
+    if memory_str.endswith("g"):
+        return int(memory_str[:-1]) * 1024
+    elif memory_str.endswith("m"):
+        return int(memory_str[:-1])
+    elif memory_str.endswith("k"):
+        return int(memory_str[:-1]) // 1024
+    else:
+        # Assume bytes
+        return int(memory_str) // (1024 * 1024)
+
+
+def validate_spark_config(config: dict[str, Any]) -> bool:
+    """Validate Spark configuration.
+
+    Args:
+        config: Spark configuration dictionary
+
+    Returns:
+        True if valid
+
+    Raises:
+        ValueError: If configuration is invalid
+    """
+    required_fields = ["app_name", "main_application_file"]
+
+    for field in required_fields:
+        if field not in config or not config[field]:
+            raise ValueError(f"Required field '{field}' is missing or empty")
+
+    # Validate resource specifications
+    if "driver_memory" in config:
+        try:
+            parse_memory(config["driver_memory"])
+        except Exception as e:
+            raise ValueError(f"Invalid driver_memory format: {e}") from e
+
+    if "executor_memory" in config:
+        try:
+            parse_memory(config["executor_memory"])
+        except Exception as e:
+            raise ValueError(f"Invalid executor_memory format: {e}") from e
+
+    return True
+
+
+def build_s3_path(bucket: str, prefix: str, filename: str) -> str:
+    """Build S3 path for artifacts.
+
+    Args:
+        bucket: S3 bucket name
+        prefix: Prefix/folder path
+        filename: File name
+
+    Returns:
+        Complete S3 path
+
+    Example:
+        >>> build_s3_path("my-bucket", "artifacts/spark", "app.py")
+        's3://my-bucket/artifacts/spark/app.py'
+    """
+    prefix = prefix.strip("/")
+    if prefix:
+        return f"s3://{bucket}/{prefix}/{filename}"
+    return f"s3://{bucket}/{filename}"
+
+
+def wait_for_completion(
+    client: "BatchSparkClient",
+    submission_id: str,
+    timeout: int = 3600,
+    poll_interval: int = 10,
+) -> "ApplicationStatus":
+    """Wait for Spark application to complete.
+
+    Args:
+        client: BatchSparkClient instance
+        submission_id: Submission ID to monitor
+        timeout: Maximum time to wait in seconds
+        poll_interval: Polling interval in seconds
+
+    Returns:
+        Final ApplicationStatus
+
+    Raises:
+        TimeoutError: If application doesn't complete within timeout
+    """
+    import time
+
+    from kubeflow.spark.models import ApplicationState
+
+    start_time = time.time()
+
+    while True:
+        status = client.get_job(submission_id)
+
+        if status.state in [
+            ApplicationState.COMPLETED,
+            ApplicationState.FAILED,
+        ]:
+            return status
+
+        elapsed = time.time() - start_time
+        if elapsed >= timeout:
+            raise TimeoutError(f"Application {submission_id} did not complete within {timeout}s")
+
+        logger.info(
+            f"Application {submission_id} status: {status.state.value}. Waiting {poll_interval}s..."
+        )
+        time.sleep(poll_interval)
diff --git a/kubeflow/spark/validation.py b/kubeflow/spark/validation.py
new file mode 100644
index 000000000..d8e74a2d3
--- /dev/null
+++ b/kubeflow/spark/validation.py
@@ -0,0 +1,461 @@
+"""Validation module for Spark applications (matches operator webhook logic).
+
+This module provides client-side validation that mirrors the Spark Operator's webhook
+validation, allowing for fast failure before submission.
+
+Key validations:
+- Spark version compatibility (e.g., pod templates require Spark 3.0+)
+- Resource format validation (memory, CPU)
+- Node selector conflicts
+- Dynamic allocation configuration
+- Port conflicts in driver ingress options
+- Dependency paths
+"""
+
+from dataclasses import dataclass, field
+from enum import Enum
+import logging
+import re
+from typing import Any, Optional
+
+from kubeflow.spark.models import SparkApplicationRequest
+
+logger = logging.getLogger(__name__)
+
+
+class ValidationErrorType(Enum):
+    """Types of validation errors."""
+
+    SPARK_VERSION = "spark_version"
+    RESOURCE_FORMAT = "resource_format"
+    NODE_SELECTOR_CONFLICT = "node_selector_conflict"
+    DRIVER_INGRESS_PORTS = "driver_ingress_ports"
+    DYNAMIC_ALLOCATION = "dynamic_allocation"
+    DEPENDENCY_PATH = "dependency_path"
+    REQUIRED_FIELD = "required_field"
+    INVALID_VALUE = "invalid_value"
+
+
+@dataclass
+class ValidationError:
+    """A single validation error.
+
+    Attributes:
+        type: Type of validation error
+        field: Field that failed validation
+        message: Human-readable error message
+        value: The invalid value (if applicable)
+    """
+
+    type: ValidationErrorType
+    field: str
+    message: str
+    value: Optional[Any] = None
+
+
+@dataclass
+class ValidationResult:
+    """Result of validation checks.
+
+    Attributes:
+        valid: Whether validation passed
+        errors: List of validation errors
+        warnings: List of validation warnings (non-fatal)
+    """
+
+    valid: bool
+    errors: list[ValidationError] = field(default_factory=list)
+    warnings: list[str] = field(default_factory=list)
+
+    def add_error(self, error: ValidationError):
+        """Add an error and mark result as invalid."""
+        self.errors.append(error)
+        self.valid = False
+
+    def add_warning(self, message: str):
+        """Add a non-fatal warning."""
+        self.warnings.append(message)
+
+
+class SparkVersionValidator:
+    """Validates Spark version compatibility (matches operator logic)."""
+
+    @staticmethod
+    def compare_version(version1: str, version2: str) -> int:
+        """Compare two semantic versions.
+
+        Args:
+            version1: First version string (e.g., "3.5.0")
+            version2: Second version string (e.g., "3.0.0")
+
+        Returns:
+            -1 if version1 < version2, 0 if equal, 1 if version1 > version2
+        """
+
+        def normalize(v):
+            return [int(x) for x in re.sub(r"(\.0+)*$", "", v).split(".")]
+
+        try:
+            parts1 = normalize(version1)
+            parts2 = normalize(version2)
+
+            # Pad shorter version with zeros
+            max_len = max(len(parts1), len(parts2))
+            parts1.extend([0] * (max_len - len(parts1)))
+            parts2.extend([0] * (max_len - len(parts2)))
+
+            for p1, p2 in zip(parts1, parts2):
+                if p1 < p2:
+                    return -1
+                elif p1 > p2:
+                    return 1
+            return 0
+        except (ValueError, AttributeError) as e:
+            logger.warning(f"Failed to compare versions {version1} and {version2}: {e}")
+            return 0
+
+    def validate(self, request: SparkApplicationRequest) -> ValidationResult:
+        """Validate Spark version requirements.
+
+        Checks:
+        - Pod templates require Spark >= 3.0.0 (from operator webhook)
+        - Dynamic allocation features require Spark >= 3.0.0
+
+        Args:
+            request: Spark application request
+
+        Returns:
+            ValidationResult
+        """
+        result = ValidationResult(valid=True)
+
+        # Check pod template requirement (from operator)
+        if (request.driver_pod_template or request.executor_pod_template) and \
+                self.compare_version(request.spark_version, "3.0.0") < 0:
+            result.add_error(
+                ValidationError(
+                    type=ValidationErrorType.SPARK_VERSION,
+                    field="spark_version",
+                    message="Pod template feature requires Spark version 3.0.0 or higher",
+                    value=request.spark_version,
+                )
+            )
+
+        # Check dynamic allocation (Spark 3.0+)
+        if request.dynamic_allocation and request.dynamic_allocation.enabled and \
+                self.compare_version(request.spark_version, "3.0.0") < 0:
+            result.add_warning(
+                "Dynamic allocation on Kubernetes requires Spark 3.0.0+. "
+                f"Your version: {request.spark_version}"
+            )
+
+        return result
+
+
+class ResourceValidator:
+    """Validates resource specifications (memory, CPU)."""
+
+    # Regex patterns for resource formats
+    MEMORY_PATTERN = re.compile(r"^(\d+)(m|M|g|G|k|K|b|B)?$")
+    CORE_LIMIT_PATTERN = re.compile(r"^(\d+)(m)?$")
+
+    @classmethod
+    def validate_memory(cls, memory: str, field_name: str) -> Optional[ValidationError]:
+        """Validate memory format.
+
+        Args:
+            memory: Memory string (e.g., "4g", "512m")
+            field_name: Field name for error reporting
+
+        Returns:
+            ValidationError if invalid, None if valid
+        """
+        if not cls.MEMORY_PATTERN.match(memory):
+            return ValidationError(
+                type=ValidationErrorType.RESOURCE_FORMAT,
+                field=field_name,
+                message=(
+                    f"Invalid memory format: {memory}. Expected format: "
+                    "<number><unit> where unit is m, g, k, or b (e.g., '4g', '512m')"
+                ),
+                value=memory,
+            )
+        return None
+
+    @classmethod
+    def validate_cores(cls, cores: int, field_name: str) -> Optional[ValidationError]:
+        """Validate CPU cores.
+
+        Args:
+            cores: Number of cores
+            field_name: Field name for error reporting
+
+        Returns:
+            ValidationError if invalid, None if valid
+        """
+        if cores < 1:
+            return ValidationError(
+                type=ValidationErrorType.RESOURCE_FORMAT,
+                field=field_name,
+                message=f"CPU cores must be >= 1, got: {cores}",
+                value=cores,
+            )
+        return None
+
+    def validate(self, request: SparkApplicationRequest) -> ValidationResult:
+        """Validate all resource specifications.
+
+        Args:
+            request: Spark application request
+
+        Returns:
+            ValidationResult
+        """
+        result = ValidationResult(valid=True)
+
+        # Validate driver resources
+        error = self.validate_memory(request.driver_memory, "driver_memory")
+        if error:
+            result.add_error(error)
+
+        error = self.validate_cores(request.driver_cores, "driver_cores")
+        if error:
+            result.add_error(error)
+
+        # Validate executor resources
+        error = self.validate_memory(request.executor_memory, "executor_memory")
+        if error:
+            result.add_error(error)
+
+        error = self.validate_cores(request.executor_cores, "executor_cores")
+        if error:
+            result.add_error(error)
+
+        # Validate number of executors
+        if request.num_executors < 1 and not (
+            request.dynamic_allocation and request.dynamic_allocation.enabled
+        ):
+            result.add_error(
+                ValidationError(
+                    type=ValidationErrorType.INVALID_VALUE,
+                    field="num_executors",
+                    message=("num_executors must be >= 1 (unless dynamic allocation is enabled)"),
+                    value=request.num_executors,
+                )
+            )
+
+        return result
+
+
+class NodeSelectorValidator:
+    """Validates node selector configuration (matches operator webhook)."""
+
+    def validate(self, request: SparkApplicationRequest) -> ValidationResult:
+        """Validate node selector conflicts.
+
+        From operator webhook:
+        node selector cannot be defined at both SparkApplication and Driver/Executor
+
+        Args:
+            request: Spark application request
+
+        Returns:
+            ValidationResult
+        """
+        result = ValidationResult(valid=True)
+
+        # This check is handled differently in the SDK since we don't have separate
+        # driver.nodeSelector and executor.nodeSelector fields yet
+        # The node_selector field applies to both driver and executor
+
+        if request.node_selector and len(request.node_selector) > 0:
+            result.add_warning(
+                "node_selector is applied to both driver and executor pods. "
+                "Use pod templates if you need different selectors per component."
+            )
+
+        return result
+
+
+class DynamicAllocationValidator:
+    """Validates dynamic allocation configuration."""
+
+    def validate(self, request: SparkApplicationRequest) -> ValidationResult:
+        """Validate dynamic allocation settings.
+
+        Checks:
+        - If enabled, min_executors <= initial_executors <= max_executors
+        - Shuffle tracking is enabled by default (operator behavior)
+
+        Args:
+            request: Spark application request
+
+        Returns:
+            ValidationResult
+        """
+        result = ValidationResult(valid=True)
+
+        if not request.dynamic_allocation or not request.dynamic_allocation.enabled:
+            return result
+
+        dyn_alloc = request.dynamic_allocation
+
+        # Validate executor bounds
+        if dyn_alloc.min_executors is not None and dyn_alloc.max_executors is not None and \
+                dyn_alloc.min_executors > dyn_alloc.max_executors:
+            result.add_error(
+                ValidationError(
+                    type=ValidationErrorType.DYNAMIC_ALLOCATION,
+                    field="dynamic_allocation",
+                    message=(
+                        f"min_executors ({dyn_alloc.min_executors}) must be <= "
+                        f"max_executors ({dyn_alloc.max_executors})"
+                    ),
+                    value=(f"min={dyn_alloc.min_executors}, max={dyn_alloc.max_executors}"),
+                )
+            )
+
+        if dyn_alloc.initial_executors is not None:
+            if (
+                dyn_alloc.min_executors is not None
+                and dyn_alloc.initial_executors < dyn_alloc.min_executors
+            ):
+                result.add_error(
+                    ValidationError(
+                        type=ValidationErrorType.DYNAMIC_ALLOCATION,
+                        field="dynamic_allocation.initial_executors",
+                        message=(
+                            f"initial_executors ({dyn_alloc.initial_executors}) "
+                            f"must be >= min_executors ({dyn_alloc.min_executors})"
+                        ),
+                        value=dyn_alloc.initial_executors,
+                    )
+                )
+
+            if (
+                dyn_alloc.max_executors is not None
+                and dyn_alloc.initial_executors > dyn_alloc.max_executors
+            ):
+                result.add_error(
+                    ValidationError(
+                        type=ValidationErrorType.DYNAMIC_ALLOCATION,
+                        field="dynamic_allocation.initial_executors",
+                        message=(
+                            f"initial_executors ({dyn_alloc.initial_executors}) "
+                            f"must be <= max_executors ({dyn_alloc.max_executors})"
+                        ),
+                        value=dyn_alloc.initial_executors,
+                    )
+                )
+
+        # Warn if shuffle tracking is disabled (operator enables by default)
+        if dyn_alloc.shuffle_tracking_enabled is False:
+            result.add_warning(
+                "Shuffle tracking is disabled. You may need an external shuffle service. "
+                "See: https://spark.apache.org/docs/latest/running-on-kubernetes.html"
+                "#dynamic-resource-allocation"
+            )
+
+        return result
+
+
+class SparkApplicationValidator:
+    """Main validator that orchestrates all validation checks."""
+
+    def __init__(self):
+        """Initialize validator with all sub-validators."""
+        self.version_validator = SparkVersionValidator()
+        self.resource_validator = ResourceValidator()
+        self.node_selector_validator = NodeSelectorValidator()
+        self.dynamic_allocation_validator = DynamicAllocationValidator()
+
+    def validate_all(self, request: SparkApplicationRequest) -> ValidationResult:
+        """Run all validation checks.
+
+        Args:
+            request: Spark application request
+
+        Returns:
+            ValidationResult with all errors and warnings
+        """
+        final_result = ValidationResult(valid=True)
+
+        # Run all validators
+        validators = [
+            self.version_validator,
+            self.resource_validator,
+            self.node_selector_validator,
+            self.dynamic_allocation_validator,
+        ]
+
+        for validator in validators:
+            result = validator.validate(request)
+            final_result.errors.extend(result.errors)
+            final_result.warnings.extend(result.warnings)
+
+        # Mark as invalid if any errors
+        if final_result.errors:
+            final_result.valid = False
+
+        # Log results
+        if not final_result.valid:
+            logger.error(f"Validation failed with {len(final_result.errors)} errors:")
+            for error in final_result.errors:
+                logger.error(f"  [{error.type.value}] {error.field}: {error.message}")
+
+        if final_result.warnings:
+            logger.warning(f"Validation completed with {len(final_result.warnings)} warnings:")
+            for warning in final_result.warnings:
+                logger.warning(f"  {warning}")
+
+        return final_result
+
+    def validate_and_raise(self, request: SparkApplicationRequest):
+        """Validate and raise exception if invalid.
+
+        Args:
+            request: Spark application request
+
+        Raises:
+            ValueError: If validation fails
+        """
+        result = self.validate_all(request)
+
+        if not result.valid:
+            error_messages = [f"{error.field}: {error.message}" for error in result.errors]
+            raise ValueError(
+                "Spark application validation failed:\n"
+                + "\n".join(f"  - {msg}" for msg in error_messages)
+            )
+
+
+# Convenience function
+def validate_spark_application(request: SparkApplicationRequest) -> ValidationResult:
+    """Validate a Spark application request.
+
+    Args:
+        request: Spark application request to validate
+
+    Returns:
+        ValidationResult
+
+    Example:
+        ```python
+        from kubeflow.spark import SparkApplicationRequest
+        from kubeflow.spark.validation import validate_spark_application
+
+        request = SparkApplicationRequest(
+            app_name="my-app",
+            main_application_file="local:///app/main.py",
+            spark_version="2.4.0",  # Too old for pod templates!
+            driver_pod_template={...},  # Will fail validation
+        )
+
+        result = validate_spark_application(request)
+        if not result.valid:
+            for error in result.errors:
+                print(f"Error: {error.message}")
+        ```
+    """
+    validator = SparkApplicationValidator()
+    return validator.validate_all(request)
diff --git a/pyproject.toml b/pyproject.toml
index 570d16f90..37e661a47 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,6 +39,15 @@ docker = [
 podman = [
   "podman>=5.6.0"
 ]
+spark-gateway = [
+  "requests>=2.31.0",
+  "pyyaml>=6.0",
+]
+spark-connect = [
+  "pyspark[connect]>=3.4.0",
+  "grpcio>=1.48.0",
+  "pyarrow>=10.0.0",
+]
 
 [dependency-groups]
 dev = [