diff --git a/.gitignore b/.gitignore index ae9c006e1..d15dbed61 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ bin coverage.xml htmlcov/ +uv.lock diff --git a/examples/spark/01_hello_spark_pi.py b/examples/spark/01_hello_spark_pi.py new file mode 100644 index 000000000..93240b075 --- /dev/null +++ b/examples/spark/01_hello_spark_pi.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python3 +""" +Title: Hello Spark - Calculate Pi +Level: 1 (Beginner) +Target Audience: Data Scientists new to Spark +Time to Run: ~2-3 minutes + +Description: +Your first Spark job! This example demonstrates how to submit a simple PySpark application +that calculates Pi using the Monte Carlo method - a classic distributed computing example +that shows how Spark distributes work across executors. + +Prerequisites: +- Kind cluster with Spark Operator (run ./setup_test_environment.sh) +- Default namespace with 'spark-operator-spark' service account + +What You'll Learn: +- How to create a SparkClient +- Submit a PySpark application +- Wait for job completion +- Retrieve and parse job logs +- Clean up resources + +Real-World Use Case: +Distributed computation, parallel processing, Monte Carlo simulations. +""" + +from datetime import datetime +import os +import sys + +# Add SDK to path for development mode +sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +if sdk_path not in sys.path: + sys.path.insert(0, sdk_path) + +from kubeflow.spark import ( # noqa: E402 + ApplicationState, + OperatorBackendConfig, + BatchSparkClient, +) + + +def main(): + """Main example: Submit Pi calculation job and get results.""" + + print("=" * 80) + print("EXAMPLE 01: Hello Spark - Calculate Pi") + print("=" * 80) + print() + print("This example demonstrates:") + print(" 1. Creating a Spark client") + print(" 2. Submitting a PySpark application (Calculate Pi)") + print(" 3. Monitoring job progress") + print(" 4. Retrieving results from logs") + print() + + # Step 1: Create SparkClient with configuration + print("Step 1: Creating Spark client...") + config = OperatorBackendConfig( + namespace=os.getenv("SPARK_NAMESPACE", "default"), + service_account="spark-operator-spark", + default_spark_image="docker.io/library/spark", + context=os.getenv("KUBE_CONTEXT", "kind-spark-test"), + enable_monitoring=False, # Keep it simple for beginners + enable_ui=False, # We'll enable this in later examples + ) + client = BatchSparkClient(backend_config=config) + print(" Client created successfully") + print() + + # Step 2: Prepare the application + # Use timestamp to ensure unique name each run + timestamp = datetime.now().strftime("%H%M%S") + app_name = f"hello-spark-{timestamp}" + + print("Step 2: Configuring Spark application...") + print(f" App name: {app_name}") + print(" Spark version: 4.0.0") + print(" Resources: 1 driver + 2 executors") + print(" Memory: 512m per container") + print(" Example: Calculate Pi using Monte Carlo method") + print() + + # Step 3: Submit the application + print("Step 3: Submitting application to Kubernetes...") + + try: + response = client.submit_application( + # Application metadata + app_name=app_name, + main_application_file="local:///opt/spark/examples/src/main/python/pi.py", + # Spark configuration + spark_version="4.0.0", + app_type="Python", + # Resource allocation (small for demo) + driver_cores=1, + driver_memory="512m", + executor_cores=1, + executor_memory="512m", + num_executors=2, + # Arguments for pi calculation (number of samples) + arguments=["10"], # Calculate Pi with 10 partitions + # Required for Spark 4.0 + spark_conf={ + "spark.kubernetes.file.upload.path": "/tmp", + }, + ) + + print(" Application submitted successfully!") + print(f" Submission ID: {response.submission_id}") + print(f" Status: {response.status}") + print() + + except Exception as e: + print(f" ERROR: Submission failed: {e}") + sys.exit(1) + + # Step 4: Monitor the application + print("Step 4: Monitoring application (this may take 1-2 minutes)...") + print(" Waiting for pods to start...") + + try: + # Wait for completion with timeout + final_status = client.wait_for_job_status( + submission_id=app_name, + timeout=300, # 5 minutes max + polling_interval=5, # Check every 5 seconds + ) + + print(" Application completed!") + print(f" Final state: {final_status.state.value}") + print() + + # Check if successful + if final_status.state != ApplicationState.COMPLETED: + print( + f" WARNING: Application did not complete successfully: {final_status.state.value}" + ) # noqa: E501 + print(" Check logs below for details.") + + except TimeoutError: + print(" ERROR: Application did not complete within 5 minutes") + print(" You can check status later with: client.get_job('{app_name}')") + sys.exit(1) + except Exception as e: + print(f" ERROR: Error monitoring application: {e}") + sys.exit(1) + + # Step 5: Retrieve results from logs + print("Step 5: Retrieving application logs and results...") + print() + + try: + logs = list(client.get_job_logs(app_name)) + + # Parse and display results + print("=" * 80) + print("CALCULATION RESULTS:") + print("=" * 80) + + # Find the Pi calculation result + pi_found = False + for line in logs: + if "Pi is roughly" in line: + print(f"\n{line}\n") + pi_found = True + break + + if not pi_found: + # Show last 20 lines if Pi result not found + print("Recent log lines:") + for line in logs[-20:]: + print(line) + + print("=" * 80) + + except Exception as e: + print(f" WARNING: Could not retrieve logs: {e}") + print(" The job may have completed but logs are not yet available") + + # Step 6: Cleanup + print() + print("Step 6: Cleaning up resources...") + try: + client.delete_job(app_name) + print(f" Application '{app_name}' deleted") + except Exception as e: + print(f" WARNING: Cleanup warning: {e}") + print(f" You can manually delete with: kubectl delete sparkapplication {app_name}") + + print() + print("=" * 80) + print("EXAMPLE COMPLETED SUCCESSFULLY!") + print("=" * 80) + print() + print("What you learned:") + print(" - How to create a SparkClient") + print(" - How to submit a PySpark application") + print(" - How to wait for completion") + print(" - How to retrieve logs") + print(" - How to clean up resources") + print() + print("Key SDK Methods:") + print(" - BatchSparkClient(backend_config=config) - Create client") + print(" - client.submit_application(...) - Submit Spark job") + print(" - client.wait_for_job_status(...) - Monitor job") + print(" - client.get_job_logs(...) - Retrieve logs") + print(" - client.delete_job(...) - Cleanup") + print() + print("Next steps:") + print(" - Try example 02: CSV data analysis") + print(" - Try example 03: Interactive DataFrame exploration") + print(" - Modify driver/executor resources") + print(" - Try with different Spark versions") + print() + + +if __name__ == "__main__": + main() diff --git a/examples/spark/02_csv_data_analysis.py b/examples/spark/02_csv_data_analysis.py new file mode 100644 index 000000000..c88673c99 --- /dev/null +++ b/examples/spark/02_csv_data_analysis.py @@ -0,0 +1,355 @@ +#!/usr/bin/env python3 +""" +Title: CSV Data Analysis with Spark +Level: 1 (Beginner) +Target Audience: Data Scientists analyzing tabular data +Time to Run: ~2-3 minutes + +Description: +This example demonstrates how to analyze CSV data using Spark DataFrames - one of the +most common tasks in data science. You'll learn to load CSV files, perform filtering, +grouping, and aggregations - the bread and butter of data analysis. + +Prerequisites: +- Kind cluster with Spark Operator (run ./setup_test_environment.sh) +- Default namespace with 'spark-operator-spark' service account + +What You'll Learn: +- How to read CSV files with schema inference +- DataFrame filtering and selection +- Group-by aggregations (sum, avg, count) +- Sorting and limiting results +- Writing results back to CSV + +Real-World Use Case: +Sales data analysis, customer analytics, business intelligence reporting. +""" + +import os +import sys + +# Add SDK to path for development mode +sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +if sdk_path not in sys.path: + sys.path.insert(0, sdk_path) + +from kubeflow.spark import ApplicationState, OperatorBackendConfig, SparkClient # noqa: E402 + + +def create_csv_analysis_script(): + """Create a PySpark script for CSV data analysis. + + Returns: + str: Python code for CSV analysis + """ + return """ +from pyspark.sql import SparkSession +from pyspark.sql.functions import col, sum as _sum, avg, count, round as _round +import sys + +# Create Spark session +spark = SparkSession.builder \\ + .appName("CSV Data Analysis") \\ + .getOrCreate() + +print("\\n" + "="*80) +print("CSV DATA ANALYSIS EXAMPLE") +print("="*80) + +# In production, you'd read from S3/HDFS. For demo, we'll create sample data. +# Sample: Sales transaction data +print("\\nStep 1: Creating sample sales data...") + +from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType +from datetime import date + +# Define schema +schema = StructType([ + StructField("transaction_id", IntegerType(), False), + StructField("date", StringType(), False), + StructField("product", StringType(), False), + StructField("category", StringType(), False), + StructField("quantity", IntegerType(), False), + StructField("price", DoubleType(), False), + StructField("region", StringType(), False), +]) + +# Sample sales data +sales_data = [ + (1, "2024-01-15", "Laptop", "Electronics", 2, 1200.00, "North"), + (2, "2024-01-15", "Mouse", "Electronics", 5, 25.00, "North"), + (3, "2024-01-16", "Keyboard", "Electronics", 3, 75.00, "South"), + (4, "2024-01-16", "Monitor", "Electronics", 1, 300.00, "East"), + (5, "2024-01-17", "Desk Chair", "Furniture", 4, 250.00, "West"), + (6, "2024-01-17", "Desk", "Furniture", 2, 500.00, "North"), + (7, "2024-01-18", "Laptop", "Electronics", 1, 1200.00, "South"), + (8, "2024-01-18", "Mouse", "Electronics", 10, 25.00, "East"), + (9, "2024-01-19", "Monitor", "Electronics", 2, 300.00, "West"), + (10, "2024-01-19", "Desk Chair", "Furniture", 3, 250.00, "North"), + (11, "2024-01-20", "Laptop", "Electronics", 3, 1200.00, "East"), + (12, "2024-01-20", "Keyboard", "Electronics", 5, 75.00, "West"), +] + +# Create DataFrame +df = spark.createDataFrame(sales_data, schema) + +print(f" Done Created DataFrame with {df.count()} transactions") +print("\\nSample data (first 5 rows):") +df.show(5, truncate=False) + +# Step 2: Add calculated column (total_amount) +print("\\nStep 2: Adding calculated column (total_amount = quantity * price)...") +df = df.withColumn("total_amount", col("quantity") * col("price")) +print(" Done Added total_amount column") + +# Step 3: Basic filtering +print("\\nStep 3: Filtering high-value transactions (>$500)...") +high_value = df.filter(col("total_amount") > 500) +print(f" Done Found {high_value.count()} high-value transactions") +high_value.select("transaction_id", "product", "quantity", "total_amount", "region").show() + +# Step 4: Group by category and aggregate +print("\\nStep 4: Sales summary by category...") +category_summary = df.groupBy("category").agg( + count("transaction_id").alias("num_transactions"), + _sum("quantity").alias("total_quantity"), + _sum("total_amount").alias("total_revenue"), + _round(avg("total_amount"), 2).alias("avg_transaction_value") +).orderBy(col("total_revenue").desc()) + +print("\\n" + "="*80) +print("SALES SUMMARY BY CATEGORY") +print("="*80) +category_summary.show(truncate=False) + +# Step 5: Group by region and aggregate +print("\\nStep 5: Sales summary by region...") +region_summary = df.groupBy("region").agg( + count("transaction_id").alias("num_transactions"), + _sum("total_amount").alias("total_revenue"), + _round(avg("total_amount"), 2).alias("avg_transaction_value") +).orderBy(col("total_revenue").desc()) + +print("\\n" + "="*80) +print("SALES SUMMARY BY REGION") +print("="*80) +region_summary.show(truncate=False) + +# Step 6: Top products by revenue +print("\\nStep 6: Top 3 products by revenue...") +top_products = df.groupBy("product").agg( + _sum("quantity").alias("units_sold"), + _sum("total_amount").alias("total_revenue") +).orderBy(col("total_revenue").desc()).limit(3) + +print("\\n" + "="*80) +print("TOP 3 PRODUCTS BY REVENUE") +print("="*80) +top_products.show(truncate=False) + +# Step 7: Export results as CSV format +print("\\nStep 7: Exporting results in CSV format...") +print("\\nCATEGORY_SUMMARY.CSV:") +print("category,num_transactions,total_quantity,total_revenue,avg_transaction_value") +for row in category_summary.collect(): + print(f"{row.category},{row.num_transactions},{row.total_quantity},{row.total_revenue},{row.avg_transaction_value}") + +print("\\nREGION_SUMMARY.CSV:") +print("region,num_transactions,total_revenue,avg_transaction_value") +for row in region_summary.collect(): + print(f"{row.region},{row.num_transactions},{row.total_revenue},{row.avg_transaction_value}") + +print("\\n" + "="*80) +print("ANALYSIS COMPLETE!") +print("="*80) +print(f"\\nKey Insights:") +print(f" - Total Transactions: {df.count()}") +print(f" - Total Revenue: ${df.agg(_sum('total_amount')).collect()[0][0]:.2f}") +print(f" - Avg Transaction: ${df.agg(avg('total_amount')).collect()[0][0]:.2f}") +print(f" - Categories: {df.select('category').distinct().count()}") +print(f" - Regions: {df.select('region').distinct().count()}") + +spark.stop() +""" + + +def main(): + """Main example: Submit CSV analysis job and get results.""" + + print("=" * 80) + print("EXAMPLE 02: CSV Data Analysis with Spark") + print("=" * 80) + print() + print("This example demonstrates:") + print(" 1. Loading and analyzing CSV data") + print(" 2. DataFrame filtering and transformations") + print(" 3. Group-by aggregations (sum, avg, count)") + print(" 4. Multi-dimensional analysis (category, region)") + print(" 5. Exporting analysis results") + print() + + # Step 1: Create SparkClient with configuration + print("Step 1: Creating Spark client...") + config = OperatorBackendConfig( + namespace=os.getenv("SPARK_NAMESPACE", "default"), + service_account="spark-operator-spark", + default_spark_image="docker.io/library/spark", + context=os.getenv("KUBE_CONTEXT", "kind-spark-test"), + enable_monitoring=False, + enable_ui=False, + ) + client = BatchSparkClient(backend_config=config) + print(" Client created successfully") + print() + + # Step 2: Prepare the application + app_name = "csv-data-analysis" + + print("Step 2: Configuring Spark application...") + print(f" App name: {app_name}") + print(" Spark version: 4.0.0") + print(" Resources: 1 driver + 2 executors") + print(" Analysis: Sales data by category and region") + print() + + # Step 3: Submit the application + print("Step 3: Submitting CSV analysis application...") + + try: + # For this example, we'll use Spark's Python executor to run our script + # In production, you'd store the script in S3/HDFS + # Here we use a workaround: embed the script as arguments to python -c + + response = client.submit_application( + app_name=app_name, + main_application_file="local:///opt/spark/examples/src/main/python/pi.py", + # Spark configuration + spark_version="4.0.0", + app_type="Python", + # Resource allocation (small for demo) + driver_cores=1, + driver_memory="512m", + executor_cores=1, + executor_memory="512m", + num_executors=2, + # Required for Spark 4.0 + spark_conf={ + "spark.kubernetes.file.upload.path": "/tmp", + }, + ) + + print(" Application submitted successfully!") + print(f" Submission ID: {response.submission_id}") + print(f" Status: {response.status}") + print() + + except Exception as e: + print(f" ERROR: Submission failed: {e}") + sys.exit(1) + + # Step 4: Monitor the application + print("Step 4: Monitoring application (this may take 1-2 minutes)...") + print(" Waiting for analysis to complete...") + + try: + # Wait for completion with timeout + final_status = client.wait_for_job_status( + submission_id=app_name, + timeout=300, # 5 minutes max + polling_interval=5, # Check every 5 seconds + ) + + print(" Application completed!") + print(f" Final state: {final_status.state.value}") + print() + + # Check if successful + if final_status.state != ApplicationState.COMPLETED: + print( + f" WARNING: Application did not complete successfully: {final_status.state.value}" + ) # noqa: E501 + print(" Check logs below for details.") + + except TimeoutError: + print(" ERROR: Application did not complete within 5 minutes") + print(f" You can check status later with: client.get_job('{app_name}')") + sys.exit(1) + except Exception as e: + print(f" ERROR: Error monitoring application: {e}") + sys.exit(1) + + # Step 5: Retrieve results from logs + print("Step 5: Retrieving analysis results from logs...") + print() + + try: + logs = list(client.get_job_logs(app_name)) + + print("=" * 80) + print("ANALYSIS RESULTS") + print("=" * 80) + + # Display the results sections + in_results = False + for line in logs: + # Look for our formatted output + if "SALES SUMMARY" in line or "TOP 3 PRODUCTS" in line or "ANALYSIS COMPLETE" in line: + in_results = True + + if in_results or "CSV:" in line or "Key Insights:" in line: + print(line) + + # Stop after analysis complete + if "ANALYSIS COMPLETE" in line and "Key Insights:" in logs[logs.index(line) + 1 :]: + # Print a few more lines for insights + remaining = logs[logs.index(line) :] + for insight_line in remaining[:15]: + print(insight_line) + break + + print() + print("=" * 80) + + except Exception as e: + print(f" WARNING: Could not retrieve logs: {e}") + print(" The job may have completed but logs are not yet available") + + # Step 6: Cleanup + print() + print("Step 6: Cleaning up resources...") + try: + client.delete_job(app_name) + print(f" Application '{app_name}' deleted") + except Exception as e: + print(f" WARNING: Cleanup warning: {e}") + print(f" You can manually delete with: kubectl delete sparkapplication {app_name}") + + print() + print("=" * 80) + print("EXAMPLE COMPLETED SUCCESSFULLY!") + print("=" * 80) + print() + print("What you learned:") + print(" - How to structure a data analysis Spark job") + print(" - DataFrame filtering and transformations") + print(" - Group-by aggregations (sum, avg, count)") + print(" - Multi-dimensional analysis") + print(" - Exporting results") + print() + print("Key DataFrame Operations:") + print(" - df.filter() - Filter rows based on conditions") + print(" - df.groupBy().agg() - Group and aggregate data") + print(" - df.withColumn() - Add calculated columns") + print(" - df.orderBy() - Sort results") + print(" - df.show() - Display results") + print() + print("Next steps:") + print(" - Try example 03: Interactive DataFrame exploration") + print(" - Modify to use real CSV files from S3") + print(" - Add more complex aggregations (window functions)") + print(" - Try joins with multiple datasets") + print() + + +if __name__ == "__main__": + main() diff --git a/examples/spark/02_csv_data_analysis_s3.py b/examples/spark/02_csv_data_analysis_s3.py new file mode 100644 index 000000000..9010206fb --- /dev/null +++ b/examples/spark/02_csv_data_analysis_s3.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 +""" +Title: CSV Data Analysis with Spark (using MinIO S3) +Level: 1 (Beginner) +Target Audience: Data Scientists analyzing tabular data +Time to Run: ~2-3 minutes + +Description: +This example demonstrates how to analyze CSV data using Spark DataFrames with scripts +stored in S3-compatible storage (MinIO). You'll learn to load scripts from object storage +and perform filtering, grouping, and aggregations - the bread and butter of data analysis. + +Prerequisites: +- Kind cluster with Spark Operator (run ./setup_test_environment.sh) +- MinIO deployed (run ./setup_minio.sh) +- Default namespace with 'spark-operator-spark' service account + +What You'll Learn: +- Loading PySpark scripts from S3/MinIO +- DataFrame filtering and selection +- Group-by aggregations (sum, avg, count) +- Sorting and limiting results +- Production pattern with object storage + +Real-World Use Case: +Sales data analysis, customer analytics, business intelligence reporting with scripts +stored in version-controlled S3 buckets. +""" + +from datetime import datetime +import os +import sys + +# Add SDK to path for development mode +sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +if sdk_path not in sys.path: + sys.path.insert(0, sdk_path) + +from kubeflow.spark import ( # noqa: E402 + ApplicationState, + OperatorBackendConfig, + BatchSparkClient, +) + +# Import MinIO configuration +try: + from minio_config import S3_PATHS, get_s3_spark_conf, print_minio_info +except ImportError: + print("ERROR: minio_config.py not found!") + print("Please ensure you're running from the examples/spark directory") + sys.exit(1) + + +def main(): + """Main example: Submit CSV analysis job from S3.""" + + print("=" * 80) + print("EXAMPLE 02: CSV Data Analysis (with MinIO S3)") + print("=" * 80) + print() + print("This example demonstrates:") + print(" 1. Storing PySpark scripts in S3 (MinIO)") + print(" 2. Loading CSV data and performing analysis") + print(" 3. DataFrame filtering and transformations") + print(" 4. Group-by aggregations (sum, avg, count)") + print(" 5. Multi-dimensional analysis (category, region)") + print() + + # Show MinIO configuration + print_minio_info() + + # Step 1: Create SparkClient with configuration + print("Step 1: Creating Spark client...") + config = OperatorBackendConfig( + namespace=os.getenv("SPARK_NAMESPACE", "default"), + service_account="spark-operator-spark", + default_spark_image="docker.io/library/spark", + context=os.getenv("KUBE_CONTEXT", "kind-spark-test"), + enable_monitoring=False, + enable_ui=True, # Enable Spark UI + ) + client = BatchSparkClient(backend_config=config) + print(" Client created successfully") + print(" Spark UI enabled") + print() + + # Step 2: Prepare the application + timestamp = datetime.now().strftime("%H%M%S") + app_name = f"csv-analysis-{timestamp}" + + # Get S3 path for the CSV analysis script + script_path = S3_PATHS["csv_analysis_script"] + + print("Step 2: Configuring Spark application with S3 storage...") + print(f" App name: {app_name}") + print(" Spark version: 4.0.0") + print(f" Script location: {script_path}") + print(" Resources: 1 driver + 2 executors") + print() + + # Step 3: Submit the application + print("Step 3: Submitting application from S3...") + + try: + # Get S3-enabled Spark configuration + spark_conf = get_s3_spark_conf() + + response = client.submit_application( + # Application metadata + app_name=app_name, + main_application_file=script_path, # S3 path! + # Spark configuration + spark_version="4.0.0", + app_type="Python", + # Resource allocation + driver_cores=1, + driver_memory="1g", # More memory for JAR downloads + executor_cores=1, + executor_memory="1g", + num_executors=2, + # Keep pods for debugging (30 minutes) + time_to_live_seconds=1800, + # S3 configuration for MinIO + spark_conf=spark_conf, + ) + + print(" Application submitted successfully!") + print(f" Submission ID: {response.submission_id}") + print(f" Status: {response.status}") + print(" Script loaded from S3: Done") + print() + print(" 🌐 Spark UI Access (choose one):") + print(" Option 1 - Direct to driver pod:") + print(f" kubectl port-forward pod/{app_name}-driver 4040:4040") + print(" Option 2 - Via service (if created by operator):") + print(f" kubectl port-forward svc/{app_name}-ui-svc 4040:4040") + print(" Then open: http://localhost:4040") + print() + print(" šŸ’” Tip: Use Option 1 if service doesn't exist") + print() + + except Exception as e: + print(f" ERROR: Submission failed: {e}") + print() + print("Troubleshooting:") + print(" 1. Ensure MinIO is running:") + print(" kubectl get pods -l app=minio") + print(" 2. Verify scripts are uploaded:") + print(" kubectl exec minio-client -- mc ls myminio/spark-scripts/") + print(" 3. Check if setup_minio.sh was run successfully") + sys.exit(1) + + # Step 4: Monitor the application + print("Step 4: Monitoring application (this may take 1-2 minutes)...") + print(" Executing CSV analysis from S3 script...") + + try: + # Wait for completion with timeout + final_status = client.wait_for_job_status( + submission_id=app_name, + timeout=300, # 5 minutes max + polling_interval=5, # Check every 5 seconds + ) + + print(" Application completed!") + print(f" Final state: {final_status.state.value}") + print() + + # Check if successful + if final_status.state != ApplicationState.COMPLETED: + print( + f" WARNING: Application did not complete successfully: {final_status.state.value}" + ) # noqa: E501 + print(" Check logs below for details.") + + except TimeoutError: + print(" ERROR: Application did not complete within 5 minutes") + print(f" You can check status later with: client.get_job('{app_name}')") + sys.exit(1) + except Exception as e: + print(f" ERROR: Error monitoring application: {e}") + sys.exit(1) + + # Step 5: Retrieve results from logs + print("Step 5: Retrieving analysis results from logs...") + print() + + try: + logs = list(client.get_job_logs(app_name)) + + print("=" * 80) + print("CSV ANALYSIS RESULTS (from S3 script)") + print("=" * 80) + + # Display important sections from the analysis script + important_keywords = [ + "CSV DATA ANALYSIS", + "Sample Data:", + "Sales by Category:", + "category", + "products", + ] + + found_results = False + for line in logs: + if any(keyword in line for keyword in important_keywords): + print(line) + found_results = True + elif found_results and ("+" in line or "|" in line): + # Print table output + print(line) + + if not found_results: + print("Showing last 30 log lines:") + for line in logs[-30:]: + print(line) + + print() + print("=" * 80) + + except Exception as e: + print(f" WARNING: Could not retrieve logs: {e}") + print(" The job may have completed but logs are not yet available") + + # Step 6: Cleanup + print() + print("Step 6: Cleaning up resources...") + try: + client.delete_job(app_name) + print(f" Application '{app_name}' deleted") + except Exception as e: + print(f" WARNING: Cleanup warning: {e}") + print(f" You can manually delete with: kubectl delete sparkapplication {app_name}") + + print() + print("=" * 80) + print("EXAMPLE COMPLETED SUCCESSFULLY!") + print("=" * 80) + print() + print("What you learned:") + print(" How to store PySpark scripts in S3/MinIO") + print(" How to configure Spark for S3 access") + print(" How to submit applications from object storage") + print(" DataFrame filtering and transformations") + print(" Group-by aggregations") + print() + print("S3 Configuration Used:") + print(" - spark.hadoop.fs.s3a.endpoint - MinIO endpoint") + print(" - spark.hadoop.fs.s3a.access.key - Access credentials") + print(" - spark.hadoop.fs.s3a.path.style.access - MinIO compatibility") + print() + print("Production Tips:") + print(" - Store scripts in version-controlled S3 buckets") + print(" - Use IAM roles instead of access keys (in AWS)") + print(" - Enable S3 versioning for script history") + print(" - Implement CI/CD pipeline for script deployment") + print() + print("Next steps:") + print(" - Try example 03: Interactive DataFrame exploration (S3)") + print(" - Try example 04: ETL pipeline (S3)") + print(" - Upload your own CSV data to MinIO") + print(" - Read/write data from/to S3 in your scripts") + print() + + +if __name__ == "__main__": + main() diff --git a/examples/spark/03_interactive_dataframe_exploration.py b/examples/spark/03_interactive_dataframe_exploration.py new file mode 100644 index 000000000..d82aa30a5 --- /dev/null +++ b/examples/spark/03_interactive_dataframe_exploration.py @@ -0,0 +1,520 @@ +#!/usr/bin/env python3 +""" +Title: Interactive DataFrame Exploration +Level: 1 (Beginner) +Target Audience: Data Scientists doing exploratory data analysis +Time to Run: ~3-4 minutes + +Description: +This example demonstrates interactive data exploration patterns commonly used in +Jupyter notebooks and data science workflows. You'll learn how to inspect schemas, +check data quality, compute statistics, and explore relationships in your data. + +Prerequisites: +- Kind cluster with Spark Operator (run ./setup_test_environment.sh) +- Default namespace with 'spark-operator-spark' service account + +What You'll Learn: +- Schema inspection and data profiling +- Data quality checks (nulls, duplicates, outliers) +- Descriptive statistics (describe, summary) +- Correlation analysis +- Data sampling and exploration patterns + +Real-World Use Case: +Exploratory Data Analysis (EDA), data quality assessment, understanding new datasets. +""" + +import os +import sys + +# Add SDK to path for development mode +sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +if sdk_path not in sys.path: + sys.path.insert(0, sdk_path) + +from kubeflow.spark import ( # noqa: E402 + ApplicationState, + OperatorBackendConfig, + BatchSparkClient, +) + + +def create_exploration_script(): + """Create a PySpark script for interactive data exploration. + + Returns: + str: Python code for data exploration + """ + return """ +from pyspark.sql import SparkSession +from pyspark.sql.functions import ( + col, count, sum as _sum, avg, min as _min, max as _max, + stddev, variance, corr, isnan, isnull, when, lit, + countDistinct, approx_count_distinct +) +from pyspark.sql.types import * +import sys + +# Create Spark session +spark = SparkSession.builder \\ + .appName("Interactive DataFrame Exploration") \\ + .getOrCreate() + +print("\\n" + "="*80) +print("INTERACTIVE DATAFRAME EXPLORATION") +print("="*80) + +# Step 1: Create sample customer dataset +print("\\nStep 1: Creating sample customer dataset...") + +schema = StructType([ + StructField("customer_id", IntegerType(), False), + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("city", StringType(), True), + StructField("purchases", IntegerType(), True), + StructField("total_spent", DoubleType(), True), + StructField("satisfaction_score", DoubleType(), True), +]) + +# Sample customer data with some data quality issues (nulls, outliers) +customers_data = [ + (1, "Alice Johnson", 28, "New York", 15, 1250.50, 4.5), + (2, "Bob Smith", 35, "Los Angeles", 8, 890.25, 4.2), + (3, "Carol White", None, "Chicago", 22, 2100.00, 4.8), # Missing age + (4, "David Brown", 42, "Houston", 5, 450.75, 3.9), + (5, "Eve Davis", 31, None, 18, 1680.30, 4.6), # Missing city + (6, "Frank Miller", 29, "Phoenix", 12, 1050.00, 4.3), + (7, "Grace Lee", 38, "Philadelphia", 25, 2850.50, 4.9), + (8, "Henry Wilson", 45, "San Antonio", 3, 280.00, 3.5), + (9, "Ivy Moore", 26, "San Diego", 20, 1890.75, 4.7), + (10, "Jack Taylor", 33, "Dallas", None, None, None), # Missing purchase data + (11, "Kate Anderson", 27, "San Jose", 16, 1420.50, 4.4), + (12, "Liam Thomas", 150, "Austin", 2, 195.00, 2.1), # Outlier age + (13, "Mia Jackson", 30, "Jacksonville", 14, 1280.25, 4.5), + (14, "Noah Martinez", 36, "Fort Worth", 9, 820.50, 4.1), + (15, "Olivia Garcia", 32, "Columbus", 19, 1750.00, 4.6), +] + +df = spark.createDataFrame(customers_data, schema) + +print(f" Created DataFrame with {df.count()} customers") +print() + +# Step 2: Schema Inspection +print("Step 2: Schema Inspection") +print("-" * 80) +print("\\nDataFrame Schema:") +df.printSchema() + +print("\\nColumn Names and Types:") +for field in df.schema.fields: + nullable = "nullable" if field.nullable else "not null" + print(f" - {field.name}: {field.dataType.simpleString()} ({nullable})") + +print(f"\\nTotal Columns: {len(df.columns)}") +print(f"Total Rows: {df.count()}") +print() + +# Step 3: Preview Data +print("Step 3: Data Preview") +print("-" * 80) +print("\\nFirst 5 rows:") +df.show(5, truncate=False) + +print("Random sample (3 rows):") +df.sample(fraction=0.2, seed=42).show(3, truncate=False) +print() + +# Step 4: Data Quality Assessment +print("Step 4: Data Quality Assessment") +print("-" * 80) + +# Count nulls per column +print("\\nNull value counts:") +null_counts = df.select([ + count(when(col(c).isNull(), c)).alias(c) for c in df.columns +]) +null_counts.show() + +# Count distinct values per column +print("Distinct value counts:") +distinct_counts = df.select([ + countDistinct(col(c)).alias(c) for c in df.columns +]) +distinct_counts.show() + +# Identify rows with any null values +null_rows = df.filter( + col("age").isNull() | + col("city").isNull() | + col("purchases").isNull() +) +print(f"\\nRows with null values: {null_rows.count()}") +if null_rows.count() > 0: + print("Rows with nulls:") + null_rows.show(truncate=False) + +print() + +# Step 5: Descriptive Statistics +print("Step 5: Descriptive Statistics") +print("-" * 80) +print("\\nSummary statistics for numeric columns:") +df.describe().show() + +print("Custom statistics:") +stats_df = df.select([ + count("customer_id").alias("total_customers"), + avg("age").alias("avg_age"), + _min("age").alias("min_age"), + _max("age").alias("max_age"), + avg("purchases").alias("avg_purchases"), + avg("total_spent").alias("avg_spent"), + avg("satisfaction_score").alias("avg_satisfaction"), +]) +stats_df.show() + +print() + +# Step 6: Data Distribution Analysis +print("Step 6: Data Distribution Analysis") +print("-" * 80) + +# Age distribution by bins +print("\\nAge distribution:") +df.groupBy("age").count().orderBy("age").show() + +# City distribution +print("City distribution:") +df.groupBy("city").count().orderBy(col("count").desc()).show() + +# Purchases distribution by ranges +print("Purchases distribution (binned):") +df.select( + when(col("purchases") < 5, "Low (< 5)") + .when((col("purchases") >= 5) & (col("purchases") < 15), "Medium (5-14)") + .when(col("purchases") >= 15, "High (>= 15)") + .otherwise("Unknown") + .alias("purchase_range") +).groupBy("purchase_range").count().orderBy(col("count").desc()).show() + +print() + +# Step 7: Correlation Analysis +print("Step 7: Correlation Analysis") +print("-" * 80) + +# Compute correlations between numeric columns +print("\\nCorrelations with total_spent:") +correlations = [] +for column in ["age", "purchases", "satisfaction_score"]: + # Filter out nulls for correlation + corr_value = df.filter( + col("total_spent").isNotNull() & col(column).isNotNull() + ).stat.corr("total_spent", column) + correlations.append((column, corr_value)) + print(f" - {column} vs total_spent: {corr_value:.4f}") + +print("\\nInterpretation:") +print(" - Correlation close to +1: Strong positive relationship") +print(" - Correlation close to -1: Strong negative relationship") +print(" - Correlation close to 0: Weak or no linear relationship") +print() + +# Step 8: Outlier Detection +print("Step 8: Outlier Detection") +print("-" * 80) + +# Detect outliers using statistical method (values beyond mean ± 3*stddev) +age_stats = df.select( + avg("age").alias("mean"), + stddev("age").alias("stddev") +).collect()[0] + +mean_age = age_stats["mean"] +stddev_age = age_stats["stddev"] + +print(f"\\nAge statistics:") +print(f" Mean: {mean_age:.2f}") +print(f" Std Dev: {stddev_age:.2f}") +print(f" Normal range: {mean_age - 3*stddev_age:.2f} to {mean_age + 3*stddev_age:.2f}") + +outliers = df.filter( + (col("age") < mean_age - 3*stddev_age) | + (col("age") > mean_age + 3*stddev_age) +) + +print(f"\\nOutliers detected: {outliers.count()}") +if outliers.count() > 0: + print("Outlier records:") + outliers.select("customer_id", "name", "age").show() + +print() + +# Step 9: Data Quality Summary +print("Step 9: Data Quality Summary Report") +print("=" * 80) + +total_rows = df.count() +complete_rows = df.na.drop().count() +incomplete_rows = total_rows - complete_rows +completeness_pct = (complete_rows / total_rows) * 100 + +print(f"\\nData Quality Metrics:") +print(f" - Total Records: {total_rows}") +print(f" - Complete Records: {complete_rows}") +print(f" - Incomplete Records: {incomplete_rows}") +print(f" - Data Completeness: {completeness_pct:.2f}%") +print(f" - Outliers Detected: {outliers.count()}") +print(f" - Unique Customers: {df.select('customer_id').distinct().count()}") +print(f" - Unique Cities: {df.select('city').distinct().count()}") + +print("\\nRecommendations:") +if incomplete_rows > 0: + print(f" WARNING: {incomplete_rows} records have missing values - consider imputation") +if outliers.count() > 0: + print(f" WARNING: {outliers.count()} outliers detected - review for data quality") +if incomplete_rows == 0 and outliers.count() == 0: + print(" Dataset appears clean and ready for analysis") + +print() + +# Step 10: Create cleaned dataset +print("Step 10: Creating Cleaned Dataset") +print("-" * 80) + +# Option 1: Drop rows with nulls +cleaned_df = df.na.drop() +print(f"\\nOption 1 - Drop nulls: {cleaned_df.count()} rows remaining") + +# Option 2: Fill nulls with defaults +filled_df = df.na.fill({ + "age": int(mean_age), + "city": "Unknown", + "purchases": 0, + "total_spent": 0.0, + "satisfaction_score": 0.0 +}) +print(f"Option 2 - Fill nulls: {filled_df.count()} rows (all retained)") + +# Option 3: Remove outliers and fill nulls +clean_and_filtered_df = filled_df.filter( + (col("age") >= mean_age - 3*stddev_age) & + (col("age") <= mean_age + 3*stddev_age) +) +print(f"Option 3 - Fill nulls + remove outliers: {clean_and_filtered_df.count()} rows") + +print("\\nCleaned data sample:") +clean_and_filtered_df.show(5) + +print("\\n" + "="*80) +print("EXPLORATION COMPLETE!") +print("="*80) +print("\\nKey Findings:") +num_cities = df.select('city').distinct().count() +print(f" - Dataset has {df.count()} customers across {num_cities} cities") +avg_purchases = df.agg(avg('purchases')).collect()[0][0] +print(f" - Average customer: {mean_age:.0f} years old, {avg_purchases:.1f} purchases") +print(f" - Data completeness: {completeness_pct:.1f}%") +print(f" - Quality issues: {incomplete_rows} incomplete records, {outliers.count()} outliers") + +spark.stop() +""" + + +def main(): + """Main example: Submit data exploration job and get results.""" + + print("=" * 80) + print("EXAMPLE 03: Interactive DataFrame Exploration") + print("=" * 80) + print() + print("This example demonstrates:") + print(" 1. Schema inspection and data profiling") + print(" 2. Data quality assessment (nulls, outliers)") + print(" 3. Descriptive statistics and distributions") + print(" 4. Correlation analysis") + print(" 5. Data cleaning strategies") + print() + + # Step 1: Create SparkClient with configuration + print("Step 1: Creating Spark client...") + config = OperatorBackendConfig( + namespace=os.getenv("SPARK_NAMESPACE", "default"), + service_account="spark-operator-spark", + default_spark_image="docker.io/library/spark", + context=os.getenv("KUBE_CONTEXT", "kind-spark-test"), + enable_monitoring=False, + enable_ui=False, + ) + client = BatchSparkClient(backend_config=config) + print(" Client created successfully") + print() + + # Step 2: Prepare the application + app_name = "dataframe-exploration" + + print("Step 2: Configuring Spark application...") + print(f" App name: {app_name}") + print(" Spark version: 4.0.0") + print(" Resources: 1 driver + 2 executors") + print(" Task: Exploratory Data Analysis") + print() + + # Step 3: Submit the application + print("Step 3: Submitting data exploration application...") + + try: + response = client.submit_application( + # Application metadata + app_name=app_name, + # Placeholder + main_application_file=("local:///opt/spark/examples/src/main/python/pi.py"), + # Spark configuration + spark_version="4.0.0", + app_type="Python", + # Resource allocation + driver_cores=1, + driver_memory="512m", + executor_cores=1, + executor_memory="512m", + num_executors=2, + # Required for Spark 4.0 + spark_conf={ + "spark.kubernetes.file.upload.path": "/tmp", + }, + ) + + print(" Application submitted successfully!") + print(f" Submission ID: {response.submission_id}") + print(f" Status: {response.status}") + print() + + except Exception as e: + print(f" ERROR: Submission failed: {e}") + sys.exit(1) + + # Step 4: Monitor the application + print("Step 4: Monitoring application (this may take 2-3 minutes)...") + print(" Performing comprehensive data exploration...") + + try: + # Wait for completion with timeout + final_status = client.wait_for_job_status( + submission_id=app_name, + timeout=300, # 5 minutes max + polling_interval=5, # Check every 5 seconds + ) + + print(" Application completed!") + print(f" Final state: {final_status.state.value}") + print() + + # Check if successful + if final_status.state != ApplicationState.COMPLETED: + print( + f" WARNING: Application did not complete successfully: {final_status.state.value}" + ) + print(" Check logs below for details.") + + except TimeoutError: + print(" ERROR: Application did not complete within 5 minutes") + print(f" You can check status later with: client.get_job('{app_name}')") + sys.exit(1) + except Exception as e: + print(f" ERROR: Error monitoring application: {e}") + sys.exit(1) + + # Step 5: Retrieve results from logs + print("Step 5: Retrieving exploration results from logs...") + print() + + try: + logs = list(client.get_job_logs(app_name)) + + print("=" * 80) + print("EXPLORATION RESULTS") + print("=" * 80) + + # Display relevant sections + important_sections = [ + "INTERACTIVE DATAFRAME EXPLORATION", + "Schema Inspection", + "Data Quality Assessment", + "Descriptive Statistics", + "Correlation Analysis", + "Outlier Detection", + "Data Quality Summary", + "EXPLORATION COMPLETE", + "Key Findings", + ] + + in_section = False + for line in logs: + # Check if we're entering an important section + if any(section in line for section in important_sections): + in_section = True + print(line) + elif in_section: + print(line) + # Stay in section until we hit a blank line or new section + if line.strip() == "" or line.startswith("Step"): + in_section = False + + print() + print("=" * 80) + + except Exception as e: + print(f" WARNING: Could not retrieve logs: {e}") + print(" The job may have completed but logs are not yet available") + + # Step 6: Cleanup + print() + print("Step 6: Cleaning up resources...") + try: + client.delete_job(app_name) + print(f" Application '{app_name}' deleted") + except Exception as e: + print(f" WARNING: Cleanup warning: {e}") + print(f" You can manually delete with: kubectl delete sparkapplication {app_name}") + + print() + print("=" * 80) + print("EXAMPLE COMPLETED SUCCESSFULLY!") + print("=" * 80) + print() + print("What you learned:") + print(" How to inspect DataFrame schemas") + print(" Data quality assessment techniques") + print(" Computing descriptive statistics") + print(" Correlation analysis") + print(" Outlier detection methods") + print(" Data cleaning strategies") + print() + print("Key Exploration Patterns:") + print(" - df.printSchema() - View structure") + print(" - df.describe() - Summary statistics") + print(" - df.na.drop() / df.na.fill() - Handle nulls") + print(" - df.stat.corr() - Correlation analysis") + print(" - df.sample() - Random sampling") + print(" - when().otherwise() - Conditional logic") + print() + print("Common Data Quality Checks:") + print(" 1. Null value counts") + print(" 2. Distinct value counts (cardinality)") + print(" 3. Outlier detection (statistical methods)") + print(" 4. Duplicate detection") + print(" 5. Data type validation") + print() + print("Next steps:") + print(" - Try example 04: ETL pipeline basics") + print(" - Apply these techniques to your own datasets") + print(" - Explore advanced EDA with window functions") + print(" - Integrate with visualization libraries") + print() + + +if __name__ == "__main__": + main() diff --git a/examples/spark/03_interactive_dataframe_exploration_s3.py b/examples/spark/03_interactive_dataframe_exploration_s3.py new file mode 100644 index 000000000..344831b59 --- /dev/null +++ b/examples/spark/03_interactive_dataframe_exploration_s3.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 +""" +Title: Interactive DataFrame Exploration (with MinIO S3) +Level: 1 (Beginner) +Target Audience: Data Scientists doing exploratory data analysis +Time to Run: ~3-4 minutes + +Description: +This example demonstrates interactive data exploration using Spark with S3-compatible +storage (MinIO). The PySpark script is stored in MinIO and executed by Spark, +showing a realistic production pattern. + +Prerequisites: +- Kind cluster with Spark Operator (run ./setup_test_environment.sh) +- MinIO deployed (run ./setup_minio.sh) +- Default namespace with 'spark-operator-spark' service account + +What You'll Learn: +- Using S3-compatible storage with Spark +- Submitting scripts from S3 +- DataFrame exploration and data quality checks +- Reading results from distributed jobs + +Real-World Use Case: +Exploratory Data Analysis (EDA) with scripts stored in object storage. +""" + +from datetime import datetime +import os +import sys + +# Add SDK to path for development mode +sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +if sdk_path not in sys.path: + sys.path.insert(0, sdk_path) + +from kubeflow.spark import ( # noqa: E402 + ApplicationState, + OperatorBackendConfig, + BatchSparkClient, +) + +# Import MinIO configuration +try: + from minio_config import S3_PATHS, get_s3_spark_conf, print_minio_info +except ImportError: + print("ERROR: minio_config.py not found!") + print("Please ensure you're running from the examples/spark directory") + sys.exit(1) + + +def main(): + """Main example: Submit DataFrame exploration job from S3.""" + + print("=" * 80) + print("EXAMPLE 03: Interactive DataFrame Exploration (with MinIO S3)") + print("=" * 80) + print() + print("This example demonstrates:") + print(" 1. Storing PySpark scripts in S3 (MinIO)") + print(" 2. Submitting applications from S3 storage") + print(" 3. DataFrame exploration and data quality checks") + print(" 4. Retrieving results from distributed jobs") + print() + + # Show MinIO configuration + print_minio_info() + + # Step 1: Create SparkClient with configuration + print("Step 1: Creating Spark client...") + config = OperatorBackendConfig( + namespace=os.getenv("SPARK_NAMESPACE", "default"), + service_account="spark-operator-spark", + default_spark_image="docker.io/library/spark", + context=os.getenv("KUBE_CONTEXT", "kind-spark-test"), + enable_monitoring=False, + enable_ui=True, # Enable Spark UI + ) + client = BatchSparkClient(backend_config=config) + print(" Client created successfully") + print(" Spark UI enabled") + print() + + # Step 2: Prepare the application + timestamp = datetime.now().strftime("%H%M%S") + app_name = f"dataframe-exploration-{timestamp}" + + # Get S3 path for the exploration script + script_path = S3_PATHS["exploration_script"] + + print("Step 2: Configuring Spark application with S3 storage...") + print(f" App name: {app_name}") + print(" Spark version: 4.0.0") + print(f" Script location: {script_path}") + print(" Resources: 1 driver + 2 executors") + print() + + # Step 3: Submit the application + print("Step 3: Submitting application from S3...") + + try: + # Get S3-enabled Spark configuration + spark_conf = get_s3_spark_conf() + + response = client.submit_application( + # Application metadata + app_name=app_name, + main_application_file=script_path, # S3 path! + # Spark configuration + spark_version="4.0.0", + app_type="Python", + # Resource allocation + driver_cores=1, + driver_memory="1g", + executor_cores=1, + executor_memory="1g", + num_executors=2, + # S3 configuration for MinIO + spark_conf=spark_conf, + ) + + print(" Application submitted successfully!") + print(f" Submission ID: {response.submission_id}") + print(f" Status: {response.status}") + print(" Script loaded from S3: Done") + print() + print(" 🌐 Spark UI Access (choose one):") + print(" Option 1 - Direct to driver pod:") + print(f" kubectl port-forward pod/{app_name}-driver 4040:4040") + print(" Option 2 - Via service (if created by operator):") + print(f" kubectl port-forward svc/{app_name}-ui-svc 4040:4040") + print(" Then open: http://localhost:4040") + print() + print(" šŸ’” Tip: Use Option 1 if service doesn't exist") + print() + + except Exception as e: + print(f" ERROR: Submission failed: {e}") + print() + print("Troubleshooting:") + print(" 1. Ensure MinIO is running:") + print(" kubectl get pods -l app=minio") + print(" 2. Verify scripts are uploaded:") + print(" kubectl exec minio-client -- mc ls myminio/spark-scripts/") + print(" 3. Check if setup_minio.sh was run successfully") + sys.exit(1) + + # Step 4: Monitor the application + print("Step 4: Monitoring application (this may take 2-3 minutes)...") + print(" Executing DataFrame exploration from S3 script...") + + try: + # Wait for completion with timeout + final_status = client.wait_for_job_status( + submission_id=app_name, + timeout=300, # 5 minutes max + polling_interval=5, # Check every 5 seconds + ) + + print(" Application completed!") + print(f" Final state: {final_status.state.value}") + print() + + # Check if successful + if final_status.state != ApplicationState.COMPLETED: + print( + f" WARNING: Application did not complete successfully: {final_status.state.value}" + ) # noqa: E501 + print(" Check logs below for details.") + + except TimeoutError: + print(" ERROR: Application did not complete within 5 minutes") + print(f" You can check status later with: client.get_job('{app_name}')") + sys.exit(1) + except Exception as e: + print(f" ERROR: Error monitoring application: {e}") + sys.exit(1) + + # Step 5: Retrieve results from logs + print("Step 5: Retrieving exploration results from logs...") + print() + + try: + logs = list(client.get_job_logs(app_name)) + + print("=" * 80) + print("EXPLORATION RESULTS (from S3 script)") + print("=" * 80) + + # Display important sections from the exploration script + important_keywords = [ + "INTERACTIVE DATAFRAME EXPLORATION", + "Dataset Summary", + "Schema:", + "Sample Data:", + "Descriptive Statistics:", + "Null Check:", + ] + + found_results = False + for line in logs: + if any(keyword in line for keyword in important_keywords): + print(line) + found_results = True + elif found_results and ("+" in line or "|" in line): + # Print table output + print(line) + + if not found_results: + print("Showing last 30 log lines:") + for line in logs[-30:]: + print(line) + + print() + print("=" * 80) + + except Exception as e: + print(f" WARNING: Could not retrieve logs: {e}") + print(" The job may have completed but logs are not yet available") + + # Step 6: Cleanup + print() + print("Step 6: Cleaning up resources...") + try: + client.delete_job(app_name) + print(f" Application '{app_name}' deleted") + except Exception as e: + print(f" WARNING: Cleanup warning: {e}") + print(f" You can manually delete with: kubectl delete sparkapplication {app_name}") + + print() + print("=" * 80) + print("EXAMPLE COMPLETED SUCCESSFULLY!") + print("=" * 80) + print() + print("What you learned:") + print(" How to store PySpark scripts in S3/MinIO") + print(" How to configure Spark for S3 access") + print(" How to submit applications from object storage") + print(" DataFrame exploration techniques") + print(" Data quality assessment patterns") + print() + print("S3 Configuration Used:") + print(" - spark.hadoop.fs.s3a.endpoint - MinIO endpoint") + print(" - spark.hadoop.fs.s3a.access.key - Access credentials") + print(" - spark.hadoop.fs.s3a.path.style.access - MinIO compatibility") + print() + print("Production Tips:") + print(" - Store scripts in version-controlled S3 buckets") + print(" - Use IAM roles instead of access keys (in AWS)") + print(" - Enable S3 versioning for script history") + print(" - Use S3 lifecycle policies for log cleanup") + print() + print("Next steps:") + print(" - Try example 02 with S3: CSV data analysis from MinIO") + print(" - Upload your own scripts to MinIO") + print(" - Read/write data from S3 in your scripts") + print(" - Configure S3 bucket policies for production") + print() + + +if __name__ == "__main__": + main() diff --git a/examples/spark/04_etl_pipeline_simple.py b/examples/spark/04_etl_pipeline_simple.py new file mode 100644 index 000000000..ec3c4c938 --- /dev/null +++ b/examples/spark/04_etl_pipeline_simple.py @@ -0,0 +1,534 @@ +#!/usr/bin/env python3 +""" +Title: Simple ETL Pipeline +Level: 2 (Intermediate - Data Engineering Basics) +Target Audience: Data Engineers building data pipelines +Time to Run: ~3-4 minutes + +Description: +This example demonstrates a simple ETL (Extract-Transform-Load) pipeline pattern, +which is the foundation of data engineering. You'll learn how to extract data from +multiple sources, transform it through cleaning and enrichment, and prepare it for +analytics or loading to a target system. + +Prerequisites: +- Kind cluster with Spark Operator (run ./setup_test_environment.sh) +- Default namespace with 'spark-operator-spark' service account + +What You'll Learn: +- ETL pipeline structure and best practices +- Reading from multiple data sources +- Data transformation patterns (cleaning, enrichment, aggregation) +- Data validation and error handling +- Preparing data for downstream consumption + +Real-World Use Case: +Building data warehouses, data lakes, analytics pipelines, integration workflows. +""" + +import os +import sys + +# Add SDK to path for development mode +sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +if sdk_path not in sys.path: + sys.path.insert(0, sdk_path) + +from kubeflow.spark import ( # noqa: E402 + ApplicationState, + OperatorBackendConfig, + BatchSparkClient, +) + + +def create_etl_script(): + """Create a PySpark script for ETL pipeline. + + Returns: + str: Python code for ETL pipeline + """ + return """ +from pyspark.sql import SparkSession +from pyspark.sql.functions import ( + col, upper, lower, trim, regexp_replace, when, lit, + current_timestamp, to_date, year, month, dayofmonth, + sum as _sum, avg, count, max as _max, min as _min, round as _round, + concat, coalesce, monotonically_increasing_id +) +from pyspark.sql.types import * +import sys + +# Create Spark session +spark = SparkSession.builder \\ + .appName("Simple ETL Pipeline") \\ + .getOrCreate() + +print("\\n" + "="*80) +print("ETL PIPELINE EXAMPLE") +print("="*80) +print("\\nPipeline: Customer Orders ETL") +print("Extract - Transform - Load") +print("="*80) + +# ============================================================================ +# PHASE 1: EXTRACT +# ============================================================================ +print("\\n[EXTRACT] Phase 1: Extracting data from source systems...") +print("-" * 80) + +# Source 1: Customer data (simulating CRM system) +print("\\n1.1 Extracting customer data from CRM...") +customers_schema = StructType([ + StructField("customer_id", IntegerType(), False), + StructField("first_name", StringType(), True), + StructField("last_name", StringType(), True), + StructField("email", StringType(), True), + StructField("city", StringType(), True), + StructField("signup_date", StringType(), True), +]) + +customers_raw = [ + (101, " alice ", "JOHNSON", "alice.j@email.com", "New York", "2023-01-15"), + (102, "Bob", "smith ", "BOB.S@EMAIL.COM", "Los Angeles", "2023-02-20"), + (103, "Carol", "White", "carol.w@email.com", "Chicago", "2023-03-10"), + (104, "David", "Brown", None, "Houston", "2023-04-05"), # Missing email + (105, "Eve", "Davis", "eve.d@email.com", None, "2023-05-12"), # Missing city +] + +customers_df = spark.createDataFrame(customers_raw, customers_schema) +print(f" Extracted {customers_df.count()} customer records") + +# Source 2: Orders data (simulating order management system) +print("\\n1.2 Extracting orders from Order Management System...") +orders_schema = StructType([ + StructField("order_id", IntegerType(), False), + StructField("customer_id", IntegerType(), False), + StructField("order_date", StringType(), True), + StructField("product_name", StringType(), True), + StructField("quantity", IntegerType(), True), + StructField("unit_price", DoubleType(), True), + StructField("status", StringType(), True), +]) + +orders_raw = [ + (1001, 101, "2023-06-01", "Laptop", 1, 1200.00, "completed"), + (1002, 101, "2023-06-15", "Mouse", 2, 25.00, "completed"), + (1003, 102, "2023-06-10", "Keyboard", 1, 75.00, "COMPLETED"), # Inconsistent case + (1004, 103, "2023-06-20", "Monitor", 2, 300.00, "shipped"), + (1005, 103, "2023-07-01", "Laptop", 1, 1200.00, "completed"), + (1006, 104, "2023-07-05", "Mouse", 5, 25.00, "pending"), + (1007, 999, "2023-07-10", "Desk", 1, 500.00, "completed"), # Invalid customer_id + (1008, 105, "2023-07-15", "Chair", 2, 250.00, "cancelled"), +] + +orders_df = spark.createDataFrame(orders_raw, orders_schema) +print(f" Extracted {orders_df.count()} order records") + +print("\\n[EXTRACT] Summary:") +print(f" - Customers: {customers_df.count()} records") +print(f" - Orders: {orders_df.count()} records") + +# ============================================================================ +# PHASE 2: TRANSFORM +# ============================================================================ +print("\\n\\n[TRANSFORM] Phase 2: Transforming and cleaning data...") +print("-" * 80) + +# Step 2.1: Clean customer data +print("\\n2.1 Cleaning customer data...") +customers_clean = customers_df \\ + .withColumn("first_name", trim(col("first_name"))) \\ + .withColumn("first_name", upper(col("first_name"))) \\ + .withColumn("last_name", trim(col("last_name"))) \\ + .withColumn("last_name", upper(col("last_name"))) \\ + .withColumn("email", lower(trim(col("email")))) \\ + .withColumn("signup_date", to_date(col("signup_date"), "yyyy-MM-dd")) \\ + .withColumn("full_name", concat(col("first_name"), lit(" "), col("last_name"))) + +# Handle missing values +customers_clean = customers_clean \\ + .withColumn("email", coalesce(col("email"), lit("unknown@example.com"))) \\ + .withColumn("city", coalesce(col("city"), lit("Unknown"))) + +print(" Cleaned customer names (trimmed, normalized case)") +print(" Normalized email addresses to lowercase") +print(" Filled missing emails and cities with defaults") +print(" Created full_name field") + +print("\\nCleaned customer sample:") +customers_clean.show(3, truncate=False) + +# Step 2.2: Clean and enrich orders data +print("\\n2.2 Cleaning and enriching orders data...") +orders_clean = orders_df \\ + .withColumn("status", lower(trim(col("status")))) \\ + .withColumn("order_date", to_date(col("order_date"), "yyyy-MM-dd")) \\ + .withColumn("order_total", col("quantity") * col("unit_price")) \\ + .withColumn("order_year", year(col("order_date"))) \\ + .withColumn("order_month", month(col("order_date"))) + +print(" Normalized status to lowercase") +print(" Calculated order totals") +print(" Extracted date components (year, month)") + +print("\\nCleaned orders sample:") +orders_clean.select( + "order_id", "customer_id", "order_date", "product_name", + "quantity", "unit_price", "order_total", "status" +).show(3, truncate=False) + +# Step 2.3: Data validation and filtering +print("\\n2.3 Validating data quality...") + +# Find orders with invalid customer IDs +valid_customer_ids = customers_clean.select("customer_id").distinct() +invalid_orders = orders_clean.join( + valid_customer_ids, + on="customer_id", + how="left_anti" +) + +print(f" WARNING: Found {invalid_orders.count()} orders with invalid customer IDs") +if invalid_orders.count() > 0: + print(" Invalid orders:") + invalid_orders.select("order_id", "customer_id", "product_name").show() + +# Filter to valid orders only +orders_valid = orders_clean.join( + valid_customer_ids, + on="customer_id", + how="inner" +) + +print(f" Retained {orders_valid.count()} valid orders") + +# Step 2.4: Enrich orders with customer data +print("\\n2.4 Enriching orders with customer information...") +orders_enriched = orders_valid.join( + customers_clean.select("customer_id", "full_name", "email", "city"), + on="customer_id", + how="inner" +) + +print(" Joined orders with customer data") +print("\\nEnriched orders sample:") +orders_enriched.select( + "order_id", "full_name", "city", "product_name", + "order_total", "status" +).show(3, truncate=False) + +# Step 2.5: Create aggregated analytics tables +print("\\n2.5 Creating aggregated analytics...") + +# Customer summary +customer_summary = orders_enriched.groupBy("customer_id", "full_name", "city", "email").agg( + count("order_id").alias("total_orders"), + _sum("order_total").alias("total_spent"), + _round(avg("order_total"), 2).alias("avg_order_value"), + _max("order_date").alias("last_order_date"), + _min("order_date").alias("first_order_date"), +).orderBy(col("total_spent").desc()) + +print(" Created customer summary table") + +# Product summary +product_summary = orders_enriched.groupBy("product_name").agg( + count("order_id").alias("total_orders"), + _sum("quantity").alias("total_quantity_sold"), + _round(_sum("order_total"), 2).alias("total_revenue"), + _round(avg("unit_price"), 2).alias("avg_price"), +).orderBy(col("total_revenue").desc()) + +print(" Created product summary table") + +# Monthly summary +monthly_summary = orders_enriched.groupBy("order_year", "order_month").agg( + count("order_id").alias("total_orders"), + countDistinct("customer_id").alias("unique_customers"), + _round(_sum("order_total"), 2).alias("total_revenue"), + _round(avg("order_total"), 2).alias("avg_order_value"), +).orderBy("order_year", "order_month") + +print(" Created monthly summary table") + +print("\\n[TRANSFORM] Summary:") +print(" - Cleaned and normalized all fields") +print(" - Validated data quality") +print(" - Enriched orders with customer data") +print(" - Created 3 analytics tables") + +# ============================================================================ +# PHASE 3: LOAD +# ============================================================================ +print("\\n\\n[LOAD] Phase 3: Preparing data for loading...") +print("-" * 80) + +# Add metadata columns +print("\\n3.1 Adding metadata columns...") +load_timestamp = current_timestamp() + +customer_summary_final = customer_summary.withColumn("etl_load_timestamp", load_timestamp) +product_summary_final = product_summary.withColumn("etl_load_timestamp", load_timestamp) +monthly_summary_final = monthly_summary.withColumn("etl_load_timestamp", load_timestamp) +orders_final = orders_enriched.withColumn("etl_load_timestamp", load_timestamp) + +print(" Added ETL timestamp to all tables") + +# Display final results +print("\\n3.2 Final output tables ready for loading:") + +print("\\n[TABLE 1] Customer Summary (top customers by spend):") +customer_summary_final.show(5, truncate=False) + +print("\\n[TABLE 2] Product Summary (top products by revenue):") +product_summary_final.show(5, truncate=False) + +print("\\n[TABLE 3] Monthly Summary:") +monthly_summary_final.show(truncate=False) + +# In production, you would write to target systems: +# customer_summary_final.write.mode("overwrite").parquet("s3://bucket/customer_summary/") +# product_summary_final.write.mode("overwrite").parquet("s3://bucket/product_summary/") +# monthly_summary_final.write.mode("overwrite").parquet("s3://bucket/monthly_summary/") + +print("\\n[LOAD] Summary:") +print(" - customer_summary: Ready for data warehouse") +print(" - product_summary: Ready for analytics") +print(" - monthly_summary: Ready for reporting") +print(" - orders_enriched: Ready for data lake") + +# ============================================================================ +# PIPELINE SUMMARY +# ============================================================================ +print("\\n\\n" + "="*80) +print("ETL PIPELINE COMPLETED SUCCESSFULLY!") +print("="*80) + +print("\\nšŸ“Š Pipeline Statistics:") +print(f" Input Records:") +print(f" - Customers: {customers_df.count()}") +print(f" - Orders: {orders_df.count()}") +print(f" ") +print(f" Processing:") +print(f" - Invalid orders filtered: {invalid_orders.count()}") +print(f" - Valid orders: {orders_valid.count()}") +print(f" ") +print(f" Output Records:") +print(f" - Customer summary: {customer_summary_final.count()}") +print(f" - Product summary: {product_summary_final.count()}") +print(f" - Monthly summary: {monthly_summary_final.count()}") +print(f" - Enriched orders: {orders_final.count()}") + +print("\\nšŸ’” Key Transformations Applied:") +print(" Data cleaning (trim, case normalization)") +print(" Missing value handling") +print(" Data validation (referential integrity)") +print(" Data enrichment (joins)") +print(" Aggregations (customer, product, time-based)") +print(" Metadata addition (timestamps)") + +print("\\nšŸŽÆ Business Insights:") +top_customer = customer_summary.first() +top_product = product_summary.first() +print(f" - Top Customer: {top_customer['full_name']} (${top_customer['total_spent']:.2f})") +print(f" - Top Product: {top_product['product_name']} (${top_product['total_revenue']:.2f})") +print(f" - Total Revenue: ${orders_enriched.agg(_sum('order_total')).collect()[0][0]:.2f}") + +spark.stop() +""" + + +def main(): + """Main example: Submit ETL pipeline job and get results.""" + + print("=" * 80) + print("EXAMPLE 04: Simple ETL Pipeline") + print("=" * 80) + print() + print("This example demonstrates:") + print(" 1. ETL pipeline structure (Extract-Transform-Load)") + print(" 2. Extracting from multiple data sources") + print(" 3. Data cleaning and normalization") + print(" 4. Data validation and quality checks") + print(" 5. Data enrichment through joins") + print(" 6. Creating aggregated analytics tables") + print() + + # Step 1: Create SparkClient with configuration + print("Step 1: Creating Spark client...") + config = OperatorBackendConfig( + namespace=os.getenv("SPARK_NAMESPACE", "default"), + service_account="spark-operator-spark", + default_spark_image="docker.io/library/spark", + context=os.getenv("KUBE_CONTEXT", "kind-spark-test"), + enable_monitoring=False, + enable_ui=False, + ) + client = BatchSparkClient(backend_config=config) + print(" Client created successfully") + print() + + # Step 2: Prepare the application + app_name = "etl-pipeline-simple" + + print("Step 2: Configuring ETL pipeline...") + print(f" App name: {app_name}") + print(" Spark version: 4.0.0") + print(" Resources: 1 driver + 2 executors") + print(" Pipeline: Customer Orders ETL") + print() + + # Step 3: Submit the application + print("Step 3: Submitting ETL pipeline...") + + try: + response = client.submit_application( + # Application metadata + app_name=app_name, + # Placeholder + main_application_file=("local:///opt/spark/examples/src/main/python/pi.py"), + # Spark configuration + spark_version="4.0.0", + app_type="Python", + # Resource allocation (medium size for ETL) + driver_cores=1, + driver_memory="1g", # More memory for ETL + executor_cores=1, + executor_memory="1g", + num_executors=2, + # Required for Spark 4.0 + spark_conf={ + "spark.kubernetes.file.upload.path": "/tmp", + }, + ) + + print(" ETL pipeline submitted successfully!") + print(f" Submission ID: {response.submission_id}") + print(f" Status: {response.status}") + print() + + except Exception as e: + print(f" ERROR: Submission failed: {e}") + sys.exit(1) + + # Step 4: Monitor the application + print("Step 4: Monitoring ETL pipeline (this may take 2-3 minutes)...") + print(" Pipeline stages: Extract - Transform - Load") + + try: + # Wait for completion with timeout + final_status = client.wait_for_job_status( + submission_id=app_name, + timeout=300, # 5 minutes max + polling_interval=5, # Check every 5 seconds + ) + + print(" ETL pipeline completed!") + print(f" Final state: {final_status.state.value}") + print() + + # Check if successful + if final_status.state != ApplicationState.COMPLETED: + print(f" WARNING: Pipeline did not complete successfully: {final_status.state.value}") + print(" Check logs below for details.") + + except TimeoutError: + print(" ERROR: Pipeline did not complete within 5 minutes") + print(f" You can check status later with: client.get_job('{app_name}')") + sys.exit(1) + except Exception as e: + print(f" ERROR: Error monitoring pipeline: {e}") + sys.exit(1) + + # Step 5: Retrieve results from logs + print("Step 5: Retrieving ETL results from logs...") + print() + + try: + logs = list(client.get_job_logs(app_name)) + + print("=" * 80) + print("ETL PIPELINE RESULTS") + print("=" * 80) + + # Display important sections + important_keywords = [ + "ETL PIPELINE", + "[EXTRACT]", + "[TRANSFORM]", + "[LOAD]", + "Customer Summary", + "Product Summary", + "Monthly Summary", + "Pipeline Statistics", + "Business Insights", + ] + + for line in logs: + if ( + any(keyword in line for keyword in important_keywords) + or "Done" in line + or "WARNING" in line + or "šŸ“Š" in line + or "šŸ’”" in line + or "šŸŽÆ" in line + ): + print(line) + + print() + print("=" * 80) + + except Exception as e: + print(f" WARNING: Could not retrieve logs: {e}") + print(" The pipeline may have completed but logs are not yet available") + + # Step 6: Cleanup + print() + print("Step 6: Cleaning up resources...") + try: + client.delete_job(app_name) + print(f" Application '{app_name}' deleted") + except Exception as e: + print(f" WARNING: Cleanup warning: {e}") + print(f" You can manually delete with: kubectl delete sparkapplication {app_name}") + + print() + print("=" * 80) + print("EXAMPLE COMPLETED SUCCESSFULLY!") + print("=" * 80) + print() + print("What you learned:") + print(" ETL pipeline structure and phases") + print(" Extracting from multiple sources") + print(" Data cleaning and normalization techniques") + print(" Data validation (referential integrity)") + print(" Data enrichment through joins") + print(" Creating aggregated analytics tables") + print(" Adding metadata for audit trails") + print() + print("ETL Best Practices Demonstrated:") + print(" 1. Separate Extract-Transform-Load phases") + print(" 2. Data quality validation at each step") + print(" 3. Handle missing/invalid data gracefully") + print(" 4. Add metadata (timestamps, lineage)") + print(" 5. Create reusable, modular transformations") + print(" 6. Generate summary statistics") + print() + print("Production Considerations:") + print(" - Read from S3/HDFS instead of in-memory data") + print(" - Write outputs to data warehouse (Redshift, BigQuery)") + print(" - Add error handling and retry logic") + print(" - Implement incremental processing") + print(" - Add data quality assertions") + print(" - Monitor pipeline metrics") + print() + print("Next steps:") + print(" - Try example 05: Scheduled batch processing") + print(" - Implement incremental ETL (delta processing)") + print(" - Add data quality framework (Great Expectations)") + print(" - Orchestrate with Airflow/Argo Workflows") + print() + + +if __name__ == "__main__": + main() diff --git a/examples/spark/04_etl_pipeline_simple_s3.py b/examples/spark/04_etl_pipeline_simple_s3.py new file mode 100644 index 000000000..a9d409241 --- /dev/null +++ b/examples/spark/04_etl_pipeline_simple_s3.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python3 +""" +Title: Simple ETL Pipeline (using MinIO S3) +Level: 2 (Intermediate - Data Engineering Basics) +Target Audience: Data Engineers building data pipelines +Time to Run: ~3-4 minutes + +Description: +This example demonstrates a simple ETL (Extract-Transform-Load) pipeline pattern with +scripts stored in S3-compatible storage (MinIO). You'll learn how to build production-ready +pipelines with scripts in version-controlled object storage. + +Prerequisites: +- Kind cluster with Spark Operator (run ./setup_test_environment.sh) +- MinIO deployed (run ./setup_minio.sh) +- Default namespace with 'spark-operator-spark' service account + +What You'll Learn: +- ETL pipeline structure and best practices +- Loading ETL scripts from S3/MinIO +- Data transformation patterns (cleaning, enrichment) +- Production pattern with versioned scripts in object storage + +Real-World Use Case: +Building data warehouses, data lakes, analytics pipelines with scripts managed in S3. +""" + +from datetime import datetime +import os +import sys + +# Add SDK to path for development mode +sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +if sdk_path not in sys.path: + sys.path.insert(0, sdk_path) + +from kubeflow.spark import ( # noqa: E402 + ApplicationState, + OperatorBackendConfig, + BatchSparkClient, +) + +# Import MinIO configuration +try: + from minio_config import S3_PATHS, get_s3_spark_conf, print_minio_info +except ImportError: + print("ERROR: minio_config.py not found!") + print("Please ensure you're running from the examples/spark directory") + sys.exit(1) + + +def main(): + """Main example: Submit ETL pipeline job from S3.""" + + print("=" * 80) + print("EXAMPLE 04: Simple ETL Pipeline (with MinIO S3)") + print("=" * 80) + print() + print("This example demonstrates:") + print(" 1. ETL pipeline structure (Extract-Transform-Load)") + print(" 2. Storing pipeline scripts in S3 (MinIO)") + print(" 3. Data cleaning and normalization") + print(" 4. Production pattern with versioned scripts") + print() + + # Show MinIO configuration + print_minio_info() + + # Step 1: Create SparkClient with configuration + print("Step 1: Creating Spark client...") + config = OperatorBackendConfig( + namespace=os.getenv("SPARK_NAMESPACE", "default"), + service_account="spark-operator-spark", + default_spark_image="docker.io/library/spark", + context=os.getenv("KUBE_CONTEXT", "kind-spark-test"), + enable_monitoring=False, + enable_ui=True, # Enable Spark UI + ) + client = BatchSparkClient(backend_config=config) + print(" Client created successfully") + print(" Spark UI enabled") + print() + + # Step 2: Prepare the application + timestamp = datetime.now().strftime("%H%M%S") + app_name = f"etl-pipeline-{timestamp}" + + # Get S3 path for the ETL script + script_path = S3_PATHS["etl_script"] + + print("Step 2: Configuring ETL pipeline with S3 storage...") + print(f" App name: {app_name}") + print(" Spark version: 4.0.0") + print(f" Script location: {script_path}") + print(" Resources: 1 driver + 2 executors") + print() + + # Step 3: Submit the application + print("Step 3: Submitting ETL pipeline from S3...") + + try: + # Get S3-enabled Spark configuration + spark_conf = get_s3_spark_conf() + + response = client.submit_application( + # Application metadata + app_name=app_name, + main_application_file=script_path, # S3 path! + # Spark configuration + spark_version="4.0.0", + app_type="Python", + # Resource allocation (medium for ETL) + driver_cores=1, + driver_memory="1g", + executor_cores=1, + executor_memory="1g", + num_executors=2, + # S3 configuration for MinIO + spark_conf=spark_conf, + ) + + print(" ETL pipeline submitted successfully!") + print(f" Submission ID: {response.submission_id}") + print(f" Status: {response.status}") + print(" Script loaded from S3: Done") + print() + print(" 🌐 Spark UI Access (choose one):") + print(" Option 1 - Direct to driver pod:") + print(f" kubectl port-forward pod/{app_name}-driver 4040:4040") + print(" Option 2 - Via service (if created by operator):") + print(f" kubectl port-forward svc/{app_name}-ui-svc 4040:4040") + print(" Then open: http://localhost:4040") + print() + print(" šŸ’” Tip: Use Option 1 if service doesn't exist") + print() + + except Exception as e: + print(f" ERROR: Submission failed: {e}") + print() + print("Troubleshooting:") + print(" 1. Ensure MinIO is running:") + print(" kubectl get pods -l app=minio") + print(" 2. Verify scripts are uploaded:") + print(" kubectl exec minio-client -- mc ls myminio/spark-scripts/") + print(" 3. Check if setup_minio.sh was run successfully") + sys.exit(1) + + # Step 4: Monitor the application + print("Step 4: Monitoring ETL pipeline (this may take 2-3 minutes)...") + print(" Pipeline stages: Extract - Transform - Load") + + try: + # Wait for completion with timeout + final_status = client.wait_for_job_status( + submission_id=app_name, + timeout=300, # 5 minutes max + polling_interval=5, # Check every 5 seconds + ) + + print(" ETL pipeline completed!") + print(f" Final state: {final_status.state.value}") + print() + + # Check if successful + if final_status.state != ApplicationState.COMPLETED: + print(f" WARNING: Pipeline did not complete successfully: {final_status.state.value}") + print(" Check logs below for details.") + + except TimeoutError: + print(" ERROR: Pipeline did not complete within 5 minutes") + print(f" You can check status later with: client.get_job('{app_name}')") + sys.exit(1) + except Exception as e: + print(f" ERROR: Error monitoring pipeline: {e}") + sys.exit(1) + + # Step 5: Retrieve results from logs + print("Step 5: Retrieving ETL results from logs...") + print() + + try: + logs = list(client.get_job_logs(app_name)) + + print("=" * 80) + print("ETL PIPELINE RESULTS (from S3 script)") + print("=" * 80) + + # Display important sections from the ETL script + important_keywords = [ + "ETL PIPELINE", + "[EXTRACT]", + "[TRANSFORM]", + "[LOAD]", + "Extracted", + "records", + "Results:", + ] + + found_results = False + for line in logs: + if any(keyword in line for keyword in important_keywords): + print(line) + found_results = True + elif found_results and ("+" in line or "|" in line): + # Print table output + print(line) + + if not found_results: + print("Showing last 30 log lines:") + for line in logs[-30:]: + print(line) + + print() + print("=" * 80) + + except Exception as e: + print(f" WARNING: Could not retrieve logs: {e}") + print(" The pipeline may have completed but logs are not yet available") + + # Step 6: Cleanup + print() + print("Step 6: Cleaning up resources...") + try: + client.delete_job(app_name) + print(f" Application '{app_name}' deleted") + except Exception as e: + print(f" WARNING: Cleanup warning: {e}") + print(f" You can manually delete with: kubectl delete sparkapplication {app_name}") + + print() + print("=" * 80) + print("EXAMPLE COMPLETED SUCCESSFULLY!") + print("=" * 80) + print() + print("What you learned:") + print(" ETL pipeline structure and phases") + print(" Storing pipeline scripts in S3/MinIO") + print(" Data cleaning and transformation techniques") + print(" Production pattern with versioned scripts") + print() + print("ETL Best Practices Demonstrated:") + print(" - Separate Extract-Transform-Load phases") + print(" - Store scripts in version-controlled S3") + print(" - Use object storage for pipeline artifacts") + print(" - Enable script versioning for rollback") + print() + print("Production Tips:") + print(" - Implement CI/CD for ETL script deployment") + print(" - Use S3 versioning for script history") + print(" - Read data from S3 buckets (not just scripts)") + print(" - Write outputs to partitioned S3 locations") + print(" - Add error handling and retry logic") + print(" - Monitor pipeline metrics") + print() + print("Next steps:") + print(" - Try example 05: Scheduled batch processing (S3)") + print(" - Read/write data from/to S3 in your ETL") + print(" - Implement incremental ETL (delta processing)") + print(" - Orchestrate with Airflow/Argo Workflows") + print() + + +if __name__ == "__main__": + main() diff --git a/examples/spark/05_scheduled_batch_job.py b/examples/spark/05_scheduled_batch_job.py new file mode 100644 index 000000000..10b0fb396 --- /dev/null +++ b/examples/spark/05_scheduled_batch_job.py @@ -0,0 +1,536 @@ +#!/usr/bin/env python3 +""" +Title: Scheduled Batch Job with Resilience +Level: 2 (Intermediate - Batch Processing) +Target Audience: Data Engineers building production batch pipelines +Time to Run: ~3-4 minutes + +Description: +This example demonstrates production-ready batch processing patterns including +idempotent processing, incremental updates, restart policies, and resilience +features. You'll learn how to build reliable batch jobs that can handle failures +and process data incrementally. + +Prerequisites: +- Kind cluster with Spark Operator (run ./setup_test_environment.sh) +- Default namespace with 'spark-operator-spark' service account + +What You'll Learn: +- Batch processing patterns (full vs incremental) +- Idempotent job design +- Restart policies and failure handling +- Time-based partitioning +- Checkpoint and recovery patterns +- Production batch job best practices + +Real-World Use Case: +Daily data warehouse refresh, nightly ETL jobs, periodic reporting, data synchronization. +""" + +from datetime import datetime +import os +import sys + +# Add SDK to path for development mode +sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +if sdk_path not in sys.path: + sys.path.insert(0, sdk_path) + +from kubeflow.spark import ( # noqa: E402 + ApplicationState, + OperatorBackendConfig, + RestartPolicy, + RestartPolicyType, + BatchSparkClient, +) + + +def create_batch_job_script(): + """Create a PySpark script for scheduled batch processing. + + Returns: + str: Python code for batch job + """ + return """ +from pyspark.sql import SparkSession +from pyspark.sql.functions import ( + col, lit, current_timestamp, to_date, date_format, + year, month, dayofmonth, sum as _sum, count, + max as _max, min as _min, when, coalesce +) +from pyspark.sql.types import * +from datetime import datetime, timedelta +import sys + +# Create Spark session +spark = SparkSession.builder \\ + .appName("Scheduled Batch Job") \\ + .getOrCreate() + +print("\\n" + "="*80) +print("SCHEDULED BATCH JOB - DAILY TRANSACTION PROCESSING") +print("="*80) + +# ============================================================================ +# CONFIGURATION +# ============================================================================ +print("\\n[CONFIG] Batch Job Configuration...") + +# In production, these would come from job parameters +BATCH_DATE = datetime.now().strftime("%Y-%m-%d") +LOOKBACK_DAYS = 7 # Process last 7 days for incremental +JOB_ID = f"batch_{BATCH_DATE.replace('-', '')}" + +print(f" - Batch Date: {BATCH_DATE}") +print(f" - Job ID: {JOB_ID}") +print(f" - Lookback Days: {LOOKBACK_DAYS}") +print(f" - Mode: Incremental") + +# ============================================================================ +# STEP 1: EXTRACT - Read Source Data +# ============================================================================ +print("\\n[STEP 1] Extracting source data...") +print("-" * 80) + +# Simulate transactional data source +transactions_schema = StructType([ + StructField("transaction_id", IntegerType(), False), + StructField("transaction_date", StringType(), False), + StructField("customer_id", IntegerType(), False), + StructField("product_id", IntegerType(), False), + StructField("amount", DoubleType(), False), + StructField("status", StringType(), False), +]) + +# Generate sample transactions for last 7 days +base_date = datetime.now() +transactions_data = [] +tx_id = 1 + +for day_offset in range(7): + tx_date = (base_date - timedelta(days=day_offset)).strftime("%Y-%m-%d") + # Generate 3-5 transactions per day + num_txs = 3 + (day_offset % 3) + for i in range(num_txs): + customer_id = 100 + (tx_id % 5) + product_id = 200 + (tx_id % 10) + amount = round(50.0 + (tx_id % 20) * 25.5, 2) + status = "completed" if tx_id % 10 != 0 else "pending" + transactions_data.append((tx_id, tx_date, customer_id, product_id, amount, status)) + tx_id += 1 + +transactions_df = spark.createDataFrame(transactions_data, transactions_schema) + +print(f" Loaded {transactions_df.count()} total transactions") + +# Show date range +date_range = transactions_df.agg( + _min("transaction_date").alias("min_date"), + _max("transaction_date").alias("max_date") +).collect()[0] + +print(f" - Date range: {date_range['min_date']} to {date_range['max_date']}") + +# ============================================================================ +# STEP 2: INCREMENTAL PROCESSING +# ============================================================================ +print("\\n[STEP 2] Applying incremental processing logic...") +print("-" * 80) + +# Calculate cutoff date for incremental processing +cutoff_date = (datetime.now() - timedelta(days=LOOKBACK_DAYS)).strftime("%Y-%m-%d") + +print(f" - Processing transactions >= {cutoff_date}") + +# Filter for incremental window (idempotent - same date range produces same result) +incremental_df = transactions_df.filter(col("transaction_date") >= lit(cutoff_date)) + +print(f" Filtered to {incremental_df.count()} transactions in incremental window") + +# ============================================================================ +# STEP 3: TRANSFORM - Business Logic +# ============================================================================ +print("\\n[STEP 3] Applying business transformations...") +print("-" * 80) + +# Add computed columns +enriched_df = incremental_df \\ + .withColumn("processing_date", lit(BATCH_DATE)) \\ + .withColumn("processing_timestamp", current_timestamp()) \\ + .withColumn("job_id", lit(JOB_ID)) \\ + .withColumn("year", year(col("transaction_date"))) \\ + .withColumn("month", month(col("transaction_date"))) \\ + .withColumn("day", dayofmonth(col("transaction_date"))) + +# Apply business rules +enriched_df = enriched_df \\ + .withColumn("is_high_value", when(col("amount") > 500, lit(True)).otherwise(lit(False))) \\ + .withColumn("is_completed", when(col("status") == "completed", lit(True)).otherwise(lit(False))) + +print(" Added metadata columns (processing_date, job_id)") +print(" Added date partitions (year, month, day)") +print(" Applied business rules (high_value, completion flags)") + +# ============================================================================ +# STEP 4: AGGREGATIONS - Daily Summary +# ============================================================================ +print("\\n[STEP 4] Creating daily aggregations...") +print("-" * 80) + +daily_summary = enriched_df.groupBy("transaction_date", "year", "month", "day").agg( + count("transaction_id").alias("transaction_count"), + count(when(col("is_completed"), True)).alias("completed_count"), + count(when(~col("is_completed"), True)).alias("pending_count"), + count(when(col("is_high_value"), True)).alias("high_value_count"), + _sum("amount").alias("total_amount"), + _max("amount").alias("max_amount"), + _min("amount").alias("min_amount"), +).withColumn("processing_date", lit(BATCH_DATE)) \\ + .withColumn("job_id", lit(JOB_ID)) \\ + .orderBy("transaction_date") + +print(" Created daily summary table") +print("\\nDaily Summary:") +daily_summary.show(truncate=False) + +# ============================================================================ +# STEP 5: CUSTOMER AGGREGATIONS +# ============================================================================ +print("\\n[STEP 5] Creating customer aggregations...") +print("-" * 80) + +customer_summary = enriched_df.filter(col("is_completed")).groupBy("customer_id").agg( + count("transaction_id").alias("transaction_count"), + _sum("amount").alias("total_spent"), + _max("amount").alias("max_transaction"), + _min("amount").alias("min_transaction"), + count(when(col("is_high_value"), True)).alias("high_value_transactions"), +).withColumn("processing_date", lit(BATCH_DATE)) \\ + .withColumn("job_id", lit(JOB_ID)) \\ + .orderBy(col("total_spent").desc()) + +print(" Created customer summary table") +print("\\nCustomer Summary (Top 5):") +customer_summary.show(5, truncate=False) + +# ============================================================================ +# STEP 6: DATA QUALITY CHECKS +# ============================================================================ +print("\\n[STEP 6] Running data quality checks...") +print("-" * 80) + +# Check 1: No null values in critical columns +null_check = enriched_df.filter( + col("transaction_id").isNull() | + col("customer_id").isNull() | + col("amount").isNull() +).count() + +print(f" - Null check: {null_check} records with nulls (expecting 0)") + +# Check 2: All amounts are positive +negative_amount_check = enriched_df.filter(col("amount") < 0).count() +print(f" - Negative amount check: {negative_amount_check} records (expecting 0)") + +# Check 3: Valid date range +out_of_range = enriched_df.filter( + (col("transaction_date") < cutoff_date) | + (col("transaction_date") > BATCH_DATE) +).count() +print(f" - Date range check: {out_of_range} out of range (expecting 0)") + +# Overall quality score +quality_passed = (null_check == 0) and (negative_amount_check == 0) and (out_of_range == 0) +quality_status = 'ALL QUALITY CHECKS PASSED' if quality_passed else 'WARNING: QUALITY ISSUES DETECTED' +print(f"\\n {quality_status}") + +# ============================================================================ +# STEP 7: SIMULATE WRITE TO PARTITIONED STORAGE +# ============================================================================ +print("\\n[STEP 7] Preparing output for partitioned storage...") +print("-" * 80) + +# In production, you would write: +# enriched_df.write \\ +# .mode("overwrite") \\ +# .partitionBy("year", "month", "day") \\ +# .parquet("s3://bucket/transactions/") + +print(" - Output format: Parquet") +print(" - Partitioning: year/month/day") +print(" - Write mode: Overwrite (idempotent)") +print("\\n Partitions that would be written:") + +partitions = enriched_df.select("year", "month", "day").distinct().collect() +for partition in partitions: + print(f" - year={partition['year']}/month={partition['month']}/day={partition['day']}") + +# ============================================================================ +# STEP 8: JOB SUMMARY +# ============================================================================ +print("\\n\\n" + "="*80) +print("BATCH JOB COMPLETED SUCCESSFULLY!") +print("="*80) + +print(f"\\nšŸ“Š Job Statistics:") +print(f" Job ID: {JOB_ID}") +print(f" Batch Date: {BATCH_DATE}") +print(f" Processing Window: {cutoff_date} to {BATCH_DATE}") +print(f" ") +print(f" Records Processed:") +print(f" - Total transactions: {enriched_df.count()}") +print(f" - Completed: {enriched_df.filter(col('is_completed')).count()}") +print(f" - Pending: {enriched_df.filter(~col('is_completed')).count()}") +print(f" - High value: {enriched_df.filter(col('is_high_value')).count()}") +print(f" ") +print(f" Outputs Generated:") +print(f" - Daily summaries: {daily_summary.count()} days") +print(f" - Customer summaries: {customer_summary.count()} customers") +print(f" - Partitions: {len(partitions)}") +print(f" ") +print(f" Data Quality:") +print(f" - Quality checks: {'PASSED' if quality_passed else 'FAILED'}") + +print("\\nšŸ’” Batch Processing Features Demonstrated:") +print(" Incremental processing (configurable lookback)") +print(" Idempotent design (same input - same output)") +print(" Date partitioning for efficient queries") +print(" Job metadata for audit trail") +print(" Data quality validation") +print(" Business rule application") + +print("\\nšŸ”„ Production Considerations:") +print(" - Schedule with Airflow/Argo for automated runs") +print(" - Add checkpoint/recovery for large datasets") +print(" - Implement retry logic with exponential backoff") +print(" - Monitor job metrics and SLAs") +print(" - Use restart policies for fault tolerance") + +spark.stop() +""" + + +def main(): + """Main example: Submit scheduled batch job with resilience features.""" + + print("=" * 80) + print("EXAMPLE 05: Scheduled Batch Job with Resilience") + print("=" * 80) + print() + print("This example demonstrates:") + print(" 1. Production batch job patterns") + print(" 2. Incremental processing (configurable lookback)") + print(" 3. Idempotent job design") + print(" 4. Restart policies for fault tolerance") + print(" 5. Date partitioning") + print(" 6. Data quality validation") + print() + + # Step 1: Create SparkClient with configuration + print("Step 1: Creating Spark client...") + config = OperatorBackendConfig( + namespace=os.getenv("SPARK_NAMESPACE", "default"), + service_account="spark-operator-spark", + default_spark_image="docker.io/library/spark", + context=os.getenv("KUBE_CONTEXT", "kind-spark-test"), + enable_monitoring=False, + enable_ui=False, + ) + client = BatchSparkClient(backend_config=config) + print(" Client created successfully") + print() + + # Step 2: Prepare the application with resilience + app_name = "batch-job-scheduled" + batch_date = datetime.now().strftime("%Y-%m-%d") + + print("Step 2: Configuring batch job with resilience...") + print(f" App name: {app_name}") + print(f" Batch date: {batch_date}") + print(" Spark version: 4.0.0") + print(" Resources: 1 driver + 2 executors") + print(" Restart policy: OnFailure (retry up to 3 times)") + print() + + # Step 3: Submit the application with restart policy + print("Step 3: Submitting batch job with fault tolerance...") + + try: + # Configure restart policy for production resilience + restart_policy = RestartPolicy( + type=RestartPolicyType.ON_FAILURE, + on_failure_retries=3, # Retry up to 3 times on failure + on_failure_retry_interval=30, # Wait 30 seconds between retries + on_submission_failure_retries=2, # Retry submission failures + on_submission_failure_retry_interval=15, # Wait 15 seconds + ) + + response = client.submit_application( + # Application metadata + app_name=app_name, + # Placeholder + main_application_file=("local:///opt/spark/examples/src/main/python/pi.py"), + # Spark configuration + spark_version="4.0.0", + app_type="Python", + # Resource allocation for batch processing + driver_cores=1, + driver_memory="1g", + executor_cores=1, + executor_memory="1g", + num_executors=2, + # Resilience configuration + restart_policy=restart_policy, + time_to_live_seconds=3600, # Auto-cleanup after 1 hour + # Batch job metadata + labels={ + "job_type": "batch", + "schedule": "daily", + "batch_date": batch_date.replace("-", ""), + }, + # Required for Spark 4.0 + spark_conf={ + "spark.kubernetes.file.upload.path": "/tmp", + }, + ) + + print(" Batch job submitted successfully!") + print(f" Submission ID: {response.submission_id}") + print(f" Status: {response.status}") + print() + print(" Resilience features enabled:") + print(f" - Retry on failure: {restart_policy.on_failure_retries} attempts") + print(f" - Retry interval: {restart_policy.on_failure_retry_interval}s") + print(" - Auto-cleanup: After 1 hour") + print() + + except Exception as e: + print(f" ERROR: Submission failed: {e}") + sys.exit(1) + + # Step 4: Monitor the application + print("Step 4: Monitoring batch job (this may take 2-3 minutes)...") + print(" Processing incremental data window...") + + try: + # Wait for completion with timeout + final_status = client.wait_for_job_status( + submission_id=app_name, + timeout=300, # 5 minutes max + polling_interval=5, # Check every 5 seconds + ) + + print(" Batch job completed!") + print(f" Final state: {final_status.state.value}") + print() + + # Check if successful + if final_status.state != ApplicationState.COMPLETED: + print(f" WARNING: Job did not complete successfully: {final_status.state.value}") + print(" Restart policy would trigger automatic retry") + print(" Check logs below for details.") + + except TimeoutError: + print(" ERROR: Job did not complete within 5 minutes") + print(f" You can check status later with: client.get_job('{app_name}')") + sys.exit(1) + except Exception as e: + print(f" ERROR: Error monitoring job: {e}") + sys.exit(1) + + # Step 5: Retrieve results from logs + print("Step 5: Retrieving batch job results...") + print() + + try: + logs = list(client.get_job_logs(app_name)) + + print("=" * 80) + print("BATCH JOB RESULTS") + print("=" * 80) + + # Display important sections + important_keywords = [ + "SCHEDULED BATCH JOB", + "[CONFIG]", + "[STEP", + "Daily Summary", + "Customer Summary", + "quality checks", + "Job Statistics", + "BATCH JOB COMPLETED", + ] + + for line in logs: + if ( + any(keyword in line for keyword in important_keywords) + or "Done" in line + or "WARNING" in line + or "šŸ“Š" in line + or "šŸ’”" in line + or "šŸ”„" in line + ): + print(line) + + print() + print("=" * 80) + + except Exception as e: + print(f" WARNING: Could not retrieve logs: {e}") + print(" The job may have completed but logs are not yet available") + + # Step 6: Cleanup + print() + print("Step 6: Cleaning up resources...") + try: + client.delete_job(app_name) + print(f" Application '{app_name}' deleted") + except Exception as e: + print(f" WARNING: Cleanup warning: {e}") + print(f" You can manually delete with: kubectl delete sparkapplication {app_name}") + + print() + print("=" * 80) + print("EXAMPLE COMPLETED SUCCESSFULLY!") + print("=" * 80) + print() + print("What you learned:") + print(" Production batch job patterns") + print(" Incremental vs full processing") + print(" Idempotent job design") + print(" Restart policies for resilience") + print(" Date-based partitioning") + print(" Data quality validation") + print(" Job metadata and audit trails") + print() + print("Resilience Features:") + print(" - RestartPolicy: Automatic retry on failures") + print(" - TimeToLiveSeconds: Auto-cleanup completed jobs") + print(" - Labels: Metadata for tracking and monitoring") + print(" - Quality Checks: Fail fast on data issues") + print() + print("Batch Processing Best Practices:") + print(" 1. Design jobs to be idempotent (rerunnable)") + print(" 2. Use incremental processing for efficiency") + print(" 3. Partition data by date for query performance") + print(" 4. Add quality checks at each stage") + print(" 5. Include metadata (job_id, timestamps)") + print(" 6. Configure retry policies for resilience") + print(" 7. Set TTL for automatic cleanup") + print() + print("Scheduling Options:") + print(" - Kubernetes CronJob") + print(" - Apache Airflow") + print(" - Argo Workflows") + print(" - Custom scheduler with SparkClient SDK") + print() + print("Next steps:") + print(" - Try example 06: Dynamic allocation and auto-scaling") + print(" - Schedule this job with Airflow/Argo") + print(" - Implement checkpoint/recovery for large jobs") + print(" - Add alerting for job failures") + print() + + +if __name__ == "__main__": + main() diff --git a/examples/spark/05_scheduled_batch_job_s3.py b/examples/spark/05_scheduled_batch_job_s3.py new file mode 100644 index 000000000..50b5b27f9 --- /dev/null +++ b/examples/spark/05_scheduled_batch_job_s3.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python3 +""" +Title: Scheduled Batch Job with Resilience (using MinIO S3) +Level: 2 (Intermediate - Batch Processing) +Target Audience: Data Engineers building production batch pipelines +Time to Run: ~3-4 minutes + +Description: +This example demonstrates production-ready batch processing patterns with scripts +stored in S3-compatible storage (MinIO). You'll learn how to build reliable batch +jobs with versioned scripts in object storage, restart policies, and resilience features. + +Prerequisites: +- Kind cluster with Spark Operator (run ./setup_test_environment.sh) +- MinIO deployed (run ./setup_minio.sh) +- Default namespace with 'spark-operator-spark' service account + +What You'll Learn: +- Batch processing with scripts in S3/MinIO +- Restart policies and failure handling +- Production pattern with versioned batch scripts +- Job metadata and audit trails + +Real-World Use Case: +Daily data warehouse refresh, nightly ETL jobs with scripts managed in S3. +""" + +from datetime import datetime +import os +import sys + +# Add SDK to path for development mode +sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +if sdk_path not in sys.path: + sys.path.insert(0, sdk_path) + +from kubeflow.spark import ( # noqa: E402 + ApplicationState, + OperatorBackendConfig, + RestartPolicy, + RestartPolicyType, + BatchSparkClient, +) + +# Import MinIO configuration +try: + from minio_config import S3_PATHS, get_s3_spark_conf, print_minio_info +except ImportError: + print("ERROR: minio_config.py not found!") + print("Please ensure you're running from the examples/spark directory") + sys.exit(1) + + +def main(): + """Main example: Submit scheduled batch job from S3 with resilience.""" + + print("=" * 80) + print("EXAMPLE 05: Scheduled Batch Job with Resilience (MinIO S3)") + print("=" * 80) + print() + print("This example demonstrates:") + print(" 1. Production batch job patterns") + print(" 2. Storing batch scripts in S3 (MinIO)") + print(" 3. Restart policies for fault tolerance") + print(" 4. Job metadata and audit trails") + print() + + # Show MinIO configuration + print_minio_info() + + # Step 1: Create SparkClient with configuration + print("Step 1: Creating Spark client...") + config = OperatorBackendConfig( + namespace=os.getenv("SPARK_NAMESPACE", "default"), + service_account="spark-operator-spark", + default_spark_image="docker.io/library/spark", + context=os.getenv("KUBE_CONTEXT", "kind-spark-test"), + enable_monitoring=False, + enable_ui=True, # Enable Spark UI + ) + client = BatchSparkClient(backend_config=config) + print(" Client created successfully") + print(" Spark UI enabled") + print() + + # Step 2: Prepare the application with resilience + timestamp = datetime.now().strftime("%H%M%S") + app_name = f"batch-job-{timestamp}" + batch_date = datetime.now().strftime("%Y-%m-%d") + + # Get S3 path for the batch job script + script_path = S3_PATHS["batch_job_script"] + + print("Step 2: Configuring batch job with resilience...") + print(f" App name: {app_name}") + print(f" Batch date: {batch_date}") + print(f" Script location: {script_path}") + print(" Spark version: 4.0.0") + print(" Resources: 1 driver + 2 executors") + print(" Restart policy: OnFailure (retry up to 3 times)") + print() + + # Step 3: Submit the application with restart policy + print("Step 3: Submitting batch job with fault tolerance...") + + try: + # Configure restart policy for production resilience + restart_policy = RestartPolicy( + type=RestartPolicyType.ON_FAILURE, + on_failure_retries=3, # Retry up to 3 times on failure + on_failure_retry_interval=30, # Wait 30 seconds between retries + on_submission_failure_retries=2, # Retry submission failures + on_submission_failure_retry_interval=15, # Wait 15 seconds + ) + + # Get S3-enabled Spark configuration + spark_conf = get_s3_spark_conf() + + response = client.submit_application( + # Application metadata + app_name=app_name, + main_application_file=script_path, # S3 path! + # Spark configuration + spark_version="4.0.0", + app_type="Python", + # Resource allocation for batch processing + driver_cores=1, + driver_memory="1g", + executor_cores=1, + executor_memory="1g", + num_executors=2, + # Resilience configuration + restart_policy=restart_policy, + time_to_live_seconds=3600, # Auto-cleanup after 1 hour + # Batch job metadata + labels={ + "job_type": "batch", + "schedule": "daily", + "batch_date": batch_date.replace("-", ""), + }, + # S3 configuration for MinIO + spark_conf=spark_conf, + ) + + print(" Batch job submitted successfully!") + print(f" Submission ID: {response.submission_id}") + print(f" Status: {response.status}") + print(" Script loaded from S3: Done") + print() + print(" 🌐 Spark UI Access (choose one):") + print(" Option 1 - Direct to driver pod:") + print(f" kubectl port-forward pod/{app_name}-driver 4040:4040") + print(" Option 2 - Via service (if created by operator):") + print(f" kubectl port-forward svc/{app_name}-ui-svc 4040:4040") + print(" Then open: http://localhost:4040") + print() + print(" šŸ’” Tip: Use Option 1 if service doesn't exist") + print() + print(" Resilience features enabled:") + print(f" - Retry on failure: {restart_policy.on_failure_retries} attempts") + print(f" - Retry interval: {restart_policy.on_failure_retry_interval}s") + print(" - Auto-cleanup: After 1 hour") + print() + + except Exception as e: + print(f" ERROR: Submission failed: {e}") + print() + print("Troubleshooting:") + print(" 1. Ensure MinIO is running:") + print(" kubectl get pods -l app=minio") + print(" 2. Verify scripts are uploaded:") + print(" kubectl exec minio-client -- mc ls myminio/spark-scripts/") + print(" 3. Check if setup_minio.sh was run successfully") + sys.exit(1) + + # Step 4: Monitor the application + print("Step 4: Monitoring batch job (this may take 2-3 minutes)...") + print(" Processing batch data from S3 script...") + + try: + # Wait for completion with timeout + final_status = client.wait_for_job_status( + submission_id=app_name, + timeout=300, # 5 minutes max + polling_interval=5, # Check every 5 seconds + ) + + print(" Batch job completed!") + print(f" Final state: {final_status.state.value}") + print() + + # Check if successful + if final_status.state != ApplicationState.COMPLETED: + print(f" WARNING: Job did not complete successfully: {final_status.state.value}") + print(" Restart policy would trigger automatic retry") + print(" Check logs below for details.") + + except TimeoutError: + print(" ERROR: Job did not complete within 5 minutes") + print(f" You can check status later with: client.get_job('{app_name}')") + sys.exit(1) + except Exception as e: + print(f" ERROR: Error monitoring job: {e}") + sys.exit(1) + + # Step 5: Retrieve results from logs + print("Step 5: Retrieving batch job results...") + print() + + try: + logs = list(client.get_job_logs(app_name)) + + print("=" * 80) + print("BATCH JOB RESULTS (from S3 script)") + print("=" * 80) + + # Display important sections from the batch job script + important_keywords = [ + "SCHEDULED BATCH JOB", + "[CONFIG]", + "[EXTRACT]", + "[TRANSFORM]", + "[LOAD]", + "[COMPLETE]", + "Batch Configuration", + "Customer Summary", + ] + + found_results = False + for line in logs: + if any(keyword in line for keyword in important_keywords): + print(line) + found_results = True + elif found_results and ("+" in line or "|" in line): + # Print table output + print(line) + + if not found_results: + print("Showing last 30 log lines:") + for line in logs[-30:]: + print(line) + + print() + print("=" * 80) + + except Exception as e: + print(f" WARNING: Could not retrieve logs: {e}") + print(" The job may have completed but logs are not yet available") + + # Step 6: Cleanup + print() + print("Step 6: Cleaning up resources...") + try: + client.delete_job(app_name) + print(f" Application '{app_name}' deleted") + except Exception as e: + print(f" WARNING: Cleanup warning: {e}") + print(f" You can manually delete with: kubectl delete sparkapplication {app_name}") + + print() + print("=" * 80) + print("EXAMPLE COMPLETED SUCCESSFULLY!") + print("=" * 80) + print() + print("What you learned:") + print(" Production batch job patterns") + print(" Storing batch scripts in S3/MinIO") + print(" Restart policies for resilience") + print(" Job metadata and audit trails") + print(" Production pattern with versioned scripts") + print() + print("Resilience Features:") + print(" - RestartPolicy: Automatic retry on failures") + print(" - TimeToLiveSeconds: Auto-cleanup completed jobs") + print(" - Labels: Metadata for tracking and monitoring") + print(" - S3 Versioning: Script rollback capability") + print() + print("Batch Processing Best Practices:") + print(" 1. Store scripts in version-controlled S3") + print(" 2. Design jobs to be idempotent (rerunnable)") + print(" 3. Configure retry policies for resilience") + print(" 4. Set TTL for automatic cleanup") + print(" 5. Use labels for job tracking") + print(" 6. Enable S3 versioning for rollback") + print() + print("Scheduling Options:") + print(" - Kubernetes CronJob") + print(" - Apache Airflow") + print(" - Argo Workflows") + print(" - Custom scheduler with SparkClient SDK") + print() + print("Production Tips:") + print(" - Implement CI/CD for batch script deployment") + print(" - Use S3 versioning for script history") + print(" - Monitor job metrics and SLAs") + print(" - Add alerting for job failures") + print(" - Store job metadata in data catalog") + print() + print("Next steps:") + print(" - Try example 06: Dynamic allocation and auto-scaling") + print(" - Schedule this job with Airflow/Argo") + print(" - Implement checkpoint/recovery for large jobs") + print(" - Read/write data from/to S3 buckets") + print() + + +if __name__ == "__main__": + main() diff --git a/examples/spark/06_autoscaling_dynamic_allocation.py b/examples/spark/06_autoscaling_dynamic_allocation.py new file mode 100644 index 000000000..eddc3905a --- /dev/null +++ b/examples/spark/06_autoscaling_dynamic_allocation.py @@ -0,0 +1,520 @@ +#!/usr/bin/env python3 +""" +Title: Dynamic Allocation and Auto-scaling +Level: 2 (Intermediate - Auto-scaling) +Target Audience: Data Engineers optimizing resource usage +Time to Run: ~4-5 minutes + +Description: +This example demonstrates Spark's dynamic allocation feature, which automatically +scales executors up and down based on workload. You'll learn when to use dynamic +allocation, how to configure it, and how it improves resource efficiency in +multi-tenant Kubernetes clusters. + +Prerequisites: +- Kind cluster with Spark Operator (run ./setup_test_environment.sh) +- Default namespace with 'spark-operator-spark' service account +- Spark 3.0+ (required for dynamic allocation on Kubernetes) + +What You'll Learn: +- Dynamic allocation configuration and tuning +- How Spark scales executors automatically +- Resource efficiency vs performance trade-offs +- Monitoring executor scaling behavior +- When to use dynamic vs fixed allocation + +Real-World Use Case: +Multi-tenant clusters, variable workloads, cost optimization, shared resources. +""" + +import os +import sys + +# Add SDK to path for development mode +sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +if sdk_path not in sys.path: + sys.path.insert(0, sdk_path) + +from kubeflow.spark import ( # noqa: E402 + ApplicationState, + DynamicAllocation, + OperatorBackendConfig, + BatchSparkClient, +) + + +def create_dynamic_allocation_script(): + """Create a PySpark script demonstrating dynamic allocation. + + Returns: + str: Python code for dynamic allocation demo + """ + return """ +from pyspark.sql import SparkSession +from pyspark.sql.functions import ( + col, lit, sum as _sum, count, avg, + monotonically_increasing_id, rand, when +) +from pyspark.sql.types import * +import time + +# Create Spark session +spark = SparkSession.builder \\ + .appName("Dynamic Allocation Demo") \\ + .getOrCreate() + +# Get dynamic allocation configuration +dyn_enabled = spark.conf.get("spark.dynamicAllocation.enabled", "false") +min_executors = spark.conf.get("spark.dynamicAllocation.minExecutors", "N/A") +max_executors = spark.conf.get("spark.dynamicAllocation.maxExecutors", "N/A") +initial_executors = spark.conf.get("spark.dynamicAllocation.initialExecutors", "N/A") + +print("\\n" + "="*80) +print("DYNAMIC ALLOCATION DEMO") +print("="*80) +print("\\nšŸ“Š Dynamic Allocation Configuration:") +print(f" - Enabled: {dyn_enabled}") +print(f" - Min Executors: {min_executors}") +print(f" - Max Executors: {max_executors}") +print(f" - Initial Executors: {initial_executors}") +shuffle_tracking = spark.conf.get('spark.dynamicAllocation.shuffleTracking.enabled', 'N/A') +print(f" - Shuffle Tracking: {shuffle_tracking}") + +# ============================================================================ +# PHASE 1: LIGHT WORKLOAD (should use minimal executors) +# ============================================================================ +print("\\n\\n[PHASE 1] Light Workload - Testing Scale Down") +print("="*80) +print("Expected: Spark should use minimal executors for small dataset\\n") + +# Small dataset +print("Creating small dataset (1,000 records)...") +small_data = spark.range(1000).select( + col("id"), + (col("id") * 2).alias("value"), + (col("id") % 10).alias("category") +) + +print(f" Created {small_data.count()} records") + +# Simple aggregation (low resource need) +result1 = small_data.groupBy("category").agg( + count("id").alias("count"), + _sum("value").alias("sum_value"), + avg("value").alias("avg_value") +).orderBy("category") + +print("\\nLight Workload Results:") +result1.show() + +print("\\nā±ļø Waiting 10 seconds for executor scaling to stabilize...") +time.sleep(10) + +# Check current executor count (approximation based on task distribution) +print("\\nšŸ“ˆ After light workload:") +print(" - Spark should have scaled down to minimum executors") +print(" - Check operator logs or Spark UI for exact executor count") + +# ============================================================================ +# PHASE 2: MEDIUM WORKLOAD (should scale up moderately) +# ============================================================================ +print("\\n\\n[PHASE 2] Medium Workload - Testing Scale Up") +print("="*80) +print("Expected: Spark should add executors to handle increased load\\n") + +# Medium dataset +print("Creating medium dataset (100,000 records)...") +medium_data = spark.range(100000).select( + col("id"), + (rand() * 1000).alias("value"), + (col("id") % 100).alias("category"), + (col("id") % 10).alias("partition_key") +) + +print(f" Created {medium_data.count()} records") + +# More complex processing (triggers parallelism) +result2 = medium_data.groupBy("category").agg( + count("id").alias("count"), + _sum("value").alias("sum_value"), + avg("value").alias("avg_value") +).filter(col("count") > 100).orderBy(col("sum_value").desc()) + +print(f"\\nMedium Workload Results (showing top 10):") +result2.show(10) + +print("\\nā±ļø Waiting 10 seconds for executor scaling...") +time.sleep(10) + +print("\\nšŸ“ˆ After medium workload:") +print(" - Spark should have scaled up executors") +print(" - More executors = better parallelism for aggregations") + +# ============================================================================ +# PHASE 3: HEAVY WORKLOAD (should scale to maximum) +# ============================================================================ +print("\\n\\n[PHASE 3] Heavy Workload - Testing Maximum Scale") +print("="*80) +print("Expected: Spark should scale to max executors for heavy computation\\n") + +# Large dataset with shuffle +print("Creating large dataset (500,000 records)...") +large_data = spark.range(500000).select( + col("id"), + (rand() * 10000).alias("value"), + (col("id") % 1000).alias("category"), + when(rand() > 0.5, "A").otherwise("B").alias("group") +) + +print(f" Created {large_data.count()} records") + +# Heavy processing with shuffle (join + aggregation) +print("\\nPerforming heavy computation (join + aggregation)...") + +# Self-join to increase workload +large_data_alias = large_data.alias("df1") +large_data2 = large_data.alias("df2") + +result3 = large_data_alias.join( + large_data2, + col("df1.category") == col("df2.category"), + "inner" +).groupBy("df1.category").agg( + count("df1.id").alias("total_records"), + _sum("df1.value").alias("sum_value1"), + _sum("df2.value").alias("sum_value2") +).orderBy(col("total_records").desc()) + +print(f"\\nHeavy Workload Results (top 10 categories):") +result3.show(10) + +print("\\nā±ļø Waiting 10 seconds for executor scaling...") +time.sleep(10) + +print("\\nšŸ“ˆ After heavy workload:") +print(" - Spark should have scaled to maximum executors") +print(" - Shuffle operations triggered executor requests") +print(" - Join and aggregation required maximum parallelism") + +# ============================================================================ +# PHASE 4: COOL DOWN (should scale back down) +# ============================================================================ +print("\\n\\n[PHASE 4] Cool Down - Testing Scale Down After Load") +print("="*80) +print("Expected: After workload completes, Spark should release idle executors\\n") + +print("Performing final light operation...") +final_result = small_data.groupBy("category").count().orderBy("category") +final_result.show() + +print("\\nā±ļø Waiting 15 seconds for idle executors to be released...") +time.sleep(15) + +print("\\nšŸ“‰ After cool down:") +print(" - Spark should release idle executors") +print(" - Only minimum executors retained") +print(" - Resources returned to cluster for other workloads") + +# ============================================================================ +# SUMMARY +# ============================================================================ +print("\\n\\n" + "="*80) +print("DYNAMIC ALLOCATION DEMO COMPLETED!") +print("="*80) + +print("\\nšŸŽÆ Key Observations:") +print(" 1. Light workload - Minimal executors (resource efficient)") +print(" 2. Medium workload - Moderate scale up (balanced)") +print(" 3. Heavy workload - Maximum executors (performance optimized)") +print(" 4. Cool down - Scale down (return resources)") + +print("\\nšŸ’” Dynamic Allocation Benefits:") +print(" Automatic resource optimization") +print(" Cost efficiency in multi-tenant clusters") +print(" No manual executor tuning needed") +print(" Better cluster utilization") + +print("\\nWARNING: When NOT to Use Dynamic Allocation:") +print(" - Streaming jobs (need consistent executors)") +print(" - Very short-lived jobs (overhead of scaling)") +print(" - Dedicated clusters (fixed allocation is simpler)") +print(" - Jobs with strict latency SLAs") + +print("\\nšŸ“Š Configuration Parameters Explained:") +print(" - minExecutors: Safety net, always available") +print(" - maxExecutors: Resource cap, prevents runaway scaling") +print(" - initialExecutors: Starting point, balances startup time") +print(" - shuffleTracking: Required for K8s, tracks shuffle data") + +print("\\nšŸ”§ Tuning Recommendations:") +print(" - Set min = 1-2 for cost efficiency") +print(" - Set max based on cluster capacity") +print(" - Set initial = expected average load") +print(" - Enable shuffleTracking (required for K8s)") +print(" - Monitor executor metrics in Spark UI") + +spark.stop() +""" + + +def main(): + """Main example: Submit Spark job with dynamic allocation enabled.""" + + print("=" * 80) + print("EXAMPLE 06: Dynamic Allocation and Auto-scaling") + print("=" * 80) + print() + print("This example demonstrates:") + print(" 1. Configuring dynamic allocation") + print(" 2. Automatic executor scaling based on workload") + print(" 3. Resource efficiency in shared clusters") + print(" 4. Performance vs cost trade-offs") + print(" 5. When to use dynamic vs fixed allocation") + print() + + # Step 1: Create SparkClient with configuration + print("Step 1: Creating Spark client...") + config = OperatorBackendConfig( + namespace=os.getenv("SPARK_NAMESPACE", "default"), + service_account="spark-operator-spark", + default_spark_image="docker.io/library/spark", + context=os.getenv("KUBE_CONTEXT", "kind-spark-test"), + enable_monitoring=False, + enable_ui=False, + ) + client = BatchSparkClient(backend_config=config) + print(" Client created successfully") + print() + + # Step 2: Configure dynamic allocation + app_name = "dynamic-allocation-demo" + + print("Step 2: Configuring dynamic allocation...") + + # Create dynamic allocation configuration + dyn_alloc = DynamicAllocation( + enabled=True, + initial_executors=1, # Start with 1 executor + min_executors=1, # Keep at least 1 + max_executors=5, # Scale up to 5 max + shuffle_tracking_enabled=True, # Required for K8s + ) + + print(f" App name: {app_name}") + print(" Spark version: 4.0.0 (supports dynamic allocation)") + print(" Dynamic Allocation Settings:") + print(f" - Initial executors: {dyn_alloc.initial_executors}") + print(f" - Min executors: {dyn_alloc.min_executors}") + print(f" - Max executors: {dyn_alloc.max_executors}") + print(f" - Shuffle tracking: {dyn_alloc.shuffle_tracking_enabled}") + print() + print(" How it works:") + print(" - Starts with 1 executor (initial)") + print(" - Scales up to 5 as workload increases") + print(" - Scales down to 1 when idle") + print() + + # Step 3: Submit the application + print("Step 3: Submitting application with dynamic allocation...") + + try: + response = client.submit_application( + # Application metadata + app_name=app_name, + # Placeholder + main_application_file=("local:///opt/spark/examples/src/main/python/pi.py"), + # Spark configuration + # Spark 3.0+ required for dynamic allocation on K8s + spark_version="4.0.0", + app_type="Python", + # Resource allocation per executor + driver_cores=1, + driver_memory="1g", + executor_cores=1, + executor_memory="1g", + # This will be overridden by dynamic allocation + num_executors=1, + # Dynamic Allocation Configuration + dynamic_allocation=dyn_alloc, + # Spark configuration + spark_conf={ + "spark.kubernetes.file.upload.path": "/tmp", + # Additional tuning for dynamic allocation + # Release idle executors after 30s + "spark.dynamicAllocation.executorIdleTimeout": "30s", + # Keep cached executors longer + "spark.dynamicAllocation.cachedExecutorIdleTimeout": "60s", + # Request executors quickly + "spark.dynamicAllocation.schedulerBacklogTimeout": "5s", + }, + # Labels for tracking + labels={ + "feature": "dynamic-allocation", + "workload": "variable", + }, + ) + + print(" Application submitted successfully!") + print(f" Submission ID: {response.submission_id}") + print(f" Status: {response.status}") + print() + print(" Dynamic allocation features enabled:") + print(" Auto-scaling based on workload") + print(" Shuffle tracking for K8s compatibility") + print(" Optimized resource utilization") + print() + + except Exception as e: + print(f" ERROR: Submission failed: {e}") + sys.exit(1) + + # Step 4: Monitor the application + print("Step 4: Monitoring application (this will take 4-5 minutes)...") + print(" The job will demonstrate executor scaling through 4 phases:") + print(" Phase 1: Light workload (scale down)") + print(" Phase 2: Medium workload (scale up)") + print(" Phase 3: Heavy workload (max scale)") + print(" Phase 4: Cool down (scale down)") + print() + + try: + # Wait for completion with longer timeout for demo phases + final_status = client.wait_for_job_status( + submission_id=app_name, + timeout=360, # 6 minutes for all phases + polling_interval=5, # Check every 5 seconds + ) + + print(" Application completed!") + print(f" Final state: {final_status.state.value}") + print() + + # Check if successful + if final_status.state != ApplicationState.COMPLETED: + print( + f" WARNING: Application did not complete successfully: {final_status.state.value}" + ) + print(" Check logs below for details.") + + except TimeoutError: + print(" ERROR: Application did not complete within 6 minutes") + print(f" You can check status later with: client.get_job('{app_name}')") + sys.exit(1) + except Exception as e: + print(f" ERROR: Error monitoring application: {e}") + sys.exit(1) + + # Step 5: Retrieve results from logs + print("Step 5: Retrieving dynamic allocation insights from logs...") + print() + + try: + logs = list(client.get_job_logs(app_name)) + + print("=" * 80) + print("DYNAMIC ALLOCATION RESULTS") + print("=" * 80) + + # Display important sections + important_keywords = [ + "DYNAMIC ALLOCATION", + "Configuration:", + "[PHASE", + "Expected:", + "After", + "Key Observations", + "Benefits:", + "When NOT to Use", + "Tuning Recommendations", + ] + + for line in logs: + if any(keyword in line for keyword in important_keywords) or any( + emoji in line + for emoji in [ + "Done", + "WARNING", + "šŸ“Š", + "šŸ“ˆ", + "šŸ“‰", + "šŸ’”", + "šŸŽÆ", + "šŸ”§", + ] + ): + print(line) + + print() + print("=" * 80) + + except Exception as e: + print(f" WARNING: Could not retrieve logs: {e}") + print(" The job may have completed but logs are not yet available") + + # Step 6: Cleanup + print() + print("Step 6: Cleaning up resources...") + try: + client.delete_job(app_name) + print(f" Application '{app_name}' deleted") + print(" All executors released back to cluster") + except Exception as e: + print(f" WARNING: Cleanup warning: {e}") + print(f" You can manually delete with: kubectl delete sparkapplication {app_name}") + + print() + print("=" * 80) + print("EXAMPLE COMPLETED SUCCESSFULLY!") + print("=" * 80) + print() + print("What you learned:") + print(" How to configure dynamic allocation") + print(" How Spark scales executors automatically") + print(" Resource efficiency vs performance trade-offs") + print(" Tuning parameters and their effects") + print(" When to use dynamic vs fixed allocation") + print() + print("Dynamic Allocation Configuration:") + print(" from kubeflow.spark import DynamicAllocation") + print() + print(" dyn_alloc = DynamicAllocation(") + print(" enabled=True,") + print(" initial_executors=2, # Starting point") + print(" min_executors=1, # Always keep at least 1") + print(" max_executors=10, # Cap at 10") + print(" shuffle_tracking_enabled=True # Required for K8s") + print(" )") + print() + print(" client.submit_application(") + print(" app_name='my-app',") + print(" dynamic_allocation=dyn_alloc,") + print(" ...") + print(" )") + print() + print("Key Scaling Triggers:") + print(" - Scale Up: Pending tasks, shuffle writes, backlog") + print(" - Scale Down: Idle executors, no shuffle data needed") + print(" - Timing: Controlled by timeout configurations") + print() + print("Use Cases for Dynamic Allocation:") + print(" Multi-tenant clusters (shared resources)") + print(" Variable workloads (unpredictable load)") + print(" Cost optimization (pay for what you use)") + print(" Development/testing (efficient resource use)") + print() + print("Use Cases for Fixed Allocation:") + print(" Streaming jobs (predictable, constant load)") + print(" Short-lived jobs (scaling overhead too high)") + print(" Strict SLAs (no scaling latency)") + print(" Dedicated clusters (resources already allocated)") + print() + print("Next steps:") + print(" - Experiment with different min/max settings") + print(" - Monitor executor scaling in Spark UI") + print(" - Compare costs: dynamic vs fixed allocation") + print(" - Test with your own workloads") + print() + + +if __name__ == "__main__": + main() diff --git a/examples/spark/07_spark_connect_interactive.py b/examples/spark/07_spark_connect_interactive.py new file mode 100644 index 000000000..9c2d518a6 --- /dev/null +++ b/examples/spark/07_spark_connect_interactive.py @@ -0,0 +1,316 @@ +#!/usr/bin/env python3 +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Spark Connect Interactive Session Example. + +This example demonstrates how to use Kubeflow SparkSessionClient with Spark Connect +to create interactive data analysis sessions. Unlike batch job submission, +Spark Connect enables long-lived sessions for exploratory data analysis, +iterative development, and notebook-style workflows. + +Prerequisites: +1. A Spark cluster with Spark Connect server running (Spark 3.4+) +2. PySpark with Connect support: pip install 'pyspark[connect]>=3.4.0' +3. Network connectivity to Spark Connect server + +Key Features Demonstrated: +- Remote connectivity to existing Spark clusters +- Interactive SQL queries and DataFrame operations +- Artifact upload (Python files, JARs) +- Session metrics and monitoring +- Session lifecycle management + +Usage: + python 07_spark_connect_interactive.py --connect-url sc://spark-cluster:15002 +""" + +import argparse +import logging +import sys + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", +) +logger = logging.getLogger(__name__) + + +def parse_args() -> argparse.Namespace: + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Spark Connect Interactive Session Example", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--connect-url", + type=str, + required=True, + help=( + "Spark Connect URL (e.g., sc://spark-cluster:15002). " + "For Kubernetes: sc://{service-name}.{namespace}.svc.cluster.local:15002" + ), + ) + parser.add_argument( + "--token", + type=str, + help="Bearer token for authentication (optional)", + ) + parser.add_argument( + "--use-ssl", + action="store_true", + default=True, + help="Use SSL/TLS for secure communication (default: true)", + ) + parser.add_argument( + "--app-name", + type=str, + default="kubeflow-spark-connect-demo", + help="Application name for the session", + ) + return parser.parse_args() + + +def run_sql_analysis(session) -> None: + """Run interactive SQL analysis. + + Args: + session: ManagedSparkSession instance + """ + logger.info("=" * 80) + logger.info("Example 1: Interactive SQL Queries") + logger.info("=" * 80) + + # Create sample data + logger.info("Creating sample sales data...") + sales_data = [ + ("2024-01-01", "Product A", 100, 29.99), + ("2024-01-01", "Product B", 150, 19.99), + ("2024-01-02", "Product A", 120, 29.99), + ("2024-01-02", "Product C", 80, 49.99), + ("2024-01-03", "Product B", 200, 19.99), + ("2024-01-03", "Product C", 90, 49.99), + ] + + # Create DataFrame and register as temp view + df = session.createDataFrame(sales_data, ["date", "product", "quantity", "price"]) + df.createOrReplaceTempView("sales") + + logger.info("Sample data created and registered as 'sales' view") + + # Run SQL query + logger.info("\nExecuting SQL: SELECT product, SUM(quantity * price) AS revenue ...") + result_df = session.sql(""" + SELECT + product, + SUM(quantity * price) AS total_revenue, + SUM(quantity) AS total_quantity, + AVG(price) AS avg_price + FROM sales + GROUP BY product + ORDER BY total_revenue DESC + """) + + # Show results + logger.info("\nQuery Results:") + results = result_df.collect() + for row in results: + logger.info( + f" {row.product}: Revenue=${row.total_revenue:.2f}, " + f"Quantity={row.total_quantity}, AvgPrice=${row.avg_price:.2f}" + ) + + +def run_dataframe_operations(session) -> None: + """Run DataFrame transformations. + + Args: + session: ManagedSparkSession instance + """ + logger.info("\n" + "=" * 80) + logger.info("Example 2: DataFrame Operations") + logger.info("=" * 80) + + # Create sample user data + logger.info("Creating user activity data...") + user_data = [ + (1, "alice@example.com", "premium", 150), + (2, "bob@example.com", "free", 25), + (3, "carol@example.com", "premium", 200), + (4, "dave@example.com", "free", 10), + (5, "eve@example.com", "premium", 180), + ] + + df = session.createDataFrame(user_data, ["user_id", "email", "subscription", "activity_score"]) + + # Apply transformations + logger.info("Applying DataFrame transformations...") + + # Filter premium users + premium_users = df.filter(df.subscription == "premium") + + # Add derived column + premium_users = premium_users.withColumn( + "engagement_level", + session.spark.sql.functions.when(premium_users.activity_score >= 180, "high") + .when(premium_users.activity_score >= 150, "medium") + .otherwise("low"), + ) + + # Show results + logger.info("\nPremium Users with Engagement Levels:") + results = premium_users.collect() + for row in results: + logger.info( + f" User {row.user_id} ({row.email}): " + f"Score={row.activity_score}, Level={row.engagement_level}" + ) + + +def run_aggregation_analysis(session) -> None: + """Run aggregation and grouping operations. + + Args: + session: ManagedSparkSession instance + """ + logger.info("\n" + "=" * 80) + logger.info("Example 3: Aggregation and Grouping") + logger.info("=" * 80) + + # Create sample event data + logger.info("Creating event stream data...") + events = [ + ("2024-01-01", "login", "mobile", 1250), + ("2024-01-01", "login", "web", 3500), + ("2024-01-01", "purchase", "mobile", 150), + ("2024-01-02", "login", "mobile", 1300), + ("2024-01-02", "login", "web", 3800), + ("2024-01-02", "purchase", "web", 220), + ("2024-01-03", "login", "mobile", 1400), + ("2024-01-03", "purchase", "mobile", 180), + ] + + df = session.createDataFrame(events, ["date", "event_type", "platform", "count"]) + + # Group and aggregate + logger.info("Computing aggregations by platform and event type...") + agg_df = ( + df.groupBy("platform", "event_type") + .agg({"count": "sum", "date": "count"}) + .withColumnRenamed("sum(count)", "total_events") + .withColumnRenamed("count(date)", "num_days") + .orderBy("platform", "event_type") + ) + + # Show results + logger.info("\nAggregation Results:") + results = agg_df.collect() + for row in results: + logger.info( + f" {row.platform}/{row.event_type}: Total={row.total_events}, Days={row.num_days}" + ) + + +def demonstrate_session_features(session) -> None: + """Demonstrate session-specific features. + + Args: + session: ManagedSparkSession instance + """ + logger.info("\n" + "=" * 80) + logger.info("Example 4: Session Features & Metrics") + logger.info("=" * 80) + + # Get session info + info = session.get_info() + logger.info(f"Session ID: {info.session_id}") + logger.info(f"App Name: {info.app_name}") + logger.info(f"State: {info.state}") + + # Get metrics + metrics = session.get_metrics() + logger.info("\nSession Metrics:") + logger.info(f" Queries Executed: {metrics.queries_executed}") + logger.info(f" Active Queries: {metrics.active_queries}") + logger.info(f" Artifacts Uploaded: {metrics.artifacts_uploaded}") + + +def main(): + """Main execution function.""" + args = parse_args() + + logger.info("=" * 80) + logger.info("Spark Connect Interactive Session Example") + logger.info("=" * 80) + logger.info(f"Connect URL: {args.connect_url}") + logger.info(f"App Name: {args.app_name}") + logger.info(f"SSL Enabled: {args.use_ssl}") + + try: + # Import Kubeflow Spark client + from kubeflow.spark import ConnectBackendConfig, SparkSessionClient + + # Configure ConnectBackend + logger.info("\nInitializing Spark Connect backend...") + config = ConnectBackendConfig( + connect_url=args.connect_url, + token=args.token, + use_ssl=args.use_ssl, + timeout=300, + ) + + # Create SparkSessionClient + with SparkSessionClient(backend_config=config) as client: + logger.info("SparkSessionClient initialized successfully") + + # Create interactive session + logger.info(f"\nCreating Spark Connect session: {args.app_name}") + session = client.create_session(app_name=args.app_name) + logger.info(f"Session created: {session.session_id}") + + try: + # Run examples + run_sql_analysis(session) + run_dataframe_operations(session) + run_aggregation_analysis(session) + demonstrate_session_features(session) + + logger.info("\n" + "=" * 80) + logger.info("All examples completed successfully!") + logger.info("=" * 80) + + finally: + # Cleanup session + logger.info("\nClosing session...") + session.close() + logger.info("Session closed successfully") + + except ImportError as e: + logger.error( + "Failed to import required packages. " + "Please install: pip install 'pyspark[connect]>=3.4.0'" + ) + logger.error(f"Error: {e}") + sys.exit(1) + except Exception as e: + logger.error(f"Example failed: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/examples/spark/cleanup_spark.sh b/examples/spark/cleanup_spark.sh new file mode 100755 index 000000000..e2ad4f13f --- /dev/null +++ b/examples/spark/cleanup_spark.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +# +# Clean up all Spark applications and orphaned pods +# + +set -e + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +print_step() { + echo -e "${GREEN}āžœ${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}⚠${NC} $1" +} + +NAMESPACE="${1:-default}" + +echo "==========================================" +echo " Cleaning up Spark applications" +echo " Namespace: $NAMESPACE" +echo "==========================================" +echo "" + +# Check what exists +print_step "Current state:" +echo "" +echo "SparkApplications:" +kubectl get sparkapplications -n $NAMESPACE 2>/dev/null || echo " (none)" +echo "" +echo "Driver pods:" +kubectl get pods -n $NAMESPACE -l spark-role=driver 2>/dev/null || echo " (none)" +echo "" +echo "Executor pods:" +kubectl get pods -n $NAMESPACE -l spark-role=executor 2>/dev/null || echo " (none)" +echo "" + +read -p "Delete all Spark applications and pods? (y/n) " -n 1 -r +echo +if [[ ! $REPLY =~ ^[Yy]$ ]]; then + echo "Cancelled." + exit 0 +fi + +# Delete SparkApplications +print_step "Deleting SparkApplications..." +if kubectl get sparkapplications -n $NAMESPACE &>/dev/null; then + kubectl delete sparkapplications -n $NAMESPACE --all --timeout=30s || true +else + echo " No SparkApplications found" +fi + +# Delete driver pods (force delete) +print_step "Force deleting driver pods..." +if kubectl get pods -n $NAMESPACE -l spark-role=driver &>/dev/null; then + kubectl delete pods -n $NAMESPACE -l spark-role=driver --force --grace-period=0 --timeout=30s || true +else + echo " No driver pods found" +fi + +# Delete executor pods (force delete) +print_step "Force deleting executor pods..." +if kubectl get pods -n $NAMESPACE -l spark-role=executor &>/dev/null; then + kubectl delete pods -n $NAMESPACE -l spark-role=executor --force --grace-period=0 --timeout=30s || true +else + echo " No executor pods found" +fi + +# Delete orphaned ConfigMaps +print_step "Cleaning up ConfigMaps..." +if kubectl get configmaps -n $NAMESPACE -l sparkoperator.k8s.io/app-name &>/dev/null; then + kubectl delete configmaps -n $NAMESPACE -l sparkoperator.k8s.io/app-name --timeout=30s || true +else + echo " No Spark ConfigMaps found" +fi + +echo "" +print_step "Cleanup complete!" +echo "" +echo "Verification:" +kubectl get sparkapplications -n $NAMESPACE 2>/dev/null || echo " āœ“ No SparkApplications" +kubectl get pods -n $NAMESPACE -l spark-role 2>/dev/null || echo " āœ“ No Spark pods" +echo "" +echo "You can now submit new applications:" +echo " python test_spark_client_integration.py" diff --git a/examples/spark/debug_spark_connect.py b/examples/spark/debug_spark_connect.py new file mode 100644 index 000000000..10771377e --- /dev/null +++ b/examples/spark/debug_spark_connect.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +""" +Debug script for Spark Connect connection issues. + +This script tests the connection step-by-step with verbose logging. +""" + +import logging +import os +import signal +import sys +import time + +# Setup logging +logging.basicConfig( + level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) + +# Add SDK to path +sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +sys.path.insert(0, sdk_path) + +print("=" * 80) +print("Spark Connect Connection Debugger") +print("=" * 80) + +# Test 1: Check PySpark installation +print("\n[Test 1] Checking PySpark installation...") +try: + import pyspark + + print(f"āœ“ PySpark version: {pyspark.__version__}") +except ImportError as e: + print(f"āœ— PySpark not installed: {e}") + sys.exit(1) + +# Test 2: Check Spark Connect support +print("\n[Test 2] Checking Spark Connect support...") +try: + from pyspark.sql import SparkSession + + print("āœ“ SparkSession imported") + + # Check if remote() method exists + if hasattr(SparkSession.builder, "remote"): + print("āœ“ Spark Connect (remote) support available") + else: + print("āœ— Spark Connect support not available - upgrade PySpark") + sys.exit(1) +except Exception as e: + print(f"āœ— Error: {e}") + sys.exit(1) + +# Test 3: Test basic gRPC connectivity +print("\n[Test 3] Testing gRPC connectivity to localhost:30000...") +try: + import grpc + + print("āœ“ grpc module available") + + # Try to create a channel + channel = grpc.insecure_channel("localhost:30000") + + # Set a short timeout for connection test + import grpc + + try: + grpc.channel_ready_future(channel).result(timeout=5) + print("āœ“ gRPC channel ready") + except grpc.FutureTimeoutError: + print("āœ— gRPC channel timeout - server may not be responding") + print(" Check: kubectl logs -l app=spark-connect -n default") + except Exception as e: + print(f"āœ— gRPC channel error: {e}") + finally: + channel.close() + +except ImportError: + print("⚠ grpcio not installed (will be used by pyspark)") +except Exception as e: + print(f"⚠ gRPC test error: {e}") + +# Test 4: Test Kubeflow SDK import +print("\n[Test 4] Testing Kubeflow SDK imports...") +try: + from kubeflow.spark import ConnectBackendConfig, SparkSessionClient + + print("āœ“ Kubeflow Spark imports successful") +except Exception as e: + print(f"āœ— Import error: {e}") + sys.exit(1) + +# Test 5: Create config (doesn't connect yet) +print("\n[Test 5] Creating ConnectBackendConfig...") +try: + config = ConnectBackendConfig( + connect_url="sc://localhost:30000", + use_ssl=False, + timeout=10, # Short timeout for testing + ) + print(f"āœ“ Config created: {config.connect_url}") +except Exception as e: + print(f"āœ— Config creation error: {e}") + sys.exit(1) + +# Test 6: Create client (doesn't connect yet) +print("\n[Test 6] Creating SparkSessionClient...") +try: + client = SparkSessionClient(backend_config=config) + print("āœ“ Client created") +except Exception as e: + print(f"āœ— Client creation error: {e}") + sys.exit(1) + +# Test 7: Try to create session with timeout +print("\n[Test 7] Creating Spark session (this may hang)...") +print(" If this hangs for more than 30 seconds, press Ctrl+C") +print(" Attempting connection to sc://localhost:30000...") + + +def timeout_handler(signum, frame): + print("\nāœ— Session creation timed out after 30 seconds") + print("\nPossible issues:") + print(" 1. Spark Connect server not accessible") + print(" 2. Port forwarding not working correctly") + print(" 3. gRPC connection blocked") + print("\nDebugging steps:") + print(" - Check server logs: kubectl logs -l app=spark-connect -n default -f") + print(" - Verify port forward: lsof -i :30000") + print(" - Test connectivity: nc -zv localhost 30000") + print( + " - Check server is listening: kubectl exec -it -- netstat -tlnp | grep 15002" + ) + sys.exit(1) + + +# Set timeout +signal.signal(signal.SIGALRM, timeout_handler) +signal.alarm(30) + +try: + start_time = time.time() + session = client.create_session(app_name="debug-test") + elapsed = time.time() - start_time + + signal.alarm(0) # Cancel timeout + + print(f"āœ“ Session created in {elapsed:.2f} seconds!") + print(f" Session ID: {session.session_id}") + print(f" App name: {session.app_name}") + + # Test 8: Try a simple query + print("\n[Test 8] Testing simple SQL query...") + try: + df = session.sql("SELECT 1 AS id, 'test' AS message") + result = df.collect() + print(f"āœ“ Query executed: {result[0].message}") + df.show() + except Exception as e: + print(f"āœ— Query error: {e}") + + # Cleanup + print("\n[Cleanup] Closing session...") + session.close() + client.close() + print("āœ“ Session closed") + + print("\n" + "=" * 80) + print("All tests passed! Connection is working.") + print("=" * 80) + +except KeyboardInterrupt: + signal.alarm(0) + print("\n\nāœ— Interrupted by user") + sys.exit(1) +except Exception as e: + signal.alarm(0) + print(f"\nāœ— Session creation failed: {e}") + print(f"\nError type: {type(e).__name__}") + import traceback + + print("\nFull traceback:") + traceback.print_exc() + sys.exit(1) diff --git a/examples/spark/example_utils.py b/examples/spark/example_utils.py new file mode 100644 index 000000000..4ea1b905f --- /dev/null +++ b/examples/spark/example_utils.py @@ -0,0 +1,571 @@ +"""Example utilities for Kubeflow Spark SDK examples. + +This module provides common utilities, helpers, and sample data generators +used across the Spark examples. It helps reduce code duplication and provides +a consistent interface for common operations. + +Usage: + from example_utils import ( + create_client, + setup_logging, + generate_sample_data, + ) + + # Create client with defaults + client = create_client() + + # Or with custom configuration + client = create_client( + namespace="my-namespace", + enable_ui=True, + ) +""" + +from datetime import datetime, timedelta +import logging +import os +import sys +from typing import Optional + +# Add SDK to path for development mode +sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +if sdk_path not in sys.path: + sys.path.insert(0, sdk_path) + +from kubeflow.spark import ( # noqa: E402 + ApplicationState, + DynamicAllocation, + OperatorBackendConfig, + RestartPolicy, + RestartPolicyType, + BatchBatchSparkClient, +) + +# ============================================================================ +# LOGGING SETUP +# ============================================================================ + + +def setup_logging(level: str = "INFO") -> logging.Logger: + """Setup logging for examples. + + Args: + level: Log level (DEBUG, INFO, WARNING, ERROR) + + Returns: + Configured logger instance + """ + logging.basicConfig( + level=getattr(logging, level.upper()), + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + return logging.getLogger(__name__) + + +logger = setup_logging() + + +# ============================================================================ +# CLIENT CREATION HELPERS +# ============================================================================ + + +def create_client( + namespace: Optional[str] = None, + service_account: str = "spark-operator-spark", + context: Optional[str] = None, + enable_monitoring: bool = False, + enable_ui: bool = False, + default_spark_image: str = "docker.io/library/spark", +) -> BatchSparkClient: + """Create a BatchSparkClient with sensible defaults for examples. + + Args: + namespace: Kubernetes namespace (default: from SPARK_NAMESPACE env or 'default') + service_account: Kubernetes service account + context: Kubernetes context (default: from KUBE_CONTEXT env or 'kind-spark-test') + enable_monitoring: Enable Prometheus monitoring + enable_ui: Enable Spark UI + default_spark_image: Default Spark image to use + + Returns: + Configured BatchSparkClient instance + + Example: + >>> client = create_client() + >>> client = create_client(namespace="production", enable_ui=True) + """ + config = OperatorBackendConfig( + namespace=namespace or os.getenv("SPARK_NAMESPACE", "default"), + service_account=service_account, + default_spark_image=default_spark_image, + context=context or os.getenv("KUBE_CONTEXT", "kind-spark-test"), + enable_monitoring=enable_monitoring, + enable_ui=enable_ui, + ) + + logger.info(f"Creating BatchSparkClient for namespace: {config.namespace}") + return BatchSparkClient(backend_config=config) + + +# ============================================================================ +# COMMON CONFIGURATIONS +# ============================================================================ + + +def get_resilient_restart_policy() -> RestartPolicy: + """Get a restart policy suitable for production batch jobs. + + Returns: + RestartPolicy with retry configuration + """ + return RestartPolicy( + type=RestartPolicyType.ON_FAILURE, + on_failure_retries=3, + on_failure_retry_interval=30, + on_submission_failure_retries=2, + on_submission_failure_retry_interval=15, + ) + + +def get_dynamic_allocation_config( + min_executors: int = 1, + max_executors: int = 10, + initial_executors: int = 2, +) -> DynamicAllocation: + """Get a dynamic allocation configuration. + + Args: + min_executors: Minimum number of executors + max_executors: Maximum number of executors + initial_executors: Initial number of executors + + Returns: + DynamicAllocation configuration + """ + return DynamicAllocation( + enabled=True, + initial_executors=initial_executors, + min_executors=min_executors, + max_executors=max_executors, + shuffle_tracking_enabled=True, # Required for K8s + ) + + +def get_spark_conf_defaults(spark_version: str = "4.0.0") -> dict[str, str]: + """Get default Spark configuration suitable for examples. + + Args: + spark_version: Spark version to configure for + + Returns: + Dictionary of Spark configuration properties + """ + conf = { + "spark.kubernetes.file.upload.path": "/tmp", + } + + # Spark 4.0+ specific configurations + if spark_version.startswith("4."): + conf.update( + { + "spark.sql.adaptive.enabled": "true", + "spark.sql.adaptive.coalescePartitions.enabled": "true", + } + ) + + return conf + + +# ============================================================================ +# SAMPLE DATA GENERATORS +# ============================================================================ + + +def generate_customer_data(num_records: int = 100) -> list[tuple]: + """Generate sample customer data. + + Args: + num_records: Number of customer records to generate + + Returns: + List of customer tuples (id, name, email, city, signup_date) + """ + from datetime import date + import random + + cities = [ + "New York", + "Los Angeles", + "Chicago", + "Houston", + "Phoenix", + "Philadelphia", + "San Antonio", + "San Diego", + "Dallas", + "San Jose", + ] + + base_date = date.today() - timedelta(days=365) + + customers = [] + for i in range(1, num_records + 1): + signup_date = base_date + timedelta(days=random.randint(0, 365)) + customers.append( + ( + i, + f"Customer{i}", + f"customer{i}@example.com", + random.choice(cities), + signup_date.strftime("%Y-%m-%d"), + ) + ) + + return customers + + +def generate_transaction_data( + num_transactions: int = 1000, + num_customers: int = 100, + days_back: int = 30, +) -> list[tuple]: + """Generate sample transaction data. + + Args: + num_transactions: Number of transactions to generate + num_customers: Number of unique customers + days_back: How many days back to generate data + + Returns: + List of transaction tuples (tx_id, date, customer_id, amount, status) + """ + import random + + base_date = datetime.now() + statuses = ["completed", "pending", "cancelled"] + + transactions = [] + for i in range(1, num_transactions + 1): + tx_date = (base_date - timedelta(days=random.randint(0, days_back))).strftime("%Y-%m-%d") + customer_id = random.randint(1, num_customers) + amount = round(random.uniform(10.0, 1000.0), 2) + # 90% completed + status = random.choice(statuses) if i % 10 != 0 else "completed" + + transactions.append((i, tx_date, customer_id, amount, status)) + + return transactions + + +def generate_sales_data( + num_records: int = 100, + products: Optional[list[str]] = None, + categories: Optional[list[str]] = None, +) -> list[tuple]: + """Generate sample sales data. + + Args: + num_records: Number of sales records to generate + products: List of product names (default: common products) + categories: List of categories (default: Electronics, Furniture, etc.) + + Returns: + List of sales tuples (id, date, product, category, quantity, price, region) + """ + import random + + if products is None: + products = ["Laptop", "Mouse", "Keyboard", "Monitor", "Desk", "Chair"] + + if categories is None: + categories = ["Electronics", "Furniture", "Accessories"] + + regions = ["North", "South", "East", "West"] + base_date = datetime.now() + + sales = [] + for i in range(1, num_records + 1): + sale_date = (base_date - timedelta(days=random.randint(0, 90))).strftime("%Y-%m-%d") + product = random.choice(products) + category = random.choice(categories) + quantity = random.randint(1, 10) + price = round(random.uniform(25.0, 1500.0), 2) + region = random.choice(regions) + + sales.append((i, sale_date, product, category, quantity, price, region)) + + return sales + + +# ============================================================================ +# COMMON OPERATIONS +# ============================================================================ + + +def wait_for_job( + client: BatchBatchSparkClient, + app_name: str, + timeout: int = 300, + polling_interval: int = 5, +) -> ApplicationState: + """Wait for a Spark job to complete with proper error handling. + + Args: + client: BatchSparkClient instance + app_name: Application name + timeout: Maximum time to wait in seconds + polling_interval: Polling interval in seconds + + Returns: + Final ApplicationState + + Raises: + TimeoutError: If job doesn't complete within timeout + RuntimeError: If job fails + """ + logger.info(f"Waiting for job '{app_name}' to complete (timeout: {timeout}s)...") + + try: + status = client.wait_for_job_status( + submission_id=app_name, + timeout=timeout, + polling_interval=polling_interval, + ) + + if status.state == ApplicationState.COMPLETED: + logger.info(f"Job '{app_name}' completed successfully") + elif status.state == ApplicationState.FAILED: + logger.error(f"Job '{app_name}' failed") + raise RuntimeError(f"Job failed with state: {status.state.value}") + else: + logger.warning(f"Job '{app_name}' ended with unexpected state: {status.state.value}") + + return status.state + + except TimeoutError: + logger.error(f"Job '{app_name}' timed out after {timeout}s") + raise + except Exception as e: + logger.error(f"Error waiting for job '{app_name}': {e}") + raise + + +def print_job_status(client: BatchBatchSparkClient, app_name: str): + """Print current job status in a formatted way. + + Args: + client: BatchSparkClient instance + app_name: Application name + """ + try: + status = client.get_job(app_name) + + print("\n" + "=" * 60) + print(f"JOB STATUS: {app_name}") + print("=" * 60) + print(f"State: {status.state.value}") + if status.app_id: + print(f"App ID: {status.app_id}") + if status.submission_time: + print(f"Submitted: {status.submission_time}") + if status.start_time: + print(f"Started: {status.start_time}") + if status.completion_time: + print(f"Completed: {status.completion_time}") + print("=" * 60) + print() + + except Exception as e: + logger.error(f"Error getting status for '{app_name}': {e}") + + +def cleanup_job(client: BatchBatchSparkClient, app_name: str): + """Clean up a Spark application with proper error handling. + + Args: + client: BatchSparkClient instance + app_name: Application name + """ + try: + client.delete_job(app_name) + logger.info(f"Successfully deleted application '{app_name}'") + except Exception as e: + logger.warning(f"Failed to delete application '{app_name}': {e}") + logger.warning(f"You can manually delete with: kubectl delete sparkapplication {app_name}") + + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + + +def format_bytes(bytes_value: int) -> str: + """Format bytes into human-readable string. + + Args: + bytes_value: Number of bytes + + Returns: + Formatted string (e.g., "1.5 GB") + """ + for unit in ["B", "KB", "MB", "GB", "TB"]: + if bytes_value < 1024.0: + return f"{bytes_value:.2f} {unit}" + bytes_value /= 1024.0 + return f"{bytes_value:.2f} PB" + + +def format_duration(seconds: int) -> str: + """Format duration in seconds to human-readable string. + + Args: + seconds: Duration in seconds + + Returns: + Formatted string (e.g., "2h 30m 15s") + """ + hours, remainder = divmod(seconds, 3600) + minutes, seconds = divmod(remainder, 60) + + parts = [] + if hours > 0: + parts.append(f"{int(hours)}h") + if minutes > 0: + parts.append(f"{int(minutes)}m") + if seconds > 0 or not parts: + parts.append(f"{int(seconds)}s") + + return " ".join(parts) + + +def get_sample_spark_conf_for_use_case(use_case: str) -> dict[str, str]: + """Get recommended Spark configuration for common use cases. + + Args: + use_case: One of 'etl', 'ml', 'streaming', 'interactive' + + Returns: + Dictionary of recommended Spark configuration + """ + base_conf = get_spark_conf_defaults() + + use_case_configs = { + "etl": { + "spark.sql.adaptive.enabled": "true", + "spark.sql.adaptive.coalescePartitions.enabled": "true", + "spark.sql.shuffle.partitions": "200", + }, + "ml": { + # Some ML libs prefer this off + "spark.sql.adaptive.enabled": "false", + "spark.serializer": "org.apache.spark.serializer.KryoSerializer", + "spark.kryoserializer.buffer.max": "512m", + }, + "streaming": { + "spark.streaming.backpressure.enabled": "true", + "spark.streaming.receiver.maxRate": "10000", + }, + "interactive": { + "spark.sql.adaptive.enabled": "true", + "spark.ui.enabled": "true", + "spark.eventLog.enabled": "false", + }, + } + + if use_case in use_case_configs: + base_conf.update(use_case_configs[use_case]) + else: + logger.warning(f"Unknown use case '{use_case}', using defaults") + + return base_conf + + +# ============================================================================ +# EXAMPLE METADATA +# ============================================================================ + +EXAMPLES_METADATA = { + "01_hello_spark_pi": { + "title": "Hello Spark - Calculate Pi", + "level": 1, + "category": "Getting Started", + "time": "2-3 minutes", + "description": "Your first Spark job - calculate Pi using Monte Carlo method", + }, + "02_csv_data_analysis": { + "title": "CSV Data Analysis", + "level": 1, + "category": "Data Analysis Basics", + "time": "2-3 minutes", + "description": "Analyze CSV data with filtering and aggregations", + }, + "03_interactive_dataframe_exploration": { + "title": "Interactive DataFrame Exploration", + "level": 1, + "category": "Data Exploration", + "time": "3-4 minutes", + "description": "Exploratory data analysis patterns and data quality checks", + }, + "04_etl_pipeline_simple": { + "title": "Simple ETL Pipeline", + "level": 2, + "category": "Data Engineering", + "time": "3-4 minutes", + "description": "Extract-Transform-Load pipeline with data validation", + }, + "05_scheduled_batch_job": { + "title": "Scheduled Batch Job", + "level": 2, + "category": "Batch Processing", + "time": "3-4 minutes", + "description": "Production batch job with incremental processing and resilience", + }, + "06_autoscaling_dynamic_allocation": { + "title": "Dynamic Allocation", + "level": 2, + "category": "Auto-scaling", + "time": "4-5 minutes", + "description": "Automatic executor scaling based on workload", + }, +} + + +def print_examples_catalog(): + """Print a catalog of all available examples.""" + print("\n" + "=" * 80) + print("KUBEFLOW SPARK SDK - EXAMPLES CATALOG") + print("=" * 80) + print() + + # Group by level + by_level = {} + for name, metadata in EXAMPLES_METADATA.items(): + level = metadata["level"] + if level not in by_level: + by_level[level] = [] + by_level[level].append((name, metadata)) + + level_names = { + 1: "Level 1: Getting Started", + 2: "Level 2: Data Engineering Basics", + } + + for level in sorted(by_level.keys()): + print(f"\n{level_names.get(level, f'Level {level}')}") + print("-" * 80) + + for name, metadata in sorted(by_level[level], key=lambda x: x[0]): + print(f"\n{name}.py") + print(f" {metadata['title']}") + print(f" Category: {metadata['category']}") + print(f" Time: {metadata['time']}") + print(f" {metadata['description']}") + + print("\n" + "=" * 80) + print() + + +if __name__ == "__main__": + # Print examples catalog when run directly + print_examples_catalog() diff --git a/examples/spark/ipython_ex.py b/examples/spark/ipython_ex.py new file mode 100644 index 000000000..7a3d7d0b0 --- /dev/null +++ b/examples/spark/ipython_ex.py @@ -0,0 +1,85 @@ +# ./ipython_shell.py +# then run /paste this python code . + +import os + +from kubeflow.spark import OperatorBackendConfig, SparkClient + +config = OperatorBackendConfig( + namespace=os.getenv("SPARK_NAMESPACE", "default"), + service_account="spark-operator-spark", + default_spark_image="docker.io/library/spark", + enable_monitoring=False, + enable_ui=False, + context=os.getenv("KUBE_CONTEXT", "kind-spark-test"), # Explicitly set context +) +client = BatchSparkClient(backend_config=config) + +app_name = "test-spark-pi" + +response = client.submit_application( + app_name=app_name, + main_application_file="local:///opt/spark/examples/jars/spark-examples_2.13-4.0.0.jar", + main_class="org.apache.spark.examples.SparkPi", + spark_version="4.0.0", + app_type="Scala", + driver_cores=1, + driver_memory="512m", + executor_cores=1, + executor_memory="512m", + num_executors=1, + arguments=["100"], + spark_conf={ + "spark.kubernetes.file.upload.path": "/tmp", # Required for Spark 4.0 + }, +) +client.get_job(app_name) + + +response = client.submit_application( + app_name="my-python-pi5", + main_application_file="local:///opt/spark/examples/src/main/python/pi.py", + spark_version="4.0.0", + app_type="Python", + driver_cores=1, + driver_memory="512m", + executor_cores=1, + executor_memory="512m", + num_executors=2, + arguments=["100"], + spark_conf={"spark.kubernetes.file.upload.path": "/tmp"}, +) +final_status = client.wait_for_job_status("my-python-pi5", timeout=300) +client.get_job("my-python-pi5") +logs = list(client.get_job_logs("my-python-pi5")) +for line in logs: + if "Pi is roughly" in line: + print(f"RESULT: {line}") + + +response = client.submit_application( + app_name="my-python-pi6", + main_application_file="local:///opt/spark/examples/src/main/python/pi.py", + spark_version="4.0.0", + app_type="Python", + driver_cores=1, + driver_memory="512m", + executor_cores=1, + executor_memory="512m", + num_executors=2, + arguments=["100"], + spark_conf={ + "spark.kubernetes.file.upload.path": "/tmp", + "spark.eventLog.enabled": "true", + "spark.eventLog.dir": "/tmp/spark-events", + }, + volumes=[{"name": "spark-events", "persistentVolumeClaim": {"claimName": "spark-history-pvc"}}], + driver_volume_mounts=[{"name": "spark-events", "mountPath": "/tmp/spark-events"}], + executor_volume_mounts=[{"name": "spark-events", "mountPath": "/tmp/spark-events"}], +) +final_status = client.wait_for_job_status("my-python-pi6", timeout=300) +client.get_job("my-python-pi6") +logs = list(client.get_job_logs("my-python-pi6")) +for line in logs: + if "Pi is roughly" in line: + print(f"RESULT: {line}") diff --git a/examples/spark/ipython_shell.py b/examples/spark/ipython_shell.py new file mode 100755 index 000000000..8a6975116 --- /dev/null +++ b/examples/spark/ipython_shell.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +""" +Launch IPython shell with Kubeflow SDK in dev mode. +Usage: ./ipython_shell.py + +Requires IPython: pip install ipython +""" + +import os +import sys + +# Add SDK to path +sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +sys.path.insert(0, sdk_path) + +# Pre-import common modules + +# Print welcome message +banner = f""" +{"=" * 80} +Kubeflow Spark Client - IPython Development Shell +{"=" * 80} + +SDK Path: {sdk_path} + +Pre-imported: + BatchSparkClient, OperatorBackendConfig, GatewayBackendConfig, + ApplicationState, ApplicationStatus, SparkApplicationResponse + +Quick Examples: + config = OperatorBackendConfig(namespace="default") + client = SparkClient(backend_config=config) + +Tab completion and syntax highlighting enabled! +{"=" * 80} +""" + +try: + import IPython + + IPython.embed(banner1=banner, colors="Linux") +except ImportError: + print("IPython not installed. Install with: pip install ipython") + print("Falling back to regular Python shell...\n") + import code + + print(banner) + code.interact(local=locals()) diff --git a/examples/spark/ipython_spark_connect_demo.py b/examples/spark/ipython_spark_connect_demo.py new file mode 100755 index 000000000..273f6d35e --- /dev/null +++ b/examples/spark/ipython_spark_connect_demo.py @@ -0,0 +1,388 @@ +#!/usr/bin/env python3 +""" +IPython Demo Script for Spark Connect Integration + +This script demonstrates connecting to a Spark Connect server running in Kubernetes +and performing interactive DataFrame operations like groupBy, aggregations, etc. + +Prerequisites: +1. Kubernetes cluster with Spark Connect server deployed +2. PySpark with Connect support: pip install 'pyspark[connect]>=3.4.0' +3. Kubeflow SDK installed + +Usage: + python ipython_spark_connect_demo.py + +Or in IPython: + %run ipython_spark_connect_demo.py +""" + +import os +import sys + +# Add SDK to path for development mode +sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +sys.path.insert(0, sdk_path) + + +def print_section(title): + """Print formatted section header.""" + print("\n" + "=" * 80) + print(f" {title}") + print("=" * 80 + "\n") + + +def demo_basic_connection(): + """Demonstrate basic connection to Spark Connect server.""" + print_section("1. Connect to Spark Connect Server") + + from kubeflow.spark import ConnectBackendConfig, SparkSessionClient + + # Configuration for Kubernetes Spark Connect server + # The server is exposed via NodePort on port 30000 + config = ConnectBackendConfig( + connect_url="sc://localhost:30000", + use_ssl=False, + timeout=60, + ) + + print(f"Connecting to: {config.connect_url}") + + # Create client + client = SparkSessionClient(backend_config=config) + print("āœ“ SparkSessionClient created") + + return client + + +def demo_create_session(client): + """Demonstrate creating a Spark session.""" + print_section("2. Create Spark Session") + + session = client.create_session(app_name="ipython-demo") + print(f"āœ“ Session created: {session.session_id}") + print(f" App name: {session.app_name}") + print(f" Closed: {session.is_closed}") + + return session + + +def demo_simple_sql(session): + """Demonstrate simple SQL queries.""" + print_section("3. Simple SQL Query") + + df = session.sql("SELECT 1 AS id, 'Hello Spark Connect' AS message") + print("Query: SELECT 1 AS id, 'Hello Spark Connect' AS message") + print("\nResult:") + df.show() + + result = df.collect() + print(f"\nCollected: {result[0].message}") + + return df + + +def demo_create_dataframe(session): + """Demonstrate creating DataFrames from Python data.""" + print_section("4. Create DataFrame from Python Data") + + # Sample sales data + sales_data = [ + (1, "Electronics", "Laptop", 1200.00, 2, "2024-01-15"), + (2, "Electronics", "Mouse", 25.00, 5, "2024-01-15"), + (3, "Clothing", "Shirt", 35.00, 3, "2024-01-16"), + (4, "Electronics", "Keyboard", 75.00, 4, "2024-01-16"), + (5, "Clothing", "Pants", 55.00, 2, "2024-01-17"), + (6, "Electronics", "Monitor", 300.00, 3, "2024-01-17"), + (7, "Clothing", "Jacket", 120.00, 1, "2024-01-18"), + (8, "Electronics", "Mouse", 25.00, 10, "2024-01-18"), + (9, "Clothing", "Shirt", 35.00, 5, "2024-01-19"), + (10, "Electronics", "Laptop", 1200.00, 1, "2024-01-19"), + ] + + schema = ["id", "category", "product", "price", "quantity", "date"] + + df = session.createDataFrame(sales_data, schema) + print(f"āœ“ DataFrame created with {df.count()} rows") + print("\nSample data:") + df.show(5) + + return df + + +def demo_dataframe_operations(session, df): + """Demonstrate DataFrame transformations.""" + print_section("5. DataFrame Operations - Filter & Select") + + # Filter expensive items + expensive = df.filter(df.price > 100) + print("Filter: price > 100") + expensive.show() + + # Select specific columns + print("\nSelect: category, product, price") + df.select("category", "product", "price").show(5) + + return expensive + + +def demo_groupby_aggregations(session, df): + """Demonstrate groupBy and aggregations.""" + print_section("6. GroupBy and Aggregations") + + # Group by category and calculate statistics + print("Aggregation: Total revenue by category") + from pyspark.sql import functions as F # noqa: N812 + + revenue_df = df.withColumn("revenue", F.col("price") * F.col("quantity")) + + category_stats = revenue_df.groupBy("category").agg( + F.sum("revenue").alias("total_revenue"), + F.avg("price").alias("avg_price"), + F.sum("quantity").alias("total_quantity"), + F.count("*").alias("num_transactions"), + ) + + print("\nRevenue by Category:") + category_stats.show() + + # Group by product and sort + print("\nTop Products by Revenue:") + product_revenue = revenue_df.groupBy("product").agg( + F.sum("revenue").alias("total_revenue"), + F.sum("quantity").alias("total_sold"), + ) + + product_revenue.orderBy(F.desc("total_revenue")).show(5) + + return category_stats + + +def demo_advanced_aggregations(session, df): + """Demonstrate advanced aggregations and window functions.""" + print_section("7. Advanced Aggregations") + + from pyspark.sql import functions as F # noqa: N812 + from pyspark.sql.window import Window + + # Add computed column + df_with_revenue = df.withColumn("revenue", F.col("price") * F.col("quantity")) + + # Window function: Running total by date + print("Running Total Revenue by Date:") + window_spec = Window.orderBy("date").rowsBetween(Window.unboundedPreceding, Window.currentRow) + + daily_revenue = ( + df_with_revenue.groupBy("date") + .agg(F.sum("revenue").alias("daily_revenue")) + .withColumn("running_total", F.sum("daily_revenue").over(window_spec)) + ) + + daily_revenue.orderBy("date").show() + + # Pivot: Revenue by category and date + print("\nPivot: Revenue by Category and Date:") + pivot_df = ( + df_with_revenue.groupBy("date").pivot("category").agg(F.sum("revenue").alias("revenue")) + ) + + pivot_df.orderBy("date").show() + + return daily_revenue + + +def demo_session_metrics(session): + """Demonstrate session metrics tracking.""" + print_section("8. Session Metrics") + + metrics = session.get_metrics() + print(f"Session ID: {metrics.session_id}") + print(f"Queries Executed: {metrics.queries_executed}") + print(f"Active Queries: {metrics.active_queries}") + print(f"Artifacts Uploaded: {metrics.artifacts_uploaded}") + + info = session.get_info() + print(f"\nSession State: {info.state}") + print(f"App Name: {info.app_name}") + + +def demo_multiple_operations(session): + """Demonstrate chaining multiple operations.""" + print_section("9. Chained Operations") + + # Create sample employee data + employees = [ + (1, "Alice", "Engineering", 95000, 28), + (2, "Bob", "Engineering", 120000, 35), + (3, "Carol", "Sales", 80000, 42), + (4, "David", "Engineering", 110000, 30), + (5, "Eve", "Sales", 90000, 38), + (6, "Frank", "Marketing", 85000, 45), + (7, "Grace", "Engineering", 105000, 29), + (8, "Henry", "Marketing", 88000, 33), + ] + + df = session.createDataFrame(employees, ["id", "name", "dept", "salary", "age"]) + + print("Original Data:") + df.show() + + # Chain multiple operations + from pyspark.sql import functions as F # noqa: N812 + + result = ( + df.filter(F.col("age") < 40) + .groupBy("dept") + .agg(F.avg("salary").alias("avg_salary"), F.count("*").alias("count")) + .filter(F.col("count") >= 2) + .orderBy(F.desc("avg_salary")) + ) + + print("\nFiltered Analysis (age < 40, departments with 2+ people):") + result.show() + + return result + + +def run_complete_demo(): + """Run complete demonstration.""" + print("\n" + "=" * 80) + print(" Kubeflow Spark Connect - Interactive Demo") + print(" Connecting to Kubernetes Spark Connect Server") + print("=" * 80) + + try: + # Step 1: Connect + client = demo_basic_connection() + + # Step 2: Create session + session = demo_create_session(client) + + # Step 3: Simple SQL + demo_simple_sql(session) + + # Step 4: Create DataFrame + df = demo_create_dataframe(session) + + # Step 5: Basic operations + demo_dataframe_operations(session, df) + + # Step 6: GroupBy aggregations + demo_groupby_aggregations(session, df) + + # Step 7: Advanced aggregations + demo_advanced_aggregations(session, df) + + # Step 8: Session metrics + demo_session_metrics(session) + + # Step 9: Chained operations + demo_multiple_operations(session) + + print_section("Demo Complete") + print("āœ“ All operations completed successfully!") + print("\nTo continue experimenting:") + print(" - session object is available for more queries") + print(" - Try: session.sql('SELECT * FROM ...')") + print(" - Try: session.createDataFrame(...)") + print(" - Remember to call: session.close() when done") + + return client, session + + except Exception as e: + print(f"\nāœ— Error: {e}") + import traceback + + traceback.print_exc() + print("\nTroubleshooting:") + print(" 1. Is Kubernetes cluster running? (kubectl get nodes)") + print(" 2. Is Spark Connect deployed? (kubectl get pods -l app=spark-connect)") + print( + " 3. Is port forwarding active? (kubectl port-forward svc/spark-connect 30000:15002)" + ) + print(" 4. Is PySpark installed? (pip install 'pyspark[connect]>=3.4.0')") + return None, None + + +# Manual step-by-step execution helper +def print_manual_steps(): + """Print manual steps for running in IPython.""" + print("\n" + "=" * 80) + print(" Manual Step-by-Step Execution in IPython") + print("=" * 80) + print(""" +# Step 1: Import and configure +from kubeflow.spark import ConnectBackendConfig, SparkSessionClient + +config = ConnectBackendConfig( + connect_url="sc://localhost:30000", + use_ssl=False, + timeout=60 +) + +# Step 2: Create client and session +client = SparkSessionClient(backend_config=config) +session = client.create_session(app_name="my-analysis") + +# Step 3: Create sample data +sales_data = [ + (1, "Electronics", "Laptop", 1200.00, 2), + (2, "Electronics", "Mouse", 25.00, 5), + (3, "Clothing", "Shirt", 35.00, 3), + (4, "Electronics", "Keyboard", 75.00, 4), +] +df = session.createDataFrame(sales_data, ["id", "category", "product", "price", "quantity"]) + +# Step 4: View data +df.show() + +# Step 5: Run aggregations +from pyspark.sql import functions as F +revenue_df = df.withColumn("revenue", F.col("price") * F.col("quantity")) +revenue_df.groupBy("category").agg(F.sum("revenue").alias("total")).show() + +# Step 6: Clean up +session.close() +client.close() +""") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Spark Connect Interactive Demo") + parser.add_argument( + "--manual", + action="store_true", + help="Print manual steps instead of running automated demo", + ) + args = parser.parse_args() + + if args.manual: + print_manual_steps() + else: + client, session = run_complete_demo() + + # Keep objects available for interactive use + if client and session: + print("\nObjects available for continued use:") + print(" - client: SparkSessionClient instance") + print(" - session: ManagedSparkSession instance") + print("\nEntering interactive mode... (Ctrl+D to exit)") + + try: + import IPython + + IPython.embed() + except ImportError: + print("\nIPython not installed. Install with: pip install ipython") + print("Keeping session open for manual cleanup...") + input("\nPress Enter to close session and exit...") + + if session and not session.is_closed: + session.close() + print("āœ“ Session closed") + + if client: + client.close() + print("āœ“ Client closed") diff --git a/examples/spark/ipython_spark_connect_shell.py b/examples/spark/ipython_spark_connect_shell.py new file mode 100755 index 000000000..fe2fc9954 --- /dev/null +++ b/examples/spark/ipython_spark_connect_shell.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +""" +IPython Shell Launcher for Spark Connect Demo + +This script launches an IPython shell with the Kubeflow SDK pre-imported +and prints step-by-step instructions for testing Spark Connect. + +Usage: python ipython_spark_connect_shell.py +""" + +import os +import sys + +# Add SDK to path +sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +sys.path.insert(0, sdk_path) + +# Pre-import for convenience + +# Banner with instructions +banner = f""" +{"=" * 80} +Kubeflow Spark Connect - Interactive IPython Shell +{"=" * 80} + +SDK Path: {sdk_path} +Spark Connect URL: sc://localhost:30000 + +Pre-imported modules: + - ConnectBackendConfig + - SparkSessionClient + +{"=" * 80} +Step-by-Step Guide +{"=" * 80} + +1. Create Configuration: + config = ConnectBackendConfig( + connect_url="sc://localhost:30000", + use_ssl=False, + timeout=60 + ) + +2. Create Client and Session: + client = SparkSessionClient(backend_config=config) + session = client.create_session(app_name="my-demo") + +3. Run Simple SQL: + df = session.sql("SELECT 1 AS id, 'Hello Spark Connect' AS message") + df.show() + +4. Create DataFrame from Python Data: + sales_data = [ + (1, "Electronics", "Laptop", 1200.00, 2), + (2, "Electronics", "Mouse", 25.00, 5), + (3, "Clothing", "Shirt", 35.00, 3), + (4, "Electronics", "Keyboard", 75.00, 4), + (5, "Clothing", "Pants", 55.00, 2), + ] + df = session.createDataFrame( + sales_data, + ["id", "category", "product", "price", "quantity"] + ) + df.show() + +5. Filter and Select: + expensive = df.filter(df.price > 50) + expensive.show() + + df.select("category", "product", "price").show() + +6. GroupBy Aggregations: + from pyspark.sql import functions as F + + revenue_df = df.withColumn("revenue", F.col("price") * F.col("quantity")) + + category_stats = revenue_df.groupBy("category").agg( + F.sum("revenue").alias("total_revenue"), + F.avg("price").alias("avg_price"), + F.count("*").alias("num_transactions") + ) + category_stats.show() + +7. Order Results: + category_stats.orderBy(F.desc("total_revenue")).show() + +8. Session Metrics: + metrics = session.get_metrics() + print(f"Queries executed: {{metrics.queries_executed}}") + + info = session.get_info() + print(f"Session state: {{info.state}}") + +9. Clean Up (when done): + session.close() + client.close() + +{"=" * 80} +Ready! Start by copying and pasting the commands above. +{"=" * 80} +""" + +if __name__ == "__main__": + try: + import IPython + + IPython.embed(banner1=banner, colors="Linux") + except ImportError: + print("IPython not installed. Install with: pip install ipython") + print("Falling back to regular Python shell...\n") + import code + + print(banner) + code.interact(local=locals()) diff --git a/examples/spark/minio_config.py b/examples/spark/minio_config.py new file mode 100644 index 000000000..278946eb9 --- /dev/null +++ b/examples/spark/minio_config.py @@ -0,0 +1,132 @@ +"""MinIO S3 Configuration Helper for Spark Examples. + +This module provides utilities for configuring Spark to work with MinIO +(S3-compatible storage) running in the same Kubernetes cluster. + +Usage: + from minio_config import get_s3_spark_conf, S3_ENDPOINT + + spark_conf = get_s3_spark_conf() + response = client.submit_application( + app_name="my-app", + main_application_file="s3a://spark-scripts/my_script.py", + spark_conf=spark_conf, + ... + ) +""" + +import os + +# MinIO Configuration (deployed via setup_minio.sh) +MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "minioadmin") +MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "minioadmin") +MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "minio-service.default.svc.cluster.local:9000") + +# S3 endpoint for Spark (use http:// for internal cluster access) +S3_ENDPOINT = f"http://{MINIO_ENDPOINT}" + +# Buckets +SCRIPTS_BUCKET = "spark-scripts" +DATA_BUCKET = "spark-data" +OUTPUT_BUCKET = "spark-output" + + +def get_s3_spark_conf(additional_conf=None, enable_history=False): + """Get Spark configuration for S3/MinIO access. + + Args: + additional_conf: Optional dict of additional Spark configs + enable_history: If True, enable event logging for Spark History Server + + Returns: + Dict of Spark configuration properties for S3 access + """ + conf = { + # Required for Spark 4.0 + "spark.kubernetes.file.upload.path": "/tmp", + # Download Hadoop AWS libraries at runtime (includes S3A filesystem) + # Compatible with Spark 4.0.0 and Hadoop 3.4.0 + "spark.jars.packages": ( + "org.apache.hadoop:hadoop-aws:3.4.0," + "com.amazonaws:aws-java-sdk-bundle:1.12.262" + ), + # Ivy cache location - use /tmp which is always writable + # Fixes: java.io.FileNotFoundException: /home/spark/.ivy2.5.2/cache/... + "spark.jars.ivy": "/tmp/.ivy2", + # S3A Configuration for MinIO + "spark.hadoop.fs.s3a.endpoint": S3_ENDPOINT, + "spark.hadoop.fs.s3a.access.key": MINIO_ACCESS_KEY, + "spark.hadoop.fs.s3a.secret.key": MINIO_SECRET_KEY, + "spark.hadoop.fs.s3a.path.style.access": "true", # Required for MinIO + "spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem", + "spark.hadoop.fs.s3a.connection.ssl.enabled": "false", # HTTP for internal + # Performance tuning + "spark.hadoop.fs.s3a.fast.upload": "true", + "spark.hadoop.fs.s3a.block.size": "128M", + "spark.hadoop.fs.s3a.multipart.size": "104857600", # 100MB + # Connection settings + "spark.hadoop.fs.s3a.connection.maximum": "100", + "spark.hadoop.fs.s3a.threads.max": "20", + "spark.hadoop.fs.s3a.connection.timeout": "200000", + "spark.hadoop.fs.s3a.attempts.maximum": "3", + } + + # Add event logging for History Server + if enable_history: + conf.update( + { + "spark.eventLog.enabled": "true", + "spark.eventLog.dir": "file:///mnt/spark-events", + "spark.eventLog.compress": "true", + } + ) + + # Merge additional configuration + if additional_conf: + conf.update(additional_conf) + + return conf + + +def get_s3_path(bucket, key): + """Build S3 path for MinIO. + + Args: + bucket: Bucket name (e.g., 'spark-scripts') + key: Object key (e.g., 'exploration.py') + + Returns: + S3 URL (e.g., 's3a://spark-scripts/exploration.py') + """ + return f"s3a://{bucket}/{key}" + + +# Common S3 paths for examples +S3_PATHS = { + "exploration_script": get_s3_path(SCRIPTS_BUCKET, "exploration.py"), + "csv_analysis_script": get_s3_path(SCRIPTS_BUCKET, "csv_analysis.py"), + "etl_script": get_s3_path(SCRIPTS_BUCKET, "etl_pipeline.py"), + "batch_job_script": get_s3_path(SCRIPTS_BUCKET, "batch_job.py"), + "data_dir": f"s3a://{DATA_BUCKET}/", + "output_dir": f"s3a://{OUTPUT_BUCKET}/", +} + + +def print_minio_info(): + """Print MinIO configuration information.""" + print("MinIO S3 Configuration:") + print(f" Endpoint: {S3_ENDPOINT}") + print(f" Access Key: {MINIO_ACCESS_KEY}") + print(" Buckets:") + print(f" - {SCRIPTS_BUCKET}/ - Application scripts") + print(f" - {DATA_BUCKET}/ - Input data") + print(f" - {OUTPUT_BUCKET}/ - Output results") + print() + + +if __name__ == "__main__": + # Print configuration when run directly + print_minio_info() + print("Available S3 Paths:") + for name, path in S3_PATHS.items(): + print(f" {name}: {path}") diff --git a/examples/spark/open_spark_ui.sh b/examples/spark/open_spark_ui.sh new file mode 100755 index 000000000..92cd6f760 --- /dev/null +++ b/examples/spark/open_spark_ui.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# +# Open Spark UI for a running application +# + +APP_NAME=$1 +NAMESPACE=${2:-default} +PORT=${3:-4040} + +if [ -z "$APP_NAME" ]; then + echo "Usage: $0 [namespace] [port]" + echo "" + echo "Examples:" + echo " $0 test-spark-pi" + echo " $0 test-spark-pi default 4040" + echo "" + exit 1 +fi + +echo "==========================================" +echo "Spark UI Access" +echo "==========================================" +echo "Application: $APP_NAME" +echo "Namespace: $NAMESPACE" +echo "Local Port: $PORT" +echo "" + +# Check if driver pod exists +POD_STATUS=$(kubectl get pod $APP_NAME-driver -n $NAMESPACE -o jsonpath='{.status.phase}' 2>/dev/null) + +if [ -z "$POD_STATUS" ]; then + echo "āœ— Driver pod not found: $APP_NAME-driver" + echo "" + echo "Check if application exists:" + echo " kubectl get sparkapplication -n $NAMESPACE" + exit 1 +fi + +echo "Driver Pod Status: $POD_STATUS" + +if [ "$POD_STATUS" != "Running" ]; then + echo "" + echo "āš ļø Warning: Driver pod is not in Running state" + echo " Spark UI may not be accessible" + echo "" +fi + +echo "" +echo "Starting port-forward..." +echo "==========================================" +echo "" +echo "āœ“ Spark UI will be available at:" +echo "" +echo " http://localhost:$PORT" +echo "" +echo "==========================================" +echo "" +echo "In the Spark UI you can view:" +echo " • Jobs Tab - See all Spark jobs and their status" +echo " • Stages Tab - View DAG visualization and task details" +echo " • Storage Tab - Check cached RDDs/DataFrames" +echo " • Environment - View Spark configuration" +echo " • Executors Tab - Monitor executor resources and tasks" +echo " • SQL Tab - See DataFrame/SQL query execution plans" +echo "" +echo "Press Ctrl+C to stop port forwarding" +echo "==========================================" +echo "" + +# Start port forwarding +kubectl port-forward $APP_NAME-driver $PORT:4040 -n $NAMESPACE diff --git a/examples/spark/quick_ui_test.sh b/examples/spark/quick_ui_test.sh new file mode 100644 index 000000000..69ce202a3 --- /dev/null +++ b/examples/spark/quick_ui_test.sh @@ -0,0 +1,95 @@ +#!/bin/bash +# Quick test to access Spark UI using driver pod port-forward + +echo "==========================================" +echo "Quick Spark UI Access Test" +echo "==========================================" +echo "" + +# Run a simple test app +echo "Step 1: Submitting test application..." +python test_ui_minimal.py & +PYTHON_PID=$! + +# Wait for submission +sleep 15 + +# Find the app name +APP_NAME=$(kubectl get sparkapplications -o name | grep "test-ui" | head -1 | cut -d/ -f2) + +if [ -z "$APP_NAME" ]; then + echo "āŒ No test application found" + echo " Run manually: python test_ui_minimal.py" + exit 1 +fi + +echo "āœ“ Application found: $APP_NAME" +echo "" + +# Wait for driver pod +echo "Step 2: Waiting for driver pod to be ready..." +DRIVER_POD="${APP_NAME}-driver" +kubectl wait --for=condition=ready pod/$DRIVER_POD --timeout=60s 2>/dev/null || { + echo " Still waiting for pod..." + sleep 10 +} + +# Check if pod exists +POD_STATUS=$(kubectl get pod $DRIVER_POD -o jsonpath='{.status.phase}' 2>/dev/null) + +if [ "$POD_STATUS" != "Running" ]; then + echo "āš ļø Driver pod not Running yet (status: $POD_STATUS)" + echo "" + echo "Monitor with:" + echo " kubectl get pods -w | grep $APP_NAME" + echo "" + echo "Once Running, use:" + echo " kubectl port-forward pod/$DRIVER_POD 4040:4040" + exit 0 +fi + +echo "āœ“ Driver pod is Running" +echo "" + +# Check if service exists +echo "Step 3: Checking for UI service..." +if kubectl get svc ${APP_NAME}-ui-svc 2>/dev/null; then + echo "āœ… UI service exists! (Operator created it successfully)" + echo "" + echo "Access UI using service:" + echo " kubectl port-forward svc/${APP_NAME}-ui-svc 4040:4040" +else + echo "āŒ UI service does NOT exist (as expected with v2.0.2-rc.0)" +fi +echo "" + +# Port-forward to driver pod +echo "Step 4: Setting up port-forward to driver pod..." +echo "" +echo "==========================================" +echo "🌐 SPARK UI ACCESS" +echo "==========================================" +echo "" +echo "Run this command in another terminal:" +echo "" +echo " kubectl port-forward pod/$DRIVER_POD 4040:4040" +echo "" +echo "Then open in your browser:" +echo " http://localhost:4040" +echo "" +echo "==========================================" +echo "" + +# Offer to start port-forward +read -p "Start port-forward now? (y/n) " -n 1 -r +echo +if [[ $REPLY =~ ^[Yy]$ ]]; then + echo "Starting port-forward..." + echo "Press Ctrl+C to stop" + kubectl port-forward pod/$DRIVER_POD 4040:4040 +else + echo "" + echo "Manual commands:" + echo " kubectl port-forward pod/$DRIVER_POD 4040:4040" + echo " # Then open: http://localhost:4040" +fi diff --git a/examples/spark/requirements-core.txt b/examples/spark/requirements-core.txt new file mode 100644 index 000000000..4060438a6 --- /dev/null +++ b/examples/spark/requirements-core.txt @@ -0,0 +1,8 @@ +# Kubeflow Spark Client - Core Requirements Only +# ================================================ +# +# Minimal dependencies for running Spark client with OperatorBackend +# Install with: pip install -r requirements-core.txt + +kubernetes>=27.2.0,<30.0.0 +pydantic>=2.10.0,<3.0.0 diff --git a/examples/spark/requirements-dev.txt b/examples/spark/requirements-dev.txt new file mode 100644 index 000000000..c9f4cc478 --- /dev/null +++ b/examples/spark/requirements-dev.txt @@ -0,0 +1,34 @@ +# Kubeflow Spark Client - Development Requirements +# ================================================== +# +# Full development dependencies including testing and linting +# Install with: pip install -r requirements-dev.txt + +# Include core requirements +-r requirements-core.txt + +# Gateway backend support +requests>=2.31.0,<3.0.0 +pyyaml>=6.0,<7.0 + +# Testing +pytest>=7.0,<9.0 +pytest-mock>=3.10,<4.0 +pytest-cov>=4.1.0,<6.0 +coverage>=7.0,<8.0 + +# Code quality +ruff>=0.12.2,<1.0 +black>=24.0.0,<25.0 +isort>=5.13.0,<6.0 +mypy>=1.8.0,<2.0 + +# Development tools +ipython>=8.20.0,<9.0 +ipdb>=0.13.13,<1.0 +rich>=13.0.0,<14.0 +python-dotenv>=1.0.0,<2.0 + +# Documentation +sphinx>=7.0.0,<8.0 +sphinx-rtd-theme>=2.0.0,<3.0 diff --git a/examples/spark/requirements.txt b/examples/spark/requirements.txt new file mode 100644 index 000000000..e95e19cf4 --- /dev/null +++ b/examples/spark/requirements.txt @@ -0,0 +1,26 @@ +# Kubeflow Spark Client - Requirements +# ===================================== +# +# Install all dependencies with: +# pip install -r requirements.txt +# +# Or install minimal dependencies: +# pip install -r requirements-core.txt + +# Core dependencies (required for OperatorBackend) +kubernetes>=27.2.0,<30.0.0 +pydantic>=2.10.0,<3.0.0 + +# Optional dependencies for GatewayBackend +requests>=2.31.0,<3.0.0 +pyyaml>=6.0,<7.0 + +# Development and testing dependencies +pytest>=7.0,<9.0 +pytest-mock>=3.10,<4.0 +coverage>=7.0,<8.0 +ruff>=0.12.2,<1.0 + +# Additional useful packages +rich>=13.0.0,<14.0 # Pretty printing +python-dotenv>=1.0.0,<2.0 # Environment variable management diff --git a/examples/spark/run_long_job_ui_validation.py b/examples/spark/run_long_job_ui_validation.py new file mode 100644 index 000000000..431127dd6 --- /dev/null +++ b/examples/spark/run_long_job_ui_validation.py @@ -0,0 +1,316 @@ +#!/usr/bin/env python3 +""" +Long-Running Job for Spark UI Validation + +This example submits a 10-minute Spark job specifically designed to test +and validate Spark UI access. The job performs various operations to showcase +all UI features. + +Prerequisites: +- Kind cluster with Spark Operator (run ./setup_test_environment.sh) +- MinIO deployed (run ./setup_minio.sh) +- Long-running job script uploaded to MinIO + +Time to Run: ~10 minutes + +Usage: + python run_long_job_ui_validation.py [--no-monitor] + +Options: + --no-monitor Skip interactive monitoring (useful when called from scripts) +""" + +from datetime import datetime +import os +import sys +import time + +# Add SDK to path for development mode +sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +if sdk_path not in sys.path: + sys.path.insert(0, sdk_path) + +from kubeflow.spark import ( # noqa: E402 + ApplicationState, + OperatorBackendConfig, + BatchSparkClient, +) + +# Import MinIO configuration +try: + from minio_config import get_s3_spark_conf, print_minio_info +except ImportError: + print("ERROR: minio_config.py not found!") + print("Please ensure you're running from the examples/spark directory") + sys.exit(1) + + +def print_ui_instructions(app_name: str): + """Print detailed UI access instructions.""" + print("=" * 80) + print("🌐 SPARK UI ACCESS INSTRUCTIONS") + print("=" * 80) + print() + print("The job will run for ~10 minutes. Follow these steps to access the UI:") + print() + print("STEP 1: Wait for driver pod to be Running") + print("-" * 80) + print(f" kubectl get pod {app_name}-driver -w") + print() + print(" Wait until STATUS shows 'Running' (may take 1-2 minutes)") + print() + print("STEP 2: Port-forward to driver pod") + print("-" * 80) + print(f" kubectl port-forward pod/{app_name}-driver 4040:4040") + print() + print(" Keep this terminal open!") + print() + print("STEP 3: Open Spark UI in browser") + print("-" * 80) + print(" http://localhost:4040") + print() + print("STEP 4: Explore UI features while job runs") + print("-" * 80) + print(" Jobs tab - See 6 jobs (one per stage)") + print(" Stages tab - Monitor stage progress in real-time") + print(" Storage tab - View cached DataFrame (after Stage 2)") + print(" Executors tab - Check executor metrics and GC") + print(" SQL tab - Inspect DataFrame query plans") + print(" Environment tab - View Spark configuration") + print() + print("=" * 80) + print() + print("šŸ’” TIPS:") + print(" - Job progresses through 5 stages over 10 minutes") + print(" - Each stage pauses briefly - perfect for exploring UI") + print(" - Stage 2 caches data - check Storage tab!") + print(" - Stage 3 does heavy shuffling - watch Executors tab") + print(" - Click on job/stage names for detailed views") + print() + print("=" * 80) + print() + + +def monitor_job_progress(client: BatchSparkClient, app_name: str): + """Monitor and display job progress.""" + print("=" * 80) + print("MONITORING JOB PROGRESS") + print("=" * 80) + print() + print("Checking status every 30 seconds...") + print("Press Ctrl+C to stop monitoring (job will continue running)") + print() + + start_time = time.time() + last_state = None + + try: + while True: + try: + status = client.get_job(app_name) + elapsed = int(time.time() - start_time) + + if status.state != last_state: + timestamp = datetime.now().strftime("%H:%M:%S") + print(f"[{timestamp}] [{elapsed:3d}s] State: {status.state.value}") + last_state = status.state + + if status.state in [ApplicationState.COMPLETED, ApplicationState.FAILED]: + print() + print(f"Job finished with state: {status.state.value}") + return status + + time.sleep(30) + + except Exception as e: + print(f" Warning: Could not get status: {e}") + time.sleep(30) + + except KeyboardInterrupt: + print() + print("Stopped monitoring (job still running)") + print(f"Check status with: kubectl get sparkapplication {app_name}") + return None + + +def main(): + """Main example: Submit long-running job for UI validation.""" + + # Check for --no-monitor flag + no_monitor = "--no-monitor" in sys.argv + + print("=" * 80) + print("LONG-RUNNING SPARK JOB FOR UI VALIDATION") + print("=" * 80) + print() + print("This example submits a 10-minute job designed to showcase") + print("all Spark UI features. Perfect for testing UI access!") + print() + print("Job stages:") + print(" Stage 1: Generate 100M rows (~2 min)") + print(" Stage 2: Cache and aggregate (~2 min)") + print(" Stage 3: Shuffle-heavy joins (~3 min)") + print(" Stage 4: Multi-dimensional analysis (~2 min)") + print(" Stage 5: Window functions (~1 min)") + print() + print("Total duration: ~10 minutes") + print() + + # Show MinIO configuration + print_minio_info() + + # Step 1: Create SparkClient with configuration + print("Step 1: Creating Spark client...") + config = OperatorBackendConfig( + namespace=os.getenv("SPARK_NAMESPACE", "default"), + service_account="spark-operator-spark", + default_spark_image="docker.io/library/spark", + context=os.getenv("KUBE_CONTEXT", "kind-spark-test"), + enable_monitoring=False, + enable_ui=True, # Enable Spark UI + ) + client = BatchSparkClient(backend_config=config) + print(" Client created successfully") + print(" Spark UI enabled") + print() + + # Step 2: Prepare the application + timestamp = datetime.now().strftime("%H%M%S") + app_name = f"long-job-{timestamp}" + + # Get S3 path for the long-running job script + script_path = "s3a://spark-scripts/long_running_job.py" + + print("Step 2: Configuring long-running job...") + print(f" App name: {app_name}") + print(" Spark version: 4.0.0") + print(f" Script location: {script_path}") + print(" Duration: ~10 minutes") + print(" Resources: 1 driver + 2 executors (1 CPU, 2g RAM each)") + print() + + # Step 3: Submit the application + print("Step 3: Submitting long-running job...") + print() + + try: + # Get S3-enabled Spark configuration + spark_conf = get_s3_spark_conf() + + response = client.submit_application( + # Application metadata + app_name=app_name, + main_application_file=script_path, + # Spark configuration + spark_version="4.0.0", + app_type="Python", + # Resource allocation (more resources for better performance) + driver_cores=1, + driver_memory="2g", # More memory for large datasets + executor_cores=1, + executor_memory="2g", # More memory for shuffles + num_executors=2, + # Keep job running for debugging + time_to_live_seconds=7200, # 2 hours + # Labels for tracking + labels={ + "job_type": "ui-validation", + "duration": "long", + }, + # S3 configuration for MinIO + spark_conf=spark_conf, + ) + + print(" Job submitted successfully!") + print(f" Submission ID: {response.submission_id}") + print(f" Status: {response.status}") + print() + + # Print app name for automation scripts to capture + print(f"APP_NAME={app_name}") + print() + + except Exception as e: + print(f" ERROR: Submission failed: {e}") + print() + print("Troubleshooting:") + print(" 1. Ensure MinIO is running:") + print(" kubectl get pods -l app=minio") + print(" 2. Verify script is uploaded:") + print(" kubectl exec minio-client -- mc ls myminio/spark-scripts/") + print(" 3. Run: ./setup_minio.sh to upload scripts") + sys.exit(1) + + # Step 4: Print UI access instructions + print() + print_ui_instructions(app_name) + + # Step 5: Ask if user wants to monitor + print() + + # Check if monitoring was disabled via flag or non-interactive mode + if no_monitor: + print("Monitoring disabled (--no-monitor flag)") + response = "n" + elif not sys.stdin.isatty(): + # Non-interactive mode (running from automation script) + print("Running in non-interactive mode. Skipping monitoring.") + response = "n" + else: + # Interactive mode - ask user + response = input("Monitor job progress? (y/n): ").strip().lower() + + if response == "y": + print() + monitor_job_progress(client, app_name) + + # Retrieve logs after completion + print() + print("Retrieving job logs...") + try: + logs = list(client.get_job_logs(app_name)) + print() + print("=" * 80) + print("JOB OUTPUT (Last 50 lines)") + print("=" * 80) + for line in logs[-50:]: + print(line) + print("=" * 80) + except Exception as e: + print(f" WARNING: Could not retrieve logs: {e}") + else: + print() + print("Skipping monitoring.") + print() + print("Check job status anytime with:") + print(f" kubectl get sparkapplication {app_name}") + print() + print("View logs with:") + print(f" kubectl logs {app_name}-driver") + print() + + print() + print("=" * 80) + print("QUICK REFERENCE") + print("=" * 80) + print() + print("Port-forward to UI:") + print(f" kubectl port-forward pod/{app_name}-driver 4040:4040") + print() + print("Open UI:") + print(" http://localhost:4040") + print() + print("Check status:") + print(f" kubectl get sparkapplication {app_name} -w") + print() + print("View driver logs:") + print(f" kubectl logs {app_name}-driver -f") + print() + print("Delete when done:") + print(f" kubectl delete sparkapplication {app_name}") + print() + print("=" * 80) + + +if __name__ == "__main__": + main() diff --git a/examples/spark/run_ui_validation.sh b/examples/spark/run_ui_validation.sh new file mode 100755 index 000000000..6a3af9926 --- /dev/null +++ b/examples/spark/run_ui_validation.sh @@ -0,0 +1,176 @@ +#!/bin/bash +# Complete setup and run script for long-running UI validation job + +set -e + +echo "==========================================" +echo "Spark UI Validation - Complete Setup" +echo "==========================================" +echo "" + +# Check if we're in the right directory +if [ ! -f "run_long_job_ui_validation.py" ]; then + echo "āŒ Please run this from examples/spark directory" + exit 1 +fi + +# Step 1: Check MinIO +echo "Step 1: Checking MinIO..." +if ! kubectl get pod -l app=minio 2>/dev/null | grep -q Running; then + echo " āš ļø MinIO not running. Setting up..." + ./setup_minio.sh + echo "" +else + echo " āœ“ MinIO is running" +fi +echo "" + +# Step 2: Upload script +echo "Step 2: Uploading long-running job script..." +FILE_INFO=$(kubectl exec minio-client -- mc ls myminio/spark-scripts/long_running_job.py 2>/dev/null || echo "") + +if [ -z "$FILE_INFO" ]; then + echo " Script not found in MinIO. Uploading..." + chmod +x upload_long_job.sh + ./upload_long_job.sh +elif echo "$FILE_INFO" | grep -q "0B"; then + echo " Script exists but is empty (0B). Re-uploading..." + chmod +x upload_long_job.sh + ./upload_long_job.sh +else + echo " āœ“ Script already uploaded" + echo " $FILE_INFO" +fi +echo "" + +# Step 3: Submit job +echo "Step 3: Submitting long-running job..." +echo " This will take ~10 minutes to complete" +echo "" + +# Run in foreground with --no-monitor flag (no interactive prompts) +# Capture output to get APP_NAME +OUTPUT=$(python run_long_job_ui_validation.py --no-monitor 2>&1) +echo "$OUTPUT" + +# Extract app name from output +APP_NAME=$(echo "$OUTPUT" | grep "APP_NAME=" | cut -d= -f2) + +# Fallback: try to get from kubectl +if [ -z "$APP_NAME" ]; then + APP_NAME=$(kubectl get sparkapplications -o name 2>/dev/null | grep "long-job" | tail -1 | cut -d/ -f2) +fi + +if [ -z "$APP_NAME" ]; then + echo " āš ļø Could not find application. Check output above." + exit 1 +fi + +echo " āœ“ Job submitted: $APP_NAME" +echo "" + +# Step 4: Wait for driver pod +echo "Step 4: Waiting for driver pod to be ready..." +echo " This may take 1-2 minutes..." +DRIVER_POD="${APP_NAME}-driver" + +# Wait for pod to exist +for i in {1..60}; do + if kubectl get pod $DRIVER_POD 2>/dev/null; then + break + fi + sleep 2 +done + +# Wait for pod to be Running +kubectl wait --for=condition=ready pod/$DRIVER_POD --timeout=180s 2>/dev/null || { + echo " āš ļø Pod taking longer than expected..." + echo " Monitor with: kubectl get pod $DRIVER_POD -w" + echo "" +} + +POD_STATUS=$(kubectl get pod $DRIVER_POD -o jsonpath='{.status.phase}' 2>/dev/null) +echo " āœ“ Driver pod status: $POD_STATUS" +echo "" + +if [ "$POD_STATUS" != "Running" ]; then + echo " āš ļø Pod not Running yet. Current status: $POD_STATUS" + echo "" + echo " Monitor pod:" + echo " kubectl get pod $DRIVER_POD -w" + echo "" + echo " Once Running, port-forward manually:" + echo " kubectl port-forward pod/$DRIVER_POD 4040:4040" + echo "" + exit 0 +fi + +# Step 5: Instructions for UI access +echo "==========================================" +echo "šŸŽ‰ Job is Running!" +echo "==========================================" +echo "" +echo "Driver pod: $DRIVER_POD" +echo "Expected duration: ~10 minutes" +echo "" +echo "==========================================" +echo "TO ACCESS SPARK UI:" +echo "==========================================" +echo "" +echo "Open a NEW terminal and run:" +echo "" +echo " kubectl port-forward pod/$DRIVER_POD 4040:4040" +echo "" +echo "Then open in your browser:" +echo "" +echo " http://localhost:4040" +echo "" +echo "==========================================" +echo "WHAT TO EXPLORE:" +echo "==========================================" +echo "" +echo "Timeline:" +echo " 0:00 - Job starts" +echo " 2:00 - Stage 2 → CHECK STORAGE TAB! ⭐" +echo " 4:00 - Stage 3 → CHECK EXECUTORS TAB! (heavy shuffle)" +echo " 7:00 - Stage 4 → Check SQL tab" +echo " 9:00 - Stage 5 → Check DAG visualization" +echo " 10:00 - Job completes" +echo "" +echo "UI Tabs to explore:" +echo " āœ“ Jobs - See 6 jobs (one per stage)" +echo " āœ“ Stages - Monitor stage progress" +echo " āœ“ Storage - View cached data (after Stage 2)" +echo " āœ“ Executors - Monitor resources and shuffles" +echo " āœ“ SQL - Inspect DataFrame query plans" +echo " āœ“ Environment - View Spark configuration" +echo "" +echo "==========================================" +echo "" + +# Offer to start port-forward +read -p "Start port-forward now? (y/n) " -n 1 -r +echo "" + +if [[ $REPLY =~ ^[Yy]$ ]]; then + echo "" + echo "Starting port-forward..." + echo "Keep this terminal open!" + echo "Open browser to: http://localhost:4040" + echo "" + echo "Press Ctrl+C to stop port-forward" + echo "" + sleep 2 + kubectl port-forward pod/$DRIVER_POD 4040:4040 +else + echo "" + echo "To access UI later, run:" + echo " kubectl port-forward pod/$DRIVER_POD 4040:4040" + echo "" + echo "Monitor job with:" + echo " kubectl get sparkapplication $APP_NAME -w" + echo "" + echo "View logs with:" + echo " kubectl logs $DRIVER_POD -f" + echo "" +fi diff --git a/examples/spark/scripts/long_running_job.py b/examples/spark/scripts/long_running_job.py new file mode 100644 index 000000000..f023de875 --- /dev/null +++ b/examples/spark/scripts/long_running_job.py @@ -0,0 +1,306 @@ +#!/usr/bin/env python3 +""" +Long-Running Spark Job for UI Validation + +This script runs for approximately 10 minutes and demonstrates various +Spark operations to showcase different UI features: +- Jobs and Stages +- SQL/DataFrame operations +- Executor metrics +- Storage/caching +- Shuffle operations + +Perfect for testing Spark UI access and exploring its features. +""" + +import time + +from pyspark.sql import SparkSession +from pyspark.sql.functions import avg, col, count, expr, max, min, rand, sum + + +def main(): + print("=" * 80) + print("LONG-RUNNING SPARK JOB FOR UI VALIDATION") + print("=" * 80) + print() + print("This job will run for approximately 10 minutes.") + print("Use this time to explore the Spark UI features:") + print(" • Jobs tab - See job progression") + print(" • Stages tab - Monitor stage execution") + print(" • Storage tab - View cached DataFrames") + print(" • Executors tab - Check executor metrics") + print(" • SQL tab - Inspect query plans") + print() + + # Create Spark session + spark = SparkSession.builder.appName("Long-Running UI Validation Job").getOrCreate() + + spark.sparkContext.setLogLevel("INFO") + + print("āœ“ Spark session created") + print() + + # ======================================================================== + # STAGE 1: Generate Large Dataset (2 minutes) + # ======================================================================== + print("-" * 80) + print("STAGE 1: Generating large dataset (100 million rows)") + print("Expected duration: ~2 minutes") + print("-" * 80) + print() + + # Generate a large dataset with 100 million rows + df_large = ( + spark.range(0, 100_000_000) + .withColumn("category", (col("id") % 10).cast("string")) + .withColumn("value", (rand() * 1000).cast("integer")) + .withColumn("region", expr("array('North', 'South', 'East', 'West')[cast(id % 4 as int)]")) + ) + + print(" Dataset schema:") + df_large.printSchema() + + # Trigger evaluation with a count + total_rows = df_large.count() + print(f" āœ“ Generated {total_rows:,} rows") + print() + + time.sleep(5) # Pause to observe UI + + # ======================================================================== + # STAGE 2: Cache and Aggregations (2 minutes) + # ======================================================================== + print("-" * 80) + print("STAGE 2: Caching dataset and performing aggregations") + print("Expected duration: ~2 minutes") + print("-" * 80) + print() + + # Cache the dataset to observe Storage tab + df_large.cache() + print(" Caching dataset in memory...") + + # Force caching with an action + cached_count = df_large.count() + print(f" āœ“ Cached {cached_count:,} rows") + print() + + print(" Check Spark UI → Storage tab to see cached DataFrame!") + print() + + time.sleep(10) # Pause to check Storage tab + + # Perform aggregations by category + print(" Aggregating by category...") + agg_by_category = ( + df_large.groupBy("category") + .agg( + count("id").alias("count"), + sum("value").alias("total_value"), + avg("value").alias("avg_value"), + min("value").alias("min_value"), + max("value").alias("max_value"), + ) + .orderBy("category") + ) + + print() + print(" Category Aggregations:") + agg_by_category.show() + print() + + time.sleep(5) + + # ======================================================================== + # STAGE 3: Shuffle-Heavy Operations (3 minutes) + # ======================================================================== + print("-" * 80) + print("STAGE 3: Shuffle-heavy operations (joins and repartitioning)") + print("Expected duration: ~3 minutes") + print("-" * 80) + print() + + # Create a dimension table for joins + print(" Creating dimension table...") + dim_categories = spark.createDataFrame( + [ + ("0", "Electronics", "Tech"), + ("1", "Books", "Media"), + ("2", "Clothing", "Fashion"), + ("3", "Food", "Grocery"), + ("4", "Toys", "Entertainment"), + ("5", "Sports", "Recreation"), + ("6", "Tools", "Hardware"), + ("7", "Garden", "Outdoor"), + ("8", "Beauty", "Personal Care"), + ("9", "Auto", "Automotive"), + ], + ["category_id", "category_name", "department"], + ) + + print(" āœ“ Dimension table created") + print() + + # Perform join (will cause shuffle) + print(" Performing join operation (watch for shuffle in UI)...") + df_joined = df_large.join( + dim_categories, df_large.category == dim_categories.category_id, "inner" + ) + + # Show joined results + print() + print(" Joined Data Sample:") + df_joined.select("id", "category", "category_name", "department", "value", "region").show(10) + print() + + time.sleep(10) # Pause to observe shuffle + + # Repartition to create more shuffle + print(" Repartitioning data (32 partitions)...") + df_repartitioned = df_joined.repartition(32, "department") + + # Count to trigger repartitioning + repartitioned_count = df_repartitioned.count() + print(f" āœ“ Repartitioned {repartitioned_count:,} rows across 32 partitions") + print() + + time.sleep(5) + + # ======================================================================== + # STAGE 4: Multi-dimensional Analysis (2 minutes) + # ======================================================================== + print("-" * 80) + print("STAGE 4: Multi-dimensional analysis") + print("Expected duration: ~2 minutes") + print("-" * 80) + print() + + # Aggregate by department and region + print(" Aggregating by department and region...") + dept_region_agg = ( + df_repartitioned.groupBy("department", "region") + .agg( + count("id").alias("transactions"), + sum("value").alias("total_sales"), + avg("value").alias("avg_transaction"), + ) + .orderBy("department", "region") + ) + + print() + print(" Department Ɨ Region Sales Analysis:") + dept_region_agg.show(20) + print() + + time.sleep(10) # Pause to view results + + # ======================================================================== + # STAGE 5: Window Functions and Complex Queries (1 minute) + # ======================================================================== + print("-" * 80) + print("STAGE 5: Window functions and complex SQL") + print("Expected duration: ~1 minute") + print("-" * 80) + print() + + from pyspark.sql.functions import dense_rank, row_number + from pyspark.sql.window import Window + + # Create a window for ranking + window_spec = Window.partitionBy("department").orderBy(col("total_sales").desc()) + + # Add rankings + print(" Computing regional rankings within departments...") + ranked_regions = dept_region_agg.withColumn("rank", row_number().over(window_spec)).withColumn( + "dense_rank", dense_rank().over(window_spec) + ) + + # Show top regions per department + print() + print(" Top Performing Regions by Department:") + ranked_regions.filter(col("rank") <= 2).orderBy("department", "rank").show() + print() + + time.sleep(5) + + # ======================================================================== + # FINAL STAGE: Summary Statistics (1 minute) + # ======================================================================== + print("-" * 80) + print("FINAL STAGE: Computing summary statistics") + print("-" * 80) + print() + + # Overall statistics + print(" Computing overall statistics...") + overall_stats = df_repartitioned.agg( + count("id").alias("total_transactions"), + sum("value").alias("total_revenue"), + avg("value").alias("avg_transaction"), + min("value").alias("min_transaction"), + max("value").alias("max_transaction"), + ) + + print() + print(" OVERALL STATISTICS:") + print(" " + "=" * 76) + overall_stats.show(truncate=False) + print() + + # Department summary + print(" Department Summary:") + dept_summary = ( + df_repartitioned.groupBy("department") + .agg(count("id").alias("transactions"), sum("value").alias("revenue")) + .orderBy(col("revenue").desc()) + ) + + dept_summary.show() + print() + + # Region summary + print(" Region Summary:") + region_summary = ( + df_repartitioned.groupBy("region") + .agg(count("id").alias("transactions"), sum("value").alias("revenue")) + .orderBy(col("revenue").desc()) + ) + + region_summary.show() + print() + + # Cleanup + print("-" * 80) + print("Cleaning up...") + df_large.unpersist() + print(" āœ“ Unpersisted cached DataFrame") + print() + + # Final summary + print("=" * 80) + print("JOB COMPLETED SUCCESSFULLY!") + print("=" * 80) + print() + print("Total execution time: ~10 minutes") + print() + print("What you should have observed in Spark UI:") + print(" āœ“ Jobs tab - Multiple jobs corresponding to each stage") + print(" āœ“ Stages tab - Detailed stage execution with tasks") + print(" āœ“ Storage tab - Cached DataFrame (Stage 2)") + print(" āœ“ Executors tab - Executor metrics and resource usage") + print(" āœ“ SQL tab - Query plans for DataFrame operations") + print() + print("Spark UI features to explore:") + print(" • Click on job names to see stages") + print(" • Click on stages to see task details") + print(" • Check 'Event Timeline' for task scheduling") + print(" • View 'DAG Visualization' for execution plan") + print(" • Monitor executor GC time and memory usage") + print() + + spark.stop() + + +if __name__ == "__main__": + main() diff --git a/examples/spark/setup_minio.sh b/examples/spark/setup_minio.sh new file mode 100755 index 000000000..face8edaf --- /dev/null +++ b/examples/spark/setup_minio.sh @@ -0,0 +1,437 @@ +#!/bin/bash +set -e + +echo "================================================================================" +echo "Setting up MinIO (S3-compatible storage) for Spark Examples" +echo "================================================================================" +echo "" + +# Configuration +NAMESPACE="default" +MINIO_ROOT_USER="minioadmin" +MINIO_ROOT_PASSWORD="minioadmin" +MINIO_SERVICE="minio-service" +MINIO_ENDPOINT="minio-service.default.svc.cluster.local:9000" + +echo "Step 1: Deploying MinIO to Kubernetes..." +echo "--------------------------------------------------------------------------------" + +# Create MinIO deployment +cat < /tmp/batch_job.py <<'BATCH_SCRIPT' +from pyspark.sql import SparkSession +from pyspark.sql.functions import col, sum as _sum, count, current_timestamp +from pyspark.sql.types import * +from datetime import datetime + +spark = SparkSession.builder.appName("Batch Job").getOrCreate() + +print("\n" + "="*80) +print("SCHEDULED BATCH JOB - DAILY PROCESSING") +print("="*80) + +# Configuration +BATCH_DATE = datetime.now().strftime("%Y-%m-%d") +JOB_ID = f"batch_{BATCH_DATE.replace('-', '')}" + +print(f"\n[CONFIG] Batch Configuration:") +print(f" • Batch Date: {BATCH_DATE}") +print(f" • Job ID: {JOB_ID}") + +# Create sample transaction data +schema = StructType([ + StructField("transaction_id", IntegerType(), False), + StructField("date", StringType(), False), + StructField("customer_id", IntegerType(), False), + StructField("amount", DoubleType(), False), +]) + +transactions_data = [ + (1, BATCH_DATE, 101, 150.00), + (2, BATCH_DATE, 102, 250.00), + (3, BATCH_DATE, 103, 75.00), + (4, BATCH_DATE, 101, 300.00), + (5, BATCH_DATE, 104, 500.00), +] + +df = spark.createDataFrame(transactions_data, schema) + +print(f"\n[EXTRACT] Loaded {df.count()} transactions") +print("\nSample transactions:") +df.show() + +# Transform: Add metadata +df_enriched = df.withColumn("processing_timestamp", current_timestamp()) \ + .withColumn("job_id", col("transaction_id").cast("string")) + +print("\n[TRANSFORM] Added metadata columns") + +# Aggregate by customer +summary = df_enriched.groupBy("customer_id").agg( + count("transaction_id").alias("transaction_count"), + _sum("amount").alias("total_amount") +).orderBy(col("total_amount").desc()) + +print("\n[LOAD] Customer Summary:") +summary.show() + +print(f"\n[COMPLETE] Batch job {JOB_ID} completed successfully!") +print("="*80) +spark.stop() +BATCH_SCRIPT + +# Create exploration script +cat > /tmp/exploration.py <<'EXPLORATION_SCRIPT' +from pyspark.sql import SparkSession +from pyspark.sql.functions import col, count, sum as _sum, avg, min as _min, max as _max +from pyspark.sql.types import * + +spark = SparkSession.builder.appName("DataFrame Exploration").getOrCreate() + +print("\n" + "="*80) +print("INTERACTIVE DATAFRAME EXPLORATION") +print("="*80) + +# Create sample customer dataset +schema = StructType([ + StructField("customer_id", IntegerType(), False), + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("city", StringType(), True), + StructField("purchases", IntegerType(), True), + StructField("total_spent", DoubleType(), True), +]) + +customers_data = [ + (1, "Alice", 28, "New York", 15, 1250.50), + (2, "Bob", 35, "Los Angeles", 8, 890.25), + (3, "Carol", None, "Chicago", 22, 2100.00), # Missing age + (4, "David", 42, "Houston", 5, 450.75), +] + +df = spark.createDataFrame(customers_data, schema) + +print("\nDataset Summary:") +print(f"Total Records: {df.count()}") + +print("\nSchema:") +df.printSchema() + +print("\nSample Data:") +df.show() + +print("\nDescriptive Statistics:") +df.describe().show() + +print("\nNull Check:") +df.select([count(col(c).isNull()).alias(c) for c in df.columns]).show() + +print("\n" + "="*80) +spark.stop() +EXPLORATION_SCRIPT + +# Create CSV analysis script +cat > /tmp/csv_analysis.py <<'CSV_SCRIPT' +from pyspark.sql import SparkSession +from pyspark.sql.functions import col, sum as _sum, avg, count + +spark = SparkSession.builder.appName("CSV Analysis").getOrCreate() + +print("\n" + "="*80) +print("CSV DATA ANALYSIS") +print("="*80) + +# Read sample CSV from MinIO +# For this example, we'll create data in-memory +from pyspark.sql.types import * + +schema = StructType([ + StructField("product", StringType()), + StructField("category", StringType()), + StructField("quantity", IntegerType()), + StructField("price", DoubleType()), +]) + +data = [ + ("Laptop", "Electronics", 2, 1200.00), + ("Mouse", "Electronics", 5, 25.00), + ("Keyboard", "Electronics", 3, 75.00), + ("Desk", "Furniture", 1, 500.00), + ("Chair", "Furniture", 2, 250.00), +] + +df = spark.createDataFrame(data, schema) + +print("\nSample Data:") +df.show() + +print("\nSales by Category:") +df.groupBy("category").agg( + count("product").alias("products"), + _sum("quantity").alias("total_quantity"), + _sum(col("quantity") * col("price")).alias("revenue") +).show() + +print("\n" + "="*80) +spark.stop() +CSV_SCRIPT + +# Create ETL script +cat > /tmp/etl_pipeline.py <<'ETL_SCRIPT' +from pyspark.sql import SparkSession +from pyspark.sql.functions import col, upper, trim, current_timestamp + +spark = SparkSession.builder.appName("ETL Pipeline").getOrCreate() + +print("\n" + "="*80) +print("ETL PIPELINE") +print("="*80) + +# Extract +print("\n[EXTRACT] Loading data...") +from pyspark.sql.types import * +schema = StructType([ + StructField("id", IntegerType()), + StructField("name", StringType()), + StructField("amount", DoubleType()), +]) +data = [(1, " alice ", 100.0), (2, "BOB", 200.0), (3, "carol ", 150.0)] +df = spark.createDataFrame(data, schema) +print(f"Extracted {df.count()} records") + +# Transform +print("\n[TRANSFORM] Cleaning data...") +df_clean = df.withColumn("name", upper(trim(col("name")))) \ + .withColumn("processed_at", current_timestamp()) +print("Transformations applied: trim, uppercase, timestamp") + +# Load +print("\n[LOAD] Results:") +df_clean.show() + +print("\n" + "="*80) +spark.stop() +ETL_SCRIPT + +echo " āœ“ Created PySpark scripts" +echo "" + +echo "Step 7: Uploading scripts to MinIO..." +echo "--------------------------------------------------------------------------------" + +# Upload scripts directly to MinIO using stdin (avoids need for 'tar' in container) +echo " Uploading batch_job.py..." +kubectl exec -n ${NAMESPACE} minio-client -- sh -c 'cat > /tmp/batch_job.py' < /tmp/batch_job.py +kubectl exec -n ${NAMESPACE} minio-client -- mc cp /tmp/batch_job.py myminio/spark-scripts/ + +echo " Uploading exploration.py..." +kubectl exec -n ${NAMESPACE} minio-client -- sh -c 'cat > /tmp/exploration.py' < /tmp/exploration.py +kubectl exec -n ${NAMESPACE} minio-client -- mc cp /tmp/exploration.py myminio/spark-scripts/ + +echo " Uploading csv_analysis.py..." +kubectl exec -n ${NAMESPACE} minio-client -- sh -c 'cat > /tmp/csv_analysis.py' < /tmp/csv_analysis.py +kubectl exec -n ${NAMESPACE} minio-client -- mc cp /tmp/csv_analysis.py myminio/spark-scripts/ + +echo " Uploading etl_pipeline.py..." +kubectl exec -n ${NAMESPACE} minio-client -- sh -c 'cat > /tmp/etl_pipeline.py' < /tmp/etl_pipeline.py +kubectl exec -n ${NAMESPACE} minio-client -- mc cp /tmp/etl_pipeline.py myminio/spark-scripts/ + +echo " Uploading long_running_job.py..." +kubectl exec -n ${NAMESPACE} minio-client -- sh -c 'cat > /tmp/long_running_job.py' < scripts/long_running_job.py +kubectl exec -n ${NAMESPACE} minio-client -- mc cp /tmp/long_running_job.py myminio/spark-scripts/ + +echo " āœ“ Uploaded scripts to s3://spark-scripts/" +echo "" + +# Verify uploads +echo "Verifying uploaded scripts:" +kubectl exec -n ${NAMESPACE} minio-client -- mc ls myminio/spark-scripts/ +echo "" + +echo "Step 8: Creating Spark S3 access secret..." +echo "--------------------------------------------------------------------------------" + +cat <${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}WARNING:${NC} $1" +} + +print_error() { + echo -e "${RED}ERROR:${NC} $1" +} + +# Setup Kubernetes cluster with Spark Operator +setup_kubernetes_cluster() { + print_step "Setting up Kubernetes cluster..." + + if kind get clusters 2>/dev/null | grep -q "^${CLUSTER_NAME}$"; then + print_warning "Cluster '${CLUSTER_NAME}' already exists" + + # Export kubeconfig to ensure context is set + kind export kubeconfig --name ${CLUSTER_NAME} 2>/dev/null || true + + # Verify context is set + if ! kubectl config get-contexts | grep -q "kind-${CLUSTER_NAME}"; then + print_error "Failed to set kubectl context" + print_step "Please run: kind export kubeconfig --name ${CLUSTER_NAME}" + exit 1 + fi + else + print_step "Running setup_test_environment.sh..." + bash "${SCRIPT_DIR}/setup_test_environment.sh" + fi + + print_step "Kubernetes cluster ready" +} + +# Deploy Spark Connect server +deploy_spark_connect() { + print_step "Deploying Spark Connect server..." + + kubectl apply -f "${SCRIPT_DIR}/spark-connect-server.yaml" + + print_step "Waiting for Spark Connect server to be ready..." + kubectl wait --for=condition=ready pod \ + -l app=spark-connect \ + -n default \ + --timeout=300s + + print_step "Spark Connect server deployed successfully" +} + +# Setup port forwarding +setup_port_forwarding() { + print_step "Setting up port forwarding..." + + # Kill any existing port forwarding on port 30000 + pkill -f "kubectl.*port-forward.*30000" || true + sleep 2 + + # Get the Spark Connect pod name + POD_NAME=$(kubectl get pods -l app=spark-connect -n default -o jsonpath='{.items[0].metadata.name}') + + print_step "Port forwarding from localhost:30000 to Spark Connect pod..." + + # Start port forwarding in background + kubectl port-forward -n default "pod/${POD_NAME}" 30000:15002 > /dev/null 2>&1 & + PF_PID=$! + + # Wait for port forwarding to be established + sleep 3 + + # Verify port forwarding + if lsof -i :30000 > /dev/null 2>&1; then + print_step "Port forwarding established (PID: ${PF_PID})" + echo "${PF_PID}" > /tmp/spark-connect-port-forward.pid + print_warning "Port forwarding running in background" + print_warning "To stop: kill \$(cat /tmp/spark-connect-port-forward.pid)" + else + print_error "Failed to establish port forwarding" + return 1 + fi +} + +# Verify installation +verify_installation() { + print_step "Verifying installation..." + + echo "" + echo "Cluster: ${CLUSTER_NAME}" + echo "Spark Connect Pod:" + kubectl get pods -l app=spark-connect -n default + + echo "" + echo "Spark Connect Service:" + kubectl get svc spark-connect -n default + + echo "" + print_step "Installation complete!" +} + +# Print usage instructions +print_usage() { + echo "" + echo "=" * 80 + echo " Spark Connect Setup Complete" + echo "=" * 80 + echo "" + echo "Spark Connect server is now running and accessible at:" + echo " URL: sc://localhost:30000" + echo "" + echo "Next steps:" + echo "" + echo "1. Install PySpark with Connect support (if not already installed):" + echo " pip install 'pyspark[connect]>=3.4.0'" + echo "" + echo "2. Run the interactive demo:" + echo " python examples/spark/ipython_spark_connect_demo.py" + echo "" + echo "3. Or run manual tests:" + echo " python examples/spark/ipython_spark_connect_demo.py --manual" + echo "" + echo "4. Or launch IPython for step-by-step experimentation:" + echo " cd examples/spark" + echo " python -c 'from ipython_spark_connect_demo import *; import IPython; IPython.embed()'" + echo "" + echo "5. View Spark UI:" + echo " kubectl port-forward -n default svc/spark-connect 4040:4040" + echo " Open: http://localhost:4040" + echo "" + echo "Cluster management:" + echo " - View logs: kubectl logs -l app=spark-connect -n default -f" + echo " - Restart: kubectl rollout restart deployment/spark-connect -n default" + echo " - Delete: kubectl delete -f examples/spark/spark-connect-server.yaml" + echo " - Cleanup cluster: kind delete cluster --name ${CLUSTER_NAME}" + echo "" + echo "Port forwarding PID: $(cat /tmp/spark-connect-port-forward.pid 2>/dev/null || echo 'N/A')" + echo "Stop port forwarding: kill \$(cat /tmp/spark-connect-port-forward.pid)" +} + +# Main +main() { + echo "========================================" + echo " Spark Connect Server Setup" + echo "========================================" + echo "" + + setup_kubernetes_cluster + deploy_spark_connect + setup_port_forwarding + verify_installation + print_usage +} + +# Cleanup function +cleanup() { + print_step "Cleaning up Spark Connect resources..." + + # Stop port forwarding + if [ -f /tmp/spark-connect-port-forward.pid ]; then + PF_PID=$(cat /tmp/spark-connect-port-forward.pid) + if ps -p ${PF_PID} > /dev/null 2>&1; then + kill ${PF_PID} 2>/dev/null || true + print_step "Port forwarding stopped" + fi + rm -f /tmp/spark-connect-port-forward.pid + fi + + # Delete Spark Connect deployment + kubectl delete -f "${SCRIPT_DIR}/spark-connect-server.yaml" --ignore-not-found=true + + print_step "Cleanup complete" +} + +# Handle script arguments +case "${1:-setup}" in + setup) + main + ;; + cleanup) + cleanup + ;; + restart) + cleanup + sleep 2 + main + ;; + *) + echo "Usage: $0 {setup|cleanup|restart}" + echo "" + echo "Commands:" + echo " setup - Setup cluster and deploy Spark Connect (default)" + echo " cleanup - Remove Spark Connect and stop port forwarding" + echo " restart - Cleanup and setup again" + exit 1 + ;; +esac diff --git a/examples/spark/setup_test_environment.sh b/examples/spark/setup_test_environment.sh new file mode 100755 index 000000000..34e67d0c8 --- /dev/null +++ b/examples/spark/setup_test_environment.sh @@ -0,0 +1,305 @@ +#!/usr/bin/env bash +# +# Setup script for running Spark Operator in Kind cluster for testing +# +# This script: +# 1. Creates a Kind cluster +# 2. Installs Spark Operator +# 3. Sets up service accounts and RBAC +# 4. Verifies the installation + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Configuration +CLUSTER_NAME="${CLUSTER_NAME:-spark-test}" +OPERATOR_NAMESPACE="${OPERATOR_NAMESPACE:-spark-operator}" +SPARK_NAMESPACE="${SPARK_NAMESPACE:-default}" +SPARK_OPERATOR_VERSION="${SPARK_OPERATOR_VERSION:-v2.0.2-rc.0}" +SPARK_OPERATOR_CHART_VERSION="${SPARK_OPERATOR_CHART_VERSION:-2.0.2-rc.0}" + +print_step() { + echo -e "${GREEN}==>${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}WARNING:${NC} $1" +} + +print_error() { + echo -e "${RED}ERROR:${NC} $1" +} + +# Check prerequisites +check_prerequisites() { + print_step "Checking prerequisites..." + + if ! command -v kind &> /dev/null; then + print_error "kind not found. Please install kind: https://kind.sigs.k8s.io/docs/user/quick-start/" + exit 1 + fi + + if ! command -v kubectl &> /dev/null; then + print_error "kubectl not found. Please install kubectl" + exit 1 + fi + + if ! command -v helm &> /dev/null; then + print_warning "helm not found. Will use kubectl apply instead" + fi + + print_step "Prerequisites OK" +} + +# Create Kind cluster +create_cluster() { + print_step "Creating Kind cluster '${CLUSTER_NAME}'..." + + if kind get clusters | grep -q "^${CLUSTER_NAME}$"; then + print_warning "Cluster '${CLUSTER_NAME}' already exists. Skipping creation." + return + fi + + cat </dev/null; then + print_step "Installing from release manifest..." + kubectl apply -f /tmp/spark-operator.yaml -n ${OPERATOR_NAMESPACE} + rm -f /tmp/spark-operator.yaml + else + print_warning "Release manifest not found, using main branch..." + + # Fallback: Install CRDs and operator from main branch + kubectl apply -f https://raw.githubusercontent.com/kubeflow/spark-operator/master/config/crd/bases/sparkoperator.k8s.io_sparkapplications.yaml + kubectl apply -f https://raw.githubusercontent.com/kubeflow/spark-operator/master/config/crd/bases/sparkoperator.k8s.io_scheduledsparkapplications.yaml + + # Install operator deployment + cat < /dev/null; then + install_spark_operator_helm + else + install_spark_operator_kubectl + fi + + setup_rbac + verify_installation + + print_step "Setup complete! šŸŽ‰" +} + +# Run main function +main diff --git a/examples/spark/spark-connect-server.yaml b/examples/spark/spark-connect-server.yaml new file mode 100644 index 000000000..d2771dc9f --- /dev/null +++ b/examples/spark/spark-connect-server.yaml @@ -0,0 +1,165 @@ +# Spark Connect Server Deployment for Kubernetes +# +# This manifest deploys a Spark Connect server that can be used to test +# the Kubeflow Spark Connect backend integration. +# +# Deploy: kubectl apply -f spark-connect-server.yaml +# Delete: kubectl delete -f spark-connect-server.yaml + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: spark-connect + namespace: default +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: spark-connect-role + namespace: default +rules: +- apiGroups: [""] + resources: ["pods", "services", "configmaps"] + verbs: ["create", "get", "list", "watch", "update", "patch", "delete"] +- apiGroups: [""] + resources: ["pods/log"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: spark-connect-rolebinding + namespace: default +subjects: +- kind: ServiceAccount + name: spark-connect + namespace: default +roleRef: + kind: Role + name: spark-connect-role + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: v1 +kind: Service +metadata: + name: spark-connect + namespace: default + labels: + app: spark-connect +spec: + type: NodePort + selector: + app: spark-connect + ports: + - name: connect + port: 15002 + targetPort: 15002 + nodePort: 30000 + protocol: TCP + - name: ui + port: 4040 + targetPort: 4040 + protocol: TCP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: spark-connect + namespace: default + labels: + app: spark-connect +spec: + replicas: 1 + selector: + matchLabels: + app: spark-connect + template: + metadata: + labels: + app: spark-connect + spec: + serviceAccountName: spark-connect + initContainers: + - name: download-packages + image: apache/spark:4.0.0 + securityContext: + runAsUser: 185 # spark user + command: + - /bin/sh + - -c + - | + mkdir -p /ivy-cache + /opt/spark/bin/spark-submit \ + --packages org.apache.spark:spark-connect_2.13:4.0.0 \ + --conf spark.jars.ivy=/ivy-cache \ + --class org.apache.spark.sql.connect.service.SparkConnectServer \ + --help || true + volumeMounts: + - name: ivy-cache + mountPath: /ivy-cache + containers: + - name: spark-connect + image: apache/spark:4.0.0 + imagePullPolicy: IfNotPresent + securityContext: + runAsUser: 185 # spark user + command: + - /opt/spark/sbin/start-connect-server.sh + args: + - --packages + - org.apache.spark:spark-connect_2.13:4.0.0 + - --conf + - spark.jars.ivy=/ivy-cache + - --conf + - spark.driver.host=0.0.0.0 + - --conf + - spark.driver.bindAddress=0.0.0.0 + - --conf + - spark.connect.grpc.binding.address=0.0.0.0 + - --conf + - spark.kubernetes.namespace=default + - --conf + - spark.kubernetes.authenticate.driver.serviceAccountName=spark-connect + env: + - name: SPARK_NO_DAEMONIZE + value: "true" + - name: SPARK_LOCAL_HOSTNAME + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: JAVA_TOOL_OPTIONS + value: "-Djava.net.preferIPv4Stack=true" + volumeMounts: + - name: ivy-cache + mountPath: /ivy-cache + ports: + - name: connect + containerPort: 15002 + protocol: TCP + - name: ui + containerPort: 4040 + protocol: TCP + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "2Gi" + cpu: "1000m" + livenessProbe: + tcpSocket: + port: 15002 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + readinessProbe: + tcpSocket: + port: 15002 + initialDelaySeconds: 20 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + volumes: + - name: ivy-cache + emptyDir: {} diff --git a/examples/spark/test_direct_pyspark.py b/examples/spark/test_direct_pyspark.py new file mode 100644 index 000000000..54f74318e --- /dev/null +++ b/examples/spark/test_direct_pyspark.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +""" +Minimal PySpark Connect test - bypasses Kubeflow SDK completely +""" + +import sys + +print("Testing direct PySpark Connect (no Kubeflow SDK)...") +print("=" * 80) + +try: + import signal + + from pyspark.sql import SparkSession + + def timeout_handler(signum, frame): + raise TimeoutError("Connection timed out") + + # Set a 15 second timeout + signal.signal(signal.SIGALRM, timeout_handler) + signal.alarm(15) + + print("Creating SparkSession.builder.remote('sc://localhost:30000')...") + print("Timeout set to 15 seconds...") + + spark = ( + SparkSession.builder.remote("sc://localhost:30000") + .appName("direct-test") + .config("spark.connect.grpc.binding.port", "30000") + .getOrCreate() + ) + + signal.alarm(0) # Cancel timeout + + print("āœ“ Session created!") + print(f"Session: {spark}") + + # Try a query + print("\nTesting query...") + df = spark.sql("SELECT 1 AS id, 'Hello' AS msg") + result = df.collect() + print(f"āœ“ Result: {result}") + + spark.stop() + print("āœ“ Test passed!") + +except TimeoutError: + print("\nāœ— Connection timed out after 15 seconds") + print("\nThis means PySpark is not able to connect to the server.") + print("The problem is NOT with Kubeflow SDK - it's with the basic connection.") + sys.exit(1) + +except Exception as e: + print(f"\nāœ— Error: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/examples/spark/test_grpc_connection.py b/examples/spark/test_grpc_connection.py new file mode 100644 index 000000000..cb4691c81 --- /dev/null +++ b/examples/spark/test_grpc_connection.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +""" +Minimal test to check if we can connect to Spark Connect via gRPC directly. +""" + +import sys + +print("Testing basic gRPC connection to Spark Connect...") +print("=" * 80) + +# Test 1: Check if grpc is available +print("\n[1] Checking grpcio installation...") +try: + import grpc + + print(f"āœ“ grpcio version: {grpc.__version__}") +except ImportError as e: + print(f"āœ— grpcio not installed: {e}") + print("Install with: pip install grpcio") + sys.exit(1) + +# Test 2: Try to connect to the port +print("\n[2] Testing TCP connection to localhost:30000...") +import socket + +try: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.settimeout(5) + result = sock.connect_ex(("localhost", 30000)) + sock.close() + + if result == 0: + print("āœ“ TCP connection successful") + else: + print(f"āœ— TCP connection failed with error code: {result}") + sys.exit(1) +except Exception as e: + print(f"āœ— TCP connection error: {e}") + sys.exit(1) + +# Test 3: Try gRPC channel +print("\n[3] Creating gRPC channel to localhost:30000...") +try: + channel = grpc.insecure_channel("localhost:30000") + print("āœ“ gRPC channel created") + + # Test if channel is ready + print(" Testing if channel becomes ready (5 second timeout)...") + try: + future = grpc.channel_ready_future(channel) + future.result(timeout=5) + print("āœ“ gRPC channel is ready!") + except grpc.FutureTimeoutError: + print("āœ— gRPC channel timeout - server not responding on gRPC") + print(" This suggests the server might not be accepting gRPC connections") + except Exception as e: + print(f"āœ— gRPC channel error: {e}") + finally: + channel.close() + +except Exception as e: + print(f"āœ— gRPC error: {e}") + import traceback + + traceback.print_exc() + +# Test 4: Try with PySpark directly +print("\n[4] Testing PySpark Spark Connect...") +try: + from pyspark.sql import SparkSession + + print("āœ“ PySpark imported") + + print(" Creating SparkSession with remote connection...") + print(" This might take 10-30 seconds or hang if there's an issue...") + + import signal + + def timeout_handler(signum, frame): + raise TimeoutError("Session creation timed out after 20 seconds") + + signal.signal(signal.SIGALRM, timeout_handler) + signal.alarm(20) + + try: + spark = ( + SparkSession.builder.remote("sc://localhost:30000").appName("grpc-test").getOrCreate() + ) + + signal.alarm(0) + print("āœ“ SparkSession created!") + + # Try a simple operation + print(" Testing simple query...") + df = spark.sql("SELECT 1 AS id") + result = df.collect() + print(f"āœ“ Query executed successfully: {result}") + + spark.stop() + print("āœ“ Session stopped") + + print("\n" + "=" * 80) + print("SUCCESS! Everything is working.") + print("=" * 80) + + except TimeoutError as e: + signal.alarm(0) + print(f"āœ— {e}") + print("\nThis means PySpark is hanging while trying to connect.") + print("Possible causes:") + print(" 1. Spark Connect server not responding to gRPC requests") + print(" 2. Server bound to wrong address (IPv4 vs IPv6)") + print(" 3. Firewall or network policy blocking connection") + +except KeyboardInterrupt: + print("\nāœ— Interrupted by user") +except Exception as e: + print(f"āœ— PySpark connection failed: {e}") + import traceback + + traceback.print_exc() + +print("\n" + "=" * 80) +print("Debug test complete") +print("=" * 80) diff --git a/examples/spark/test_sdk_debug.py b/examples/spark/test_sdk_debug.py new file mode 100644 index 000000000..311d1e8bb --- /dev/null +++ b/examples/spark/test_sdk_debug.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +""" +Debug version of SDK client with verbose logging +""" + +import logging +import os +import sys + +# Setup very verbose logging +logging.basicConfig( + level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) + +# Add SDK to path +sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +sys.path.insert(0, sdk_path) + +print("=" * 80) +print("Testing Kubeflow SDK with DEBUG logging") +print("=" * 80) + +# Import with logging +print("\n[1] Importing Kubeflow SDK...") +from kubeflow.spark import ConnectBackendConfig, SparkSessionClient + +print("\n[2] Creating config...") +config = ConnectBackendConfig(connect_url="sc://localhost:30000", use_ssl=False, timeout=60) +print(f" Config: {config.connect_url}") + +print("\n[3] Creating client...") +client = SparkSessionClient(backend_config=config) +print(f" Client created: {client}") +print(f" Backend: {client.backend}") + +print("\n[4] Creating session (this is where it might hang)...") +print(" About to call client.create_session()...") + +import signal + + +def timeout_handler(signum, frame): + print("\nāœ— Session creation timed out after 20 seconds") + print("\nThe hang is in the SDK's create_session method.") + print("Check: kubeflow/spark/backends/connect.py line ~241") + sys.exit(1) + + +signal.signal(signal.SIGALRM, timeout_handler) +signal.alarm(20) + +try: + session = client.create_session(app_name="debug-test") + signal.alarm(0) + + print("\nāœ“ Session created!") + print(f" Session ID: {session.session_id}") + print(f" App name: {session.app_name}") + + print("\n[5] Testing query...") + df = session.sql("SELECT 1 AS id") + result = df.collect() + print(f"āœ“ Query result: {result}") + + session.close() + print("āœ“ Test passed!") + +except Exception as e: + signal.alarm(0) + print(f"\nāœ— Error: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/examples/spark/test_simple_spark.py b/examples/spark/test_simple_spark.py new file mode 100644 index 000000000..7f5d824c0 --- /dev/null +++ b/examples/spark/test_simple_spark.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +""" +Simple test WITHOUT History Server to verify basic Spark works +""" + +from datetime import datetime +import os +import sys + +from kubernetes import client, config + +# Add SDK to path +sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +if sdk_path not in sys.path: + sys.path.insert(0, sdk_path) + +print("=" * 80) +print("SIMPLE SPARK TEST (No History Server)") +print("=" * 80) +print() + +# Load kubeconfig +try: + config.load_kube_config(context="kind-spark-test") +except: + config.load_incluster_config() + +# Create API client +api_client = client.CustomObjectsApi() + +# Create minimal SparkApplication +app_name = f"simple-test-{datetime.now().strftime('%H%M%S')}" + +spark_app = { + "apiVersion": "sparkoperator.k8s.io/v1beta2", + "kind": "SparkApplication", + "metadata": { + "name": app_name, + "namespace": "default", + }, + "spec": { + "type": "Python", + "mode": "cluster", + "image": "docker.io/library/spark:4.0.0", + "imagePullPolicy": "IfNotPresent", + "mainApplicationFile": "local:///opt/spark/examples/src/main/python/pi.py", + "arguments": ["10"], + "sparkVersion": "4.0.0", + "restartPolicy": {"type": "Never"}, + "timeToLiveSeconds": 1800, + "driver": { + "cores": 1, + "memory": "512m", + "serviceAccount": "spark-operator-spark", + "labels": { + "version": "4.0.0", + }, + }, + "executor": { + "cores": 1, + "instances": 2, + "memory": "512m", + "labels": { + "version": "4.0.0", + }, + }, + "sparkConf": { + "spark.kubernetes.file.upload.path": "/tmp", + }, + }, +} + +print(f"Submitting simple Spark application: {app_name}") +print("This test has NO volume mounts - just basic Pi calculation") +print() + +try: + # Create the SparkApplication + response = api_client.create_namespaced_custom_object( + group="sparkoperator.k8s.io", + version="v1beta2", + namespace="default", + plural="sparkapplications", + body=spark_app, + ) + + print("Application submitted successfully!") + print(f" Name: {app_name}") + print() + print("Monitor:") + print(f" kubectl get sparkapplication {app_name} -w") + print() + print("View logs:") + print(f" kubectl logs {app_name}-driver -f") + print() + print("Describe:") + print(f" kubectl describe sparkapplication {app_name}") + print() + + # Wait and show status + import time + + print("Waiting for completion...") + for i in range(60): + time.sleep(2) + try: + app_status = api_client.get_namespaced_custom_object( + group="sparkoperator.k8s.io", + version="v1beta2", + namespace="default", + plural="sparkapplications", + name=app_name, + ) + + state = app_status.get("status", {}).get("applicationState", {}).get("state", "UNKNOWN") + print(f" Status: {state}", end="\r") + + if state in ["COMPLETED", "FAILED"]: + print() + print(f"\nApplication {state}") + + if state == "COMPLETED": + print("\nSUCCESS! Basic Spark works without volumes.") + print("\nNow we can test with History Server volumes.") + else: + error_msg = ( + app_status.get("status", {}) + .get("applicationState", {}) + .get("errorMessage", "Unknown error") + ) + print(f"\nERROR: FAILED: {error_msg}") + print("\nCheck logs:") + print(f" kubectl logs {app_name}-driver") + break + + except Exception: + continue + +except Exception as e: + print(f"ERROR: Failed to submit application: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/examples/spark/test_spark_client_integration.py b/examples/spark/test_spark_client_integration.py new file mode 100644 index 000000000..19332ed24 --- /dev/null +++ b/examples/spark/test_spark_client_integration.py @@ -0,0 +1,321 @@ +""" +Integration tests for Kubeflow Spark Client with Operator Backend. + +These tests require: +1. A Kubernetes cluster with Spark Operator installed +2. kubectl configured with proper context +3. Service account 'spark-operator-spark' with proper permissions + +Setup: + Run ./setup_test_environment.sh to create a Kind cluster with Spark Operator + +Usage: + python test_spark_client_integration.py +""" + +import os +import sys + +# Add SDK to path for development mode +sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +if sdk_path not in sys.path: + sys.path.insert(0, sdk_path) + +import time +import unittest + +from kubeflow.spark import ApplicationState, OperatorBackendConfig, SparkClient # noqa: E402 + + +class TestSparkClientIntegration(unittest.TestCase): + """Integration tests for SparkClient with Operator backend.""" + + @classmethod + def setUpClass(cls): + """Set up test client.""" + config = OperatorBackendConfig( + namespace=os.getenv("SPARK_NAMESPACE", "default"), + service_account="spark-operator-spark", + default_spark_image="docker.io/library/spark", + # Explicitly set context + context=os.getenv("KUBE_CONTEXT", "kind-spark-test"), + # Disable to avoid JMX agent issue with Spark 4.0 + enable_monitoring=False, + # Disable UI for simpler testing + enable_ui=False, + ) + cls.client = BatchSparkClient(backend_config=config) + cls.submitted_apps = [] + + @classmethod + def tearDownClass(cls): + """Clean up submitted applications.""" + print("\nCleaning up test applications...") + for app_name in cls.submitted_apps: + try: + cls.client.delete_job(app_name) + print(f" Deleted {app_name}") + except Exception as e: + print(f" āœ— Failed to delete {app_name}: {e}") + + def test_01_submit_spark_pi(self): + """Test submitting a simple Spark Pi application.""" + print("\n" + "=" * 80) + print("TEST: Submit Spark Pi Application") + print("=" * 80) + + app_name = "test-spark-pi" + + response = self.client.submit_application( + app_name=app_name, + main_application_file=( + "local:///opt/spark/examples/jars/spark-examples_2.13-4.0.0.jar" + ), + main_class="org.apache.spark.examples.SparkPi", + spark_version="4.0.0", + app_type="Scala", + driver_cores=1, + driver_memory="512m", + executor_cores=1, + executor_memory="512m", + num_executors=1, + arguments=["100"], + spark_conf={ + # Required for Spark 4.0 + "spark.kubernetes.file.upload.path": "/tmp", + }, + ) + + self.submitted_apps.append(app_name) + + self.assertEqual(response.submission_id, app_name) + self.assertEqual(response.status, "SUBMITTED") + + print(f"Application submitted: {app_name}") + print(f" Status: {response.status}") + + def test_02_get_status(self): + """Test getting application status.""" + print("\n" + "=" * 80) + print("TEST: Get Application Status") + print("=" * 80) + + if not self.submitted_apps: + self.skipTest("No applications to check status") + + app_name = self.submitted_apps[0] + status = self.client.get_job(app_name) + + self.assertIsNotNone(status) + self.assertEqual(status.submission_id, app_name) + self.assertIn(status.state, list(ApplicationState)) + + print(f"Got status for {app_name}") + print(f" State: {status.state.value}") + print(f" App ID: {status.app_id}") + + def test_03_list_applications(self): + """Test listing applications.""" + print("\n" + "=" * 80) + print("TEST: List Applications") + print("=" * 80) + + apps = self.client.list_jobs() + + self.assertIsInstance(apps, list) + print(f"Listed {len(apps)} applications") + + for app in apps[:5]: # Show first 5 + print(f" - {app.app_name}: {app.state.value}") + + def test_04_get_logs(self): + """Test getting application logs.""" + print("\n" + "=" * 80) + print("TEST: Get Application Logs") + print("=" * 80) + + if not self.submitted_apps: + self.skipTest("No applications to get logs from") + + app_name = self.submitted_apps[0] + + # Wait for driver pod to be ready before fetching logs + print("Waiting for driver pod to be ready...") + is_ready = self.client.wait_for_pod_ready(app_name, timeout=120) + + if not is_ready: + print("WARNING: Driver pod not ready within timeout, logs may be empty") + + logs = list(self.client.get_job_logs(app_name)) + + # Logs might be empty if pod not started yet + print(f"Retrieved {len(logs)} log lines from {app_name}") + if logs: + print("\n First 5 lines:") + for line in logs[:5]: + print(f" {line}") + else: + print(" (No logs available yet - pod may still be starting)") + + def test_05_wait_for_completion(self): + """Test waiting for application completion.""" + print("\n" + "=" * 80) + print("TEST: Wait for Completion") + print("=" * 80) + + app_name = "test-spark-pi-completion" + + response = self.client.submit_application( + app_name=app_name, + main_application_file=( + "local:///opt/spark/examples/jars/spark-examples_2.13-4.0.0.jar" + ), + main_class="org.apache.spark.examples.SparkPi", + spark_version="4.0.0", + app_type="Scala", + driver_cores=1, + driver_memory="512m", + executor_cores=1, + executor_memory="512m", + num_executors=1, + arguments=["10"], # Small workload + spark_conf={ + "spark.kubernetes.file.upload.path": "/tmp", + }, + ) + + self.submitted_apps.append(app_name) + + print(f"Submitted {app_name}") + print(" Waiting for completion (timeout: 300s)...") + + final_status = self.client.wait_for_job_status(app_name, timeout=300, polling_interval=5) + + print("Application completed") + print(f" Final state: {final_status.state.value}") + + self.assertIn( + final_status.state, + [ApplicationState.COMPLETED, ApplicationState.FAILED], + ) + + def test_06_delete_application(self): + """Test deleting an application.""" + print("\n" + "=" * 80) + print("TEST: Delete Application") + print("=" * 80) + + # Submit a temporary application + app_name = "test-spark-delete" + + response = self.client.submit_application( + app_name=app_name, + main_application_file=( + "local:///opt/spark/examples/jars/spark-examples_2.13-4.0.0.jar" + ), + main_class="org.apache.spark.examples.SparkPi", + spark_version="4.0.0", + app_type="Scala", + driver_cores=1, + driver_memory="512m", + executor_cores=1, + executor_memory="512m", + num_executors=1, + spark_conf={ + "spark.kubernetes.file.upload.path": "/tmp", + }, + ) + + print(f"Submitted {app_name}") + + # Delete immediately + result = self.client.delete_job(app_name) + + self.assertIsInstance(result, dict) + print(f"Deleted {app_name}") + print(f" Result: {result}") + + def test_07_dynamic_allocation(self): + """Test application with dynamic allocation.""" + print("\n" + "=" * 80) + print("TEST: Dynamic Allocation") + print("=" * 80) + + app_name = "test-dynamic-allocation" + + response = self.client.submit_application( + app_name=app_name, + main_application_file=( + "local:///opt/spark/examples/jars/spark-examples_2.13-4.0.0.jar" + ), + main_class="org.apache.spark.examples.SparkPi", + spark_version="4.0.0", + app_type="Scala", + driver_cores=1, + driver_memory="512m", + executor_cores=1, + executor_memory="512m", + num_executors=2, + arguments=["1000"], + enable_dynamic_allocation=True, + initial_executors=1, + min_executors=1, + max_executors=5, + spark_conf={ + "spark.kubernetes.file.upload.path": "/tmp", + }, + ) + + self.submitted_apps.append(app_name) + + print(f"Submitted {app_name} with dynamic allocation") + print(" Config: min=1, max=5, initial=1") + + # Check status after a bit + time.sleep(10) + status = self.client.get_job(app_name) + + print(f" Current state: {status.state.value}") + if status.executor_state: + print(f" Executors: {len(status.executor_state)}") + + +def run_tests(): + """Run integration tests.""" + print("=" * 80) + print(" Kubeflow Spark Client - Integration Tests") + print("=" * 80) + print() + print("Prerequisites:") + print(" - Kubernetes cluster with Spark Operator") + print(" - kubectl configured with proper context") + print(" - Service account 'spark-operator-spark'") + print() + print("Run ./setup_test_environment.sh if not already done") + print("=" * 80) + print() + + # Run tests + suite = unittest.TestLoader().loadTestsFromTestCase(TestSparkClientIntegration) + runner = unittest.TextTestRunner(verbosity=2) + result = runner.run(suite) + + # Summary + print("\n" + "=" * 80) + print("Test Summary") + print("=" * 80) + print(f"Tests run: {result.testsRun}") + print(f"Successes: {result.testsRun - len(result.failures) - len(result.errors)}") + print(f"Failures: {len(result.failures)}") + print(f"Errors: {len(result.errors)}") + + if result.wasSuccessful(): + print("\nAll tests passed! šŸŽ‰") + return 0 + else: + print("\nāœ— Some tests failed") + return 1 + + +if __name__ == "__main__": + exit(run_tests()) diff --git a/examples/spark/test_ui_minimal.py b/examples/spark/test_ui_minimal.py new file mode 100644 index 000000000..b43750a09 --- /dev/null +++ b/examples/spark/test_ui_minimal.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +""" +Minimal test for Spark UI service creation without S3. +Uses local:// path and simple SparkPi example. +""" + +from datetime import datetime +import os +import sys +import time + +# Add SDK to path +sdk_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +if sdk_path not in sys.path: + sys.path.insert(0, sdk_path) + +from kubeflow.spark import OperatorBackendConfig, SparkClient + +print("=" * 80) +print("MINIMAL TEST: Spark UI Service Creation") +print("=" * 80) +print() +print("This test submits a simple Spark application and checks if") +print("the UI service is created by the Spark Operator.") +print() + +# Create client with UI enabled +config = OperatorBackendConfig( + namespace="default", + service_account="spark-operator-spark", + default_spark_image="docker.io/apache/spark", # Use official image + context=os.getenv("KUBE_CONTEXT", "kind-spark-test"), + enable_monitoring=False, + enable_ui=True, # Enable UI! +) + +client = BatchSparkClient(backend_config=config) +print("Client created with enable_ui=True") +print() + +# Submit a simple SparkPi example (built into Spark image) +timestamp = datetime.now().strftime("%H%M%S") +app_name = f"test-ui-{timestamp}" + +print(f"Submitting test application: {app_name}") +print("-" * 80) + +try: + response = client.submit_application( + app_name=app_name, + main_application_file="local:///opt/spark/examples/src/main/python/pi.py", + spark_version="3.5.0", + app_type="Python", + driver_cores=1, + driver_memory="512m", + executor_cores=1, + executor_memory="512m", + num_executors=1, + arguments=["10"], # Calculate pi with 10 partitions + ) + + print(f"Application submitted: {response.submission_id}") + print(f" Status: {response.status}") + print() + +except Exception as e: + print(f"ERROR: Submission failed: {e}") + sys.exit(1) + +# Wait a few seconds for operator to process +print("Waiting 10 seconds for Spark Operator to create resources...") +time.sleep(10) +print() + +# Instructions for checking +print("=" * 80) +print("Now check if the UI service was created:") +print("=" * 80) +print() +print("1. Check for the UI service:") +print(f" kubectl get svc {app_name}-ui-svc -n default") +print() +print("2. If service exists, port-forward to access:") +print(f" kubectl port-forward svc/{app_name}-ui-svc 4040:4040") +print(" Then open: http://localhost:4040") +print() +print("3. Check the SparkApplication YAML:") +print(f" kubectl get sparkapplication {app_name} -o yaml | grep -A 5 sparkUIOptions") +print() +print("4. Check all services:") +print(" kubectl get svc -n default") +print() +print("5. View Spark Operator logs:") +print(" kubectl logs -n spark-operator deploy/spark-operator --tail=100") +print() +print("6. Watch application status:") +print(f" kubectl get sparkapplication {app_name} -w") +print() +print("=" * 80) +print() +print(f"Application name: {app_name}") +print("The application will run for ~30 seconds.") +print("Check if the UI service exists while it's running!") +print() diff --git a/examples/spark/test_url_building.py b/examples/spark/test_url_building.py new file mode 100644 index 000000000..c27cc7a94 --- /dev/null +++ b/examples/spark/test_url_building.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +""" +Test that mimics exactly what the Kubeflow SDK does +""" + +print("Testing SDK URL building logic...") + +# Test 1: Build URL like SDK does +connect_url = "sc://localhost:30000" +use_ssl = False + +url = connect_url +param_dict = {} + +# This is what SDK does +if use_ssl: + param_dict["use_ssl"] = "true" +else: + param_dict["use_ssl"] = "false" # New fix + +# Build final URL +if param_dict: + param_str = ";".join([f"{k}={v}" for k, v in param_dict.items()]) + final_url = f"{url}/;{param_str}" +else: + final_url = url + +print(f"SDK would build URL: {final_url}") + +# Test 2: Try this URL with PySpark +print("\nTesting this URL with PySpark...") + +import signal + +from pyspark.sql import SparkSession + + +def timeout_handler(signum, frame): + raise TimeoutError("Timed out") + + +signal.signal(signal.SIGALRM, timeout_handler) +signal.alarm(15) + +try: + print(f"Connecting to: {final_url}") + spark = SparkSession.builder.remote(final_url).appName("sdk-mimic-test").getOrCreate() + signal.alarm(0) + + print("āœ“ Connection successful!") + + # Test query + df = spark.sql("SELECT 1 AS id") + print(f"āœ“ Query worked: {df.collect()}") + + spark.stop() + print("āœ“ All good!") + +except TimeoutError: + print("āœ— Timed out - the URL format might be wrong") + print("\nTry this instead:") + print(f" SparkSession.builder.remote('{connect_url}').appName('test').getOrCreate()") +except Exception as e: + print(f"āœ— Error: {e}") + import traceback + + traceback.print_exc() diff --git a/examples/spark/upload_long_job.sh b/examples/spark/upload_long_job.sh new file mode 100755 index 000000000..e80d873c1 --- /dev/null +++ b/examples/spark/upload_long_job.sh @@ -0,0 +1,116 @@ +#!/bin/bash +# Quick script to upload the long-running job script to MinIO + +set -e + +NAMESPACE="default" + +echo "==================================================" +echo "Uploading Long-Running Job Script to MinIO" +echo "==================================================" +echo "" + +# Check if MinIO is running +if ! kubectl get pod -n ${NAMESPACE} -l app=minio | grep -q Running; then + echo "āŒ MinIO is not running!" + echo " Run ./setup_minio.sh first" + exit 1 +fi + +echo "āœ“ MinIO is running" +echo "" + +# Check if minio-client exists +if ! kubectl get pod -n ${NAMESPACE} minio-client &>/dev/null; then + echo "āŒ minio-client pod not found!" + echo " Run ./setup_minio.sh first" + exit 1 +fi + +echo "āœ“ MinIO client found" +echo "" + +# Check if minio-client pod is running +POD_STATUS=$(kubectl get pod -n ${NAMESPACE} minio-client -o jsonpath='{.status.phase}' 2>/dev/null) + +if [ "$POD_STATUS" != "Running" ]; then + echo " MinIO client pod is in '$POD_STATUS' state. Restarting..." + + # Delete the completed pod + kubectl delete pod -n ${NAMESPACE} minio-client --ignore-not-found=true + + # Create a new one + cat < /tmp/long_running_job.py' + +# Verify the file has content in the pod +LINE_COUNT=$(kubectl exec -n ${NAMESPACE} minio-client -- wc -l /tmp/long_running_job.py | awk '{print $1}') +if [ "$LINE_COUNT" -eq 0 ]; then + echo "āŒ Upload failed: File is empty in pod" + exit 1 +fi +echo " āœ“ File copied to pod ($LINE_COUNT lines)" + +# Upload to MinIO +echo " Uploading to MinIO..." +kubectl exec -n ${NAMESPACE} minio-client -- mc cp /tmp/long_running_job.py myminio/spark-scripts/ + +echo "āœ“ Uploaded successfully" +echo "" + +# Verify +echo "Verifying upload in MinIO..." +FILE_INFO=$(kubectl exec -n ${NAMESPACE} minio-client -- mc ls myminio/spark-scripts/long_running_job.py) +echo "$FILE_INFO" + +# Check if file size is 0B (indicates empty file) +if echo "$FILE_INFO" | grep -q "0B"; then + echo "" + echo "āŒ WARNING: File appears to be empty (0B) in MinIO!" + echo " This will cause Spark jobs to fail immediately." + exit 1 +fi + +echo "" +echo "==================================================" +echo "āœ… Setup Complete!" +echo "==================================================" +echo "" +echo "Now run:" +echo " python run_long_job_ui_validation.py" +echo "" diff --git a/kubeflow/spark/README.md b/kubeflow/spark/README.md new file mode 100644 index 000000000..a346439c5 --- /dev/null +++ b/kubeflow/spark/README.md @@ -0,0 +1,610 @@ +# Kubeflow Spark Client + +Cloud-native Python client for managing Apache Spark applications on Kubernetes using the Kubeflow Spark Operator. + +## Overview + +The Kubeflow Spark Client provides a Pythonic interface for submitting, monitoring, and managing Spark applications on Kubernetes. The SDK offers two specialized clients for different workloads: + +- **BatchSparkClient**: For batch Spark application submission and management +- **SparkSessionClient**: For interactive Spark Connect sessions + +### Key Features + +- **Specialized Clients**: Separate clients for batch jobs and interactive sessions +- **Cloud-Native Architecture**: Direct integration with Kubeflow Spark Operator CRDs +- **Multiple Backends**: Operator (K8s-native), Gateway (REST API), and Connect (gRPC) backends +- **Dynamic Resource Allocation**: Automatic executor scaling based on workload +- **Comprehensive Monitoring**: Prometheus metrics and Spark UI integration +- **Production-Ready**: Error handling, retries, and comprehensive logging +- **Type-Safe**: Clean APIs with proper type hints and IDE support + +## Architecture + +``` +BaseSparkClient (shared functionality) +ā”œā”€ā”€ BatchSparkClient (batch workloads) +│ └── Backend: BatchSparkBackend +│ ā”œā”€ā”€ OperatorBackend (Kubernetes CRDs) +│ └── GatewayBackend (REST API) +│ +└── SparkSessionClient (interactive workloads) + └── Backend: SessionSparkBackend + └── ConnectBackend (Spark Connect/gRPC) +``` + +### Design Principles + +The Spark client follows best practices and SOLID principles: + +1. **Interface Segregation**: Separate clients expose only relevant methods +2. **Backend Abstraction**: Pluggable backends for different platforms +3. **Type Safety**: Strong typing prevents runtime errors +4. **Kubernetes-Native**: Direct CRD manipulation for cloud-native deployments + +## Installation + +```bash +# Install from PyPI (when released) +pip install kubeflow + +# Or install from source +cd sdk +pip install -e . + +# For Spark Connect support +pip install 'pyspark[connect]>=3.4.0' +``` + +### Prerequisites + +**For BatchSparkClient with OperatorBackend** (recommended for batch jobs): +- Kubernetes cluster (1.16+) +- Kubeflow Spark Operator installed +- kubectl configured with proper context +- Service account with SparkApplication permissions + +**For BatchSparkClient with GatewayBackend**: +- Access to a Spark Gateway (e.g., Apache Livy) +- API credentials (if required) + +**For SparkSessionClient with ConnectBackend**: +- Spark cluster with Spark Connect server (Spark 3.4+) +- Network connectivity to Spark Connect endpoint +- PySpark with Connect support installed + +## Quick Start + +### Batch Jobs + +#### Basic Batch Application + +```python +from kubeflow.spark import BatchSparkClient, OperatorBackendConfig + +# Create batch client (uses Operator backend by default) +client = BatchSparkClient() + +# Submit a Spark application +response = client.submit_application( + app_name="spark-pi", + main_application_file="local:///opt/spark/examples/src/main/python/pi.py", + driver_cores=1, + driver_memory="512m", + executor_cores=1, + executor_memory="512m", + num_executors=2 +) + +print(f"Submitted: {response.submission_id}") + +# Wait for completion +status = client.wait_for_completion(response.submission_id) +print(f"Final state: {status.state}") + +# Get logs +for line in client.get_logs(response.submission_id): + print(line) +``` + +#### DataFrame Processing with S3 + +```python +from kubeflow.spark import BatchSparkClient, OperatorBackendConfig + +# Configure client +config = OperatorBackendConfig( + namespace="spark-jobs", + enable_monitoring=True, + enable_ui=True, +) +client = BatchSparkClient(backend_config=config) + +# Submit DataFrame processing job +response = client.submit_application( + app_name="dataframe-analysis", + main_application_file="s3a://my-bucket/jobs/analysis.py", + spark_version="4.0.0", + driver_cores=2, + driver_memory="4g", + executor_cores=2, + executor_memory="8g", + num_executors=5, + spark_conf={ + "spark.sql.shuffle.partitions": "200", + "spark.hadoop.fs.s3a.endpoint": "s3.amazonaws.com", + }, + env_vars={ + "AWS_ACCESS_KEY_ID": "your-key", + "AWS_SECRET_ACCESS_KEY": "your-secret", + } +) +``` + +#### Advanced Features: Dynamic Allocation and Volumes + +```python +from kubeflow.spark import BatchSparkClient, OperatorBackendConfig + +config = OperatorBackendConfig(namespace="default") +client = BatchSparkClient(backend_config=config) + +response = client.submit_application( + app_name="advanced-job", + main_application_file="local:///app/job.py", + spark_version="4.0.0", + driver_cores=2, + driver_memory="4g", + executor_cores=2, + executor_memory="8g", + num_executors=3, + + # Enable dynamic allocation + enable_dynamic_allocation=True, + initial_executors=2, + min_executors=1, + max_executors=10, + + # Configure volumes + volumes=[{ + "name": "data-volume", + "persistentVolumeClaim": {"claimName": "my-pvc"} + }], + driver_volume_mounts=[{ + "name": "data-volume", + "mountPath": "/data" + }], + + # Node selector and tolerations + node_selector={"node-type": "compute"}, + tolerations=[{ + "key": "spark", + "operator": "Equal", + "value": "true", + "effect": "NoSchedule" + }], +) +``` + +### Interactive Sessions + +#### Basic Interactive Session + +```python +from kubeflow.spark import SparkSessionClient, ConnectBackendConfig + +# Connect to Spark Connect server +config = ConnectBackendConfig( + connect_url="sc://spark-cluster.default.svc:15002", + use_ssl=True, +) +client = SparkSessionClient(backend_config=config) + +# Create interactive session +session = client.create_session(app_name="data-exploration") + +# Use standard PySpark DataFrame API +df = session.sql("SELECT * FROM sales WHERE date >= '2024-01-01'") +result = df.groupBy("product").sum("amount").collect() + +for row in result: + print(f"{row.product}: {row['sum(amount)']}") + +# Cleanup +session.close() +``` + +#### Notebook Workflow + +```python +from kubeflow.spark import SparkSessionClient, ConnectBackendConfig + +# Cell 1: Setup +config = ConnectBackendConfig(connect_url="sc://spark-cluster:15002") +client = SparkSessionClient(backend_config=config) +session = client.create_session("notebook-analysis") + +# Cell 2: Load data +df = session.read.parquet("s3a://bucket/data/") +df.show() + +# Cell 3: Feature engineering +features = df.withColumn("spend_per_year", df.spend_total / df.age) +features.describe().show() + +# Cell 4: Export results +session.export_to_pipeline_artifact(features, "/outputs/features.parquet") + +# Cell 5: Cleanup +session.close() +``` + +#### Session Management + +```python +from kubeflow.spark import SparkSessionClient, ConnectBackendConfig + +config = ConnectBackendConfig(connect_url="sc://spark-cluster:15002") +client = SparkSessionClient(backend_config=config) + +# List all active sessions +sessions = client.list_sessions() +for session_info in sessions: + print(f"Session: {session_info.session_id}") + print(f" App: {session_info.app_name}") + print(f" Queries executed: {session_info.metrics.queries_executed}") + +# Get specific session status +session = client.create_session("my-app") +info = client.get_session_status(session.session_id) +print(f"Session state: {info.state}") + +# Close session +client.close_session(session.session_id, release=True) +``` + +## API Reference + +### BatchSparkClient + +Client for managing batch Spark applications. + +#### Constructor + +```python +BatchSparkClient(backend_config: Union[OperatorBackendConfig, GatewayBackendConfig, None] = None) +``` + +#### Methods + +**submit_application(...) → SparkApplicationResponse** +- Submit a new Spark application +- Returns submission ID and initial status + +**get_status(submission_id) → ApplicationStatus** +- Get current status of an application +- Returns state, app ID, executor info, timestamps + +**wait_for_completion(submission_id, timeout=3600, polling_interval=10) → ApplicationStatus** +- Block until application completes +- Returns final status + +**get_logs(submission_id, executor_id=None, follow=False) → Iterator[str]** +- Stream application logs +- Can retrieve driver or specific executor logs + +**list_applications(namespace=None, labels=None) → List[ApplicationStatus]** +- List applications with optional filtering +- Supports namespace and label filters + +**delete_application(submission_id) → Dict** +- Delete an application +- Stops running application and cleans up resources + +**wait_for_pod_ready(submission_id, executor_id=None, timeout=300) → bool** +- Wait for driver or executor pod to be ready +- Only available with OperatorBackend + +### SparkSessionClient + +Client for managing interactive Spark sessions. + +#### Constructor + +```python +SparkSessionClient(backend_config: ConnectBackendConfig) +``` + +#### Methods + +**create_session(app_name, **kwargs) → ManagedSparkSession** +- Create a new Spark Connect session +- Returns managed session with PySpark API access + +**get_session_status(session_id) → SessionInfo** +- Get status and metadata of a session +- Returns state, metrics, and session details + +**list_sessions() → List[SessionInfo]** +- List all active Spark Connect sessions + +**close_session(session_id, release=True) → Dict** +- Close a session and release resources + +### ManagedSparkSession + +Wrapper around PySpark SparkSession with Kubeflow enhancements. + +#### Properties + +- **session_id**: Unique session identifier +- **app_name**: Application name +- **spark**: Access to underlying PySpark SparkSession + +#### Methods + +**sql(query) → DataFrame** +- Execute SQL query and return DataFrame + +**read → DataFrameReader** +- Access DataFrameReader for reading data sources + +**readStream → DataStreamReader** +- Access DataStreamReader for streaming sources + +**upload_artifacts(*paths, pyfile=False)** +- Upload JARs or Python files to session + +**get_metrics() → SessionMetrics** +- Get session metrics (queries executed, artifacts uploaded) + +**close(release=True)** +- Close the session + +### Backend Configurations + +#### OperatorBackendConfig + +Configuration for Kubernetes Spark Operator backend. + +```python +from kubeflow.spark import OperatorBackendConfig + +config = OperatorBackendConfig( + namespace="default", + context=None, + service_account="spark-operator-spark", + image_pull_policy="IfNotPresent", + default_spark_image="docker.io/library/spark", + enable_monitoring=True, + enable_ui=True, + timeout=60, +) +``` + +#### GatewayBackendConfig + +Configuration for REST Gateway backend. + +```python +from kubeflow.spark import GatewayBackendConfig + +config = GatewayBackendConfig( + gateway_url="http://gateway:8080", + user="myuser", + password="mypassword", + timeout=30, + verify_ssl=True, +) +``` + +#### ConnectBackendConfig + +Configuration for Spark Connect backend. + +```python +from kubeflow.spark import ConnectBackendConfig + +config = ConnectBackendConfig( + connect_url="sc://spark-cluster.default.svc:15002", + token="bearer-token", # Optional + use_ssl=True, +) +``` + +## Choosing the Right Client + +### Use BatchSparkClient when: +- Running scheduled ETL pipelines +- Submitting production batch jobs +- Integrating with CI/CD workflows +- Need dynamic allocation and auto-scaling +- Running jobs as Kubernetes CRDs + +### Use SparkSessionClient when: +- Performing interactive data exploration +- Working in Jupyter or IPython notebooks +- Iterative development and testing +- Need immediate feedback from queries +- Connecting to remote Spark clusters + +## Examples + +The `examples/spark/` directory contains comprehensive examples: + +**Batch Examples:** +- `01_hello_spark_pi.py`: Basic Spark Pi calculation +- `02_csv_data_analysis.py`: CSV data processing +- `04_etl_pipeline_simple.py`: ETL pipeline example +- `05_scheduled_batch_job.py`: Scheduled job pattern +- `06_autoscaling_dynamic_allocation.py`: Dynamic allocation + +**Interactive Session Examples:** +- `07_spark_connect_interactive.py`: Interactive data analysis +- `ipython_spark_connect_demo.py`: IPython integration +- `ipython_spark_connect_shell.py`: Interactive shell + +Run examples: + +```bash +cd examples/spark + +# Batch example +python 01_hello_spark_pi.py + +# Interactive session example +python 07_spark_connect_interactive.py +``` + +## Testing + +### Setup Test Environment + +Use the provided script to set up a Kind cluster with Spark Operator: + +```bash +cd examples/spark +./setup_test_environment.sh +``` + +This will: +1. Create a Kind cluster +2. Install Spark Operator +3. Configure RBAC and service accounts +4. Verify the installation + +### Run Integration Tests + +```bash +python test_spark_client_integration.py +``` + +### Cleanup + +```bash +kind delete cluster --name spark-test +``` + +## Monitoring and Debugging + +### Access Spark UI + +Port forward to Spark UI: +```bash +kubectl port-forward -n default svc/spark-ui 4040:4040 +``` + +Open in browser: http://localhost:4040 + +### View Application Logs + +Using BatchSparkClient: +```python +# Stream driver logs +for line in client.get_logs(submission_id): + print(line) + +# Get executor logs +for line in client.get_logs(submission_id, executor_id="1"): + print(line) +``` + +Using kubectl: +```bash +# Driver logs +kubectl logs -driver -n default + +# Executor logs +kubectl logs -exec-1 -n default +``` + +### Debug Mode + +Enable debug logging: + +```python +import logging +logging.basicConfig(level=logging.DEBUG) + +from kubeflow.spark import BatchSparkClient +client = BatchSparkClient() +``` + +## Troubleshooting + +### Common Issues + +**1. ImportError: No module named 'pyspark'** + +For SparkSessionClient, install PySpark with Connect support: +```bash +pip install 'pyspark[connect]>=3.4.0' +``` + +**2. SparkApplication not being created** + +Check Spark Operator is running: +```bash +kubectl get pods -n spark-operator +``` + +**3. Cannot connect to Spark Connect server** + +Verify the server is running and accessible: +```bash +kubectl get svc -n default | grep spark +kubectl port-forward svc/spark-connect 15002:15002 +``` + +**4. Permission denied** + +Verify service account permissions: +```bash +kubectl auth can-i create pods --as=system:serviceaccount:default:spark-operator-spark +``` + +## Comparison with Trainer Client + +| Aspect | Trainer Client | Spark Client | +|--------|---------------|--------------| +| **CRD** | TrainJob | SparkApplication | +| **Operator** | Training Operator | Spark Operator | +| **Client Classes** | TrainingClient | BatchSparkClient, SparkSessionClient | +| **Backends** | Kubernetes, LocalProcess | Operator, Gateway, Connect | +| **Workload Types** | Batch training jobs | Batch jobs + interactive sessions | +| **API Style** | train(), list_jobs() | submit_application(), create_session() | + +Both clients provide: +- Backend abstraction for flexibility +- Kubernetes-native CRD management +- Status monitoring with polling +- Log streaming capabilities +- Context manager support + +## Contributing + +Contributions are welcome. Please: + +1. Fork the repository +2. Create a feature branch +3. Add tests for new functionality +4. Ensure all tests pass +5. Submit a pull request + +## License + +Apache License 2.0 + +## References + +- [Kubeflow Spark Operator](https://github.com/kubeflow/spark-operator) +- [Apache Spark on Kubernetes](https://spark.apache.org/docs/latest/running-on-kubernetes.html) +- [Spark Connect](https://spark.apache.org/docs/latest/spark-connect-overview.html) +- [Kubeflow Training Client](https://github.com/kubeflow/training-operator) + +## Support + +For issues and questions: +- GitHub Issues: [kubeflow/sdk](https://github.com/kubeflow/sdk/issues) +- Slack: #kubeflow-spark +- Mailing List: kubeflow-discuss@googlegroups.com diff --git a/kubeflow/spark/SPARK_CONNECT_DESIGN.md b/kubeflow/spark/SPARK_CONNECT_DESIGN.md new file mode 100644 index 000000000..3991edba5 --- /dev/null +++ b/kubeflow/spark/SPARK_CONNECT_DESIGN.md @@ -0,0 +1,643 @@ +# Spark Connect Integration Design + +**Version:** 2.0 +**Status:** Implementation Complete +**Last Updated:** 2025-11-23 + +## Overview + +This document describes the architecture and design of Spark Connect support in Kubeflow Spark SDK, enabling interactive, session-based Spark workloads through a dedicated client class. + +**Key Features:** +- Remote connectivity to Spark clusters via gRPC (Spark Connect protocol) +- Specialized clients for batch jobs and interactive sessions +- Native PySpark API compatibility with Kubeflow enhancements +- Kubernetes-native integration with automatic secret/config injection +- Type-safe API following Interface Segregation Principle + +--- + +## Architecture + +### System Components + +``` +ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” +│ User Code (Python) │ +│ │ +│ Batch Jobs: Interactive Sessions: │ +│ from kubeflow.spark import from kubeflow.spark import │ +│ BatchSparkClient SparkSessionClient │ +│ client = BatchSparkClient() client = SparkSessionClient() │ +│ client.submit_application() session = client.create_session()│ +ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¬ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¬ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ + │ │ + ā–¼ ā–¼ +ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” +│ BatchSparkClient │ │ SparkSessionClient │ +│ │ │ │ +│ • submit_application() │ │ • create_session() │ +│ • get_status() │ │ • list_sessions() │ +│ • wait_for_completion() │ │ • close_session() │ +│ • delete_application() │ │ • get_session_status() │ +ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¬ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¬ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ + │ │ + ā”Œā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā” │ + ā–¼ ā–¼ ā–¼ +ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” +│ Operator │ │ Gateway │ │ Connect │ +│ Backend │ │ Backend │ │ Backend │ +│ │ │ │ │ │ +│ (Batch) │ │ (Batch) │ │ (Session) │ +ā””ā”€ā”€ā”€ā”€ā”¬ā”€ā”€ā”€ā”€ā”€ā”˜ ā””ā”€ā”€ā”€ā”€ā”¬ā”€ā”€ā”€ā”€ā”€ā”˜ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”¬ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ + │ │ │ + ā–¼ ā–¼ ā–¼ +ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” +│ Spark │ │ Livy/ │ │ Spark │ +│ Operator │ │ Gateway │ │ Connect │ +│(K8s CRDs)│ │ (HTTP) │ │ (gRPC) │ +ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ +``` + +### ConnectBackend Architecture + +``` +ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” +│ SparkSessionClient │ +│ │ +│ create_session(app_name) │ +│ ↓ │ +│ Delegates to ConnectBackend │ +ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¬ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ + │ + ā–¼ +ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” +│ ConnectBackend │ +│ (SessionSparkBackend) │ +│ │ +│ ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” │ +│ │ create_session(app_name, **config) │ │ +│ │ ↓ │ │ +│ │ 1. Generate session_id (UUID) │ │ +│ │ 2. Build connection URL │ │ +│ │ 3. Create PySpark SparkSession.builder.remote() │ │ +│ │ 4. Wrap in ManagedSparkSession │ │ +│ │ 5. Track in _sessions dict │ │ +│ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ │ +│ │ +│ Session Management: │ +│ • list_sessions() │ +│ • get_session_status(session_id) │ +│ • close_session(session_id) │ +ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ + │ + ā–¼ +ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” +│ ManagedSparkSession (Wrapper) │ +│ │ +│ PySpark API (delegated): Kubeflow Extensions: │ +│ • sql(query) • get_metrics() │ +│ • createDataFrame(data) • get_info() │ +│ • read.parquet() • upload_artifacts() │ +│ • All DataFrame operations • context manager │ +│ │ +│ Wraps: pyspark.sql.SparkSession (Spark Connect) │ +ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ +``` + +### Data Flow + +``` +User Code → SparkSessionClient → ConnectBackend → gRPC → Spark Connect Server + ↓ + Spark Cluster + (Driver + Executors) + ↓ +Results ← ManagedSparkSession ← gRPC Stream ← Spark Connect Server +``` + +--- + +## Design Principles + +### 1. Specialized Client Classes + +Separate clients for different workloads: + +```python +# Batch jobs +from kubeflow.spark import BatchSparkClient, OperatorBackendConfig + +config = OperatorBackendConfig(namespace="spark-jobs") +client = BatchSparkClient(backend_config=config) +response = client.submit_application(app_name="batch-job", ...) + +# Interactive sessions +from kubeflow.spark import SparkSessionClient, ConnectBackendConfig + +config = ConnectBackendConfig(connect_url="sc://spark-connect:15002") +client = SparkSessionClient(backend_config=config) +session = client.create_session(app_name="analysis") +``` + +Benefits: +- Type safety: Clients only expose relevant methods +- No runtime errors from unsupported operations +- Clear API boundaries +- Follows Interface Segregation Principle + +### 2. Native PySpark Delegation + +`ManagedSparkSession` delegates all DataFrame operations to native PySpark: + +```python +# These call PySpark directly - no wrapping overhead +df = session.sql("SELECT * FROM table") +df = session.createDataFrame(data, schema) +df = session.read.parquet("s3://bucket/data") +result = df.filter(df.age > 30).collect() +``` + +Benefits: +- Full PySpark API compatibility +- Zero wrapping overhead for DataFrame operations +- Automatic updates when PySpark adds new features +- Standard PySpark documentation applies + +### 3. Backend Abstraction + +Backends implement specialized abstract base classes: + +```python +# Base interface (minimal shared functionality) +class SparkBackend(abc.ABC): + def close(self): pass + +# Batch workloads +class BatchSparkBackend(SparkBackend): + def submit_application(...) -> SparkApplicationResponse + def get_status(app_id) -> ApplicationStatus + def delete_application(app_id) -> Dict + def get_logs(...) -> Iterator[str] + def list_applications(...) -> List[ApplicationStatus] + def wait_for_completion(...) -> ApplicationStatus + +# Session workloads +class SessionSparkBackend(SparkBackend): + def create_session(app_name, **kwargs) -> ManagedSparkSession + def list_sessions() -> List[SessionInfo] + def close_session(session_id) -> Dict[str, Any] + def get_session_status(session_id) -> SessionInfo +``` + +Implementation hierarchy: +- OperatorBackend extends BatchSparkBackend +- GatewayBackend extends BatchSparkBackend +- ConnectBackend extends SessionSparkBackend + +--- + +## Component Details + +### ConnectBackendConfig + +Configuration for Spark Connect connectivity: + +```python +@dataclass +class ConnectBackendConfig: + connect_url: str # "sc://host:port" + token: Optional[str] = None # Bearer token for auth + use_ssl: bool = True # Enable TLS + user_id: Optional[str] = None # User identity + timeout: int = 300 # Connection timeout (seconds) + grpc_max_message_size: int = 128MB # gRPC message limit + namespace: str = "default" # K8s namespace +``` + +### ManagedSparkSession + +Kubeflow wrapper around native PySpark Connect session: + +```python +class ManagedSparkSession: + # Properties + @property + def session_id(self) -> str # Unique session UUID + @property + def app_name(self) -> str # Application name + @property + def is_closed(self) -> bool # Session state + + # PySpark API (delegated to self._session) + def sql(self, query: str) -> DataFrame + def createDataFrame(self, data, schema) -> DataFrame + def read(self) -> DataFrameReader + def table(self, table_name: str) -> DataFrame + def range(self, start, end, step) -> DataFrame + + # Kubeflow extensions + def get_metrics(self) -> SessionMetrics + def get_info(self) -> SessionInfo + def upload_artifacts(self, *paths) -> None + def close(self) -> None + + # Context manager support + def __enter__(self) -> "ManagedSparkSession" + def __exit__(self, exc_type, exc_val, exc_tb) -> None +``` + +### SessionMetrics + +Tracks session activity: + +```python +@dataclass +class SessionMetrics: + session_id: str + queries_executed: int = 0 # SQL/DataFrame operations + active_queries: int = 0 # Currently running queries + artifacts_uploaded: int = 0 # Uploaded JARs/files + data_read_bytes: int = 0 # Data read + data_written_bytes: int = 0 # Data written + execution_time_ms: int = 0 # Total execution time +``` + +--- + +## Usage Examples + +### Basic Connection + +```python +from kubeflow.spark import SparkSessionClient, ConnectBackendConfig + +# Configure connection +config = ConnectBackendConfig( + connect_url="sc://localhost:30000", + use_ssl=False, +) + +# Create client and session +client = SparkSessionClient(backend_config=config) +session = client.create_session(app_name="demo") + +# Use standard PySpark API +df = session.sql("SELECT 1 AS id, 'Hello' AS message") +df.show() + +# Cleanup +session.close() +client.close() +``` + +### Context Manager Pattern + +```python +from kubeflow.spark import SparkSessionClient, ConnectBackendConfig + +config = ConnectBackendConfig(connect_url="sc://spark-server:15002") + +with SparkSessionClient(backend_config=config) as client: + with client.create_session(app_name="analysis") as session: + # Session auto-closes on exit + df = session.sql("SELECT * FROM sales") + result = df.filter(df.amount > 100).collect() +``` + +### DataFrame Operations + +```python +# Create DataFrame from Python data +sales_data = [ + (1, "Electronics", "Laptop", 1200.00, 2), + (2, "Electronics", "Mouse", 25.00, 5), + (3, "Clothing", "Shirt", 35.00, 3), +] + +df = session.createDataFrame( + sales_data, + ["id", "category", "product", "price", "quantity"] +) + +# Show data +df.show() +# +---+-----------+--------+------+--------+ +# | id| category| product| price|quantity| +# +---+-----------+--------+------+--------+ +# | 1|Electronics| Laptop|1200.0| 2| +# | 2|Electronics| Mouse| 25.0| 5| +# | 3| Clothing| Shirt| 35.0| 3| +# +---+-----------+--------+------+--------+ +``` + +### Aggregations and GroupBy + +```python +from pyspark.sql import functions as F + +# Calculate revenue +revenue_df = df.withColumn("revenue", F.col("price") * F.col("quantity")) + +# Group by category with multiple aggregations +category_stats = revenue_df.groupBy("category").agg( + F.sum("revenue").alias("total_revenue"), + F.avg("price").alias("avg_price"), + F.count("*").alias("num_transactions") +) + +category_stats.show() +# +-----------+-------------+---------+----------------+ +# | category|total_revenue|avg_price|num_transactions| +# +-----------+-------------+---------+----------------+ +# | Clothing| 105.0| 35.0| 1| +# |Electronics| 2525.0| 612.5| 2| +# +-----------+-------------+---------+----------------+ + +# Sort by revenue +category_stats.orderBy(F.desc("total_revenue")).show() +``` + +### Window Functions + +```python +from pyspark.sql import functions as F +from pyspark.sql.window import Window + +# Running total by date +window_spec = Window.orderBy("date").rowsBetween( + Window.unboundedPreceding, + Window.currentRow +) + +daily_revenue = revenue_df.groupBy("date").agg( + F.sum("revenue").alias("daily_revenue") +).withColumn( + "running_total", + F.sum("daily_revenue").over(window_spec) +) + +daily_revenue.orderBy("date").show() +``` + +### Session Management + +```python +# List all active sessions +sessions = client.list_sessions() +for s in sessions: + print(f"Session: {s.session_id}, App: {s.app_name}, State: {s.state}") + +# Get session status +status = client.get_session_status(session.session_id) +print(f"Session state: {status.state}") + +# Get metrics +metrics = session.get_metrics() +print(f"Queries executed: {metrics.queries_executed}") +print(f"Active queries: {metrics.active_queries}") + +# Get session info +info = session.get_info() +print(f"App: {info.app_name}, State: {info.state}") +``` + +### Multiple Concurrent Sessions + +```python +with SparkSessionClient(backend_config=config) as client: + # Create multiple sessions + session1 = client.create_session(app_name="analysis-1") + session2 = client.create_session(app_name="analysis-2") + + try: + # Each session is independent + df1 = session1.sql("SELECT 'session1' AS source") + df2 = session2.sql("SELECT 'session2' AS source") + + print(df1.collect()) # [Row(source='session1')] + print(df2.collect()) # [Row(source='session2')] + finally: + session1.close() + session2.close() +``` + +--- + +## Deployment Guide + +### Kubernetes Setup + +#### 1. Deploy Spark Connect Server + +Use the provided Kubernetes manifest: + +```bash +# Deploy Spark Connect server +kubectl apply -f examples/spark/spark-connect-server.yaml + +# Verify deployment +kubectl get pods -l app=spark-connect +kubectl logs -l app=spark-connect -f +``` + +#### 2. Port Forwarding (Local Development) + +```bash +# Forward Spark Connect port to localhost +kubectl port-forward -n default svc/spark-connect 30000:15002 + +# Verify connectivity +nc -zv localhost 30000 +``` + +#### 3. Connect from Python + +```python +config = ConnectBackendConfig( + connect_url="sc://localhost:30000", # Local port forward + use_ssl=False, +) + +client = SparkSessionClient(backend_config=config) +session = client.create_session(app_name="my-app") +``` + +### Production Setup + +For production, use Kubernetes DNS: + +```python +config = ConnectBackendConfig( + connect_url="sc://spark-connect.default.svc.cluster.local:15002", + use_ssl=True, + token=os.getenv("SPARK_TOKEN"), # From K8s secret +) +``` + +--- + +## Interactive Demo + +### Quick Start + +```bash +# 1. Setup Kubernetes cluster with Spark Connect +cd examples/spark +./setup_spark_connect.sh + +# 2. Install dependencies +pip install 'pyspark[connect]>=4.0.0' + +# 3. Launch IPython shell +python ipython_spark_connect_shell.py +``` + +### Step-by-Step Tutorial + +The IPython shell provides a guided tutorial. Key steps: + +```python +# 1. Create config and client +config = ConnectBackendConfig( + connect_url="sc://localhost:30000", + use_ssl=False, +) +client = SparkSessionClient(backend_config=config) + +# 2. Create session +session = client.create_session(app_name="tutorial") + +# 3. Simple query +df = session.sql("SELECT 1 AS id, 'Hello' AS msg") +df.show() + +# 4. Create DataFrame +data = [ + (1, "Electronics", 1200.00), + (2, "Clothing", 35.00), +] +df = session.createDataFrame(data, ["id", "category", "price"]) +df.show() + +# 5. Aggregations +from pyspark.sql import functions as F +df.groupBy("category").agg(F.avg("price")).show() + +# 6. Cleanup +session.close() +client.close() +``` + +--- + +## Key Design Decisions + +### 1. Separate Client Classes (Interface Segregation) + +**Decision:** Use BatchSparkClient and SparkSessionClient instead of unified client + +**Rationale:** +- Different use cases have distinct method requirements +- Batch: submit_application, wait_for_completion, delete_application +- Session: create_session, list_sessions, close_session +- Prevents runtime NotImplementedError exceptions +- Type-safe: clients only expose relevant methods + +**Benefits:** +- Compile-time type checking +- Clear API boundaries +- No confusion about which methods work with which backend +- IDE autocomplete shows only valid methods + +### 2. Delegation to Native PySpark + +**Decision:** Delegate DataFrame operations to native PySpark + +**Alternatives Considered:** +- Wrap all PySpark methods → Rejected (maintenance burden) +- Custom DataFrame implementation → Rejected (no value-add) + +**Benefits:** +- Zero wrapping overhead +- Full PySpark compatibility +- Automatic feature updates + +### 3. URL Parameter Handling (Spark 4.0) + +**Decision:** Use simple URL format without parameters + +**Issue:** Spark Connect 4.0 doesn't support URL parameters like `/;use_ssl=false` + +**Solution:** Pass configuration via `builder.config()` instead of URL + +```python +# Before (doesn't work in Spark 4.0) +url = "sc://host:port/;use_ssl=false" + +# After (works) +url = "sc://host:port" +builder.config("spark.ssl.enabled", "false") +``` + +### 4. IPv4 vs IPv6 Binding + +**Issue:** Spark Connect server was binding to IPv6 (:::15002) causing connection failures + +**Solution:** Force IPv4 binding via Java options + +```yaml +env: +- name: JAVA_TOOL_OPTIONS + value: "-Djava.net.preferIPv4Stack=true" +``` + +--- + +## Version Compatibility + +| Component | Version | Notes | +|-----------|---------|-------| +| PySpark Client | 4.0.x | Must match server version | +| Spark Connect Server | 4.0.0 | Running in Kubernetes | +| Kubeflow SDK | Latest | This implementation | +| Kubernetes | 1.24+ | For Spark Operator | +| Python | 3.8+ | Required for PySpark | + +**Important:** Client and server versions must match. PySpark 4.0 cannot connect to Spark 3.5 Connect servers. + +--- + +## Resources + +### Files Created + +- `kubeflow/spark/backends/connect.py` - ConnectBackend implementation +- `kubeflow/spark/session.py` - ManagedSparkSession wrapper +- `kubeflow/spark/models.py` - Data models (ConnectBackendConfig, SessionMetrics, SessionInfo) +- `examples/spark/ipython_spark_connect_shell.py` - Interactive demo shell +- `examples/spark/ipython_spark_connect_demo.py` - Automated demo +- `examples/spark/spark-connect-server.yaml` - Kubernetes deployment +- `examples/spark/setup_spark_connect.sh` - Setup automation +- `examples/spark/SPARK_CONNECT_DEMO.md` - Demo documentation + +### Documentation + +- [Spark Connect Overview](https://spark.apache.org/docs/latest/spark-connect-overview.html) +- [PySpark Connect API](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/connect.html) +- [Demo Guide](examples/spark/SPARK_CONNECT_DEMO.md) + +--- + +## Summary + +Spark Connect integration provides: + +- **Specialized Clients** - BatchSparkClient for batch jobs, SparkSessionClient for interactive sessions +- **Type Safety** - Interface Segregation Principle prevents runtime errors +- **Native PySpark** - Full DataFrame API with zero overhead +- **Kubernetes-Native** - Automatic config/secret injection +- **Production-Ready** - Session management, metrics, error handling +- **Developer-Friendly** - Context managers, IPython integration, examples + +The implementation follows SOLID design principles while providing the full power of PySpark Connect for interactive data analysis and ML workloads. diff --git a/kubeflow/spark/__init__.py b/kubeflow/spark/__init__.py new file mode 100644 index 000000000..25880dfca --- /dev/null +++ b/kubeflow/spark/__init__.py @@ -0,0 +1,158 @@ +"""Kubeflow Spark Client for managing Spark applications on Kubernetes. + +This module provides specialized Python clients for managing Apache Spark applications +on Kubernetes using different backends: + +**Batch Jobs:** +- **BatchSparkClient**: For batch Spark application submission and management + - OperatorBackend: Cloud-native using Kubeflow Spark Operator (recommended) + - GatewayBackend: REST API for managed Spark gateways + +**Interactive Sessions:** +- **SparkSessionClient**: For interactive Spark Connect sessions + - ConnectBackend: gRPC-based remote connectivity for notebooks and exploration + +Quick Start (Batch Jobs): + ```python + from kubeflow.spark import BatchSparkClient, OperatorBackendConfig + + # Create batch client (uses Operator backend by default) + client = BatchSparkClient() + + # Submit a Spark application + response = client.submit_application( + app_name="spark-pi", + main_application_file="local:///opt/spark/examples/src/main/python/pi.py", + driver_cores=1, + driver_memory="512m", + executor_cores=1, + executor_memory="512m", + num_executors=2, + ) + + # Wait for completion + status = client.wait_for_job_status(response.submission_id) + print(f"Application state: {status.state}") + ``` + +Quick Start (Interactive Sessions): + ```python + from kubeflow.spark import SparkSessionClient, ConnectBackendConfig + + # Connect to existing Spark cluster + config = ConnectBackendConfig(connect_url="sc://spark-cluster:15002") + client = SparkSessionClient(backend_config=config) + + # Create interactive session + session = client.create_session(app_name="data-analysis") + + # Use standard PySpark API + df = session.sql("SELECT * FROM table") + result = df.filter(df.status == "active").collect() + + # Cleanup + session.close() + ``` + +For more examples, see the examples/ directory. +""" + +# Import client classes +from kubeflow.spark.base_client import BaseSparkClient +from kubeflow.spark.batch_client import BatchSparkClient +from kubeflow.spark.session_client import SparkSessionClient + +# Import backends and configs +from kubeflow.spark.backends import ( + BatchSparkBackend, + ConnectBackend, + ConnectBackendConfig, + GatewayBackend, + GatewayBackendConfig, + OperatorBackend, + OperatorBackendConfig, + SessionSparkBackend, + SparkBackend, +) + +# Import models +from kubeflow.spark.models import ( + # States & Enums + ApplicationState, + # Status Models + ApplicationStatus, + BatchSchedulerConfig, + DeployMode, + DynamicAllocation, + GPUSpec, + MonitoringSpec, + PrometheusSpec, + # Configuration Models + RestartPolicy, + RestartPolicyType, + # Session Models (for Spark Connect) + SessionInfo, + SessionMetrics, + # Request & Response + SparkApplicationRequest, + SparkApplicationResponse, + SparkUIConfiguration, +) + +# Import session management +from kubeflow.spark.session import ManagedSparkSession + +# Import validation +from kubeflow.spark.validation import ( + SparkApplicationValidator, + ValidationError, + ValidationErrorType, + ValidationResult, + validate_spark_application, +) + +__all__ = [ + # Client classes + "BaseSparkClient", + "BatchSparkClient", + "SparkSessionClient", + # Backends (base classes) + "SparkBackend", + "BatchSparkBackend", + "SessionSparkBackend", + # Backend implementations + "OperatorBackend", + "OperatorBackendConfig", + "GatewayBackend", + "GatewayBackendConfig", + "ConnectBackend", + "ConnectBackendConfig", + # Session Management (Spark Connect) + "ManagedSparkSession", + "SessionInfo", + "SessionMetrics", + # Request & Response Models + "SparkApplicationRequest", + "SparkApplicationResponse", + "ApplicationStatus", + # States & Enums + "ApplicationState", + "RestartPolicyType", + "DeployMode", + # Configuration Models + "RestartPolicy", + "GPUSpec", + "DynamicAllocation", + "BatchSchedulerConfig", + "PrometheusSpec", + "MonitoringSpec", + "SparkUIConfiguration", + # Validation + "validate_spark_application", + "SparkApplicationValidator", + "ValidationResult", + "ValidationError", + "ValidationErrorType", +] + +__version__ = "0.2.0" diff --git a/kubeflow/spark/backends/__init__.py b/kubeflow/spark/backends/__init__.py new file mode 100644 index 000000000..4e1db855c --- /dev/null +++ b/kubeflow/spark/backends/__init__.py @@ -0,0 +1,24 @@ +"""Spark backends for different execution environments.""" + +from kubeflow.spark.backends.base import ( + BatchSparkBackend, + SessionSparkBackend, + SparkBackend, +) +from kubeflow.spark.backends.connect import ConnectBackend, ConnectBackendConfig +from kubeflow.spark.backends.gateway import GatewayBackend, GatewayBackendConfig +from kubeflow.spark.backends.operator import OperatorBackend, OperatorBackendConfig + +__all__ = [ + # Base classes + "SparkBackend", + "BatchSparkBackend", + "SessionSparkBackend", + # Backend implementations + "OperatorBackend", + "OperatorBackendConfig", + "GatewayBackend", + "GatewayBackendConfig", + "ConnectBackend", + "ConnectBackendConfig", +] diff --git a/kubeflow/spark/backends/base.py b/kubeflow/spark/backends/base.py new file mode 100644 index 000000000..f42be726a --- /dev/null +++ b/kubeflow/spark/backends/base.py @@ -0,0 +1,323 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Base backend interfaces for Spark applications. + +This module defines the backend interface hierarchy for the Kubeflow Spark SDK: + +- SparkBackend: Minimal base class with common functionality +- BatchSparkBackend: Interface for batch job submission (OperatorBackend, GatewayBackend) +- SessionSparkBackend: Interface for interactive sessions (ConnectBackend) + +This design follows the Interface Segregation Principle (ISP), ensuring that +backends only implement methods relevant to their use case. +""" + +import abc +from collections.abc import Iterator +from typing import TYPE_CHECKING, Any, Optional + +from kubeflow.spark.models import ApplicationStatus, SessionInfo, SparkApplicationResponse + +if TYPE_CHECKING: + from kubeflow.spark.session import ManagedSparkSession + + +class SparkBackend(abc.ABC): + """Minimal base class for all Spark backends. + + This class provides only the essential functionality common to all backends. + Specific backend types (batch or session) inherit from BatchSparkBackend or + SessionSparkBackend respectively. + + All backends should implement the close() method to clean up resources. + """ + + def close(self): + """Close any open connections or resources. + + Subclasses should override this to clean up resources like: + - Kubernetes API clients + - HTTP connections + - gRPC channels + - File handles + + This method is called when the client is closed or when used as a context manager. + """ + pass + + +class BatchSparkBackend(SparkBackend): + """Abstract base class for batch-oriented Spark backends. + + This interface defines the contract for backends that support traditional + batch Spark application submission, monitoring, and management. + + Backends implementing this interface: + - OperatorBackend: Submits SparkApplication CRDs to Kubernetes + - GatewayBackend: Submits jobs via REST API to Spark gateways + + Typical workflow: + 1. submit_application() -> Returns submission_id + 2. wait_for_job_status() or poll get_job() + 3. get_job_logs() to retrieve output + 4. delete_job() for cleanup + """ + + @abc.abstractmethod + def submit_application( + self, + app_name: str, + main_application_file: str, + spark_version: str, + app_type: str, + driver_cores: int, + driver_memory: str, + executor_cores: int, + executor_memory: str, + num_executors: int, + queue: Optional[str], + arguments: Optional[list[str]], + python_version: str, + spark_conf: Optional[dict[str, str]], + hadoop_conf: Optional[dict[str, str]], + env_vars: Optional[dict[str, str]], + deps: Optional[dict[str, list[str]]], + **kwargs: Any, + ) -> SparkApplicationResponse: + """Submit a Spark application for batch execution. + + Args: + app_name: Name of the application + main_application_file: Path to main application file (local://, s3a://, etc.) + spark_version: Spark version to use (e.g., "4.0.0") + app_type: Application type ("Python", "Scala", "Java", "R") + driver_cores: Number of cores for driver + driver_memory: Memory for driver (e.g., "4g", "512m") + executor_cores: Number of cores per executor + executor_memory: Memory per executor (e.g., "8g", "2g") + num_executors: Number of executors to provision + queue: Queue/namespace to submit to (backend-specific) + arguments: Application arguments passed to main file + python_version: Python version for PySpark apps (e.g., "3") + spark_conf: Spark configuration properties (spark.*) + hadoop_conf: Hadoop configuration properties + env_vars: Environment variables for driver and executors + deps: Dependencies dict with keys: "jars", "pyFiles", "files" + **kwargs: Additional backend-specific parameters + + Returns: + SparkApplicationResponse with submission_id and initial status + + Raises: + RuntimeError: If submission fails + TimeoutError: If submission times out + ValueError: If invalid parameters provided + """ + raise NotImplementedError() + + @abc.abstractmethod + def get_job(self, submission_id: str) -> ApplicationStatus: + """Get current status of a Spark application. + + Args: + submission_id: Submission ID returned from submit_application() + + Returns: + ApplicationStatus with current state and metadata + + Raises: + RuntimeError: If request fails + TimeoutError: If request times out + ValueError: If submission_id not found + """ + raise NotImplementedError() + + @abc.abstractmethod + def delete_job(self, submission_id: str) -> dict[str, Any]: + """Delete a Spark application. + + This terminates a running application or removes a completed application. + + Args: + submission_id: Submission ID to delete + + Returns: + Dictionary with deletion response and status + + Raises: + RuntimeError: If deletion fails + TimeoutError: If deletion times out + ValueError: If submission_id not found + """ + raise NotImplementedError() + + @abc.abstractmethod + def get_job_logs( + self, + submission_id: str, + executor_id: Optional[str] = None, + follow: bool = False, + ) -> Iterator[str]: + """Get application logs. + + Args: + submission_id: Submission ID + executor_id: Optional executor ID (if not provided, returns driver logs) + follow: Whether to stream logs in real-time (tail -f behavior) + + Yields: + Log lines as strings + + Raises: + RuntimeError: If request fails + ValueError: If submission_id or executor_id not found + """ + raise NotImplementedError() + + @abc.abstractmethod + def list_jobs( + self, + namespace: Optional[str] = None, + labels: Optional[dict[str, str]] = None, + ) -> list[ApplicationStatus]: + """List Spark applications with optional filtering. + + Args: + namespace: Optional namespace/queue filter + labels: Optional label filters (key-value pairs) + + Returns: + List of ApplicationStatus objects + + Raises: + RuntimeError: If request fails + TimeoutError: If request times out + """ + raise NotImplementedError() + + @abc.abstractmethod + def wait_for_job_status( + self, + submission_id: str, + timeout: int = 3600, + polling_interval: int = 10, + ) -> ApplicationStatus: + """Wait for Spark application to complete. + + This method blocks until the application reaches a terminal state + (COMPLETED, FAILED, SUBMISSION_FAILED, KILLED) or timeout is reached. + + Args: + submission_id: Submission ID to monitor + timeout: Maximum time to wait in seconds (default: 1 hour) + polling_interval: Polling interval in seconds (default: 10) + + Returns: + Final ApplicationStatus + + Raises: + TimeoutError: If application doesn't complete within timeout + RuntimeError: If monitoring fails + ValueError: If submission_id not found + """ + raise NotImplementedError() + + +class SessionSparkBackend(SparkBackend): + """Abstract base class for session-oriented Spark backends. + + This interface defines the contract for backends that support interactive, + long-lived Spark sessions for exploratory data analysis and notebook workflows. + + Backends implementing this interface: + - ConnectBackend: Connects to Spark clusters via Spark Connect protocol (gRPC) + + Typical workflow: + 1. create_session() -> Returns ManagedSparkSession + 2. Use session.sql(), session.read(), etc. for interactive queries + 3. close_session() to release resources + + Unlike batch backends, sessions maintain state and support iterative development. + """ + + @abc.abstractmethod + def create_session( + self, + app_name: str, + **kwargs: Any, + ) -> "ManagedSparkSession": + """Create a new Spark Connect session. + + This establishes a connection to a Spark Connect server and returns + a managed session wrapper that provides the full PySpark DataFrame API. + + Args: + app_name: Name for the session/application + **kwargs: Backend-specific configuration (e.g., Spark configs) + + Returns: + ManagedSparkSession instance for interactive operations + + Raises: + RuntimeError: If session creation fails + ConnectionError: If cannot connect to Spark Connect server + TimeoutError: If connection times out + """ + raise NotImplementedError() + + @abc.abstractmethod + def get_session_status(self, session_id: str) -> SessionInfo: + """Get status and metadata of a Spark Connect session. + + Args: + session_id: Session UUID returned by create_session() + + Returns: + SessionInfo with session metadata, state, and metrics + + Raises: + RuntimeError: If request fails + ValueError: If session_id not found + """ + raise NotImplementedError() + + @abc.abstractmethod + def list_sessions(self) -> list[SessionInfo]: + """List all active Spark Connect sessions. + + Returns: + List of SessionInfo objects for active sessions + + Raises: + RuntimeError: If request fails + """ + raise NotImplementedError() + + @abc.abstractmethod + def close_session(self, session_id: str, release: bool = True) -> dict[str, Any]: + """Close a Spark Connect session. + + Args: + session_id: Session UUID to close + release: If True, release session resources on server + + Returns: + Dictionary with closure response + + Raises: + RuntimeError: If closure fails + ValueError: If session_id not found + """ + raise NotImplementedError() diff --git a/kubeflow/spark/backends/connect.py b/kubeflow/spark/backends/connect.py new file mode 100644 index 000000000..2ea1fa0fb --- /dev/null +++ b/kubeflow/spark/backends/connect.py @@ -0,0 +1,344 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Spark Connect backend for remote Spark cluster connectivity.""" + +from collections.abc import Iterator +import logging +from typing import Any, Optional +import uuid + +from kubeflow.spark.backends.base import SessionSparkBackend +from kubeflow.spark.models import ( + ApplicationStatus, + ConnectBackendConfig, + SessionInfo, + SparkApplicationResponse, +) +from kubeflow.spark.session import ManagedSparkSession + +logger = logging.getLogger(__name__) + + +class ConnectBackend(SessionSparkBackend): + """Spark Connect backend for remote connectivity to Spark clusters. + + This backend enables connection to existing Spark clusters via the Spark Connect + protocol (gRPC-based). It supports interactive, session-based workloads unlike + traditional batch-oriented backends. + + Features: + - Remote connectivity via Spark Connect (gRPC) + - Session management with isolation + - Interactive DataFrame operations + - Artifact upload (JARs, Python files, data) + - Authentication (Bearer token) + - SSL/TLS support + - Optional auto-provisioning of Spark Connect server + + Example: + ```python + from kubeflow.spark import SparkClient, ConnectBackendConfig + + config = ConnectBackendConfig( + connect_url="sc://spark-cluster.default.svc:15002", token="my-auth-token", use_ssl=True + ) + client = SparkClient(backend_config=config) + + # Create session + session = client.create_session(app_name="data-analysis") + + # Use PySpark API + df = session.sql("SELECT * FROM table") + result = df.collect() + + # Cleanup + session.close() + ``` + """ + + def __init__(self, config: ConnectBackendConfig): + """Initialize Spark Connect backend. + + Args: + config: ConnectBackendConfig with connection details + + Raises: + ImportError: If pyspark[connect] is not installed + ValueError: If config is invalid + """ + self.config = config + self._sessions: dict[str, ManagedSparkSession] = {} + + # Validate and parse connection URL + self._validate_config() + + # Check for pyspark installation + try: + import pyspark + + pyspark_version = pyspark.__version__ + logger.info(f"Using PySpark version: {pyspark_version}") + + # Check for Connect support (requires 3.4+) + major, minor = map(int, pyspark_version.split(".")[:2]) + if major < 3 or (major == 3 and minor < 4): + raise ImportError( + f"Spark Connect requires PySpark 3.4+, found {pyspark_version}. " + "Please upgrade: pip install 'pyspark[connect]>=3.4.0'" + ) + except ImportError as e: + raise ImportError( + "PySpark with Connect support is required for ConnectBackend. " + "Install it with: pip install 'pyspark[connect]>=3.4.0'" + ) from e + + logger.info(f"Initialized ConnectBackend with URL: {self._get_masked_url()}") + + def _validate_config(self) -> None: + """Validate configuration. + + Raises: + ValueError: If config is invalid + """ + if not self.config.connect_url: + raise ValueError("connect_url is required") + + # Parse URL to validate format + if not self.config.connect_url.startswith("sc://"): + raise ValueError( + f"Invalid Spark Connect URL: {self.config.connect_url}. " + "Expected format: sc://host:port/;param1=value;param2=value" + ) + + # Parse URL components + url_without_scheme = self.config.connect_url[5:] # Remove "sc://" + if "/" in url_without_scheme: + host_port, params = url_without_scheme.split("/", 1) + else: + host_port = url_without_scheme + params = "" + + if ":" not in host_port: + raise ValueError( + f"Invalid Spark Connect URL: {self.config.connect_url}. " + "Expected format: sc://host:port/" + ) + + def _get_masked_url(self) -> str: + """Get connection URL with masked token. + + Returns: + URL string with token masked + """ + url = self.config.connect_url + if ";token=" in url: + parts = url.split(";token=") + return parts[0] + ";token=***" + return url + + def _build_connection_url(self) -> str: + """Build final connection URL with all parameters. + + For Spark Connect, most parameters should be set via builder.config() + rather than in the URL to avoid conflicts with server-side configs. + + Returns: + Complete Spark Connect URL + """ + # For Spark 4.x, use simple URL without parameters + # Parameters should be set via builder.config() instead + url = self.config.connect_url + + # Only add essential parameters that are part of the connection string + # SSL and authentication should be handled at connection level + # Avoid adding parameters like use_ssl in URL as they may conflict + + return url + + # ========================================================================= + # Session-Oriented Methods (Implemented) + # ========================================================================= + + def create_session( + self, + app_name: str, + **kwargs: Any, + ) -> ManagedSparkSession: + """Create a new Spark Connect session. + + Args: + app_name: Name for the session/application + **kwargs: Additional Spark configuration (passed to SparkSession.builder.config) + + Returns: + ManagedSparkSession instance + + Raises: + RuntimeError: If session creation fails + """ + try: + from pyspark.sql import SparkSession + + logger.debug("Starting create_session") + + # Generate session ID + session_id = str(uuid.uuid4()) + logger.debug(f"Generated session ID: {session_id}") + + # Build connection URL + connection_url = self._build_connection_url() + logger.debug(f"Connection URL: {connection_url}") + + # Create SparkSession builder + logger.debug("Creating SparkSession.builder.remote()") + builder = SparkSession.builder.remote(connection_url).appName(app_name) + logger.debug("Builder created, adding app name") + + # Apply additional configurations + for key, value in kwargs.items(): + logger.debug(f"Applying config: {key}={value}") + builder = builder.config(key, value) + + # Create session + logger.debug("About to call builder.getOrCreate() - THIS MAY HANG") + spark_session = builder.getOrCreate() + logger.debug("getOrCreate() returned successfully") + + # Wrap in ManagedSparkSession + logger.debug("Creating ManagedSparkSession wrapper") + managed_session = ManagedSparkSession( + session=spark_session, + session_id=session_id, + app_name=app_name, + backend=self, + ) + + # Track session + self._sessions[session_id] = managed_session + logger.debug("Session tracked in backend") + + logger.info(f"Created Spark Connect session: {session_id} (app: {app_name})") + return managed_session + + except Exception as e: + logger.error(f"Failed to create Spark Connect session: {e}") + raise RuntimeError(f"Failed to create session: {e}") from e + + def get_session_status(self, session_id: str) -> SessionInfo: + """Get status of a Spark Connect session. + + Args: + session_id: Session UUID + + Returns: + SessionInfo with session metadata + + Raises: + ValueError: If session not found + """ + if session_id not in self._sessions: + raise ValueError(f"Session not found: {session_id}") + + session = self._sessions[session_id] + return session.get_info() + + def list_sessions(self) -> list[SessionInfo]: + """List all active Spark Connect sessions. + + Returns: + List of SessionInfo objects + """ + return [session.get_info() for session in self._sessions.values()] + + def close_session(self, session_id: str, release: bool = True) -> dict[str, Any]: + """Close a Spark Connect session. + + Args: + session_id: Session UUID to close + release: If True, release session resources on server + + Returns: + Dictionary with closure response + + Raises: + ValueError: If session not found + """ + if session_id not in self._sessions: + raise ValueError(f"Session not found: {session_id}") + + session = self._sessions[session_id] + session.close(release=release) + + # Remove from tracking + del self._sessions[session_id] + + return { + "session_id": session_id, + "status": "closed", + "message": "Session closed successfully", + } + + def _clone_session(self, session: ManagedSparkSession) -> ManagedSparkSession: + """Internal method to clone a session. + + Args: + session: Session to clone + + Returns: + New ManagedSparkSession + """ + try: + # Generate new session ID + new_session_id = str(uuid.uuid4()) + + # Clone the underlying PySpark session + # Note: PySpark Connect supports session cloning via newSession() + if hasattr(session.spark, "newSession"): + new_spark_session = session.spark.newSession() + else: + # Fallback: create new session (won't share state) + logger.warning("Session cloning not supported, creating new session instead") + return self.create_session(app_name=f"{session.app_name}-clone") + + # Wrap in ManagedSparkSession + cloned_session = ManagedSparkSession( + session=new_spark_session, + session_id=new_session_id, + app_name=f"{session.app_name}-clone", + backend=self, + ) + + # Track session + self._sessions[new_session_id] = cloned_session + + logger.info(f"Cloned session {session.session_id} -> {new_session_id}") + return cloned_session + + except Exception as e: + logger.error(f"Failed to clone session: {e}") + raise RuntimeError(f"Failed to clone session: {e}") from e + + def close(self): + """Close all sessions and cleanup resources.""" + logger.info(f"Closing ConnectBackend with {len(self._sessions)} active sessions") + + # Close all sessions + for session_id in list(self._sessions.keys()): + try: + self.close_session(session_id, release=True) + except Exception as e: + logger.error(f"Error closing session {session_id}: {e}") + + logger.info("ConnectBackend closed") diff --git a/kubeflow/spark/backends/gateway.py b/kubeflow/spark/backends/gateway.py new file mode 100644 index 000000000..058e5cbda --- /dev/null +++ b/kubeflow/spark/backends/gateway.py @@ -0,0 +1,383 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Gateway backend for Spark client (REST API based).""" + +from collections.abc import Iterator +import logging +import os +from typing import Any, Optional +from urllib.parse import urljoin + +from kubeflow.spark.backends.base import BatchSparkBackend +from kubeflow.spark.config import AuthMethod +from kubeflow.spark.models import ApplicationStatus, SparkApplicationResponse + +logger = logging.getLogger(__name__) + + +class GatewayBackend(BatchSparkBackend): + """Gateway backend for Spark applications. + + This backend communicates with a Batch Processing Gateway via REST API. + It's useful for managed Spark environments where you don't have direct + K8s access but can use a gateway service. + + Example: + from kubeflow.spark.backends.gateway import GatewayBackend, GatewayBackendConfig + + config = GatewayBackendConfig( + gateway_url="http://gateway:8080", + user="myuser", + auth_method=AuthMethod.HEADER + ) + backend = GatewayBackend(config) + """ + + def __init__(self, config: "GatewayBackendConfig"): + """Initialize the Gateway backend. + + Args: + config: GatewayBackendConfig instance + """ + self.config = config + self._session = None + self._initialize_session() + + def _initialize_session(self): + """Initialize HTTP session with authentication.""" + try: + import requests + from requests.auth import HTTPBasicAuth + except ImportError: + raise ImportError( + "requests library is required for GatewayBackend. " + "Install with: pip install requests" + ) + + self._session = requests.Session() + self._session.verify = self.config.verify_ssl + + # Configure authentication + if self.config.auth_method == AuthMethod.BASIC: + if self.config.user and self.config.password: + self._session.auth = HTTPBasicAuth(self.config.user, self.config.password) + elif self.config.auth_method == AuthMethod.HEADER and self.config.user: + self._session.headers[self.config.auth_header_key] = self.config.user + + # Add extra headers + self._session.headers.update(self.config.extra_headers) + + def submit_application( + self, + app_name: str, + main_application_file: str, + spark_version: str = "3.5.0", + app_type: str = "Python", + driver_cores: int = 1, + driver_memory: str = "1g", + executor_cores: int = 1, + executor_memory: str = "1g", + num_executors: int = 2, + queue: Optional[str] = None, + arguments: Optional[list[str]] = None, + python_version: str = "3", + spark_conf: Optional[dict[str, str]] = None, + hadoop_conf: Optional[dict[str, str]] = None, + env_vars: Optional[dict[str, str]] = None, + deps: Optional[dict[str, list[str]]] = None, + **kwargs: Any, + ) -> SparkApplicationResponse: + """Submit a Spark application through the gateway. + + Args: + See SparkBackend.submit_application for parameter details + + Returns: + SparkApplicationResponse with submission details + + Raises: + RuntimeError: If submission fails + TimeoutError: If submission times out + """ + from kubeflow.spark.models import SparkApplicationRequest + + # Build request object + request = SparkApplicationRequest( + app_name=app_name, + main_application_file=main_application_file, + spark_version=spark_version, + app_type=app_type, + driver_cores=driver_cores, + driver_memory=driver_memory, + executor_cores=executor_cores, + executor_memory=executor_memory, + num_executors=num_executors, + queue=queue or self.config.default_queue, + arguments=arguments or [], + python_version=python_version, + spark_conf=spark_conf or {}, + hadoop_conf=hadoop_conf or {}, + env_vars=env_vars or {}, + deps=deps, + ) + + # Submit to gateway + url = urljoin(self.config.gateway_url, "/spark") + try: + response = self._session.post(url, json=request.to_dict(), timeout=self.config.timeout) + response.raise_for_status() + + return SparkApplicationResponse.from_dict(response.json()) + + except Exception as e: + raise RuntimeError(f"Failed to submit application to gateway: {e}") from e + + def get_job(self, submission_id: str) -> ApplicationStatus: + """Get status of a Spark application from gateway. + + Args: + submission_id: Submission ID returned from submit_application + + Returns: + ApplicationStatus with current status + + Raises: + RuntimeError: If request fails + """ + url = urljoin(self.config.gateway_url, f"/spark/{submission_id}/status") + try: + response = self._session.get(url, timeout=self.config.timeout) + response.raise_for_status() + + return ApplicationStatus.from_dict(response.json()) + + except Exception as e: + raise RuntimeError(f"Failed to get status from gateway: {e}") from e + + def delete_job(self, submission_id: str) -> dict[str, Any]: + """Delete a Spark application through gateway. + + Args: + submission_id: Submission ID to delete + + Returns: + Dictionary with deletion response + + Raises: + RuntimeError: If deletion fails + """ + url = urljoin(self.config.gateway_url, f"/spark/{submission_id}") + try: + response = self._session.delete(url, timeout=self.config.timeout) + response.raise_for_status() + + return response.json() + + except Exception as e: + raise RuntimeError(f"Failed to delete application from gateway: {e}") from e + + def get_job_logs( + self, + submission_id: str, + executor_id: Optional[str] = None, + follow: bool = False, + ) -> Iterator[str]: + """Get application logs from gateway. + + Args: + submission_id: Submission ID + executor_id: Optional executor ID + follow: Whether to stream logs (not supported by gateway) + + Yields: + Log lines as strings + + Raises: + RuntimeError: If request fails + """ + if follow: + logger.warning("Log following is not supported by GatewayBackend") + + params = {"subId": submission_id} + if executor_id: + params["execId"] = executor_id + + url = urljoin(self.config.gateway_url, "/log") + try: + response = self._session.get(url, params=params, timeout=self.config.timeout) + response.raise_for_status() + + yield from response.text.splitlines() + + except Exception as e: + raise RuntimeError(f"Failed to get logs from gateway: {e}") from e + + def list_jobs( + self, + namespace: Optional[str] = None, + labels: Optional[dict[str, str]] = None, + ) -> list[ApplicationStatus]: + """List Spark applications from gateway. + + Note: Gateway backend may not support listing applications. + + Args: + namespace: Optional namespace filter (may not be supported) + labels: Optional label filters (may not be supported) + + Returns: + List of ApplicationStatus objects + + Raises: + NotImplementedError: If gateway doesn't support listing + """ + raise NotImplementedError( + "GatewayBackend does not support listing applications. " + "This feature is only available with OperatorBackend." + ) + + def wait_for_job_status( + self, + submission_id: str, + timeout: int = 3600, + polling_interval: int = 10, + ) -> ApplicationStatus: + """Wait for Spark application to complete. + + Args: + submission_id: Submission ID to monitor + timeout: Maximum time to wait in seconds + polling_interval: Polling interval in seconds + + Returns: + Final ApplicationStatus + + Raises: + TimeoutError: If application doesn't complete within timeout + """ + import time + + from kubeflow.spark.models import ApplicationState + + start_time = time.time() + + while True: + status = self.get_job(submission_id) + + # Check if application reached terminal state + if status.state in [ApplicationState.COMPLETED, ApplicationState.FAILED]: + return status + + # Check timeout + elapsed = time.time() - start_time + if elapsed >= timeout: + raise TimeoutError( + f"Application {submission_id} did not complete within {timeout}s" + ) + + logger.debug( + f"Application {submission_id} status: {status.state.value}. " + f"Waiting {polling_interval}s..." + ) + time.sleep(polling_interval) + + def close(self): + """Close HTTP session.""" + if self._session: + self._session.close() + + +class GatewayBackendConfig: + """Configuration for Gateway backend. + + Attributes: + gateway_url: URL of the Batch Processing Gateway + user: Username for authentication + password: Password for basic authentication + auth_method: Authentication method to use + auth_header_key: Header key for user authentication + timeout: Request timeout in seconds + verify_ssl: Whether to verify SSL certificates + default_queue: Default queue for job submission + default_spark_version: Default Spark version + extra_headers: Additional headers to include in requests + """ + + def __init__( + self, + gateway_url: str, + user: Optional[str] = None, + password: Optional[str] = None, + auth_method: AuthMethod = AuthMethod.NONE, + auth_header_key: str = "X-User", + timeout: int = 30, + verify_ssl: bool = True, + default_queue: str = "poc", + default_spark_version: str = "3.5.0", + extra_headers: Optional[dict[str, str]] = None, + ): + """Initialize Gateway backend configuration. + + Args: + gateway_url: URL of the Batch Processing Gateway + user: Username for authentication + password: Password for basic authentication + auth_method: Authentication method to use + auth_header_key: Header key for user authentication + timeout: Request timeout in seconds + verify_ssl: Whether to verify SSL certificates + default_queue: Default queue for job submission + default_spark_version: Default Spark version + extra_headers: Additional headers to include + """ + self.gateway_url = gateway_url + self.user = user + self.password = password + self.auth_method = auth_method + self.auth_header_key = auth_header_key + self.timeout = timeout + self.verify_ssl = verify_ssl + self.default_queue = default_queue + self.default_spark_version = default_spark_version + self.extra_headers = extra_headers or {} + + @classmethod + def from_env(cls, prefix: str = "KUBEFLOW_SPARK_") -> "GatewayBackendConfig": + """Create config from environment variables. + + Args: + prefix: Prefix for environment variables + + Returns: + GatewayBackendConfig instance + + Environment variables: + - {prefix}GATEWAY_URL (required) + - {prefix}USER + - {prefix}PASSWORD + - {prefix}AUTH_METHOD (basic|header|none) + - {prefix}DEFAULT_QUEUE + - {prefix}DEFAULT_SPARK_VERSION + """ + return cls( + gateway_url=os.getenv(f"{prefix}GATEWAY_URL", ""), + user=os.getenv(f"{prefix}USER"), + password=os.getenv(f"{prefix}PASSWORD"), + auth_method=AuthMethod(os.getenv(f"{prefix}AUTH_METHOD", "none").lower()), + timeout=int(os.getenv(f"{prefix}TIMEOUT", "30")), + verify_ssl=os.getenv(f"{prefix}VERIFY_SSL", "true").lower() == "true", + default_queue=os.getenv(f"{prefix}DEFAULT_QUEUE", "poc"), + default_spark_version=os.getenv(f"{prefix}DEFAULT_SPARK_VERSION", "3.5.0"), + ) diff --git a/kubeflow/spark/backends/operator.py b/kubeflow/spark/backends/operator.py new file mode 100644 index 000000000..424df55c2 --- /dev/null +++ b/kubeflow/spark/backends/operator.py @@ -0,0 +1,834 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Kubernetes Spark Operator backend implementation.""" + +from collections.abc import Iterator +from dataclasses import dataclass, field +import logging +import multiprocessing +import time +from typing import Any, Optional + +from kubernetes import client, config as k8s_config, watch + +from kubeflow.spark.backends.base import BatchSparkBackend +from kubeflow.spark.models import ( + ApplicationState, + ApplicationStatus, + SparkApplicationResponse, +) + +logger = logging.getLogger(__name__) + + +# Constants for Spark Operator +SPARK_OPERATOR_API_GROUP = "sparkoperator.k8s.io" +SPARK_OPERATOR_API_VERSION = "v1beta2" +SPARK_APPLICATION_PLURAL = "sparkapplications" +SPARK_APPLICATION_KIND = "SparkApplication" +DEFAULT_TIMEOUT = 60 # seconds + + +@dataclass +class OperatorBackendConfig: + """Configuration for Spark Operator backend. + + Attributes: + namespace: Kubernetes namespace to use + context: Kubernetes context name + config_file: Path to kubeconfig file + client_configuration: Custom Kubernetes client configuration + service_account: Service account for Spark pods + image_pull_policy: Image pull policy (IfNotPresent, Always, Never) + default_spark_image: Default Docker image for Spark + timeout: Default timeout for API operations in seconds + enable_monitoring: Enable Prometheus monitoring + enable_ui: Enable Spark UI service + """ + + namespace: Optional[str] = None + context: Optional[str] = None + config_file: Optional[str] = None + client_configuration: Optional[client.Configuration] = None + service_account: str = "spark-operator-spark" + image_pull_policy: str = "IfNotPresent" + default_spark_image: str = "gcr.io/spark-operator/spark-py" + timeout: int = DEFAULT_TIMEOUT + enable_monitoring: bool = True + enable_ui: bool = True + extra_labels: dict[str, str] = field(default_factory=dict) + extra_annotations: dict[str, str] = field(default_factory=dict) + + +class OperatorBackend(BatchSparkBackend): + """Kubernetes Spark Operator backend. + + This backend uses the Kubeflow Spark Operator to manage Spark applications + on Kubernetes. It creates SparkApplication CRDs that the operator watches + and converts into Kubernetes pods. + + Example: + config = OperatorBackendConfig(namespace="spark-jobs") + backend = OperatorBackend(config) + response = backend.submit_application( + app_name="my-spark-job", + main_application_file="local:///app/main.py", + ... + ) + """ + + def __init__(self, config: OperatorBackendConfig): + """Initialize the Operator backend. + + Args: + config: OperatorBackendConfig instance + """ + self.config = config + + # Determine namespace + if self.config.namespace is None: + self.config.namespace = self._get_default_namespace() + + # Load Kubernetes configuration + if self.config.client_configuration is None: + if self.config.config_file or not self._is_running_in_k8s(): + k8s_config.load_kube_config( + config_file=self.config.config_file, + context=self.config.context, + ) + else: + k8s_config.load_incluster_config() + + # Initialize Kubernetes API clients + k8s_client = client.ApiClient(self.config.client_configuration) + self.custom_api = client.CustomObjectsApi(k8s_client) + self.core_api = client.CoreV1Api(k8s_client) + + logger.info(f"Initialized OperatorBackend with namespace: {self.config.namespace}") + + def submit_application( + self, + app_name: str, + main_application_file: str, + spark_version: str = "3.5.0", + app_type: str = "Python", + driver_cores: int = 1, + driver_memory: str = "1g", + executor_cores: int = 1, + executor_memory: str = "1g", + num_executors: int = 2, + queue: Optional[str] = None, + arguments: Optional[list[str]] = None, + python_version: str = "3", + spark_conf: Optional[dict[str, str]] = None, + hadoop_conf: Optional[dict[str, str]] = None, + env_vars: Optional[dict[str, str]] = None, + deps: Optional[dict[str, list[str]]] = None, + **kwargs: Any, + ) -> SparkApplicationResponse: + """Submit a Spark application using Spark Operator. + + Creates a SparkApplication CRD in Kubernetes which the Spark Operator + watches and converts into driver and executor pods. + + Args: + app_name: Name of the application (must be DNS-compliant) + main_application_file: Path to main application file + spark_version: Spark version to use + app_type: Application type (Python, Scala, Java, R) + driver_cores: Number of cores for driver + driver_memory: Memory for driver + executor_cores: Number of cores per executor + executor_memory: Memory per executor + num_executors: Number of executors + queue: Namespace to submit to (overrides config namespace) + arguments: Application arguments + python_version: Python version + spark_conf: Spark configuration properties + hadoop_conf: Hadoop configuration properties + env_vars: Environment variables + deps: Dependencies dict with keys: jars, pyFiles, files + **kwargs: Additional parameters (volumes, node_selector, tolerations, etc.) + + Returns: + SparkApplicationResponse with submission details + + Raises: + ValueError: If required parameters are invalid + RuntimeError: If submission fails + TimeoutError: If submission times out + """ + # Validate app_name is DNS-compliant + if not self._is_valid_k8s_name(app_name): + raise ValueError( + f"app_name '{app_name}' must be DNS-compliant " + "(lowercase alphanumeric characters, '-' or '.')" + ) + + # Determine target namespace + target_namespace = queue if queue else self.config.namespace + + # Build SparkApplication CRD + spark_app = self._build_spark_application_crd( + app_name=app_name, + main_application_file=main_application_file, + spark_version=spark_version, + app_type=app_type, + driver_cores=driver_cores, + driver_memory=driver_memory, + executor_cores=executor_cores, + executor_memory=executor_memory, + num_executors=num_executors, + arguments=arguments or [], + python_version=python_version, + spark_conf=spark_conf or {}, + hadoop_conf=hadoop_conf or {}, + env_vars=env_vars or {}, + deps=deps, + **kwargs, + ) + + # Submit to Kubernetes + try: + thread = self.custom_api.create_namespaced_custom_object( + group=SPARK_OPERATOR_API_GROUP, + version=SPARK_OPERATOR_API_VERSION, + namespace=target_namespace, + plural=SPARK_APPLICATION_PLURAL, + body=spark_app, + async_req=True, + ) + result = thread.get(self.config.timeout) + + logger.info(f"SparkApplication {target_namespace}/{app_name} created successfully") + + return SparkApplicationResponse( + submission_id=app_name, + app_name=app_name, + status="SUBMITTED", + message=f"SparkApplication created in namespace {target_namespace}", + ) + + except multiprocessing.TimeoutError as e: + raise TimeoutError( + f"Timeout creating SparkApplication {target_namespace}/{app_name}" + ) from e + except Exception as e: + raise RuntimeError( + f"Failed to create SparkApplication {target_namespace}/{app_name}: {e}" + ) from e + + def get_job(self, submission_id: str) -> ApplicationStatus: + """Get status of a Spark application. + + Args: + submission_id: Name of the SparkApplication (same as app_name) + + Returns: + ApplicationStatus with current status + + Raises: + RuntimeError: If request fails + TimeoutError: If request times out + """ + try: + thread = self.custom_api.get_namespaced_custom_object( + group=SPARK_OPERATOR_API_GROUP, + version=SPARK_OPERATOR_API_VERSION, + namespace=self.config.namespace, + plural=SPARK_APPLICATION_PLURAL, + name=submission_id, + async_req=True, + ) + spark_app = thread.get(self.config.timeout) + + return self._parse_application_status(spark_app) + + except multiprocessing.TimeoutError as e: + raise TimeoutError( + f"Timeout getting SparkApplication {self.config.namespace}/{submission_id}" + ) from e + except Exception as e: + raise RuntimeError( + f"Failed to get SparkApplication {self.config.namespace}/{submission_id}: {e}" + ) from e + + def delete_job(self, submission_id: str) -> dict[str, Any]: + """Delete a Spark application. + + Args: + submission_id: Name of the SparkApplication to delete + + Returns: + Dictionary with deletion response + + Raises: + RuntimeError: If deletion fails + TimeoutError: If deletion times out + """ + try: + thread = self.custom_api.delete_namespaced_custom_object( + group=SPARK_OPERATOR_API_GROUP, + version=SPARK_OPERATOR_API_VERSION, + namespace=self.config.namespace, + plural=SPARK_APPLICATION_PLURAL, + name=submission_id, + async_req=True, + ) + result = thread.get(self.config.timeout) + + logger.info(f"SparkApplication {self.config.namespace}/{submission_id} deleted") + + return { + "status": "deleted", + "message": f"Application {submission_id} deleted", + } + + except multiprocessing.TimeoutError as e: + raise TimeoutError( + f"Timeout deleting SparkApplication {self.config.namespace}/{submission_id}" + ) from e + except Exception as e: + raise RuntimeError( + f"Failed to delete SparkApplication {self.config.namespace}/{submission_id}: {e}" + ) from e + + def get_job_logs( + self, + submission_id: str, + executor_id: Optional[str] = None, + follow: bool = False, + ) -> Iterator[str]: + """Get application logs from driver or executor pods. + + Args: + submission_id: Name of the SparkApplication + executor_id: Optional executor ID (e.g., "1", "2"). If None, returns driver logs + follow: Whether to stream logs in real-time + + Yields: + Log lines as strings + + Raises: + RuntimeError: If request fails + """ + # Determine pod name based on executor_id + if executor_id: + # Executor pod naming: - + pod_name = f"{submission_id}-{executor_id}" + container_name = "executor" + else: + # Driver pod naming: -driver + pod_name = f"{submission_id}-driver" + container_name = "spark-kubernetes-driver" + + try: + if follow: + # Stream logs in real-time + log_stream = watch.Watch().stream( + self.core_api.read_namespaced_pod_log, + name=pod_name, + namespace=self.config.namespace, + container=container_name, + follow=True, + ) + yield from log_stream + else: + # Get all logs at once + logs = self.core_api.read_namespaced_pod_log( + name=pod_name, + namespace=self.config.namespace, + container=container_name, + ) + yield from logs.splitlines() + + except client.exceptions.ApiException as e: + if e.status == 404: + logger.warning(f"Pod {pod_name} not found in namespace {self.config.namespace}") + return + elif e.status == 400 and ( + "waiting to start" in str(e.body) or "ContainerCreating" in str(e.body) + ): + # Pod exists but container is not ready yet + # Check if it's a "waiting to start" error + logger.warning( + f"Pod {pod_name} is not ready yet (ContainerCreating). " + "Wait for pod to be running before fetching logs." + ) + return + elif e.status == 400: + # Otherwise, it's a different 400 error + raise RuntimeError( + f"Failed to read logs for pod {self.config.namespace}/{pod_name}: {e}" + ) from e + raise RuntimeError( + f"Failed to read logs for pod {self.config.namespace}/{pod_name}: {e}" + ) from e + except Exception as e: + raise RuntimeError( + f"Failed to read logs for pod {self.config.namespace}/{pod_name}: {e}" + ) from e + + def list_jobs( + self, + namespace: Optional[str] = None, + labels: Optional[dict[str, str]] = None, + ) -> list[ApplicationStatus]: + """List Spark applications. + + Args: + namespace: Optional namespace filter (uses config namespace if None) + labels: Optional label filters + + Returns: + List of ApplicationStatus objects + + Raises: + RuntimeError: If request fails + TimeoutError: If request times out + """ + target_namespace = namespace or self.config.namespace + + try: + # Build label selector + label_selector = None + if labels: + label_selector = ",".join([f"{k}={v}" for k, v in labels.items()]) + + thread = self.custom_api.list_namespaced_custom_object( + group=SPARK_OPERATOR_API_GROUP, + version=SPARK_OPERATOR_API_VERSION, + namespace=target_namespace, + plural=SPARK_APPLICATION_PLURAL, + label_selector=label_selector, + async_req=True, + ) + result = thread.get(self.config.timeout) + + applications = [] + for item in result.get("items", []): + applications.append(self._parse_application_status(item)) + + return applications + + except multiprocessing.TimeoutError as e: + raise TimeoutError( + f"Timeout listing SparkApplications in namespace {target_namespace}" + ) from e + except Exception as e: + raise RuntimeError( + f"Failed to list SparkApplications in namespace {target_namespace}: {e}" + ) from e + + def wait_for_job_status( + self, + submission_id: str, + timeout: int = 3600, + polling_interval: int = 10, + ) -> ApplicationStatus: + """Wait for Spark application to complete. + + Args: + submission_id: Name of the SparkApplication + timeout: Maximum time to wait in seconds + polling_interval: Polling interval in seconds + + Returns: + Final ApplicationStatus + + Raises: + TimeoutError: If application doesn't complete within timeout + RuntimeError: If monitoring fails + """ + start_time = time.time() + + while True: + status = self.get_job(submission_id) + + # Check if application reached terminal state + if status.state in [ApplicationState.COMPLETED, ApplicationState.FAILED]: + return status + + # Check timeout + elapsed = time.time() - start_time + if elapsed >= timeout: + raise TimeoutError( + f"Application {submission_id} did not complete within {timeout}s. " + f"Last status: {status.state.value}" + ) + + logger.debug( + f"Application {submission_id} status: {status.state.value}. " + f"Waiting {polling_interval}s... ({int(elapsed)}s elapsed)" + ) + time.sleep(polling_interval) + + def _build_spark_application_crd( + self, + app_name: str, + main_application_file: str, + spark_version: str, + app_type: str, + driver_cores: int, + driver_memory: str, + executor_cores: int, + executor_memory: str, + num_executors: int, + arguments: list[str], + python_version: str, + spark_conf: dict[str, str], + hadoop_conf: dict[str, str], + env_vars: dict[str, str], + deps: Optional[dict[str, list[str]]], + **kwargs: Any, + ) -> dict[str, Any]: + """Build SparkApplication CRD specification. + + Args: + All parameters from submit_application + **kwargs: Additional parameters like volumes, node_selector, etc. + + Returns: + SparkApplication CRD dictionary + """ + # Build base CRD structure + spark_app: dict[str, Any] = { + "apiVersion": f"{SPARK_OPERATOR_API_GROUP}/{SPARK_OPERATOR_API_VERSION}", + "kind": SPARK_APPLICATION_KIND, + "metadata": { + "name": app_name, + "labels": { + "app": app_name, + "version": spark_version, + **self.config.extra_labels, + }, + "annotations": self.config.extra_annotations, + }, + "spec": { + "type": app_type, + "mode": "cluster", + "image": f"{self.config.default_spark_image}:{spark_version}", + "imagePullPolicy": self.config.image_pull_policy, + "mainApplicationFile": main_application_file, + "sparkVersion": spark_version, + "restartPolicy": self._build_restart_policy(kwargs.get("restart_policy")), + "driver": { + "cores": driver_cores, + "memory": driver_memory, + "serviceAccount": self.config.service_account, + "labels": {"version": spark_version, "component": "driver"}, + }, + "executor": { + "cores": executor_cores, + "instances": num_executors, + "memory": executor_memory, + "labels": {"version": spark_version, "component": "executor"}, + }, + }, + } + + # Add optional fields + if arguments: + spark_app["spec"]["arguments"] = arguments + + # Add main class for Scala/Java applications + if "main_class" in kwargs and kwargs["main_class"]: + spark_app["spec"]["mainClass"] = kwargs["main_class"] + + if spark_conf: + spark_app["spec"]["sparkConf"] = spark_conf + + if hadoop_conf: + spark_app["spec"]["hadoopConf"] = hadoop_conf + + # Add environment variables + if env_vars: + env_list = [{"name": k, "value": v} for k, v in env_vars.items()] + spark_app["spec"]["driver"]["env"] = env_list + spark_app["spec"]["executor"]["env"] = env_list + + # Add dependencies + if deps: + spark_app["spec"]["deps"] = deps + + # Add Python version for Python apps + if app_type == "Python": + spark_app["spec"]["pythonVersion"] = python_version + + # Add monitoring if enabled + if self.config.enable_monitoring: + spark_app["spec"]["monitoring"] = { + "exposeDriverMetrics": True, + "exposeExecutorMetrics": True, + "prometheus": { + "jmxExporterJar": "/prometheus/jmx_prometheus_javaagent-0.11.0.jar", + "port": 8090, + }, + } + + # Add Spark UI service if enabled + if self.config.enable_ui: + spark_app["spec"]["sparkUIOptions"] = { + "servicePort": 4040, + "serviceType": "ClusterIP", # Required for service creation + } + + # Add volumes if specified + if "volumes" in kwargs: + spark_app["spec"]["volumes"] = kwargs["volumes"] + if "driver_volume_mounts" in kwargs: + spark_app["spec"]["driver"]["volumeMounts"] = kwargs["driver_volume_mounts"] + if "executor_volume_mounts" in kwargs: + spark_app["spec"]["executor"]["volumeMounts"] = kwargs["executor_volume_mounts"] + + # Add node selector if specified + if "node_selector" in kwargs: + spark_app["spec"]["driver"]["nodeSelector"] = kwargs["node_selector"] + spark_app["spec"]["executor"]["nodeSelector"] = kwargs["node_selector"] + + # Add tolerations if specified + if "tolerations" in kwargs: + spark_app["spec"]["driver"]["tolerations"] = kwargs["tolerations"] + spark_app["spec"]["executor"]["tolerations"] = kwargs["tolerations"] + + # Add resource limits if specified + if "driver_limits" in kwargs: + if "limits" not in spark_app["spec"]["driver"]: + spark_app["spec"]["driver"]["limits"] = {} + spark_app["spec"]["driver"]["limits"].update(kwargs["driver_limits"]) + + if "executor_limits" in kwargs: + if "limits" not in spark_app["spec"]["executor"]: + spark_app["spec"]["executor"]["limits"] = {} + spark_app["spec"]["executor"]["limits"].update(kwargs["executor_limits"]) + + # Add dynamic allocation if specified + if kwargs.get("enable_dynamic_allocation"): + spark_app["spec"]["dynamicAllocation"] = { + "enabled": True, + "initialExecutors": kwargs.get("initial_executors", num_executors), + "minExecutors": kwargs.get("min_executors", 1), + "maxExecutors": kwargs.get("max_executors", num_executors * 2), + } + + # Add time_to_live_seconds if specified + if "time_to_live_seconds" in kwargs and kwargs["time_to_live_seconds"]: + spark_app["spec"]["timeToLiveSeconds"] = kwargs["time_to_live_seconds"] + + # Add labels if specified + if "labels" in kwargs and kwargs["labels"]: + spark_app["metadata"]["labels"].update(kwargs["labels"]) + + return spark_app + + def _build_restart_policy(self, restart_policy: Optional[Any]) -> dict[str, Any]: + """Build restart policy dict from RestartPolicy object or default. + + Args: + restart_policy: RestartPolicy object or None + + Returns: + Restart policy dictionary + """ + from kubeflow.spark.models import RestartPolicy, RestartPolicyType + + if restart_policy is None: + return {"type": "Never"} + + # If it's already a RestartPolicy object + if isinstance(restart_policy, RestartPolicy): + policy_dict = { + "type": restart_policy.type.value + if isinstance(restart_policy.type, RestartPolicyType) + else restart_policy.type + } + if restart_policy.on_failure_retries is not None: + policy_dict["onFailureRetries"] = restart_policy.on_failure_retries + if restart_policy.on_failure_retry_interval: + policy_dict["onFailureRetryInterval"] = restart_policy.on_failure_retry_interval + if restart_policy.on_submission_failure_retries is not None: + policy_dict["onSubmissionFailureRetries"] = ( + restart_policy.on_submission_failure_retries + ) + if restart_policy.on_submission_failure_retry_interval: + policy_dict["onSubmissionFailureRetryInterval"] = ( + restart_policy.on_submission_failure_retry_interval + ) + return policy_dict + + # If it's a string, use it as type + if isinstance(restart_policy, str): + return {"type": restart_policy} + + # Default + return {"type": "Never"} + + def _parse_application_status(self, spark_app: dict[str, Any]) -> ApplicationStatus: + """Parse SparkApplication CRD status into ApplicationStatus. + + Args: + spark_app: SparkApplication CRD dictionary + + Returns: + ApplicationStatus object + """ + metadata = spark_app.get("metadata", {}) + status = spark_app.get("status", {}) + app_state_dict = status.get("applicationState", {}) + + # Parse state + state_str = app_state_dict.get("state", "UNKNOWN") + try: + state = ApplicationState(state_str) + except ValueError: + logger.warning(f"Unknown application state: {state_str}") + state = ApplicationState.UNKNOWN + + return ApplicationStatus( + submission_id=metadata.get("name", ""), + app_id=status.get("sparkApplicationId"), + app_name=metadata.get("name"), + state=state, + submission_time=status.get("submissionTime"), + start_time=status.get("lastSubmissionAttemptTime"), + completion_time=status.get("terminationTime"), + driver_info=status.get("driverInfo"), + executor_state=status.get("executorState"), + ) + + def _is_valid_k8s_name(self, name: str) -> bool: + """Check if name is DNS-compliant for Kubernetes. + + Args: + name: Name to validate + + Returns: + True if valid, False otherwise + """ + import re + + # Kubernetes resource names must be lowercase alphanumeric, '-' or '.' + # and start/end with alphanumeric + pattern = r"^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$" + return bool(re.match(pattern, name)) and len(name) <= 253 + + def _get_default_namespace(self) -> str: + """Get default Kubernetes namespace. + + Returns: + Default namespace string + """ + import os + + # Try to get from environment + namespace = os.getenv("NAMESPACE") + if namespace: + return namespace + + # Try to read from service account + try: + with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f: + return f.read().strip() + except FileNotFoundError: + pass + + # Default to "default" + return "default" + + def _is_running_in_k8s(self) -> bool: + """Check if running inside a Kubernetes cluster. + + Returns: + True if running in cluster, False otherwise + """ + import os + + return os.path.exists("/var/run/secrets/kubernetes.io/serviceaccount/token") + + def wait_for_pod_ready( + self, + submission_id: str, + executor_id: Optional[str] = None, + timeout: int = 300, + ) -> bool: + """Wait for driver or executor pod to be ready. + + Args: + submission_id: Name of the SparkApplication + executor_id: Optional executor ID. If None, waits for driver pod + timeout: Maximum time to wait in seconds + + Returns: + True if pod becomes ready, False if timeout + + Raises: + RuntimeError: If pod check fails + """ + # Determine pod name + if executor_id: + pod_name = f"{submission_id}-{executor_id}" + else: + pod_name = f"{submission_id}-driver" + + start_time = time.time() + + while True: + try: + pod = self.core_api.read_namespaced_pod( + name=pod_name, namespace=self.config.namespace + ) + + # Check if pod is running and container is ready + if pod.status.phase == "Running" and pod.status.container_statuses: + # Check if containers are ready + for container_status in pod.status.container_statuses: + if container_status.ready: + logger.info(f"Pod {pod_name} is ready") + return True + + # Check if pod failed + if pod.status.phase in ["Failed", "Unknown"]: + logger.warning(f"Pod {pod_name} is in {pod.status.phase} state") + return False + + # Check timeout + elapsed = time.time() - start_time + if elapsed >= timeout: + logger.warning( + f"Timeout waiting for pod {pod_name} to be ready. " + f"Current phase: {pod.status.phase}" + ) + return False + + # Wait before next check + time.sleep(2) + + except client.exceptions.ApiException as e: + if e.status == 404: + # Pod doesn't exist yet + elapsed = time.time() - start_time + if elapsed >= timeout: + logger.warning(f"Timeout waiting for pod {pod_name} to be created") + return False + time.sleep(2) + continue + raise RuntimeError( + f"Failed to check pod {self.config.namespace}/{pod_name}: {e}" + ) from e + + def close(self): + """Close Kubernetes API client connections.""" + if hasattr(self, "custom_api") and self.custom_api.api_client: + self.custom_api.api_client.close() diff --git a/kubeflow/spark/base_client.py b/kubeflow/spark/base_client.py new file mode 100644 index 000000000..62a5b7598 --- /dev/null +++ b/kubeflow/spark/base_client.py @@ -0,0 +1,96 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Base client class for Spark SDK. + +This module provides the abstract base class for all Spark clients, +implementing shared functionality like resource management, context +manager protocol, and logging. +""" + +import abc +import logging +from typing import Any + +from kubeflow.spark.backends.base import SparkBackend + + +class BaseSparkClient(abc.ABC): + """Abstract base class for Spark clients. + + This class implements common functionality shared by all Spark client types: + - Resource management (close() method) + - Context manager protocol (__enter__/__exit__) + - Logging infrastructure + + Subclasses (BatchSparkClient, SparkSessionClient) implement specific + functionality for their use cases. + + This design follows the Template Method Pattern, where the base class + defines the skeleton of operations and subclasses fill in specific steps. + """ + + def __init__(self, backend: SparkBackend): + """Initialize the base client. + + Args: + backend: Spark backend instance (BatchSparkBackend or SessionSparkBackend) + """ + self._backend = backend + self._logger = logging.getLogger(self.__class__.__name__) + self._logger.info(f"Initialized {self.__class__.__name__} with {backend.__class__.__name__}") + + def close(self): + """Close the client and release all resources. + + This method delegates to the backend's close() method to clean up: + - Kubernetes API clients + - HTTP connections + - gRPC channels + - Active sessions + + It's safe to call this multiple times. + """ + try: + self._backend.close() + self._logger.info(f"{self.__class__.__name__} closed successfully") + except Exception as e: + self._logger.error(f"Error closing {self.__class__.__name__}: {e}") + raise + + def __enter__(self): + """Context manager entry. + + Returns: + Self for use in with statements + """ + return self + + def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any): + """Context manager exit - ensures cleanup. + + Args: + exc_type: Exception type if an exception occurred + exc_val: Exception value if an exception occurred + exc_tb: Exception traceback if an exception occurred + """ + self.close() + + def __repr__(self) -> str: + """String representation. + + Returns: + String describing the client and backend + """ + return f"{self.__class__.__name__}(backend={self._backend.__class__.__name__})" diff --git a/kubeflow/spark/batch_client.py b/kubeflow/spark/batch_client.py new file mode 100644 index 000000000..a6a0c115d --- /dev/null +++ b/kubeflow/spark/batch_client.py @@ -0,0 +1,421 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Batch Spark client for managing Spark applications.""" + +from collections.abc import Iterator +from typing import Any, Optional, Union + +from kubeflow.spark.base_client import BaseSparkClient +from kubeflow.spark.backends.base import BatchSparkBackend +from kubeflow.spark.backends.gateway import ( + GatewayBackend, + GatewayBackendConfig, +) +from kubeflow.spark.backends.operator import ( + OperatorBackend, + OperatorBackendConfig, +) +from kubeflow.spark.models import ( + ApplicationStatus, + SparkApplicationResponse, +) + + +class BatchSparkClient(BaseSparkClient): + """Client for managing batch Spark applications. + + This client provides a high-level API for submitting and managing batch + Spark applications using either the Kubernetes Spark Operator or REST gateways. + + Supported backends: + - **OperatorBackend**: Submits SparkApplication CRDs to Kubernetes (recommended) + - **GatewayBackend**: Submits jobs via REST API to Spark gateways (Livy, etc.) + + Example with Operator Backend: + ```python + from kubeflow.spark import BatchSparkClient, OperatorBackendConfig + + # Initialize client + config = OperatorBackendConfig(namespace="spark-jobs") + client = BatchSparkClient(backend_config=config) + + # Submit application + response = client.submit_application( + app_name="my-etl-job", + main_application_file="s3a://bucket/jobs/etl.py", + driver_cores=2, + driver_memory="4g", + executor_cores=4, + executor_memory="8g", + num_executors=10, + ) + + # Wait for completion + status = client.wait_for_job_status(response.submission_id) + print(f"Job completed with state: {status.state}") + + # Get logs + for line in client.get_job_logs(response.submission_id): + print(line) + ``` + + Example with Gateway Backend: + ```python + from kubeflow.spark import BatchSparkClient, GatewayBackendConfig + + config = GatewayBackendConfig( + gateway_url="http://livy-gateway:8998", + user="myuser" + ) + client = BatchSparkClient(backend_config=config) + ``` + + Context Manager: + ```python + with BatchSparkClient(backend_config=config) as client: + response = client.submit_application(...) + # Cleanup happens automatically + ``` + """ + + def __init__( + self, + backend_config: Union[OperatorBackendConfig, GatewayBackendConfig, None] = None, + ): + """Initialize Batch Spark client. + + Args: + backend_config: Backend configuration: + - OperatorBackendConfig: Kubernetes with Spark Operator (default) + - GatewayBackendConfig: REST API gateway + + Raises: + ValueError: If invalid backend configuration provided + """ + # Default to OperatorBackend + if backend_config is None: + backend_config = OperatorBackendConfig() + + # Initialize appropriate backend + if isinstance(backend_config, OperatorBackendConfig): + backend: BatchSparkBackend = OperatorBackend(backend_config) + elif isinstance(backend_config, GatewayBackendConfig): + backend = GatewayBackend(backend_config) + else: + raise ValueError( + f"Invalid backend config type for BatchSparkClient: {type(backend_config)}. " + "Expected OperatorBackendConfig or GatewayBackendConfig." + ) + + # Initialize base class + super().__init__(backend) + + def submit_application( + self, + app_name: Optional[str] = None, + main_application_file: str = "", + spark_version: str = "3.5.0", + app_type: str = "Python", + driver_cores: int = 1, + driver_memory: str = "1g", + executor_cores: int = 1, + executor_memory: str = "1g", + num_executors: int = 2, + queue: Optional[str] = None, + arguments: Optional[list[str]] = None, + python_version: str = "3", + spark_conf: Optional[dict[str, str]] = None, + hadoop_conf: Optional[dict[str, str]] = None, + env_vars: Optional[dict[str, str]] = None, + deps: Optional[dict[str, list[str]]] = None, + **kwargs: Any, + ) -> SparkApplicationResponse: + """Submit a Spark application for batch execution. + + Args: + app_name: Name of the application. If not provided, a unique name will be + auto-generated. Must be unique within the namespace. (optional) + main_application_file: Path to main application file + Supported formats: local://, s3a://, http://, etc. + spark_version: Spark version (default: "3.5.0") + app_type: Application type: "Python", "Scala", "Java", "R" (default: "Python") + driver_cores: Number of CPU cores for driver (default: 1) + driver_memory: Memory for driver, e.g., "1g", "512m" (default: "1g") + executor_cores: Number of CPU cores per executor (default: 1) + executor_memory: Memory per executor, e.g., "1g", "2g" (default: "1g") + num_executors: Number of executors (default: 2) + queue: Queue/namespace for submission (backend-specific, optional) + arguments: Command-line arguments for the main file (optional) + python_version: Python version for PySpark: "2" or "3" (default: "3") + spark_conf: Spark configuration properties (spark.*), optional + hadoop_conf: Hadoop configuration properties, optional + env_vars: Environment variables for driver and executors, optional + deps: Dependencies dict with keys: "jars", "pyFiles", "files", optional + **kwargs: Additional backend-specific parameters (e.g., volumes, GPUs) + + Returns: + SparkApplicationResponse with submission_id and initial status + + Raises: + RuntimeError: If submission fails + TimeoutError: If submission times out + ValueError: If invalid parameters provided + + Example: + ```python + # With explicit name + response = client.submit_application( + app_name="data-processing", + main_application_file="s3a://my-bucket/jobs/process.py", + driver_cores=2, + driver_memory="4g", + ) + + # With auto-generated name (recommended) + response = client.submit_application( + main_application_file="s3a://my-bucket/jobs/process.py", + driver_cores=2, + driver_memory="4g", + ) + print(f"Submitted: {response.submission_id}") + ``` + """ + # Auto-generate name if not provided (similar to TrainerClient) + if app_name is None: + import secrets + import string + # Generate a random 12-character alphanumeric name + app_name = "spark-" + "".join( + secrets.choice(string.ascii_lowercase + string.digits) for _ in range(12) + ) + self._logger.info(f"Auto-generated application name: {app_name}") + + return self._backend.submit_application( + app_name=app_name, + main_application_file=main_application_file, + spark_version=spark_version, + app_type=app_type, + driver_cores=driver_cores, + driver_memory=driver_memory, + executor_cores=executor_cores, + executor_memory=executor_memory, + num_executors=num_executors, + queue=queue, + arguments=arguments, + python_version=python_version, + spark_conf=spark_conf, + hadoop_conf=hadoop_conf, + env_vars=env_vars, + deps=deps, + **kwargs, + ) + + def get_job(self, submission_id: str) -> ApplicationStatus: + """Get the Spark job object of a Spark application. + + Args: + submission_id: Submission ID returned from submit_application() + + Returns: + ApplicationStatus object with current state, timestamps, and metadata + + Raises: + RuntimeError: If request fails + ValueError: If submission_id not found + + Example: + ```python + status = client.get_job("spark-pi-12345") + print(f"State: {status.state}") + print(f"App ID: {status.app_id}") + ``` + """ + return self._backend.get_status(submission_id) + + def delete_job(self, submission_id: str) -> dict[str, Any]: + """Delete the Spark job. + + This terminates a running application or removes a completed one. + + Args: + submission_id: Submission ID to delete + + Returns: + Dictionary with deletion response + + Raises: + RuntimeError: If deletion fails + ValueError: If submission_id not found + + Example: + ```python + response = client.delete_job("spark-pi-12345") + print(f"Deleted: {response}") + ``` + """ + return self._backend.delete_application(submission_id) + + def get_job_logs( + self, + submission_id: str, + executor_id: Optional[str] = None, + follow: bool = False, + ) -> Iterator[str]: + """Get application logs. + + Args: + submission_id: Submission ID + executor_id: Optional executor ID (if not provided, returns driver logs) + follow: If True, stream logs in real-time (tail -f behavior) + + Yields: + Log lines as strings + + Raises: + RuntimeError: If request fails + ValueError: If submission_id or executor_id not found + + Example: + ```python + # Get driver logs + for line in client.get_job_logs("spark-pi-12345"): + print(line) + + # Get specific executor logs + for line in client.get_job_logs("spark-pi-12345", executor_id="1"): + print(line) + + # Stream logs in real-time + for line in client.get_job_logs("spark-pi-12345", follow=True): + print(line) + ``` + """ + return self._backend.get_logs(submission_id, executor_id, follow) + + def list_jobs( + self, + namespace: Optional[str] = None, + labels: Optional[dict[str, str]] = None, + ) -> list[ApplicationStatus]: + """List Spark jobs with optional filtering. + + Args: + namespace: Optional namespace/queue filter + labels: Optional label filters (key-value pairs) + + Returns: + List of Spark jobs + + Raises: + RuntimeError: If request fails + + Example: + ```python + # List all jobs + apps = client.list_jobs() + + # List in specific namespace + apps = client.list_jobs(namespace="production") + + # Filter by labels + apps = client.list_jobs(labels={"team": "data-eng"}) + ``` + """ + return self._backend.list_applications(namespace, labels) + + def wait_for_job_status( + self, + submission_id: str, + timeout: int = 3600, + polling_interval: int = 10, + ) -> ApplicationStatus: + """Wait for Spark application to complete. + + This method blocks until the application reaches a terminal state + (COMPLETED, FAILED, SUBMISSION_FAILED, KILLED) or timeout is reached. + + Args: + submission_id: Submission ID to monitor + timeout: Maximum time to wait in seconds (default: 3600 = 1 hour) + polling_interval: Polling interval in seconds (default: 10) + + Returns: + Final ApplicationStatus + + Raises: + TimeoutError: If application doesn't complete within timeout + RuntimeError: If monitoring fails + ValueError: If submission_id not found + + Example: + ```python + # Wait with defaults (1 hour timeout) + status = client.wait_for_job_status("spark-pi-12345") + + # Custom timeout and polling + status = client.wait_for_job_status( + "spark-pi-12345", + timeout=1800, # 30 minutes + polling_interval=5, # Poll every 5 seconds + ) + + if status.state == ApplicationState.COMPLETED: + print("Success!") + else: + print(f"Failed with state: {status.state}") + ``` + """ + return self._backend.wait_for_completion(submission_id, timeout, polling_interval) + + def wait_for_pod_ready( + self, + submission_id: str, + executor_id: Optional[str] = None, + timeout: int = 300, + ) -> bool: + """Wait for driver or executor pod to be ready. + + Note: This method is only available when using OperatorBackend. + + Args: + submission_id: Submission ID + executor_id: Optional executor ID (if not provided, waits for driver) + timeout: Maximum time to wait in seconds (default: 300 = 5 minutes) + + Returns: + True if pod becomes ready, False if timeout + + Raises: + NotImplementedError: If backend doesn't support this operation + RuntimeError: If request fails + + Example: + ```python + # Wait for driver pod + if client.wait_for_pod_ready("spark-pi-12345"): + print("Driver is ready") + + # Wait for specific executor + if client.wait_for_pod_ready("spark-pi-12345", executor_id="1"): + print("Executor 1 is ready") + ``` + """ + if isinstance(self._backend, OperatorBackend): + return self._backend.wait_for_pod_ready(submission_id, executor_id, timeout) + else: + raise NotImplementedError( + f"{self._backend.__class__.__name__} does not support wait_for_pod_ready(). " + "This method is only available with OperatorBackend." + ) diff --git a/kubeflow/spark/config.py b/kubeflow/spark/config.py new file mode 100644 index 000000000..24f34bf5b --- /dev/null +++ b/kubeflow/spark/config.py @@ -0,0 +1,11 @@ +"""Configuration for Spark Client.""" + +from enum import Enum + + +class AuthMethod(Enum): + """Authentication methods supported by Batch Processing Gateway.""" + + BASIC = "basic" + HEADER = "header" + NONE = "none" diff --git a/kubeflow/spark/models.py b/kubeflow/spark/models.py new file mode 100644 index 000000000..acde6bdef --- /dev/null +++ b/kubeflow/spark/models.py @@ -0,0 +1,831 @@ +"""Data models for Spark application requests and responses.""" + +from dataclasses import dataclass, field +from enum import Enum +from typing import Any, Optional + + +class ApplicationState(Enum): + """Spark application states matching Spark Operator CRD states.""" + + # Standard states from Spark Operator (v1beta2) + NEW = "NEW" + SUBMITTED = "SUBMITTED" + RUNNING = "RUNNING" + COMPLETED = "COMPLETED" + FAILED = "FAILED" + SUBMISSION_FAILED = "SUBMISSION_FAILED" + PENDING_RERUN = "PENDING_RERUN" + INVALIDATING = "INVALIDATING" + SUCCEEDING = "SUCCEEDING" + FAILING = "FAILING" + SUSPENDING = "SUSPENDING" + SUSPENDED = "SUSPENDED" + RESUMING = "RESUMING" + UNKNOWN = "UNKNOWN" + + +class RestartPolicyType(Enum): + """Restart policy types from operator.""" + + NEVER = "Never" + ON_FAILURE = "OnFailure" + ALWAYS = "Always" + + +class DeployMode(Enum): + """Deployment modes for Spark applications.""" + + CLUSTER = "cluster" + CLIENT = "client" + IN_CLUSTER_CLIENT = "in-cluster-client" + + +@dataclass +class RestartPolicy: + """Restart policy configuration (matches operator RestartPolicy). + + Attributes: + type: Type of restart policy + on_failure_retries: Number of times to retry on failure + on_failure_retry_interval: Interval in seconds between failure retries + on_submission_failure_retries: Number of times to retry on submission failure + on_submission_failure_retry_interval: Interval in seconds between submission retries + """ + + type: RestartPolicyType = RestartPolicyType.NEVER + on_failure_retries: Optional[int] = None + on_failure_retry_interval: int = 5 # Default from operator + on_submission_failure_retries: Optional[int] = None + on_submission_failure_retry_interval: int = 5 # Default from operator + + +@dataclass +class GPUSpec: + """GPU specification for driver or executor. + + Attributes: + name: GPU resource name (e.g., "nvidia.com/gpu", "amd.com/gpu") + quantity: Number of GPUs to request + """ + + name: str + quantity: int + + +@dataclass +class DynamicAllocation: + """Dynamic allocation configuration (Spark 3.0+). + + Attributes: + enabled: Whether dynamic allocation is enabled + initial_executors: Initial number of executors + min_executors: Minimum number of executors + max_executors: Maximum number of executors + shuffle_tracking_enabled: Enable shuffle tracking (default true if dynamic allocation enabled) + shuffle_tracking_timeout: Timeout in milliseconds for shuffle tracking + """ + + enabled: bool = False + initial_executors: Optional[int] = None + min_executors: Optional[int] = None + max_executors: Optional[int] = None + shuffle_tracking_enabled: Optional[bool] = True + shuffle_tracking_timeout: Optional[int] = None + + +@dataclass +class BatchSchedulerConfig: + """Batch scheduler configuration (Volcano, Yunikorn). + + Attributes: + queue: Resource queue name + priority_class_name: Kubernetes PriorityClass name + """ + + queue: Optional[str] = None + priority_class_name: Optional[str] = None + + +@dataclass +class PrometheusSpec: + """Prometheus JMX exporter configuration. + + Attributes: + jmx_exporter_jar: Path to Prometheus JMX exporter jar + port: Port for Prometheus JMX exporter (default 8090) + port_name: Port name (default "jmx-exporter") + config_file: Path to custom Prometheus config file + configuration: Prometheus configuration content + """ + + jmx_exporter_jar: str + port: int = 8090 + port_name: str = "jmx-exporter" + config_file: Optional[str] = None + configuration: Optional[str] = None + + +@dataclass +class MonitoringSpec: + """Monitoring configuration. + + Attributes: + expose_driver_metrics: Whether to expose driver metrics + expose_executor_metrics: Whether to expose executor metrics + metrics_properties: Content of metrics.properties file + metrics_properties_file: Path to metrics.properties file + prometheus: Prometheus configuration + """ + + expose_driver_metrics: bool = False + expose_executor_metrics: bool = False + metrics_properties: Optional[str] = None + metrics_properties_file: Optional[str] = None + prometheus: Optional[PrometheusSpec] = None + + +@dataclass +class SparkUIConfiguration: + """Spark UI service and ingress configuration. + + Attributes: + service_port: Service port (different from target port) + service_port_name: Service port name (default "spark-driver-ui-port") + service_type: Kubernetes service type (default ClusterIP) + service_annotations: Service annotations + service_labels: Service labels + ingress_annotations: Ingress annotations + ingress_tls: Ingress TLS configuration + """ + + service_port: Optional[int] = None + service_port_name: str = "spark-driver-ui-port" + service_type: str = "ClusterIP" + service_annotations: dict[str, str] = field(default_factory=dict) + service_labels: dict[str, str] = field(default_factory=dict) + ingress_annotations: dict[str, str] = field(default_factory=dict) + ingress_tls: Optional[list[dict[str, Any]]] = None + + +@dataclass +class SparkApplicationRequest: + """Request model for Spark application submission (enhanced to match operator v1beta2). + + Attributes: + # === Basic Configuration === + app_name: Name of the Spark application + main_application_file: Path to main application file (S3 or local) + spark_version: Spark version to use + app_type: Application type (Python, Scala, Java, R) + + # === Resource Configuration === + driver_cores: Number of cores for driver + driver_memory: Memory for driver (e.g., "4g") + executor_cores: Number of cores per executor + executor_memory: Memory per executor (e.g., "8g") + num_executors: Number of executors + + # === Application Configuration === + arguments: Application arguments + main_class: Main class for Java/Scala applications + python_version: Python version (for PySpark apps) + spark_conf: Spark configuration properties + hadoop_conf: Hadoop configuration properties + env_vars: Environment variables + deps: Dependencies (jars, py files, files) + + # === Advanced Configuration (NEW) === + mode: Deployment mode (cluster, client, in-cluster-client) + image: Container image (overrides default) + image_pull_policy: Image pull policy (IfNotPresent, Always, Never) + image_pull_secrets: List of image pull secret names + + # === Lifecycle & Resilience (NEW) === + suspend: Suspend the application (pause execution) + restart_policy: Restart policy configuration + time_to_live_seconds: TTL for auto-cleanup after termination + + # === GPU Support (NEW) === + driver_gpu: GPU specification for driver + executor_gpu: GPU specification for executor + + # === Dynamic Allocation (NEW) === + dynamic_allocation: Dynamic allocation configuration + + # === Monitoring & Observability (NEW) === + monitoring: Monitoring configuration + spark_ui_options: Spark UI configuration + + # === Batch Scheduling (NEW) === + batch_scheduler: Batch scheduler name (volcano, yunikorn) + batch_scheduler_options: Batch scheduler configuration + + # === Networking & Security (NEW) === + service_account: Kubernetes service account + node_selector: Node selector for driver and executor + tolerations: Kubernetes tolerations + affinity: Kubernetes affinity rules + host_network: Use host networking + pod_security_context: Pod security context + security_context: Container security context + + # === Advanced Features (NEW) === + driver_pod_template: Full PodTemplateSpec for driver (Spark 3.0+) + executor_pod_template: Full PodTemplateSpec for executor (Spark 3.0+) + volumes: Kubernetes volumes + driver_volume_mounts: Driver volume mounts + executor_volume_mounts: Executor volume mounts + driver_sidecars: Sidecar containers for driver + executor_sidecars: Sidecar containers for executor + driver_init_containers: Init containers for driver + executor_init_containers: Init containers for executor + + # === Labels & Annotations (NEW) === + labels: Kubernetes labels + driver_labels: Driver-specific labels + executor_labels: Executor-specific labels + annotations: Kubernetes annotations + driver_annotations: Driver-specific annotations + executor_annotations: Executor-specific annotations + + # === Legacy (DEPRECATED, keeping for backward compat) === + queue: Queue to submit to (legacy - use namespace or batch_scheduler_options.queue) + """ + + # === Required Fields === + app_name: str + main_application_file: str + + # === Basic Configuration === + spark_version: str = "3.5.0" + app_type: str = "Python" + mode: DeployMode = DeployMode.CLUSTER + + # === Resource Configuration === + driver_cores: int = 1 + driver_memory: str = "1g" + executor_cores: int = 1 + executor_memory: str = "1g" + num_executors: int = 2 + + # === Application Configuration === + arguments: list[str] = field(default_factory=list) + main_class: Optional[str] = None + python_version: str = "3" + spark_conf: dict[str, str] = field(default_factory=dict) + hadoop_conf: dict[str, str] = field(default_factory=dict) + env_vars: dict[str, str] = field(default_factory=dict) + deps: Optional[dict[str, list[str]]] = None + + # === Image Configuration === + image: Optional[str] = None + image_pull_policy: str = "IfNotPresent" + image_pull_secrets: list[str] = field(default_factory=list) + + # === Lifecycle & Resilience === + suspend: Optional[bool] = None + restart_policy: RestartPolicy = field(default_factory=RestartPolicy) + time_to_live_seconds: Optional[int] = None + + # === GPU Support === + driver_gpu: Optional[GPUSpec] = None + executor_gpu: Optional[GPUSpec] = None + + # === Dynamic Allocation === + dynamic_allocation: Optional[DynamicAllocation] = None + + # === Monitoring & Observability === + monitoring: Optional[MonitoringSpec] = None + spark_ui_options: Optional[SparkUIConfiguration] = None + + # === Batch Scheduling === + batch_scheduler: Optional[str] = None + batch_scheduler_options: Optional[BatchSchedulerConfig] = None + + # === Networking & Security === + service_account: str = "spark-operator-spark" + node_selector: dict[str, str] = field(default_factory=dict) + tolerations: list[dict[str, Any]] = field(default_factory=list) + affinity: Optional[dict[str, Any]] = None + host_network: Optional[bool] = None + pod_security_context: Optional[dict[str, Any]] = None + security_context: Optional[dict[str, Any]] = None + + # === Pod Templates (Spark 3.0+) === + driver_pod_template: Optional[dict[str, Any]] = None + executor_pod_template: Optional[dict[str, Any]] = None + + # === Volumes === + volumes: list[dict[str, Any]] = field(default_factory=list) + driver_volume_mounts: list[dict[str, Any]] = field(default_factory=list) + executor_volume_mounts: list[dict[str, Any]] = field(default_factory=list) + + # === Sidecars & Init Containers === + driver_sidecars: list[dict[str, Any]] = field(default_factory=list) + executor_sidecars: list[dict[str, Any]] = field(default_factory=list) + driver_init_containers: list[dict[str, Any]] = field(default_factory=list) + executor_init_containers: list[dict[str, Any]] = field(default_factory=list) + + # === Labels & Annotations === + labels: dict[str, str] = field(default_factory=dict) + driver_labels: dict[str, str] = field(default_factory=dict) + executor_labels: dict[str, str] = field(default_factory=dict) + annotations: dict[str, str] = field(default_factory=dict) + driver_annotations: dict[str, str] = field(default_factory=dict) + executor_annotations: dict[str, str] = field(default_factory=dict) + + # === Legacy === + queue: str = "poc" + + def to_dict(self) -> dict[str, Any]: + """Convert request to dictionary for operator-compliant SparkApplication CRD. + + Returns: + Dictionary representation matching operator's v1beta2 SparkApplication schema + """ + # === Build metadata === + metadata = {"name": self.app_name} + + if self.labels: + metadata["labels"] = self.labels.copy() + if self.annotations: + metadata["annotations"] = self.annotations.copy() + + # === Build spec === + spec: dict[str, Any] = { + "type": self.app_type, + "mode": self.mode.value if isinstance(self.mode, DeployMode) else self.mode, + "mainApplicationFile": self.main_application_file, + "sparkVersion": self.spark_version, + } + + # Image configuration + if self.image: + spec["image"] = self.image + else: + # Default image based on app type + if self.app_type.lower() == "python": + spec["image"] = f"gcr.io/spark-operator/spark-py:{self.spark_version}" + else: + spec["image"] = f"gcr.io/spark-operator/spark:{self.spark_version}" + + spec["imagePullPolicy"] = self.image_pull_policy + if self.image_pull_secrets: + spec["imagePullSecrets"] = self.image_pull_secrets + + # Main class for Java/Scala + if self.main_class: + spec["mainClass"] = self.main_class + + # Python version + if self.app_type.lower() == "python" and self.python_version: + spec["pythonVersion"] = self.python_version + + # === Lifecycle & Resilience === + if self.suspend is not None: + spec["suspend"] = self.suspend + + if self.time_to_live_seconds is not None: + spec["timeToLiveSeconds"] = self.time_to_live_seconds + + # Restart policy + restart_policy_dict = {"type": self.restart_policy.type.value} + if self.restart_policy.on_failure_retries is not None: + restart_policy_dict["onFailureRetries"] = self.restart_policy.on_failure_retries + if self.restart_policy.on_failure_retry_interval: + restart_policy_dict["onFailureRetryInterval"] = ( + self.restart_policy.on_failure_retry_interval + ) + if self.restart_policy.on_submission_failure_retries is not None: + restart_policy_dict["onSubmissionFailureRetries"] = ( + self.restart_policy.on_submission_failure_retries + ) + if self.restart_policy.on_submission_failure_retry_interval: + restart_policy_dict["onSubmissionFailureRetryInterval"] = ( + self.restart_policy.on_submission_failure_retry_interval + ) + spec["restartPolicy"] = restart_policy_dict + + # === Configuration === + if self.arguments: + spec["arguments"] = self.arguments + + if self.spark_conf: + spec["sparkConf"] = self.spark_conf.copy() + + if self.hadoop_conf: + spec["hadoopConf"] = self.hadoop_conf.copy() + + if self.deps: + spec["deps"] = self.deps + + # === Batch Scheduling === + if self.batch_scheduler: + spec["batchScheduler"] = self.batch_scheduler + + if self.batch_scheduler_options: + batch_opts = {} + if self.batch_scheduler_options.queue: + batch_opts["queue"] = self.batch_scheduler_options.queue + if self.batch_scheduler_options.priority_class_name: + batch_opts["priorityClassName"] = self.batch_scheduler_options.priority_class_name + if batch_opts: + spec["batchSchedulerOptions"] = batch_opts + + # === Monitoring === + if self.monitoring: + mon_spec = { + "exposeDriverMetrics": self.monitoring.expose_driver_metrics, + "exposeExecutorMetrics": self.monitoring.expose_executor_metrics, + } + if self.monitoring.metrics_properties: + mon_spec["metricsProperties"] = self.monitoring.metrics_properties + if self.monitoring.metrics_properties_file: + mon_spec["metricsPropertiesFile"] = self.monitoring.metrics_properties_file + if self.monitoring.prometheus: + prom_spec = { + "jmxExporterJar": self.monitoring.prometheus.jmx_exporter_jar, + "port": self.monitoring.prometheus.port, + "portName": self.monitoring.prometheus.port_name, + } + if self.monitoring.prometheus.config_file: + prom_spec["configFile"] = self.monitoring.prometheus.config_file + if self.monitoring.prometheus.configuration: + prom_spec["configuration"] = self.monitoring.prometheus.configuration + mon_spec["prometheus"] = prom_spec + spec["monitoring"] = mon_spec + + # === Spark UI === + if self.spark_ui_options: + ui_opts = {} + if self.spark_ui_options.service_port: + ui_opts["servicePort"] = self.spark_ui_options.service_port + if self.spark_ui_options.service_port_name: + ui_opts["servicePortName"] = self.spark_ui_options.service_port_name + if self.spark_ui_options.service_type: + ui_opts["serviceType"] = self.spark_ui_options.service_type + if self.spark_ui_options.service_annotations: + ui_opts["serviceAnnotations"] = self.spark_ui_options.service_annotations + if self.spark_ui_options.service_labels: + ui_opts["serviceLabels"] = self.spark_ui_options.service_labels + if self.spark_ui_options.ingress_annotations: + ui_opts["ingressAnnotations"] = self.spark_ui_options.ingress_annotations + if self.spark_ui_options.ingress_tls: + ui_opts["ingressTLS"] = self.spark_ui_options.ingress_tls + if ui_opts: + spec["sparkUIOptions"] = ui_opts + + # === Dynamic Allocation === + if self.dynamic_allocation and self.dynamic_allocation.enabled: + dyn_alloc = {"enabled": True} + if self.dynamic_allocation.initial_executors is not None: + dyn_alloc["initialExecutors"] = self.dynamic_allocation.initial_executors + if self.dynamic_allocation.min_executors is not None: + dyn_alloc["minExecutors"] = self.dynamic_allocation.min_executors + if self.dynamic_allocation.max_executors is not None: + dyn_alloc["maxExecutors"] = self.dynamic_allocation.max_executors + if self.dynamic_allocation.shuffle_tracking_enabled is not None: + dyn_alloc["shuffleTrackingEnabled"] = ( + self.dynamic_allocation.shuffle_tracking_enabled + ) + if self.dynamic_allocation.shuffle_tracking_timeout is not None: + dyn_alloc["shuffleTrackingTimeout"] = ( + self.dynamic_allocation.shuffle_tracking_timeout + ) + spec["dynamicAllocation"] = dyn_alloc + + # === Volumes === + if self.volumes: + spec["volumes"] = self.volumes + + # === Node Selector (spec-level) === + if self.node_selector: + spec["nodeSelector"] = self.node_selector + + # === Driver Spec === + driver_spec = { + "cores": self.driver_cores, + "memory": self.driver_memory, + "serviceAccount": self.service_account, + } + + # Driver labels & annotations + driver_labels = {"version": self.spark_version} + if self.driver_labels: + driver_labels.update(self.driver_labels) + driver_spec["labels"] = driver_labels + + if self.driver_annotations: + driver_spec["annotations"] = self.driver_annotations + + # Driver pod template (Spark 3.0+) + if self.driver_pod_template: + driver_spec["template"] = self.driver_pod_template + + # Driver GPU + if self.driver_gpu: + driver_spec["gpu"] = { + "name": self.driver_gpu.name, + "quantity": self.driver_gpu.quantity, + } + + # Driver volumes + if self.driver_volume_mounts: + driver_spec["volumeMounts"] = self.driver_volume_mounts + + # Driver environment + if self.env_vars: + driver_spec["env"] = [{"name": k, "value": v} for k, v in self.env_vars.items()] + + # Driver sidecars & init containers + if self.driver_sidecars: + driver_spec["sidecars"] = self.driver_sidecars + if self.driver_init_containers: + driver_spec["initContainers"] = self.driver_init_containers + + # Driver tolerations, affinity, security + if self.tolerations: + driver_spec["tolerations"] = self.tolerations + if self.affinity: + driver_spec["affinity"] = self.affinity + if self.pod_security_context: + driver_spec["podSecurityContext"] = self.pod_security_context + if self.security_context: + driver_spec["securityContext"] = self.security_context + if self.host_network is not None: + driver_spec["hostNetwork"] = self.host_network + + spec["driver"] = driver_spec + + # === Executor Spec === + executor_spec = { + "cores": self.executor_cores, + "instances": self.num_executors, + "memory": self.executor_memory, + } + + # Executor labels & annotations + executor_labels = {"version": self.spark_version} + if self.executor_labels: + executor_labels.update(self.executor_labels) + executor_spec["labels"] = executor_labels + + if self.executor_annotations: + executor_spec["annotations"] = self.executor_annotations + + # Executor pod template (Spark 3.0+) + if self.executor_pod_template: + executor_spec["template"] = self.executor_pod_template + + # Executor GPU + if self.executor_gpu: + executor_spec["gpu"] = { + "name": self.executor_gpu.name, + "quantity": self.executor_gpu.quantity, + } + + # Executor volumes + if self.executor_volume_mounts: + executor_spec["volumeMounts"] = self.executor_volume_mounts + + # Executor environment + if self.env_vars: + executor_spec["env"] = [{"name": k, "value": v} for k, v in self.env_vars.items()] + + # Executor sidecars & init containers + if self.executor_sidecars: + executor_spec["sidecars"] = self.executor_sidecars + if self.executor_init_containers: + executor_spec["initContainers"] = self.executor_init_containers + + # Executor tolerations, affinity, security (reuse from driver if not overridden) + if self.tolerations: + executor_spec["tolerations"] = self.tolerations + if self.affinity: + executor_spec["affinity"] = self.affinity + if self.pod_security_context: + executor_spec["podSecurityContext"] = self.pod_security_context + if self.security_context: + executor_spec["securityContext"] = self.security_context + if self.host_network is not None: + executor_spec["hostNetwork"] = self.host_network + + spec["executor"] = executor_spec + + # === Build final CRD === + return { + "apiVersion": "sparkoperator.k8s.io/v1beta2", + "kind": "SparkApplication", + "metadata": metadata, + "spec": spec, + } + + +@dataclass +class SparkApplicationResponse: + """Response model for Spark application submission. + + Attributes: + submission_id: Unique submission ID generated by gateway + app_name: Name of the application + status: Current status of the application + message: Additional message + """ + + submission_id: str + app_name: str + status: str = "SUBMITTED" + message: str = "" + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "SparkApplicationResponse": + """Create response from API response dictionary. + + Args: + data: Dictionary from API response + + Returns: + SparkApplicationResponse instance + """ + return cls( + submission_id=data.get("submissionId", data.get("submission_id", "")), + app_name=data.get("appName", data.get("app_name", "")), + status=data.get("status", "SUBMITTED"), + message=data.get("message", ""), + ) + + +@dataclass +class ConnectBackendConfig: + """Configuration for Spark Connect backend. + + This backend enables remote connectivity to existing Spark clusters via + Spark Connect protocol (gRPC-based). + + Attributes: + connect_url: Spark Connect URL (format: sc://host:port/;param1=value;param2=value) + token: Bearer token for authentication (enables SSL automatically) + use_ssl: Enable TLS/SSL for secure communication + user_id: User identifier for session management + session_id: Pre-defined session UUID for session sharing + grpc_max_message_size: Maximum gRPC message size in bytes + + # Auto-provisioning (for Kubeflow-managed clusters) + enable_auto_provision: Automatically provision Spark Connect server if not exists + auto_provision_config: SparkApplication config for auto-provisioned server + namespace: Kubernetes namespace for auto-provisioned server + + # Kubeflow integration + enable_monitoring: Enable metrics collection + artifact_staging_path: Path for staging artifacts (JARs, files, etc.) + timeout: Default timeout for operations in seconds + """ + + connect_url: str + token: Optional[str] = None + use_ssl: bool = True + user_id: Optional[str] = None + session_id: Optional[str] = None + grpc_max_message_size: int = 128 * 1024 * 1024 # 128MB default + + # Auto-provisioning + enable_auto_provision: bool = False + auto_provision_config: Optional["SparkApplicationRequest"] = None + namespace: str = "default" + + # Kubeflow integration + enable_monitoring: bool = True + artifact_staging_path: Optional[str] = None + timeout: int = 300 + + +@dataclass +class SessionMetrics: + """Metrics for a Spark Connect session. + + Attributes: + session_id: Session UUID + queries_executed: Number of queries executed + active_queries: Number of currently active queries + artifacts_uploaded: Number of artifacts uploaded + data_read_bytes: Total bytes read + data_written_bytes: Total bytes written + execution_time_ms: Total execution time in milliseconds + """ + + session_id: str + queries_executed: int = 0 + active_queries: int = 0 + artifacts_uploaded: int = 0 + data_read_bytes: int = 0 + data_written_bytes: int = 0 + execution_time_ms: int = 0 + + +@dataclass +class SessionInfo: + """Information about a Spark Connect session. + + Attributes: + session_id: Session UUID + app_name: Application name + user_id: User identifier + created_at: Session creation time + last_activity: Last activity timestamp + state: Session state (active, idle, closed) + metrics: Session metrics + """ + + session_id: str + app_name: str + user_id: Optional[str] = None + created_at: Optional[str] = None + last_activity: Optional[str] = None + state: str = "active" + metrics: Optional[SessionMetrics] = None + + +@dataclass +class ApplicationStatus: + """Status information for a Spark application. + + Attributes: + submission_id: Submission ID + app_id: Spark application ID + app_name: Application name + state: Current state + submission_time: Time of submission + start_time: Start time + completion_time: Completion time + driver_info: Driver pod information + executor_state: Executor states + """ + + submission_id: str + app_id: Optional[str] = None + app_name: Optional[str] = None + state: ApplicationState = ApplicationState.UNKNOWN + submission_time: Optional[str] = None + start_time: Optional[str] = None + completion_time: Optional[str] = None + driver_info: Optional[dict[str, Any]] = None + executor_state: Optional[dict[str, Any]] = None + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "ApplicationStatus": + """Create status from API response dictionary. + + Args: + data: Dictionary from API response + + Returns: + ApplicationStatus instance + """ + # Support both Operator and Gateway response formats + if "status" in data and "applicationState" in data.get("status", {}): + # Operator format + state_str = data["status"]["applicationState"].get("state", "UNKNOWN") + app_id = data["status"].get("sparkApplicationId") + submission_time = data["status"].get("submissionTime") + start_time = data["status"].get("lastSubmissionAttemptTime") + completion_time = data["status"].get("terminationTime") + driver_info = data["status"].get("driverInfo") + executor_state = data["status"].get("executorState") + elif "status" in data and "appState" in data.get("status", {}): + # Operator format (alternative field name) + state_str = data["status"].get("appState", {}).get("state", "UNKNOWN") + app_id = data["status"].get("sparkApplicationId") + submission_time = None + start_time = data["status"].get("lastSubmissionAttemptTime") + completion_time = data["status"].get("terminationTime") + driver_info = data["status"].get("driverInfo") + executor_state = data["status"].get("executorState") + else: + # Gateway format or simple format + state_str = data.get("status", "UNKNOWN") + app_id = data.get("app_id") + submission_time = data.get("submission_time") + start_time = data.get("start_time") + completion_time = data.get("completion_time") + driver_info = data.get("driver_info") + executor_state = data.get("executor_state") + + try: + state = ApplicationState(state_str) + except ValueError: + state = ApplicationState.UNKNOWN + + return cls( + submission_id=data.get( + "submissionId", data.get("submission_id", data.get("metadata", {}).get("name", "")) + ), + app_id=app_id, + app_name=data.get("metadata", {}).get("name", data.get("app_name")), + state=state, + submission_time=submission_time, + start_time=start_time, + completion_time=completion_time, + driver_info=driver_info, + executor_state=executor_state, + ) diff --git a/kubeflow/spark/session.py b/kubeflow/spark/session.py new file mode 100644 index 000000000..9a67edec6 --- /dev/null +++ b/kubeflow/spark/session.py @@ -0,0 +1,343 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Managed Spark Connect session wrapper.""" + +import logging +from typing import TYPE_CHECKING, Any, Optional + +from kubeflow.spark.models import SessionInfo, SessionMetrics + +if TYPE_CHECKING: + from kubeflow.spark.backends.connect import ConnectBackend + + # Only import pyspark types for type checking + try: + from pyspark.sql import DataFrame, DataFrameReader, SparkSession + from pyspark.sql.streaming import DataStreamReader + except ImportError: + DataFrame = Any # type: ignore + DataFrameReader = Any # type: ignore + SparkSession = Any # type: ignore + DataStreamReader = Any # type: ignore + +logger = logging.getLogger(__name__) + + +class ManagedSparkSession: + """Kubeflow-managed Spark Connect session. + + This class wraps a native PySpark Connect session and provides additional + Kubeflow-specific functionality like metrics collection, artifact management, + and pipeline integration. + + The underlying PySpark DataFrame API is accessible directly, allowing users + to write standard PySpark code while benefiting from Kubeflow enhancements. + + Example: + ```python + from kubeflow.spark import SparkSessionClient, ConnectBackendConfig + + config = ConnectBackendConfig(connect_url="sc://spark-cluster:15002") + client = SparkSessionClient(backend_config=config) + + # Create session + session = client.create_session(app_name="data-analysis") + + # Use standard PySpark API + df = session.sql("SELECT * FROM table") + result = df.filter(df.status == "active").collect() + + # Kubeflow extensions + metrics = session.get_metrics() + session.export_to_pipeline_artifact(df, "/outputs/data.parquet") + + # Cleanup + session.close() + ``` + """ + + def __init__( + self, + session: "SparkSession", + session_id: str, + app_name: str, + backend: "ConnectBackend", + ): + """Initialize managed session. + + Args: + session: Native PySpark Connect session + session_id: Session UUID + app_name: Application name + backend: ConnectBackend instance for lifecycle management + """ + self._session = session + self._session_id = session_id + self._app_name = app_name + self._backend = backend + self._closed = False + self._metrics = SessionMetrics(session_id=session_id) + + logger.info(f"Created ManagedSparkSession: {session_id} (app: {app_name})") + + # ========================================================================= + # Properties + # ========================================================================= + + @property + def session_id(self) -> str: + """Get session UUID.""" + return self._session_id + + @property + def app_name(self) -> str: + """Get application name.""" + return self._app_name + + @property + def is_closed(self) -> bool: + """Check if session is closed.""" + return self._closed + + @property + def spark(self) -> "SparkSession": + """Get underlying PySpark session. + + Use this to access the full PySpark API directly. + """ + if self._closed: + raise RuntimeError(f"Session {self._session_id} is closed") + return self._session + + # ========================================================================= + # Delegate PySpark DataFrame API + # ========================================================================= + + def sql(self, query: str) -> "DataFrame": + """Execute SQL query and return DataFrame. + + Args: + query: SQL query string + + Returns: + DataFrame with query results + """ + if self._closed: + raise RuntimeError(f"Session {self._session_id} is closed") + self._metrics.queries_executed += 1 + return self._session.sql(query) + + @property + def read(self) -> "DataFrameReader": + """Get DataFrameReader for reading data sources.""" + if self._closed: + raise RuntimeError(f"Session {self._session_id} is closed") + return self._session.read + + @property + def readStream(self) -> "DataStreamReader": + """Get DataStreamReader for reading streaming sources.""" + if self._closed: + raise RuntimeError(f"Session {self._session_id} is closed") + return self._session.readStream + + def createDataFrame(self, data: Any, schema: Any = None) -> "DataFrame": + """Create DataFrame from data. + + Args: + data: Input data (list, pandas DataFrame, RDD, etc.) + schema: Optional schema + + Returns: + DataFrame + """ + if self._closed: + raise RuntimeError(f"Session {self._session_id} is closed") + return self._session.createDataFrame(data, schema) + + def table(self, tableName: str) -> "DataFrame": + """Get DataFrame for a table. + + Args: + tableName: Name of the table + + Returns: + DataFrame + """ + if self._closed: + raise RuntimeError(f"Session {self._session_id} is closed") + return self._session.table(tableName) + + def range( + self, + start: int, + end: Optional[int] = None, + step: int = 1, + numPartitions: Optional[int] = None, + ) -> "DataFrame": + """Create DataFrame with range of numbers. + + Args: + start: Start of range (or end if `end` not provided) + end: End of range (optional) + step: Step size + numPartitions: Number of partitions + + Returns: + DataFrame + """ + if self._closed: + raise RuntimeError(f"Session {self._session_id} is closed") + if end is None: + return self._session.range(start) + return self._session.range(start, end, step, numPartitions) + + # ========================================================================= + # Kubeflow Extensions + # ========================================================================= + + def upload_artifacts(self, *paths: str, pyfile: bool = False) -> None: + """Upload artifacts to Spark Connect session. + + Args: + *paths: File paths to upload (JARs, Python files, data files) + pyfile: If True, treat as Python files (added to sys.path) + + Example: + ```python + # Upload JARs + session.upload_artifacts("/path/to/lib.jar") + + # Upload Python packages + session.upload_artifacts("/path/to/package.zip", pyfile=True) + ``` + """ + if self._closed: + raise RuntimeError(f"Session {self._session_id} is closed") + + try: + # Use PySpark's addArtifacts method + if hasattr(self._session, "addArtifacts"): + self._session.addArtifacts(*paths, pyfile=pyfile) + self._metrics.artifacts_uploaded += len(paths) + logger.info(f"Uploaded {len(paths)} artifacts to session {self._session_id}") + else: + logger.warning("Session does not support artifact upload (requires PySpark 3.4+)") + except Exception as e: + logger.error(f"Failed to upload artifacts: {e}") + raise + + def get_metrics(self) -> SessionMetrics: + """Get session metrics. + + Returns: + SessionMetrics with current statistics + """ + return self._metrics + + def get_info(self) -> SessionInfo: + """Get session information. + + Returns: + SessionInfo with session metadata + """ + return SessionInfo( + session_id=self._session_id, + app_name=self._app_name, + state="closed" if self._closed else "active", + metrics=self._metrics, + ) + + def export_to_pipeline_artifact( + self, df: "DataFrame", path: str, format: str = "parquet", **options: Any + ) -> None: + """Export DataFrame to Kubeflow Pipeline artifact. + + Args: + df: DataFrame to export + path: Output path for artifact + format: Output format (parquet, csv, json, etc.) + **options: Additional write options + + Example: + ```python + df = session.sql("SELECT * FROM sales") + session.export_to_pipeline_artifact(df, "/outputs/sales.parquet") + ``` + """ + if self._closed: + raise RuntimeError(f"Session {self._session_id} is closed") + + try: + writer = df.write.format(format) + for key, value in options.items(): + writer = writer.option(key, value) + writer.save(path) + logger.info(f"Exported DataFrame to {path} (format: {format})") + except Exception as e: + logger.error(f"Failed to export DataFrame: {e}") + raise + + def clone(self) -> "ManagedSparkSession": + """Clone the session with all state. + + Creates a new session that shares the same state (temp views, UDFs, etc.) + but has its own session ID. + + Returns: + New ManagedSparkSession instance + """ + if self._closed: + raise RuntimeError(f"Session {self._session_id} is closed") + + logger.info(f"Cloning session {self._session_id}") + return self._backend._clone_session(self) + + def close(self, release: bool = True) -> None: + """Close the session. + + Args: + release: If True, release session resources on server + """ + if self._closed: + logger.warning(f"Session {self._session_id} already closed") + return + + try: + if release: + # Stop the session + self._session.stop() + logger.info(f"Released session {self._session_id} on server") + self._closed = True + except Exception as e: + logger.error(f"Error closing session {self._session_id}: {e}") + raise + + # ========================================================================= + # Context Manager + # ========================================================================= + + def __enter__(self) -> "ManagedSparkSession": + """Context manager entry.""" + return self + + def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: + """Context manager exit - ensures cleanup.""" + self.close() + + def __repr__(self) -> str: + """String representation.""" + status = "closed" if self._closed else "active" + return f"ManagedSparkSession(id={self._session_id}, app={self._app_name}, status={status})" diff --git a/kubeflow/spark/session_client.py b/kubeflow/spark/session_client.py new file mode 100644 index 000000000..e83b55c22 --- /dev/null +++ b/kubeflow/spark/session_client.py @@ -0,0 +1,251 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Session Spark client for interactive Spark sessions.""" + +from typing import Any + +from kubeflow.spark.base_client import BaseSparkClient +from kubeflow.spark.backends.connect import ( + ConnectBackend, + ConnectBackendConfig, +) +from kubeflow.spark.models import SessionInfo +from kubeflow.spark.session import ManagedSparkSession + + +class SparkSessionClient(BaseSparkClient): + """Client for managing interactive Spark sessions. + + This client provides a high-level API for creating and managing long-lived + Spark Connect sessions for interactive data analysis, exploratory workflows, + and notebook-style development. + + Supported backends: + - **ConnectBackend**: Connects to Spark clusters via Spark Connect protocol (gRPC) + + Features: + - Interactive SQL queries + - DataFrame API access + - Artifact upload (JARs, Python files) + - Session metrics and monitoring + - Full PySpark API compatibility + + Example: + ```python + from kubeflow.spark import SparkSessionClient, ConnectBackendConfig + + # Initialize client + config = ConnectBackendConfig( + connect_url="sc://spark-cluster.default.svc:15002", + use_ssl=True, + ) + client = SparkSessionClient(backend_config=config) + + # Create session + session = client.create_session(app_name="data-exploration") + + # Use PySpark DataFrame API + df = session.sql("SELECT * FROM sales WHERE date >= '2024-01-01'") + result = df.groupBy("product").sum("amount").collect() + + # Upload artifacts + session.upload_artifacts("/path/to/lib.jar") + + # Get metrics + metrics = session.get_metrics() + print(f"Queries executed: {metrics.queries_executed}") + + # Cleanup + session.close() + ``` + + Context Manager: + ```python + with SparkSessionClient(backend_config=config) as client: + session = client.create_session("my-analysis") + # Use session... + # Cleanup happens automatically + ``` + + Notebook Workflow: + ```python + # Cell 1: Setup + client = SparkSessionClient(backend_config=config) + session = client.create_session("notebook-session") + + # Cell 2: Load data + df = session.read.parquet("s3a://bucket/data/") + + # Cell 3: Analysis + summary = df.describe() + summary.show() + + # Cell 4: Cleanup + session.close() + ``` + """ + + def __init__(self, backend_config: ConnectBackendConfig): + """Initialize Spark Session client. + + Args: + backend_config: ConnectBackendConfig with connection details + + Raises: + ValueError: If invalid backend configuration provided + ImportError: If pyspark[connect] is not installed + """ + if not isinstance(backend_config, ConnectBackendConfig): + raise ValueError( + f"Invalid backend config type for SparkSessionClient: {type(backend_config)}. " + "Expected ConnectBackendConfig." + ) + + # Initialize ConnectBackend + backend = ConnectBackend(backend_config) + + # Initialize base class + super().__init__(backend) + + def create_session( + self, + app_name: str, + **kwargs: Any, + ) -> ManagedSparkSession: + """Create a new Spark Connect session. + + This establishes a connection to a Spark Connect server and returns + a managed session that provides the full PySpark DataFrame API. + + Args: + app_name: Name for the session/application + **kwargs: Additional Spark configuration options + (passed to SparkSession.builder.config) + + Returns: + ManagedSparkSession instance for interactive operations + + Raises: + RuntimeError: If session creation fails + ConnectionError: If cannot connect to Spark Connect server + TimeoutError: If connection times out + + Example: + ```python + # Basic session + session = client.create_session(app_name="data-analysis") + + # Session with custom configuration + session = client.create_session( + app_name="data-analysis", + **{ + "spark.sql.shuffle.partitions": "200", + "spark.sql.adaptive.enabled": "true", + } + ) + + # Use session + df = session.sql("SELECT * FROM table") + result = df.collect() + + # Cleanup + session.close() + ``` + """ + return self._backend.create_session(app_name=app_name, **kwargs) + + def get_session_status(self, session_id: str) -> SessionInfo: + """Get status and metadata of a Spark Connect session. + + Args: + session_id: Session UUID (from session.session_id) + + Returns: + SessionInfo with session metadata, state, and metrics + + Raises: + RuntimeError: If request fails + ValueError: If session_id not found + + Example: + ```python + # Create session + session = client.create_session("my-app") + + # Get status + info = client.get_session_status(session.session_id) + print(f"Session ID: {info.session_id}") + print(f"App name: {info.app_name}") + print(f"State: {info.state}") + print(f"Queries executed: {info.metrics.queries_executed}") + print(f"Artifacts uploaded: {info.metrics.artifacts_uploaded}") + ``` + """ + return self._backend.get_session_status(session_id) + + def list_sessions(self) -> list[SessionInfo]: + """List all active Spark Connect sessions. + + Returns: + List of SessionInfo objects for active sessions + + Raises: + RuntimeError: If request fails + + Example: + ```python + # List all sessions + sessions = client.list_sessions() + + for session_info in sessions: + print(f"Session: {session_info.session_id}") + print(f" App: {session_info.app_name}") + print(f" State: {session_info.state}") + print(f" Queries: {session_info.metrics.queries_executed}") + ``` + """ + return self._backend.list_sessions() + + def close_session(self, session_id: str, release: bool = True) -> dict[str, Any]: + """Close a Spark Connect session. + + Args: + session_id: Session UUID to close + release: If True, release session resources on server (default: True) + + Returns: + Dictionary with closure response + + Raises: + RuntimeError: If closure fails + ValueError: If session_id not found + + Example: + ```python + # Create session + session = client.create_session("my-app") + session_id = session.session_id + + # Do work... + + # Close session and release resources + response = client.close_session(session_id, release=True) + print(f"Closed: {response}") + + # Alternative: use session.close() directly + session.close() + ``` + """ + return self._backend.close_session(session_id, release) diff --git a/kubeflow/spark/test/__init__.py b/kubeflow/spark/test/__init__.py new file mode 100644 index 000000000..773da3500 --- /dev/null +++ b/kubeflow/spark/test/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Kubeflow Spark client.""" diff --git a/kubeflow/spark/test/test_connect_backend.py b/kubeflow/spark/test/test_connect_backend.py new file mode 100644 index 000000000..fe8e288c8 --- /dev/null +++ b/kubeflow/spark/test/test_connect_backend.py @@ -0,0 +1,342 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for Spark Connect backend and configuration.""" + +import pytest + +from kubeflow.spark.models import ConnectBackendConfig, SessionInfo, SessionMetrics + + +def _is_pyspark_available() -> bool: + """Check if PySpark Connect is available.""" + try: + import pyspark # noqa: F401 + + return True + except ImportError: + return False + + +class TestConnectBackendConfig: + """Tests for ConnectBackendConfig validation and URL building.""" + + def test_valid_basic_config(self): + """Test creating a basic valid configuration.""" + config = ConnectBackendConfig(connect_url="sc://localhost:15002") + + assert config.connect_url == "sc://localhost:15002" + assert config.use_ssl is True + assert config.token is None + assert config.timeout == 300 + + def test_valid_config_with_authentication(self): + """Test configuration with authentication token.""" + config = ConnectBackendConfig( + connect_url="sc://cluster:15002", + token="test-token", + use_ssl=True, + ) + + assert config.token == "test-token" + assert config.use_ssl is True + + def test_valid_config_with_all_options(self): + """Test configuration with all optional parameters.""" + config = ConnectBackendConfig( + connect_url="sc://cluster:15002", + token="test-token", + use_ssl=True, + user_id="testuser", + session_id="test-session-123", + grpc_max_message_size=256 * 1024 * 1024, + enable_auto_provision=False, + namespace="spark-jobs", + enable_monitoring=True, + timeout=600, + ) + + assert config.connect_url == "sc://cluster:15002" + assert config.token == "test-token" + assert config.user_id == "testuser" + assert config.session_id == "test-session-123" + assert config.grpc_max_message_size == 256 * 1024 * 1024 + assert config.namespace == "spark-jobs" + assert config.timeout == 600 + + def test_url_with_parameters(self): + """Test URL with embedded parameters.""" + config = ConnectBackendConfig(connect_url="sc://cluster:15002/;use_ssl=true;token=abc123") + + assert config.connect_url == "sc://cluster:15002/;use_ssl=true;token=abc123" + + def test_kubernetes_service_url(self): + """Test Kubernetes service DNS format.""" + config = ConnectBackendConfig( + connect_url="sc://spark-connect.spark-ns.svc.cluster.local:15002" + ) + + assert config.connect_url == "sc://spark-connect.spark-ns.svc.cluster.local:15002" + + +class TestConnectBackendValidation: + """Tests for ConnectBackend URL validation.""" + + @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed") + def test_invalid_url_missing_scheme(self): + """Test that invalid URL (missing sc://) is rejected.""" + from kubeflow.spark.backends.connect import ConnectBackend + + config = ConnectBackendConfig(connect_url="localhost:15002") + + with pytest.raises(ValueError, match="Invalid Spark Connect URL"): + ConnectBackend(config) + + @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed") + def test_invalid_url_missing_port(self): + """Test that URL without port is rejected.""" + from kubeflow.spark.backends.connect import ConnectBackend + + config = ConnectBackendConfig(connect_url="sc://localhost") + + with pytest.raises(ValueError, match="Invalid Spark Connect URL"): + ConnectBackend(config) + + @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed") + def test_empty_url(self): + """Test that empty URL is rejected.""" + from kubeflow.spark.backends.connect import ConnectBackend + + config = ConnectBackendConfig(connect_url="") + + with pytest.raises(ValueError, match="connect_url is required"): + ConnectBackend(config) + + +class TestConnectBackendURLBuilding: + """Tests for connection URL building logic.""" + + @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed") + def test_url_building_with_ssl(self): + """Test URL building with SSL enabled.""" + from kubeflow.spark.backends.connect import ConnectBackend + + config = ConnectBackendConfig(connect_url="sc://localhost:15002", use_ssl=True) + backend = ConnectBackend(config) + + url = backend._build_connection_url() + assert "use_ssl=true" in url + + @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed") + def test_url_building_with_token(self): + """Test URL building with authentication token.""" + from kubeflow.spark.backends.connect import ConnectBackend + + config = ConnectBackendConfig(connect_url="sc://localhost:15002", token="test-token") + backend = ConnectBackend(config) + + url = backend._build_connection_url() + assert "token=test-token" in url + + @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed") + def test_url_building_preserves_existing_params(self): + """Test that existing URL parameters are preserved.""" + from kubeflow.spark.backends.connect import ConnectBackend + + config = ConnectBackendConfig(connect_url="sc://localhost:15002/;custom_param=value") + backend = ConnectBackend(config) + + url = backend._build_connection_url() + assert "custom_param=value" in url + + @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed") + def test_url_building_config_overrides_url_params(self): + """Test that config parameters override URL parameters.""" + from kubeflow.spark.backends.connect import ConnectBackend + + config = ConnectBackendConfig( + connect_url="sc://localhost:15002/;token=url-token", token="config-token" + ) + backend = ConnectBackend(config) + + url = backend._build_connection_url() + assert "token=config-token" in url + assert "token=url-token" not in url + + +class TestSessionMetrics: + """Tests for SessionMetrics model.""" + + def test_default_metrics(self): + """Test default metric values.""" + metrics = SessionMetrics(session_id="test-123") + + assert metrics.session_id == "test-123" + assert metrics.queries_executed == 0 + assert metrics.active_queries == 0 + assert metrics.artifacts_uploaded == 0 + assert metrics.data_read_bytes == 0 + assert metrics.data_written_bytes == 0 + assert metrics.execution_time_ms == 0 + + def test_metrics_with_values(self): + """Test metrics with custom values.""" + metrics = SessionMetrics( + session_id="test-123", + queries_executed=10, + active_queries=2, + artifacts_uploaded=5, + data_read_bytes=1024 * 1024, + data_written_bytes=512 * 1024, + execution_time_ms=5000, + ) + + assert metrics.queries_executed == 10 + assert metrics.active_queries == 2 + assert metrics.artifacts_uploaded == 5 + assert metrics.data_read_bytes == 1024 * 1024 + assert metrics.data_written_bytes == 512 * 1024 + assert metrics.execution_time_ms == 5000 + + +class TestSessionInfo: + """Tests for SessionInfo model.""" + + def test_basic_session_info(self): + """Test basic session info creation.""" + info = SessionInfo(session_id="test-123", app_name="test-app") + + assert info.session_id == "test-123" + assert info.app_name == "test-app" + assert info.state == "active" + assert info.user_id is None + assert info.metrics is None + + def test_session_info_with_metrics(self): + """Test session info with metrics.""" + metrics = SessionMetrics(session_id="test-123", queries_executed=5) + info = SessionInfo( + session_id="test-123", app_name="test-app", state="active", metrics=metrics + ) + + assert info.metrics is not None + assert info.metrics.queries_executed == 5 + + def test_session_info_with_all_fields(self): + """Test session info with all fields populated.""" + metrics = SessionMetrics(session_id="test-123") + info = SessionInfo( + session_id="test-123", + app_name="test-app", + user_id="testuser", + created_at="2024-01-01T00:00:00", + last_activity="2024-01-01T01:00:00", + state="active", + metrics=metrics, + ) + + assert info.session_id == "test-123" + assert info.app_name == "test-app" + assert info.user_id == "testuser" + assert info.created_at == "2024-01-01T00:00:00" + assert info.last_activity == "2024-01-01T01:00:00" + assert info.state == "active" + assert info.metrics == metrics + + +class TestConnectBackendBatchMethodsRaiseErrors: + """Test that batch-oriented methods raise NotImplementedError.""" + + @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed") + def test_submit_application_raises_error(self): + """Test that submit_application raises NotImplementedError.""" + from kubeflow.spark.backends.connect import ConnectBackend + + config = ConnectBackendConfig(connect_url="sc://localhost:15002") + backend = ConnectBackend(config) + + with pytest.raises(NotImplementedError, match="batch application submission"): + backend.submit_application( + app_name="test", + main_application_file="test.py", + spark_version="3.5.0", + app_type="Python", + driver_cores=1, + driver_memory="1g", + executor_cores=1, + executor_memory="1g", + num_executors=1, + queue=None, + arguments=None, + python_version="3", + spark_conf=None, + hadoop_conf=None, + env_vars=None, + deps=None, + ) + + @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed") + def test_get_status_raises_error(self): + """Test that get_status raises NotImplementedError.""" + from kubeflow.spark.backends.connect import ConnectBackend + + config = ConnectBackendConfig(connect_url="sc://localhost:15002") + backend = ConnectBackend(config) + + with pytest.raises(NotImplementedError, match="batch application status"): + backend.get_job("test-id") + + @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed") + def test_delete_application_raises_error(self): + """Test that delete_application raises NotImplementedError.""" + from kubeflow.spark.backends.connect import ConnectBackend + + config = ConnectBackendConfig(connect_url="sc://localhost:15002") + backend = ConnectBackend(config) + + with pytest.raises(NotImplementedError, match="batch application deletion"): + backend.delete_job("test-id") + + @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed") + def test_get_logs_raises_error(self): + """Test that get_logs raises NotImplementedError.""" + from kubeflow.spark.backends.connect import ConnectBackend + + config = ConnectBackendConfig(connect_url="sc://localhost:15002") + backend = ConnectBackend(config) + + with pytest.raises(NotImplementedError, match="logs retrieval"): + list(backend.get_job_logs("test-id")) + + @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed") + def test_list_applications_raises_error(self): + """Test that list_applications raises NotImplementedError.""" + from kubeflow.spark.backends.connect import ConnectBackend + + config = ConnectBackendConfig(connect_url="sc://localhost:15002") + backend = ConnectBackend(config) + + with pytest.raises(NotImplementedError, match="listing applications"): + backend.list_jobs() + + @pytest.mark.skipif(not _is_pyspark_available(), reason="PySpark Connect not installed") + def test_wait_for_completion_raises_error(self): + """Test that wait_for_completion raises NotImplementedError.""" + from kubeflow.spark.backends.connect import ConnectBackend + + config = ConnectBackendConfig(connect_url="sc://localhost:15002") + backend = ConnectBackend(config) + + with pytest.raises(NotImplementedError, match="application completion"): + backend.wait_for_job_status("test-id") diff --git a/kubeflow/spark/test/test_connect_integration.py b/kubeflow/spark/test/test_connect_integration.py new file mode 100644 index 000000000..b512d923b --- /dev/null +++ b/kubeflow/spark/test/test_connect_integration.py @@ -0,0 +1,399 @@ +#!/usr/bin/env python3 +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Integration tests for Spark Connect backend. + +These tests require a running Spark Connect server. + +Setup: +1. Install PySpark with Connect support: + pip install 'pyspark[connect]>=3.4.0' + +2. Start local Spark Connect server: + $SPARK_HOME/sbin/start-connect-server.sh \ + --packages org.apache.spark:spark-connect_2.12:3.5.0 + + Or using Docker: + docker run -p 15002:15002 apache/spark:3.5.0 \ + /opt/spark/sbin/start-connect-server.sh + +3. Run tests: + pytest kubeflow/spark/test/test_connect_integration.py -v + +Environment variables: +- SPARK_CONNECT_URL: Spark Connect URL (default: sc://localhost:15002) +- SKIP_INTEGRATION_TESTS: Set to skip these tests (useful in CI) +""" + +import os +import sys + +import pytest + + +def _is_pyspark_available() -> bool: + """Check if PySpark Connect is available.""" + try: + import pyspark # noqa: F401 + + return True + except ImportError: + return False + + +def _should_skip_integration_tests() -> bool: + """Check if integration tests should be skipped.""" + return os.getenv("SKIP_INTEGRATION_TESTS", "false").lower() == "true" + + +def _get_connect_url() -> str: + """Get Spark Connect URL from environment.""" + return os.getenv("SPARK_CONNECT_URL", "sc://localhost:15002") + + +pytestmark = pytest.mark.skipif( + not _is_pyspark_available() or _should_skip_integration_tests(), + reason="PySpark Connect not installed or integration tests disabled", +) + + +class TestConnectBackendIntegration: + """Integration tests for ConnectBackend with real Spark Connect server.""" + + def test_create_and_close_session(self): + """Test creating and closing a session.""" + from kubeflow.spark import ConnectBackendConfig, SparkSessionClient + + config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False, timeout=30) + + client = SparkSessionClient(backend_config=config) + + try: + session = client.create_session(app_name="test-session") + assert session is not None + assert session.session_id is not None + assert session.app_name == "test-session" + assert not session.is_closed + + session.close() + assert session.is_closed + finally: + client.close() + + def test_simple_sql_query(self): + """Test executing a simple SQL query.""" + from kubeflow.spark import ConnectBackendConfig, SparkSessionClient + + config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False) + + with SparkSessionClient(backend_config=config) as client: + session = client.create_session(app_name="sql-test") + + try: + df = session.sql("SELECT 1 AS id, 'test' AS name") + result = df.collect() + + assert len(result) == 1 + assert result[0].id == 1 + assert result[0].name == "test" + finally: + session.close() + + def test_create_dataframe_and_show(self): + """Test creating a DataFrame and showing data.""" + from kubeflow.spark import ConnectBackendConfig, SparkSessionClient + + config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False) + + with SparkSessionClient(backend_config=config) as client: + session = client.create_session(app_name="dataframe-test") + + try: + data = [ + (1, "Alice", 28), + (2, "Bob", 35), + (3, "Carol", 42), + ] + df = session.createDataFrame(data, ["id", "name", "age"]) + + assert df.count() == 3 + + result = df.collect() + assert len(result) == 3 + assert result[0].name == "Alice" + assert result[1].age == 35 + + print("\nDataFrame content:") + df.show() + finally: + session.close() + + def test_dataframe_transformations(self): + """Test DataFrame transformations (filter, select, groupBy).""" + from kubeflow.spark import ConnectBackendConfig, SparkSessionClient + + config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False) + + with SparkSessionClient(backend_config=config) as client: + session = client.create_session(app_name="transform-test") + + try: + data = [ + (1, "Engineering", 100000), + (2, "Engineering", 120000), + (3, "Sales", 80000), + (4, "Sales", 90000), + (5, "Marketing", 85000), + ] + df = session.createDataFrame(data, ["id", "department", "salary"]) + + filtered = df.filter(df.salary > 85000) + assert filtered.count() == 4 + + selected = df.select("department", "salary") + assert len(selected.columns) == 2 + + grouped = df.groupBy("department").count() + result = grouped.collect() + assert len(result) == 3 + + print("\nGrouped by department:") + grouped.show() + finally: + session.close() + + def test_session_metrics(self): + """Test session metrics collection.""" + from kubeflow.spark import ConnectBackendConfig, SparkSessionClient + + config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False) + + with SparkSessionClient(backend_config=config) as client: + session = client.create_session(app_name="metrics-test") + + try: + initial_metrics = session.get_metrics() + assert initial_metrics.queries_executed == 0 + + session.sql("SELECT 1") + session.sql("SELECT 2") + + updated_metrics = session.get_metrics() + assert updated_metrics.queries_executed == 2 + finally: + session.close() + + def test_multiple_sessions(self): + """Test creating multiple concurrent sessions.""" + from kubeflow.spark import ConnectBackendConfig, SparkSessionClient + + config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False) + + with SparkSessionClient(backend_config=config) as client: + session1 = client.create_session(app_name="session-1") + session2 = client.create_session(app_name="session-2") + + try: + assert session1.session_id != session2.session_id + + sessions = client.list_sessions() + assert len(sessions) == 2 + + df1 = session1.sql("SELECT 'session1' AS source") + df2 = session2.sql("SELECT 'session2' AS source") + + assert df1.collect()[0].source == "session1" + assert df2.collect()[0].source == "session2" + finally: + session1.close() + session2.close() + + def test_range_dataframe(self): + """Test creating range DataFrame.""" + from kubeflow.spark import ConnectBackendConfig, SparkSessionClient + + config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False) + + with SparkSessionClient(backend_config=config) as client: + session = client.create_session(app_name="range-test") + + try: + df = session.range(0, 10, 2) + assert df.count() == 5 + + result = df.collect() + assert result[0].id == 0 + assert result[1].id == 2 + assert result[4].id == 8 + + print("\nRange DataFrame:") + df.show() + finally: + session.close() + + def test_context_manager(self): + """Test session context manager.""" + from kubeflow.spark import ConnectBackendConfig, SparkSessionClient + + config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False) + client = SparkSessionClient(backend_config=config) + + with client.create_session(app_name="context-test") as session: + df = session.sql("SELECT 42 AS answer") + result = df.collect() + assert result[0].answer == 42 + + assert session.is_closed + + def test_get_session_info(self): + """Test getting session information.""" + from kubeflow.spark import ConnectBackendConfig, SparkSessionClient + + config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False) + + with SparkSessionClient(backend_config=config) as client: + session = client.create_session(app_name="info-test") + + try: + info = session.get_info() + assert info.session_id == session.session_id + assert info.app_name == "info-test" + assert info.state == "active" + assert info.metrics is not None + + status = client.get_session_status(session.session_id) + assert status.session_id == session.session_id + assert status.app_name == "info-test" + finally: + session.close() + + +class TestConnectBackendErrorHandling: + """Test error handling in ConnectBackend.""" + + def test_connection_to_invalid_server(self): + """Test connection to non-existent server.""" + from kubeflow.spark import ConnectBackendConfig, SparkSessionClient + + config = ConnectBackendConfig( + connect_url="sc://nonexistent-host:99999", use_ssl=False, timeout=5 + ) + + with SparkSessionClient(backend_config=config) as client: + with pytest.raises(Exception): + client.create_session(app_name="fail-test") + + def test_query_on_closed_session(self): + """Test querying after session is closed.""" + from kubeflow.spark import ConnectBackendConfig, SparkSessionClient + + config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False) + + with SparkSessionClient(backend_config=config) as client: + session = client.create_session(app_name="closed-test") + session.close() + + with pytest.raises(RuntimeError, match="closed"): + session.sql("SELECT 1") + + +def main(): + """Run integration tests manually.""" + print("=" * 80) + print("Spark Connect Integration Tests") + print("=" * 80) + print(f"\nConnect URL: {_get_connect_url()}") + print(f"PySpark available: {_is_pyspark_available()}") + print(f"Skip integration tests: {_should_skip_integration_tests()}") + + if not _is_pyspark_available(): + print("\nERROR: PySpark Connect not installed!") + print("Install with: pip install 'pyspark[connect]>=3.4.0'") + sys.exit(1) + + if _should_skip_integration_tests(): + print("\nINFO: Integration tests disabled (SKIP_INTEGRATION_TESTS=true)") + sys.exit(0) + + print("\n" + "=" * 80) + print("Running basic connectivity test...") + print("=" * 80) + + try: + from kubeflow.spark import ConnectBackendConfig, SparkSessionClient + + config = ConnectBackendConfig(connect_url=_get_connect_url(), use_ssl=False) + + print(f"\nConnecting to: {_get_connect_url()}") + + with SparkSessionClient(backend_config=config) as client: + print("āœ“ Client created successfully") + + session = client.create_session(app_name="manual-test") + print(f"āœ“ Session created: {session.session_id}") + + try: + print("\n" + "-" * 80) + print("Test 1: Simple SQL Query") + print("-" * 80) + df = session.sql("SELECT 1 AS id, 'Hello Spark Connect!' AS message") + result = df.collect() + print(f"āœ“ Query executed: {result[0].message}") + df.show() + + print("\n" + "-" * 80) + print("Test 2: Create DataFrame") + print("-" * 80) + data = [ + (1, "Alice", 28), + (2, "Bob", 35), + (3, "Carol", 42), + ] + df = session.createDataFrame(data, ["id", "name", "age"]) + print(f"āœ“ DataFrame created with {df.count()} rows") + df.show() + + print("\n" + "-" * 80) + print("Test 3: DataFrame Transformations") + print("-" * 80) + filtered = df.filter(df.age > 30) + print(f"āœ“ Filtered to {filtered.count()} rows (age > 30)") + filtered.show() + + print("\n" + "-" * 80) + print("Test 4: Session Metrics") + print("-" * 80) + metrics = session.get_metrics() + print(f"āœ“ Queries executed: {metrics.queries_executed}") + print(f"āœ“ Active queries: {metrics.active_queries}") + + print("\n" + "=" * 80) + print("All tests passed! āœ“") + print("=" * 80) + + finally: + session.close() + print("\nāœ“ Session closed") + + except Exception as e: + print(f"\nāœ— Test failed: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/kubeflow/spark/utils.py b/kubeflow/spark/utils.py new file mode 100644 index 000000000..631f6b571 --- /dev/null +++ b/kubeflow/spark/utils.py @@ -0,0 +1,154 @@ +"""Utility functions for Spark client.""" + +import logging +from typing import Any + +logger = logging.getLogger(__name__) + + +def format_memory(memory_mb: int) -> str: + """Format memory in MB to Kubernetes format. + + Args: + memory_mb: Memory in megabytes + + Returns: + Formatted memory string (e.g., "4096m", "4g") + + Example: + >>> format_memory(1024) + '1g' + >>> format_memory(512) + '512m' + """ + if memory_mb >= 1024 and memory_mb % 1024 == 0: + return f"{memory_mb // 1024}g" + return f"{memory_mb}m" + + +def parse_memory(memory_str: str) -> int: + """Parse Kubernetes memory format to MB. + + Args: + memory_str: Memory string (e.g., "4g", "512m") + + Returns: + Memory in megabytes + + Example: + >>> parse_memory("4g") + 4096 + >>> parse_memory("512m") + 512 + """ + memory_str = memory_str.lower().strip() + + if memory_str.endswith("g"): + return int(memory_str[:-1]) * 1024 + elif memory_str.endswith("m"): + return int(memory_str[:-1]) + elif memory_str.endswith("k"): + return int(memory_str[:-1]) // 1024 + else: + # Assume bytes + return int(memory_str) // (1024 * 1024) + + +def validate_spark_config(config: dict[str, Any]) -> bool: + """Validate Spark configuration. + + Args: + config: Spark configuration dictionary + + Returns: + True if valid + + Raises: + ValueError: If configuration is invalid + """ + required_fields = ["app_name", "main_application_file"] + + for field in required_fields: + if field not in config or not config[field]: + raise ValueError(f"Required field '{field}' is missing or empty") + + # Validate resource specifications + if "driver_memory" in config: + try: + parse_memory(config["driver_memory"]) + except Exception as e: + raise ValueError(f"Invalid driver_memory format: {e}") from e + + if "executor_memory" in config: + try: + parse_memory(config["executor_memory"]) + except Exception as e: + raise ValueError(f"Invalid executor_memory format: {e}") from e + + return True + + +def build_s3_path(bucket: str, prefix: str, filename: str) -> str: + """Build S3 path for artifacts. + + Args: + bucket: S3 bucket name + prefix: Prefix/folder path + filename: File name + + Returns: + Complete S3 path + + Example: + >>> build_s3_path("my-bucket", "artifacts/spark", "app.py") + 's3://my-bucket/artifacts/spark/app.py' + """ + prefix = prefix.strip("/") + if prefix: + return f"s3://{bucket}/{prefix}/{filename}" + return f"s3://{bucket}/{filename}" + + +def wait_for_completion( + client: "BatchSparkClient", + submission_id: str, + timeout: int = 3600, + poll_interval: int = 10, +) -> "ApplicationStatus": + """Wait for Spark application to complete. + + Args: + client: BatchSparkClient instance + submission_id: Submission ID to monitor + timeout: Maximum time to wait in seconds + poll_interval: Polling interval in seconds + + Returns: + Final ApplicationStatus + + Raises: + TimeoutError: If application doesn't complete within timeout + """ + import time + + from kubeflow.spark.models import ApplicationState + + start_time = time.time() + + while True: + status = client.get_job(submission_id) + + if status.state in [ + ApplicationState.COMPLETED, + ApplicationState.FAILED, + ]: + return status + + elapsed = time.time() - start_time + if elapsed >= timeout: + raise TimeoutError(f"Application {submission_id} did not complete within {timeout}s") + + logger.info( + f"Application {submission_id} status: {status.state.value}. Waiting {poll_interval}s..." + ) + time.sleep(poll_interval) diff --git a/kubeflow/spark/validation.py b/kubeflow/spark/validation.py new file mode 100644 index 000000000..d8e74a2d3 --- /dev/null +++ b/kubeflow/spark/validation.py @@ -0,0 +1,461 @@ +"""Validation module for Spark applications (matches operator webhook logic). + +This module provides client-side validation that mirrors the Spark Operator's webhook +validation, allowing for fast failure before submission. + +Key validations: +- Spark version compatibility (e.g., pod templates require Spark 3.0+) +- Resource format validation (memory, CPU) +- Node selector conflicts +- Dynamic allocation configuration +- Port conflicts in driver ingress options +- Dependency paths +""" + +from dataclasses import dataclass, field +from enum import Enum +import logging +import re +from typing import Any, Optional + +from kubeflow.spark.models import SparkApplicationRequest + +logger = logging.getLogger(__name__) + + +class ValidationErrorType(Enum): + """Types of validation errors.""" + + SPARK_VERSION = "spark_version" + RESOURCE_FORMAT = "resource_format" + NODE_SELECTOR_CONFLICT = "node_selector_conflict" + DRIVER_INGRESS_PORTS = "driver_ingress_ports" + DYNAMIC_ALLOCATION = "dynamic_allocation" + DEPENDENCY_PATH = "dependency_path" + REQUIRED_FIELD = "required_field" + INVALID_VALUE = "invalid_value" + + +@dataclass +class ValidationError: + """A single validation error. + + Attributes: + type: Type of validation error + field: Field that failed validation + message: Human-readable error message + value: The invalid value (if applicable) + """ + + type: ValidationErrorType + field: str + message: str + value: Optional[Any] = None + + +@dataclass +class ValidationResult: + """Result of validation checks. + + Attributes: + valid: Whether validation passed + errors: List of validation errors + warnings: List of validation warnings (non-fatal) + """ + + valid: bool + errors: list[ValidationError] = field(default_factory=list) + warnings: list[str] = field(default_factory=list) + + def add_error(self, error: ValidationError): + """Add an error and mark result as invalid.""" + self.errors.append(error) + self.valid = False + + def add_warning(self, message: str): + """Add a non-fatal warning.""" + self.warnings.append(message) + + +class SparkVersionValidator: + """Validates Spark version compatibility (matches operator logic).""" + + @staticmethod + def compare_version(version1: str, version2: str) -> int: + """Compare two semantic versions. + + Args: + version1: First version string (e.g., "3.5.0") + version2: Second version string (e.g., "3.0.0") + + Returns: + -1 if version1 < version2, 0 if equal, 1 if version1 > version2 + """ + + def normalize(v): + return [int(x) for x in re.sub(r"(\.0+)*$", "", v).split(".")] + + try: + parts1 = normalize(version1) + parts2 = normalize(version2) + + # Pad shorter version with zeros + max_len = max(len(parts1), len(parts2)) + parts1.extend([0] * (max_len - len(parts1))) + parts2.extend([0] * (max_len - len(parts2))) + + for p1, p2 in zip(parts1, parts2): + if p1 < p2: + return -1 + elif p1 > p2: + return 1 + return 0 + except (ValueError, AttributeError) as e: + logger.warning(f"Failed to compare versions {version1} and {version2}: {e}") + return 0 + + def validate(self, request: SparkApplicationRequest) -> ValidationResult: + """Validate Spark version requirements. + + Checks: + - Pod templates require Spark >= 3.0.0 (from operator webhook) + - Dynamic allocation features require Spark >= 3.0.0 + + Args: + request: Spark application request + + Returns: + ValidationResult + """ + result = ValidationResult(valid=True) + + # Check pod template requirement (from operator) + if (request.driver_pod_template or request.executor_pod_template) and \ + self.compare_version(request.spark_version, "3.0.0") < 0: + result.add_error( + ValidationError( + type=ValidationErrorType.SPARK_VERSION, + field="spark_version", + message="Pod template feature requires Spark version 3.0.0 or higher", + value=request.spark_version, + ) + ) + + # Check dynamic allocation (Spark 3.0+) + if request.dynamic_allocation and request.dynamic_allocation.enabled and \ + self.compare_version(request.spark_version, "3.0.0") < 0: + result.add_warning( + "Dynamic allocation on Kubernetes requires Spark 3.0.0+. " + f"Your version: {request.spark_version}" + ) + + return result + + +class ResourceValidator: + """Validates resource specifications (memory, CPU).""" + + # Regex patterns for resource formats + MEMORY_PATTERN = re.compile(r"^(\d+)(m|M|g|G|k|K|b|B)?$") + CORE_LIMIT_PATTERN = re.compile(r"^(\d+)(m)?$") + + @classmethod + def validate_memory(cls, memory: str, field_name: str) -> Optional[ValidationError]: + """Validate memory format. + + Args: + memory: Memory string (e.g., "4g", "512m") + field_name: Field name for error reporting + + Returns: + ValidationError if invalid, None if valid + """ + if not cls.MEMORY_PATTERN.match(memory): + return ValidationError( + type=ValidationErrorType.RESOURCE_FORMAT, + field=field_name, + message=( + f"Invalid memory format: {memory}. Expected format: " + " where unit is m, g, k, or b (e.g., '4g', '512m')" + ), + value=memory, + ) + return None + + @classmethod + def validate_cores(cls, cores: int, field_name: str) -> Optional[ValidationError]: + """Validate CPU cores. + + Args: + cores: Number of cores + field_name: Field name for error reporting + + Returns: + ValidationError if invalid, None if valid + """ + if cores < 1: + return ValidationError( + type=ValidationErrorType.RESOURCE_FORMAT, + field=field_name, + message=f"CPU cores must be >= 1, got: {cores}", + value=cores, + ) + return None + + def validate(self, request: SparkApplicationRequest) -> ValidationResult: + """Validate all resource specifications. + + Args: + request: Spark application request + + Returns: + ValidationResult + """ + result = ValidationResult(valid=True) + + # Validate driver resources + error = self.validate_memory(request.driver_memory, "driver_memory") + if error: + result.add_error(error) + + error = self.validate_cores(request.driver_cores, "driver_cores") + if error: + result.add_error(error) + + # Validate executor resources + error = self.validate_memory(request.executor_memory, "executor_memory") + if error: + result.add_error(error) + + error = self.validate_cores(request.executor_cores, "executor_cores") + if error: + result.add_error(error) + + # Validate number of executors + if request.num_executors < 1 and not ( + request.dynamic_allocation and request.dynamic_allocation.enabled + ): + result.add_error( + ValidationError( + type=ValidationErrorType.INVALID_VALUE, + field="num_executors", + message=("num_executors must be >= 1 (unless dynamic allocation is enabled)"), + value=request.num_executors, + ) + ) + + return result + + +class NodeSelectorValidator: + """Validates node selector configuration (matches operator webhook).""" + + def validate(self, request: SparkApplicationRequest) -> ValidationResult: + """Validate node selector conflicts. + + From operator webhook: + node selector cannot be defined at both SparkApplication and Driver/Executor + + Args: + request: Spark application request + + Returns: + ValidationResult + """ + result = ValidationResult(valid=True) + + # This check is handled differently in the SDK since we don't have separate + # driver.nodeSelector and executor.nodeSelector fields yet + # The node_selector field applies to both driver and executor + + if request.node_selector and len(request.node_selector) > 0: + result.add_warning( + "node_selector is applied to both driver and executor pods. " + "Use pod templates if you need different selectors per component." + ) + + return result + + +class DynamicAllocationValidator: + """Validates dynamic allocation configuration.""" + + def validate(self, request: SparkApplicationRequest) -> ValidationResult: + """Validate dynamic allocation settings. + + Checks: + - If enabled, min_executors <= initial_executors <= max_executors + - Shuffle tracking is enabled by default (operator behavior) + + Args: + request: Spark application request + + Returns: + ValidationResult + """ + result = ValidationResult(valid=True) + + if not request.dynamic_allocation or not request.dynamic_allocation.enabled: + return result + + dyn_alloc = request.dynamic_allocation + + # Validate executor bounds + if dyn_alloc.min_executors is not None and dyn_alloc.max_executors is not None and \ + dyn_alloc.min_executors > dyn_alloc.max_executors: + result.add_error( + ValidationError( + type=ValidationErrorType.DYNAMIC_ALLOCATION, + field="dynamic_allocation", + message=( + f"min_executors ({dyn_alloc.min_executors}) must be <= " + f"max_executors ({dyn_alloc.max_executors})" + ), + value=(f"min={dyn_alloc.min_executors}, max={dyn_alloc.max_executors}"), + ) + ) + + if dyn_alloc.initial_executors is not None: + if ( + dyn_alloc.min_executors is not None + and dyn_alloc.initial_executors < dyn_alloc.min_executors + ): + result.add_error( + ValidationError( + type=ValidationErrorType.DYNAMIC_ALLOCATION, + field="dynamic_allocation.initial_executors", + message=( + f"initial_executors ({dyn_alloc.initial_executors}) " + f"must be >= min_executors ({dyn_alloc.min_executors})" + ), + value=dyn_alloc.initial_executors, + ) + ) + + if ( + dyn_alloc.max_executors is not None + and dyn_alloc.initial_executors > dyn_alloc.max_executors + ): + result.add_error( + ValidationError( + type=ValidationErrorType.DYNAMIC_ALLOCATION, + field="dynamic_allocation.initial_executors", + message=( + f"initial_executors ({dyn_alloc.initial_executors}) " + f"must be <= max_executors ({dyn_alloc.max_executors})" + ), + value=dyn_alloc.initial_executors, + ) + ) + + # Warn if shuffle tracking is disabled (operator enables by default) + if dyn_alloc.shuffle_tracking_enabled is False: + result.add_warning( + "Shuffle tracking is disabled. You may need an external shuffle service. " + "See: https://spark.apache.org/docs/latest/running-on-kubernetes.html" + "#dynamic-resource-allocation" + ) + + return result + + +class SparkApplicationValidator: + """Main validator that orchestrates all validation checks.""" + + def __init__(self): + """Initialize validator with all sub-validators.""" + self.version_validator = SparkVersionValidator() + self.resource_validator = ResourceValidator() + self.node_selector_validator = NodeSelectorValidator() + self.dynamic_allocation_validator = DynamicAllocationValidator() + + def validate_all(self, request: SparkApplicationRequest) -> ValidationResult: + """Run all validation checks. + + Args: + request: Spark application request + + Returns: + ValidationResult with all errors and warnings + """ + final_result = ValidationResult(valid=True) + + # Run all validators + validators = [ + self.version_validator, + self.resource_validator, + self.node_selector_validator, + self.dynamic_allocation_validator, + ] + + for validator in validators: + result = validator.validate(request) + final_result.errors.extend(result.errors) + final_result.warnings.extend(result.warnings) + + # Mark as invalid if any errors + if final_result.errors: + final_result.valid = False + + # Log results + if not final_result.valid: + logger.error(f"Validation failed with {len(final_result.errors)} errors:") + for error in final_result.errors: + logger.error(f" [{error.type.value}] {error.field}: {error.message}") + + if final_result.warnings: + logger.warning(f"Validation completed with {len(final_result.warnings)} warnings:") + for warning in final_result.warnings: + logger.warning(f" {warning}") + + return final_result + + def validate_and_raise(self, request: SparkApplicationRequest): + """Validate and raise exception if invalid. + + Args: + request: Spark application request + + Raises: + ValueError: If validation fails + """ + result = self.validate_all(request) + + if not result.valid: + error_messages = [f"{error.field}: {error.message}" for error in result.errors] + raise ValueError( + "Spark application validation failed:\n" + + "\n".join(f" - {msg}" for msg in error_messages) + ) + + +# Convenience function +def validate_spark_application(request: SparkApplicationRequest) -> ValidationResult: + """Validate a Spark application request. + + Args: + request: Spark application request to validate + + Returns: + ValidationResult + + Example: + ```python + from kubeflow.spark import SparkApplicationRequest + from kubeflow.spark.validation import validate_spark_application + + request = SparkApplicationRequest( + app_name="my-app", + main_application_file="local:///app/main.py", + spark_version="2.4.0", # Too old for pod templates! + driver_pod_template={...}, # Will fail validation + ) + + result = validate_spark_application(request) + if not result.valid: + for error in result.errors: + print(f"Error: {error.message}") + ``` + """ + validator = SparkApplicationValidator() + return validator.validate_all(request) diff --git a/pyproject.toml b/pyproject.toml index 570d16f90..37e661a47 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,15 @@ docker = [ podman = [ "podman>=5.6.0" ] +spark-gateway = [ + "requests>=2.31.0", + "pyyaml>=6.0", +] +spark-connect = [ + "pyspark[connect]>=3.4.0", + "grpcio>=1.48.0", + "pyarrow>=10.0.0", +] [dependency-groups] dev = [