# Comprehensive Guide: Checking PySpark Version

This notebook demonstrates multiple ways to check PySpark version in different environments.

## 1. Direct Module Import

In [None]:
# Method 1: Direct import
try:
    import pyspark
    print(f"PySpark version: {pyspark.__version__}")
except ImportError:
    print("PySpark is not installed")
except AttributeError:
    print("Version attribute not found")

## 2. Using SparkSession

In [None]:
# Method 2: SparkSession (most common in production)
try:
    from pyspark.sql import SparkSession
    spark = SparkSession.builder \
        .appName("VersionCheck") \
        .getOrCreate()
    
    print(f"PySpark version: {spark.version}")
    print(f"Spark UI: http://localhost:4040")
    
    # Additional Spark info
    print(f"\nSpark Configuration:")
    print(f"Master: {spark.sparkContext.master}")
    print(f"App Name: {spark.sparkContext.appName}")
    print(f"Application ID: {spark.sparkContext.applicationId}")
    
    spark.stop()
except Exception as e:
    print(f"Error: {e}")

## 3. Using SparkContext

In [None]:
# Method 3: SparkContext (legacy approach)
try:
    from pyspark import SparkContext, SparkConf
    conf = SparkConf().setAppName("VersionCheck")
    sc = SparkContext.getOrCreate(conf)
    
    print(f"PySpark version: {sc.version}")
    print(f"Python version: {sc.pythonVer}")
    print(f"Spark home: {sc._jsc.sc().getSparkHome().get() if sc._jsc.sc().getSparkHome().isDefined() else 'Not set'}")
    
    sc.stop()
except Exception as e:
    print(f"Error: {e}")

## 4. Shell Commands in Jupyter

In [None]:
# Method 4a: Using pip
!pip show pyspark | grep -E "Name:|Version:"

In [None]:
# Method 4b: Using pip list
!pip list | grep -i pyspark

In [None]:
# Method 4c: Using conda (if available)
!conda list pyspark 2>/dev/null | grep -v "^#" || echo "Conda not available or PySpark not installed via conda"

## 5. System Information

In [None]:
# Method 5: Check system environment
import os
import sys

print("System Information:")
print(f"Python version: {sys.version}")
print(f"Python executable: {sys.executable}")
print(f"\nEnvironment variables:")
print(f"SPARK_HOME: {os.environ.get('SPARK_HOME', 'Not set')}")
print(f"PYSPARK_PYTHON: {os.environ.get('PYSPARK_PYTHON', 'Not set')}")
print(f"PYSPARK_DRIVER_PYTHON: {os.environ.get('PYSPARK_DRIVER_PYTHON', 'Not set')}")

# Check if PySpark is in Python path
pyspark_paths = [p for p in sys.path if 'pyspark' in p.lower()]
if pyspark_paths:
    print(f"\nPySpark in Python path:")
    for path in pyspark_paths:
        print(f"  - {path}")

## 6. Detailed Package Information

In [None]:
# Method 6: Get detailed package info
try:
    import pkg_resources
    pyspark_pkg = pkg_resources.get_distribution('pyspark')
    print(f"Package: {pyspark_pkg.key}")
    print(f"Version: {pyspark_pkg.version}")
    print(f"Location: {pyspark_pkg.location}")
    
    # Get dependencies
    print("\nDependencies:")
    for req in pyspark_pkg.requires():
        print(f"  - {req}")
except Exception as e:
    print(f"Could not get package info: {e}")

## 7. Spark Configuration Details

In [None]:
# Method 7: Get Spark configuration details
try:
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.appName("ConfigCheck").getOrCreate()
    
    print("Spark Configuration:")
    conf = spark.sparkContext.getConf()
    for item in sorted(conf.getAll(), key=lambda x: x[0]):
        if any(keyword in item[0].lower() for keyword in ['version', 'python', 'memory', 'cores']):
            print(f"{item[0]}: {item[1]}")
    
    spark.stop()
except Exception as e:
    print(f"Error: {e}")

## 8. Version Compatibility Check

In [None]:
# Method 8: Check version compatibility
import sys

def check_compatibility():
    try:
        import pyspark
        pyspark_version = pyspark.__version__
        python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
        
        print(f"PySpark version: {pyspark_version}")
        print(f"Python version: {python_version}")
        
        # Version compatibility matrix
        compatibility = {
            "3.5": ["3.10", "3.11"],
            "3.4": ["3.8", "3.9", "3.10", "3.11"],
            "3.3": ["3.7", "3.8", "3.9", "3.10"],
            "3.2": ["3.6", "3.7", "3.8", "3.9"],
            "3.1": ["3.6", "3.7", "3.8", "3.9"],
            "3.0": ["3.6", "3.7", "3.8"],
            "2.4": ["2.7", "3.4", "3.5", "3.6", "3.7"],
        }
        
        pyspark_major = pyspark_version.split('.')[0] + '.' + pyspark_version.split('.')[1]
        
        if pyspark_major in compatibility:
            supported_python = compatibility[pyspark_major]
            if python_version in supported_python:
                print(f"✅ Python {python_version} is compatible with PySpark {pyspark_version}")
            else:
                print(f"⚠️  Python {python_version} may not be fully compatible with PySpark {pyspark_version}")
                print(f"   Recommended Python versions: {', '.join(supported_python)}")
        else:
            print(f"ℹ️  Compatibility information not available for PySpark {pyspark_version}")
            
    except ImportError:
        print("PySpark not installed")

check_compatibility()

## 9. Docker Container Commands

In [None]:
# Docker commands to check PySpark version
print("Docker commands to check PySpark version:")
print("\n1. Check in running container:")
print('   docker exec <container_name> python -c "import pyspark; print(pyspark.__version__)"')
print("\n2. Check using spark-submit:")
print("   docker exec <container_name> spark-submit --version")
print("\n3. Check using pip:")
print("   docker exec <container_name> pip show pyspark")
print("\n4. Interactive check:")
print("   docker exec -it <container_name> pyspark")
print("   >>> spark.version")
print("\n5. Check in Docker image:")
print('   docker run --rm <image_name> python -c "import pyspark; print(pyspark.__version__)"')

## 10. Create Version Check Script

In [None]:
# Create a reusable version check function
def get_pyspark_info():
    """Get comprehensive PySpark version and environment information"""
    info = {}
    
    try:
        import pyspark
        info['installed'] = True
        info['version'] = pyspark.__version__
        
        # Try to get more details from SparkSession
        try:
            from pyspark.sql import SparkSession
            spark = SparkSession.builder.appName("InfoCheck").getOrCreate()
            info['spark_version'] = spark.version
            info['spark_home'] = spark.sparkContext._jsc.sc().getSparkHome().get() if spark.sparkContext._jsc.sc().getSparkHome().isDefined() else None
            info['master'] = spark.sparkContext.master
            spark.stop()
        except:
            pass
            
    except ImportError:
        info['installed'] = False
        
    # Environment info
    import os
    info['spark_home_env'] = os.environ.get('SPARK_HOME')
    info['pyspark_python'] = os.environ.get('PYSPARK_PYTHON')
    
    return info

# Display the information
import json
info = get_pyspark_info()
print(json.dumps(info, indent=2))