## Exploration of PySpark

In [1]:
pip install pyspark==3.5.1

Collecting pyspark==3.5.1
  Using cached pyspark-3.5.1.tar.gz (317.0 MB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.7 (from pyspark==3.5.1)
  Using cached py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Using cached py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Building wheels for collected packages: pyspark
  Building wheel^C
[?25canceled
[31mERROR: Operation cancelled by user[0m[31m
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install findspark

Note: you may need to restart the kernel to use updated packages.


In [5]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("example").getOrCreate()

25/03/02 15:10:23 WARN Utils: Your hostname, MacBook-Air-91.local resolves to a loopback address: 127.0.0.1; using 172.20.10.5 instead (on interface en0)
25/03/02 15:10:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/02 15:10:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
df = spark.createDataFrame(
    [
        ("sue", 32),
        ("li", 3),
        ("bob", 75),
        ("heo", 13),
    ],
    ["first_name", "age"],
)

In [9]:
df.show()

                                                                                

+----------+---+
|first_name|age|
+----------+---+
|       sue| 32|
|        li|  3|
|       bob| 75|
|       heo| 13|
+----------+---+



In [11]:
spark.version

'3.5.1'

In [13]:
#pip install pyspark==2.1.2

In [15]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [17]:
data = [
    (1, 2.0, 3.0, 10.0),
    (2, 3.0, 5.0, 15.0),
    (3, 4.0, 7.0, 20.0),
    (4, 5.0, 9.0, 25.0),
]
columns = ["id", "feature1", "feature2", "label"]
df = spark.createDataFrame(data, columns)

In [18]:
assembler = VectorAssembler(inputCols=["feature1", "feature2"], outputCol="features")
df = assembler.transform(df).select("features", "label")

In [20]:
lr = LinearRegression(featuresCol="features", labelCol="label")
model = lr.fit(df)

25/02/25 17:16:36 WARN Instrumentation: [fd061e9e] regParam is zero, which might cause numerical instability and overfitting.
25/02/25 17:16:37 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/02/25 17:16:37 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
25/02/25 17:16:37 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [22]:
print(f"Coefficients: {model.coefficients}")
print(f"Intercept: {model.intercept}")

Coefficients: [5.0,0.0]
Intercept: 0.0


## Test Spark 3

In [9]:
!pip install pyspark==3.0.2



In [1]:
!pip install sparkmeasure



In [3]:
!pip install findspark



In [4]:
import findspark
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
import time
from sparkmeasure import StageMetrics

In [25]:
### Configuration

spark_version = "3.0.2"  # Change this for each test
data_path = "sample_libsvm_data.txt"  # From spark examples
num_iterations = 100

# Initialize Spark with metrics collection
spark = SparkSession.builder \
    .appName(f"LR-Perf-Test-{spark_version}") \
    .config("spark.jars.packages", "ch.cern.sparkmeasure:spark-measure_2.12:0.23") \
    .config("spark.driver.extraJavaOptions", 
           "-Ddev.ludovic.netlib.blas.nativeLib=libopenblas.dylib " +
           "-Ddev.ludovic.netlib.blas.nativeLibPath=/opt/homebrew/Cellar/openblas/0.3.27_1/lib/") \
    .config("spark.executor.extraJavaOptions", 
           "-Ddev.ludovic.netlib.blas.nativeLib=libopenblas.dylib") \
    .getOrCreate()

stagemetrics = StageMetrics(spark)

In [18]:
%%sh
curl -o sample_libsvm_data.txt https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_libsvm_data.txt
mkdir -p data/mllib
cp sample_libsvm_data.txt data/mllib/

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  102k  100  102k    0     0   357k      0 --:--:-- --:--:-- --:--:--  358k


In [26]:
### Data Preparation

from pyspark.mllib.util import MLUtils

def load_data(spark_version, spark_session):
    # Use absolute path for reliability
    data_path = "/Users/nishkagovil/Downloads/data/mllib/sample_libsvm_data.txt"  # Update with your actual path
    
    if spark_version.startswith("1."):
        return MLUtils.loadLibSVMFile(spark_session.sparkContext, data_path)
    else:
        return (spark_session.read
                .format("libsvm")
                .option("numFeatures", "692")  # This dataset has 692 features
                .load(data_path))

dataset = load_data(spark_version, spark)

In [27]:
dataset.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [28]:
### Training & Metrics Collection

def train_model(data):
    start_time = time.time()
    
    if spark_version.startswith("1."):
        # Spark 1.0 (MLlib RDD API)
        model = LogisticRegressionWithSGD.train(
            data,
            iterations=num_iterations,
            regParam=0.01
        )
    else:
        # Spark 2.0+ (ML DataFrame API)
        lr = LogisticRegression(
            maxIter=num_iterations,
            regParam=0.01
        )
        model = lr.fit(data)
    
    training_time = time.time() - start_time
    return model, training_time

In [29]:
# Execute with metrics collection
stagemetrics.begin()
model, exec_time = train_model(dataset)
metrics_df = stagemetrics.end()

In [36]:
### Results Analysis

print(f"\n{'='*40}")
print(f"Spark {spark_version} Performance Metrics")
print(f"{'='*40}")

print(f"\nTraining Time: {exec_time:.2f}s")

if stagemetrics:
    try:
        # Create DF using current schema
        metrics_df = stagemetrics.create_stagemetrics_DF("PerfStageMetrics")
        
        if metrics_df and metrics_df.count() > 0:
            # Use actual existing columns from error message
            resource_metrics = metrics_df.select(
                "stageDuration", 
                "executorCpuTime",
                "executorRunTime", 
                "peakExecutionMemory"
            ).first()

            print(f"\nResource Utilization:")
            print(f"• Stage Duration: {resource_metrics['stageDuration']/1000:.2f}s")
            print(f"• CPU Time: {resource_metrics['executorCpuTime']/1000:.2f}s")
            print(f"• Peak Memory: {resource_metrics['peakExecutionMemory']/1024**2:.2f} MB")
            print(f"• Executor Runtime: {resource_metrics['executorRunTime']/1000:.2f}s")
        else:
            print("\nNo metrics collected - verify Spark jobs executed")
    except Exception as e:
        print(f"\nMetrics error: {str(e)}")
else:
    print("\nMetrics collector not initialized")



Spark 3.0.2 Performance Metrics

Training Time: 0.72s

Resource Utilization:
• Stage Duration: 0.03s
• CPU Time: 0.02s
• Peak Memory: 0.00 MB
• Executor Runtime: 0.02s


25/03/02 16:38:30 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [37]:
# Cleanup
spark.stop()

## Test Spark 2

In [1]:
!pip install pyspark==2.4.8



In [2]:
!pip install sparkmeasure



In [3]:
!pip install findspark



In [4]:
import findspark
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
import time
from sparkmeasure import StageMetrics

In [5]:
### Configuration

spark_version = "2.4.8"  # Change this for each test
data_path = "sample_libsvm_data.txt"  # From spark examples
num_iterations = 100

# Initialize Spark with metrics collection
spark = SparkSession.builder \
    .appName("LR-Perf-Test-2.4.8") \
    .config("spark.jars.packages", "ch.cern.sparkmeasure:spark-measure_2.11:0.16") \
    .getOrCreate()

stagemetrics = StageMetrics(spark)

In [6]:
# %%sh
# curl -o sample_libsvm_data.txt https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_libsvm_data.txt
# mkdir -p data/mllib
# cp sample_libsvm_data.txt data/mllib/

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  102k  100  102k    0     0   739k      0 --:--:-- --:--:-- --:--:--  746k


In [6]:
### Data Preparation

from pyspark.mllib.util import MLUtils

def load_data(spark_version, spark_session):
    # Use file:/// prefix for local paths
    data_path = "file:///Users/nishkagovil/Downloads/data/mllib/sample_libsvm_data.txt"
    
    if spark_version.startswith("1."):
        return MLUtils.loadLibSVMFile(spark_session.sparkContext, data_path)
    else:
        return (spark_session.read
                .format("libsvm")
                .option("numFeatures", "692")
                .load(data_path))
dataset = load_data(spark_version, spark)

In [7]:
dataset.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [8]:
### Training & Metrics Collection

def train_model(data):
    start_time = time.time()
    
    if spark_version.startswith("1."):
        # Spark 1.0 (MLlib RDD API)
        model = LogisticRegressionWithSGD.train(
            data,
            iterations=num_iterations,
            regParam=0.01
        )
    else:
        # Spark 2.0+ (ML DataFrame API)
        lr = LogisticRegression(
            maxIter=num_iterations,
            regParam=0.01
        )
        model = lr.fit(data)
    
    training_time = time.time() - start_time
    return model, training_time

In [9]:
# Execute with metrics collection
stagemetrics.begin()
model, exec_time = train_model(dataset)
metrics_df = stagemetrics.end()

In [10]:
### Results Analysis

print(f"\n{'='*40}")
print(f"Spark {spark_version} Performance Metrics")
print(f"{'='*40}")

print(f"\nTraining Time: {exec_time:.2f}s")

if stagemetrics:
    try:
        # Create DF using current schema
        metrics_df = stagemetrics.create_stagemetrics_DF("PerfStageMetrics")
        
        if metrics_df and metrics_df.count() > 0:
            # Use actual existing columns from error message
            resource_metrics = metrics_df.select(
                "stageDuration", 
                "executorCpuTime",
                "executorRunTime", 
                "peakExecutionMemory"
            ).first()

            print(f"\nResource Utilization:")
            print(f"• Stage Duration: {resource_metrics['stageDuration']/1000:.2f}s")
            print(f"• CPU Time: {resource_metrics['executorCpuTime']/1000:.2f}s")
            print(f"• Peak Memory: {resource_metrics['peakExecutionMemory']/1024**2:.2f} MB")
            print(f"• Executor Runtime: {resource_metrics['executorRunTime']/1000:.2f}s")
        else:
            print("\nNo metrics collected - verify Spark jobs executed")
    except Exception as e:
        print(f"\nMetrics error: {str(e)}")
else:
    print("\nMetrics collector not initialized")



Spark 2.4.8 Performance Metrics

Training Time: 4.27s

Resource Utilization:
• Stage Duration: 0.19s
• CPU Time: 0.13s
• Peak Memory: 0.00 MB
• Executor Runtime: 0.15s


In [11]:
# Cleanup
spark.stop()