# Pyspark Machine Learning

Using `pyspark` on Docker exposed `localhost` port to run machine learning jobs

## General Setup 

In [7]:
# This SparkSession is already initialized when launching with pyspark command
from pyspark.sql import SparkSession

# If you want to explicitly create it:
spark = SparkSession.builder \
    .appName("ClassificationExample") \
    .getOrCreate()

# Check if Spark is working
print(f"Spark version: {spark.version}")

# Simple example with classification
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Create sample data
data = [(0, 1.0, 0.0, "cat"),
        (1, 0.0, 1.0, "dog"),
        (2, 0.0, 1.0, "dog"),
        (3, 1.0, 0.0, "cat")]
columns = ["id", "feature1", "feature2", "label"]
df = spark.createDataFrame(data, columns)

# Prepare features
feature_cols = ["feature1", "feature2"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Convert string labels to indices
label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")

# Create and train model
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=10)

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[assembler, label_indexer, rf])

# Train model
model = pipeline.fit(df)

# Make predictions
predictions = model.transform(df)

# Select example rows to display
predictions.select("id", "label", "prediction").show()

# Evaluate model
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy = {accuracy}")

Spark version: 4.0.0-preview2
+---+-----+----------+
| id|label|prediction|
+---+-----+----------+
|  0|  cat|       0.0|
|  1|  dog|       1.0|
|  2|  dog|       1.0|
|  3|  cat|       0.0|
+---+-----+----------+

Test Accuracy = 1.0


Note that after the cell is executed need to wait a while to get the result to be shown.

## Include Metrics Computation

In [11]:
import time
import os
import psutil # You might need to install this: pip install psutil

# This SparkSession is already initialized when launching with pyspark command
from pyspark.sql import SparkSession

# If you want to explicitly create it:
spark = SparkSession.builder \
    .appName("ClassificationExampleWithMetrics") \
    .getOrCreate()

# Check if Spark is working
print(f"Spark version: {spark.version}")

# Simple example with classification
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Create sample data
data = [(0, 1.0, 0.0, "cat"),
        (1, 0.0, 1.0, "dog"),
        (2, 0.0, 1.0, "dog"),
        (3, 1.0, 0.0, "cat")]
columns = ["id", "feature1", "feature2", "label"]
df = spark.createDataFrame(data, columns)

# Prepare features
feature_cols = ["feature1", "feature2"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Convert string labels to indices
label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")

# Create model
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=10)

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[assembler, label_indexer, rf])

# --- Measure Training Time ---
start_train_time = time.time()
model = pipeline.fit(df)
end_train_time = time.time()
training_time = end_train_time - start_train_time
print(f"Training Time: {training_time:.4f} seconds")
# --- End Training Time Measurement ---

# --- Measure Prediction Time ---
start_pred_time = time.time()
predictions = model.transform(df)
end_pred_time = time.time()
prediction_time = end_pred_time - start_pred_time
print(f"Prediction Time: {prediction_time:.4f} seconds")
# --- End Prediction Time Measurement ---

# Select example rows to display
print("Sample Predictions:")
predictions.select("id", "label", "prediction", "indexedLabel").show() # Added indexedLabel for clarity

# --- Evaluate Model (Accuracy) ---
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy = {accuracy:.4f}")
# --- End Evaluation ---

# --- Measure Memory Usage (Driver Process) ---
# Note: This measures the memory usage of the Python driver process,
# not the total cluster memory used by Spark executors.
process = psutil.Process(os.getpid())
memory_info = process.memory_info()
memory_usage_mb = memory_info.rss / (1024 * 1024) # Resident Set Size in MB
print(f"Driver Memory Usage: {memory_usage_mb:.2f} MB")
# --- End Memory Usage Measurement ---

spark.stop()

Spark version: 4.0.0-preview2
Training Time: 4.1157 seconds
Prediction Time: 0.1216 seconds
Sample Predictions:
+---+-----+----------+------------+
| id|label|prediction|indexedLabel|
+---+-----+----------+------------+
|  0|  cat|       0.0|         0.0|
|  1|  dog|       1.0|         1.0|
|  2|  dog|       1.0|         1.0|
|  3|  cat|       0.0|         0.0|
+---+-----+----------+------------+

Test Accuracy = 1.0000
Driver Memory Usage: 195.71 MB


We can see that the training time, prediction time, and drive memory usage are being reported. We can use it to compare against the `scikit-learn` approach.