This workshop demonstrate how to apply SVM classifier on multi-class classification
The dataset is iris dataset

In [1]:
# Install pyspark and findspark
!pip install --ignore-install -q pyspark
# Install findspark library
!pip install --ignore-install -q findspark

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
# Import findspark
import findspark
findspark.init()

In [3]:
import sys
sys.version_info
print(sys.version)

3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]


In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [5]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LinearSVC
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

from pyspark.ml.classification import OneVsRest
from pyspark.ml.feature import VectorAssembler, StringIndexer


In [6]:
# Initialize Spark session
spark = SparkSession.builder.appName("IrisSVM").getOrCreate()

# Load the Iris dataset into a DataFrame
# Replace 'iris_data.csv' with the path to your dataset file
data = spark.read.csv("/content/drive/MyDrive/iris-data.csv", header=True, inferSchema=True)


In [7]:
# Define the feature columns
feature_columns = ["sepal length", "sepal width", "petal length", "petal width"]



In [8]:
# Create a StringIndexer to encode the "species" column
indexer = StringIndexer(inputCol="class", outputCol="label")
data = indexer.fit(data).transform(data)

In [9]:
# Create a vector assembler to combine feature columns into a single vector column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data)

In [10]:


# Split the data into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=123)

# Create an SVM classifier
svm = LinearSVC(maxIter=100, labelCol="label")


In [11]:
# Create an OvR classifier
ovr_classifier = OneVsRest(classifier=svm, labelCol="label")

In [12]:
# Train the OvR model
ovr_model = ovr_classifier.fit(train_data)

In [13]:
# Make predictions on the test data
predictions = ovr_model.transform(test_data)

In [14]:
# Evaluate the model using MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)


In [15]:
# Print the accuracy of the model
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 93.10%


In [16]:
# Convert the predictions and labels to RDD for MulticlassMetrics
prediction_and_label = predictions.select("prediction", "label").rdd

In [17]:
# Instantiate MulticlassMetrics
metrics = MulticlassMetrics(prediction_and_label)



In [18]:
# Print the confusion matrix
print("Confusion Matrix:")
print(metrics.confusionMatrix().toArray())

Confusion Matrix:
[[13.  0.  0.]
 [ 0.  6.  1.]
 [ 0.  1.  8.]]


In [19]:
# Get the recall for the "Setosa" class (class index 0)
setosa_recall = metrics.recall(0)

# Print the recall for the "Setosa" class
print(f"Recall for Setosa class: {setosa_recall:.2f}")

Recall for Setosa class: 1.00


In [20]:
# Stop the Spark session
spark.stop()