In [1]:
import warnings
warnings.filterwarnings("ignore")


In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
spark = SparkSession.builder.appName("Iris Flower Classification").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/01 14:34:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/12/01 14:34:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
df = spark.read.csv("iris.csv", header=True, inferSchema=True)
df.show(5)

                                                                                

+-------------+------------+-------------+------------+-----------+
|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+-------------+------------+-------------+------------+-----------+
|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
+-------------+------------+-------------+------------+-----------+
only showing top 5 rows



In [6]:
df.printSchema()

root
 |-- SepalLengthCm: double (nullable = true)
 |-- SepalWidthCm: double (nullable = true)
 |-- PetalLengthCm: double (nullable = true)
 |-- PetalWidthCm: double (nullable = true)
 |-- Species: string (nullable = true)



In [7]:
indexer = StringIndexer(inputCol="Species", outputCol="label")
assembler = VectorAssembler(inputCols=["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"], outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
lr = LogisticRegression(featuresCol="scaled_features", labelCol="label")

In [8]:
from pyspark.ml import Pipeline
from pyspark.ml.pipeline import PipelineModel

In [9]:
pipeline = Pipeline(stages=[indexer, assembler, scaler, lr])

In [10]:
(train_data, test_data) = df.randomSplit([0.7, 0.3], seed=42)

In [11]:
pipeline_model = pipeline.fit(train_data)

23/12/01 14:34:30 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [12]:
print("Evaluation")

ps = [str(x).split("_")[0] for x in pipeline.getStages()]

print("Pipeline Stage 1 = ", ps[0])
print("Pipeline Stage 1 = ", ps[1])
print("Pipeline Stage 1 = ", ps[2])

print("Label Column = ", lr.getLabelCol())

Evaluation
Pipeline Stage 1 =  StringIndexer
Pipeline Stage 1 =  VectorAssembler
Pipeline Stage 1 =  StandardScaler
Label Column =  label


In [13]:
predictions = pipeline_model.transform(test_data)

In [14]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = ", accuracy)

Accuracy =  0.9782608695652174


In [15]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
precision = evaluator.evaluate(predictions)

In [16]:
print("Precision = ", precision)

Precision =  0.9804347826086957


In [17]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator.evaluate(predictions)
print("Recall = ", recall)

Recall =  0.9782608695652174


In [18]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(predictions)
print("F1 Score = ", f1_score)

F1 Score =  0.9784581393513769


In [25]:
pipeline_model.write().save("Iris_Logistics_Regression")

                                                                                

In [26]:
loaded_pipeline_model = PipelineModel.load("./Iris_Logistics_Regression/")

In [27]:
predictions = loaded_pipeline_model.transform(test_data)

In [33]:
predictions.select("label", "prediction").show()

+-----+----------+
|label|prediction|
+-----+----------+
|  2.0|       2.0|
|  2.0|       2.0|
|  2.0|       2.0|
|  2.0|       2.0|
|  2.0|       2.0|
|  2.0|       2.0|
|  2.0|       2.0|
|  2.0|       2.0|
|  2.0|       2.0|
|  1.0|       1.0|
|  2.0|       2.0|
|  2.0|       2.0|
|  2.0|       2.0|
|  2.0|       2.0|
|  1.0|       1.0|
|  2.0|       2.0|
|  2.0|       2.0|
|  2.0|       2.0|
|  2.0|       2.0|
|  2.0|       2.0|
+-----+----------+
only showing top 20 rows



In [34]:
spark.stop()