### Data Loading

In [1]:
iris_df = spark.read.csv("/home/justin/Downloads/iris.data"
, header=False, inferSchema = True)

In [4]:
iris_df.show()

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|
|         4.4|        2.9|         1.4|        0.2|Iris-setosa|
|         4.9|        3.1|         1.5|        0.1|Iris-setosa|
|         5.4|        3.7|         1.5|        0.2|Iris-setosa|
|         4.8|        3.4|         1.6|        0.2|Iris-setosa|
|         4.8|        3.0|         1.4| 

In [3]:
from pyspark.sql.functions import *

iris_df = iris_df.select(col('_c0').alias('sepal_length'), 
                        col('_c1').alias('sepal_width'),
                        col('_c2').alias('petal_length'),
                        col('_c3').alias('petal_width'),
                        col('_c4').alias('species'))

### create feature column

In [6]:
from pyspark.ml.feature import VectorAssembler

viris_df = VectorAssembler(inputCols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], outputCol = 'features').transform(iris_df)



In [7]:
viris_df.show()

+------------+-----------+------------+-----------+-----------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|    species|         features|
+------------+-----------+------------+-----------+-----------+-----------------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|[4.6,3.4,1.4,0.3]|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|[5.0,3.4,1.5,0.2]|
|         4.4|        2.9|         1.4|        0.2|Iris-setosa|[4.4,2.9,1.4,0.2]|
|         4.9|  

### convert species names into index names label

In [15]:
from pyspark.ml.feature import StringIndexer

iviris_df = StringIndexer(inputCol = 'species', outputCol = 'label').fit(viris_df).transform(viris_df)

In [16]:
iviris_df.show(300)

+------------+-----------+------------+-----------+---------------+-----------------+-----+
|sepal_length|sepal_width|petal_length|petal_width|        species|         features|label|
+------------+-----------+------------+-----------+---------------+-----------------+-----+
|         5.1|        3.5|         1.4|        0.2|    Iris-setosa|[5.1,3.5,1.4,0.2]|  0.0|
|         4.9|        3.0|         1.4|        0.2|    Iris-setosa|[4.9,3.0,1.4,0.2]|  0.0|
|         4.7|        3.2|         1.3|        0.2|    Iris-setosa|[4.7,3.2,1.3,0.2]|  0.0|
|         4.6|        3.1|         1.5|        0.2|    Iris-setosa|[4.6,3.1,1.5,0.2]|  0.0|
|         5.0|        3.6|         1.4|        0.2|    Iris-setosa|[5.0,3.6,1.4,0.2]|  0.0|
|         5.4|        3.9|         1.7|        0.4|    Iris-setosa|[5.4,3.9,1.7,0.4]|  0.0|
|         4.6|        3.4|         1.4|        0.3|    Iris-setosa|[4.6,3.4,1.4,0.3]|  0.0|
|         5.0|        3.4|         1.5|        0.2|    Iris-setosa|[5.0,3.4,1.5,

In [24]:
iviris_df.count()

150

In [113]:
splits = iviris_df.randomSplit([0.6,0.4],1)
train_df = splits[0]
test_df = splits[1]

In [114]:
train_df.show()

+------------+-----------+------------+-----------+---------------+-----------------+-----+
|sepal_length|sepal_width|petal_length|petal_width|        species|         features|label|
+------------+-----------+------------+-----------+---------------+-----------------+-----+
|         4.4|        2.9|         1.4|        0.2|    Iris-setosa|[4.4,2.9,1.4,0.2]|  0.0|
|         4.4|        3.0|         1.3|        0.2|    Iris-setosa|[4.4,3.0,1.3,0.2]|  0.0|
|         4.4|        3.2|         1.3|        0.2|    Iris-setosa|[4.4,3.2,1.3,0.2]|  0.0|
|         4.6|        3.2|         1.4|        0.2|    Iris-setosa|[4.6,3.2,1.4,0.2]|  0.0|
|         4.6|        3.4|         1.4|        0.3|    Iris-setosa|[4.6,3.4,1.4,0.3]|  0.0|
|         4.6|        3.6|         1.0|        0.2|    Iris-setosa|[4.6,3.6,1.0,0.2]|  0.0|
|         4.7|        3.2|         1.6|        0.2|    Iris-setosa|[4.7,3.2,1.6,0.2]|  0.0|
|         4.8|        3.0|         1.4|        0.1|    Iris-setosa|[4.8,3.0,1.4,

## Naive Based Classifier

In [115]:
from pyspark.ml.classification import NaiveBayes

In [142]:
nb = NaiveBayes(modelType = 'multinomial')

nbmodel = nb.fit(train_df)

In [143]:
pred_df = nbmodel.transform(test_df)

In [144]:
pred_df.show(5)

+------------+-----------+------------+-----------+-----------+-----------------+-----+--------------------+--------------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|         features|label|       rawPrediction|         probability|prediction|
+------------+-----------+------------+-----------+-----------+-----------------+-----+--------------------+--------------------+----------+
|         4.3|        3.0|         1.1|        0.1|Iris-setosa|[4.3,3.0,1.1,0.1]|  0.0|[-9.9894495209670...|[0.71183175063155...|       0.0|
|         4.5|        2.3|         1.3|        0.3|Iris-setosa|[4.5,2.3,1.3,0.3]|  0.0|[-10.475670388034...|[0.52746779120113...|       0.0|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|  0.0|[-11.445607205078...|[0.64305489612317...|       0.0|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|  0.0|[-11.238029954327...|[0.69419834250896...|       0.0|
|         4.8

In [145]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

nb_acc = MulticlassClassificationEvaluator(labelCol = 'label', predictionCol = 'prediction', metricName = 'accuracy').evaluate(pred_df)

In [146]:
nb_acc

0.9807692307692307

## Multilayer Perceptron Classification

In [147]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [148]:
mlp = MultilayerPerceptronClassifier(layers = [4, 5, 5, 3], seed = 1)
mlp_model = mlp.fit(train_df)

In [149]:
mlp_pred = mlp_model.transform(test_df)

In [150]:
mlp_acc = MulticlassClassificationEvaluator(metricName = "accuracy").evaluate(mlp_pred)

In [151]:
mlp_acc

0.6923076923076923

## Decision Tree

In [152]:
from pyspark.ml.classification import DecisionTreeClassifier

In [153]:
dt = DecisionTreeClassifier(labelCol = 'label', featuresCol = 'features')
dt_model = dt.fit(train_df)

In [154]:
dt_pred = dt_model.transform(test_df)

In [155]:
dt_acc = MulticlassClassificationEvaluator(labelCol = 'label', predictionCol = 'prediction', metricName = 'accuracy').evaluate(dt_pred)

In [156]:
dt_acc

0.9423076923076923