import libraries

In [11]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import Row, SQLContext
from pyspark import SparkFiles
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, VectorIndexer
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import os
import pandas as pd

In [2]:
# create spark and sql context
sc = SparkContext()
sqlContext = SQLContext(sc)

In [3]:
# load iris data as pandas dataframe and then convert it into spark dataframe
pandas_data = pd.read_csv("iris.csv")
data = sqlContext.createDataFrame(pandas_data)
data.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- variety: string (nullable = true)



In [4]:
data.show(5)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [5]:
#See standard parameters
data.describe().show()

+-------+------------------+------------------+------------------+------------------+---------+
|summary|      sepal_length|       sepal_width|      petal_length|       petal_width|  variety|
+-------+------------------+------------------+------------------+------------------+---------+
|  count|               150|               150|               150|               150|      150|
|   mean| 5.843333333333334|3.0573333333333337|3.7580000000000005|1.1993333333333331|     null|
| stddev|0.8280661279778632|0.4358662849366982|1.7652982332594662|0.7622376689603466|     null|
|    min|               4.3|               2.0|               1.0|               0.1|   Setosa|
|    max|               7.9|               4.4|               6.9|               2.5|Virginica|
+-------+------------------+------------------+------------------+------------------+---------+



## Decision Trees Pipeline

In [6]:
# Stage1
# convert the targets column to numerical
stringIndexer = StringIndexer(inputCol="variety", outputCol="num_variety")

# Stage2
# assemble features in one column
vectorAssembler = VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],\
                                  outputCol="features")

# Stage3
# train a DecisionTree model.
dt_classifier = DecisionTreeClassifier(labelCol="num_variety", featuresCol="features")


# split data
(training_data, test_data) = data.randomSplit([0.8, 0.2])

# chain indexers and dt in a Pipeline
dt_pipeline = Pipeline(stages=[stringIndexer, vectorAssembler, dt_classifier])

# train model
dt_model = dt_pipeline.fit(training_data)

# make predictions.
dt_predictions = dt_model.transform(test_data)

# select example rows to display.
dt_predictions.select("prediction", "num_variety", "features").show(5)

# select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="num_variety", predictionCol="prediction", metricName="accuracy")

dt_accuracy = evaluator.evaluate(dt_predictions)
print("Test accuracy = ", dt_accuracy*100, "%")

treeModel = dt_model.stages[2]
# summary only
print(treeModel)


+----------+-----------+-----------------+
|prediction|num_variety|         features|
+----------+-----------+-----------------+
|       0.0|        0.0|[4.9,3.1,1.5,0.1]|
|       0.0|        0.0|[5.1,3.3,1.7,0.5]|
|       0.0|        0.0|[5.2,3.5,1.5,0.2]|
|       0.0|        0.0|[5.5,4.2,1.4,0.2]|
|       0.0|        0.0|[4.5,2.3,1.3,0.3]|
+----------+-----------+-----------------+
only showing top 5 rows

Test accuracy =  90.0 %
DecisionTreeClassificationModel (uid=DecisionTreeClassifier_003f7d819650) of depth 5 with 13 nodes


## Random Forest Pipeline

In [7]:
# Stage3
# train a Random Forest model.
rf_classifier = RandomForestClassifier(labelCol="num_variety", featuresCol="features")


# chain indexers and dt in a Pipeline
rf_pipeline = Pipeline(stages=[stringIndexer, vectorAssembler, rf_classifier])

# train model
rf_model = rf_pipeline.fit(training_data)

# make predictions.
rf_predictions = rf_model.transform(test_data)

# select example rows to display.
rf_predictions.select("prediction", "num_variety", "features").show(5)

# compute accuracy
rf_accuracy = evaluator.evaluate(rf_predictions)
print("Test accuracy = ", rf_accuracy*100, "%")

forestModel = rf_model.stages[2]
# summary only
print(forestModel)


+----------+-----------+-----------------+
|prediction|num_variety|         features|
+----------+-----------+-----------------+
|       0.0|        0.0|[4.9,3.1,1.5,0.1]|
|       0.0|        0.0|[5.1,3.3,1.7,0.5]|
|       0.0|        0.0|[5.2,3.5,1.5,0.2]|
|       0.0|        0.0|[5.5,4.2,1.4,0.2]|
|       0.0|        0.0|[4.5,2.3,1.3,0.3]|
+----------+-----------+-----------------+
only showing top 5 rows

Test accuracy =  96.66666666666667 %
RandomForestClassificationModel (uid=RandomForestClassifier_1503000db8fd) with 20 trees


## Gradient Boosting Tree Classifier

In [13]:
# Stage3
# train a DecisionTree model.
lr_classifier = LogisticRegression(labelCol="num_variety", featuresCol="features", maxIter=10, regParam=0.3, elasticNetParam=0.8)

# chain indexers and gbt in a Pipeline
lr_pipeline = Pipeline(stages=[stringIndexer, vectorAssembler, lr_classifier])

# train model
lr_model = lr_pipeline.fit(training_data)

# make predictions.
lr_predictions = lr_model.transform(test_data)

# select example rows to display.
lr_predictions.select("prediction", "num_variety", "features").show(5)

lr_accuracy = evaluator.evaluate(lr_predictions)
print("Test accuracy = ", lr_accuracy*100, "%")

lrModel = lr_model.stages[2]
# summary only
print(lrModel)


+----------+-----------+-----------------+
|prediction|num_variety|         features|
+----------+-----------+-----------------+
|       0.0|        0.0|[4.9,3.1,1.5,0.1]|
|       0.0|        0.0|[5.1,3.3,1.7,0.5]|
|       0.0|        0.0|[5.2,3.5,1.5,0.2]|
|       0.0|        0.0|[5.5,4.2,1.4,0.2]|
|       0.0|        0.0|[4.5,2.3,1.3,0.3]|
+----------+-----------+-----------------+
only showing top 5 rows

Test accuracy =  60.0 %
LogisticRegressionModel: uid = LogisticRegression_3eb0c583d34a, numClasses = 3, numFeatures = 4


In [None]:
sc.stop()