import libraries

In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import Row, SQLContext
from pyspark import SparkFiles
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, VectorIndexer
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import os
import pandas as pd

In [2]:
# create spark and sql context
#sc.stop()
sc = SparkContext()
sqlContext = SQLContext(sc)

In [3]:
# load iris data as pandas dataframe and then convert it into spark dataframe
pandas_data = pd.read_csv("iris.csv")
data = sqlContext.createDataFrame(pandas_data)
data.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- variety: string (nullable = true)



In [4]:
data.show(5)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [5]:
#See standard parameters
data.describe().show()

+-------+------------------+------------------+------------------+------------------+---------+
|summary|      sepal_length|       sepal_width|      petal_length|       petal_width|  variety|
+-------+------------------+------------------+------------------+------------------+---------+
|  count|               150|               150|               150|               150|      150|
|   mean| 5.843333333333334|3.0573333333333337|3.7580000000000005|1.1993333333333331|     null|
| stddev|0.8280661279778632|0.4358662849366982|1.7652982332594662|0.7622376689603466|     null|
|    min|               4.3|               2.0|               1.0|               0.1|   Setosa|
|    max|               7.9|               4.4|               6.9|               2.5|Virginica|
+-------+------------------+------------------+------------------+------------------+---------+



## Decision Trees Pipeline

In [6]:
# Stage1
# convert the targets column to numerical
stringIndexer = StringIndexer(inputCol="variety", outputCol="num_variety")

# Stage2
# assemble features in one column
vectorAssembler = VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],\
                                  outputCol="features")

# Stage3
# train a DecisionTree model.
dt_classifier = DecisionTreeClassifier(labelCol="num_variety", featuresCol="features")


# split data
(training_data, test_data) = data.randomSplit([0.8, 0.2])

# chain indexers and dt in a Pipeline
dt_pipeline = Pipeline(stages=[stringIndexer, vectorAssembler, dt_classifier])

# train model
dt_model = dt_pipeline.fit(training_data)

# make predictions.
dt_predictions = dt_model.transform(test_data)

# select example rows to display.
dt_predictions.select("prediction", "num_variety", "features").show(5)

# select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="num_variety", predictionCol="prediction", metricName="accuracy")

dt_accuracy = evaluator.evaluate(dt_predictions)
print("Test accuracy = ", dt_accuracy*100, "%")

treeModel = dt_model.stages[2]
# summary only
print(treeModel)


+----------+-----------+-----------------+
|prediction|num_variety|         features|
+----------+-----------+-----------------+
|       0.0|        0.0|[4.6,3.1,1.5,0.2]|
|       0.0|        0.0|[5.0,3.6,1.4,0.2]|
|       0.0|        0.0|[5.0,3.4,1.5,0.2]|
|       0.0|        0.0|[5.1,3.3,1.7,0.5]|
|       0.0|        0.0|[5.0,3.2,1.2,0.2]|
+----------+-----------+-----------------+
only showing top 5 rows

Test accuracy =  88.88888888888889 %
DecisionTreeClassificationModel (uid=DecisionTreeClassifier_a259dabd8a70) of depth 5 with 15 nodes


## Random Forest Pipeline

In [7]:
# Stage3
# train a Random Forest model.
rf_classifier = RandomForestClassifier(labelCol="num_variety", featuresCol="features")


# chain indexers and dt in a Pipeline
rf_pipeline = Pipeline(stages=[stringIndexer, vectorAssembler, rf_classifier])

# train model
rf_model = rf_pipeline.fit(training_data)

# make predictions.
rf_predictions = rf_model.transform(test_data)

# select example rows to display.
rf_predictions.select("prediction", "num_variety", "features").show(5)

# compute accuracy
rf_accuracy = evaluator.evaluate(rf_predictions)
print("Test accuracy = ", rf_accuracy*100, "%")

forestModel = rf_model.stages[2]
# summary only
print(forestModel)


+----------+-----------+-----------------+
|prediction|num_variety|         features|
+----------+-----------+-----------------+
|       0.0|        0.0|[4.6,3.1,1.5,0.2]|
|       0.0|        0.0|[5.0,3.6,1.4,0.2]|
|       0.0|        0.0|[5.0,3.4,1.5,0.2]|
|       0.0|        0.0|[5.1,3.3,1.7,0.5]|
|       0.0|        0.0|[5.0,3.2,1.2,0.2]|
+----------+-----------+-----------------+
only showing top 5 rows

Test accuracy =  88.88888888888889 %
RandomForestClassificationModel (uid=RandomForestClassifier_2d1d080238c3) with 20 trees


## Gradient Boosting Tree Classifier

In [8]:
# Stage3
# train a DecisionTree model.
gbt_classifier = GBTClassifier(labelCol="num_variety", featuresCol="features", maxIter=10)

# chain indexers and gbt in a Pipeline
gbt_pipeline = Pipeline(stages=[stringIndexer, vectorAssembler, gbt_classifier])

# train model
gbt_model = gbt_pipeline.fit(training_data)

# make predictions.
gbt_predictions = gbt_model.transform(test_data)

# select example rows to display.
gbt_predictions.select("prediction", "num_variety", "features").show(5)

gbt_accuracy = evaluator.evaluate(gbt_predictions)
print("Test accuracy = ", gbt_accuracy*100, "%")

gbtModel = gbt_model.stages[2]
# summary only
print(gbtModel)


Py4JJavaError: An error occurred while calling o470.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 18 in stage 54.0 failed 1 times, most recent failure: Lost task 18.0 in stage 54.0 (TID 983, localhost, executor driver): java.lang.IllegalArgumentException: requirement failed: GBTClassifier was given dataset with invalid label 2.0.  Labels must be in {0,1}; note that GBTClassifier currently only supports binary classification.
	at scala.Predef$.require(Predef.scala:224)
	at org.apache.spark.ml.classification.GBTClassifier$$anonfun$train$1$$anonfun$1$$anonfun$apply$1.apply(GBTClassifier.scala:167)
	at org.apache.spark.ml.classification.GBTClassifier$$anonfun$train$1$$anonfun$1$$anonfun$apply$1.apply(GBTClassifier.scala:165)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:222)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:299)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:357)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:308)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD.count(RDD.scala:1213)
	at org.apache.spark.ml.tree.impl.DecisionTreeMetadata$.buildMetadata(DecisionTreeMetadata.scala:118)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:106)
	at org.apache.spark.ml.regression.DecisionTreeRegressor$$anonfun$train$2.apply(DecisionTreeRegressor.scala:129)
	at org.apache.spark.ml.regression.DecisionTreeRegressor$$anonfun$train$2.apply(DecisionTreeRegressor.scala:124)
	at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185)
	at org.apache.spark.ml.regression.DecisionTreeRegressor.train(DecisionTreeRegressor.scala:124)
	at org.apache.spark.ml.tree.impl.GradientBoostedTrees$.boost(GradientBoostedTrees.scala:297)
	at org.apache.spark.ml.tree.impl.GradientBoostedTrees$.run(GradientBoostedTrees.scala:55)
	at org.apache.spark.ml.classification.GBTClassifier$$anonfun$train$1.apply(GBTClassifier.scala:206)
	at org.apache.spark.ml.classification.GBTClassifier$$anonfun$train$1.apply(GBTClassifier.scala:156)
	at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185)
	at org.apache.spark.ml.classification.GBTClassifier.train(GBTClassifier.scala:156)
	at org.apache.spark.ml.classification.GBTClassifier.train(GBTClassifier.scala:58)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.IllegalArgumentException: requirement failed: GBTClassifier was given dataset with invalid label 2.0.  Labels must be in {0,1}; note that GBTClassifier currently only supports binary classification.
	at scala.Predef$.require(Predef.scala:224)
	at org.apache.spark.ml.classification.GBTClassifier$$anonfun$train$1$$anonfun$1$$anonfun$apply$1.apply(GBTClassifier.scala:167)
	at org.apache.spark.ml.classification.GBTClassifier$$anonfun$train$1$$anonfun$1$$anonfun$apply$1.apply(GBTClassifier.scala:165)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:222)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:299)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:357)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:308)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more
