In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import Row, SQLContext
from pyspark import SparkFiles
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, VectorIndexer
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import os
import pandas as pd


In [2]:
#######################
##CREATE SPARK CONTEXT#
##CREATE SQL CONTEXT###
#######################
sc =SparkContext()
sqlContext = SQLContext(sc)


In [3]:
#LOAD IRIS DATA

data_dir="."
file = os.path.join(data_dir,"iris.csv")
panda_df = pd.read_csv(file)

iris_df=sqlContext.createDataFrame(panda_df)
iris_df.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- variety: string (nullable = true)



#### Perform Data Analytics

In [4]:
#See standard parameters
iris_df.describe().show()

+-------+------------------+------------------+------------------+------------------+---------+
|summary|      sepal_length|       sepal_width|      petal_length|       petal_width|  variety|
+-------+------------------+------------------+------------------+------------------+---------+
|  count|               150|               150|               150|               150|      150|
|   mean| 5.843333333333334|3.0573333333333337|3.7580000000000005|1.1993333333333331|     null|
| stddev|0.8280661279778632|0.4358662849366982|1.7652982332594662|0.7622376689603466|     null|
|    min|               4.3|               2.0|               1.0|               0.1|   Setosa|
|    max|               7.9|               4.4|               6.9|               2.5|Virginica|
+-------+------------------+------------------+------------------+------------------+---------+



### Decision tree Classifier

#### Prepare data for ML

In [5]:
#Transform to a Data Frame for input to Machine Learing

#stage 1: change the categorical feature to numerical one 
stringIndexer = StringIndexer(inputCol="variety", outputCol="label")


#stage 2: generate one column "features" for all the features 
vectorAssembler = VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],\
                                  outputCol="features")

#stage 3: create the model 
classifier_dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

#regroup the stages in a pipeline
pipeline_dt = Pipeline(stages=[stringIndexer, vectorAssembler, classifier_dt])

# split data
(data_train, data_test) = iris_df.randomSplit([0.75, 0.25])

#### Perform Machine Learning

In [6]:
# apply the pipeline to apply the transformation and train the model 
model_dt = pipeline_dt.fit(data_train)

# make predictions by transforming the features dataframe into predictions dataframe
predictions_dt = model_dt.transform(data_test)

print("nb nodes in model dt: ",model_dt.stages[-1].numNodes)
print("depth of model dt: ",model_dt.stages[-1].depth)

predictions_dt.select("prediction","label", "features").show()

#Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="label",metricName="accuracy")
accuracy_dt = evaluator.evaluate(predictions_dt)    

print("Test accuracy = ", accuracy_dt*100, "%")

#Draw a confusion matrix
predictions_dt.groupBy("label","prediction").count().show()

nb nodes in model dt:  11
depth of model dt:  4
+----------+-----+-----------------+
|prediction|label|         features|
+----------+-----+-----------------+
|       1.0|  1.0|[4.7,3.2,1.3,0.2]|
|       1.0|  1.0|[4.9,3.0,1.4,0.2]|
|       1.0|  1.0|[5.4,3.9,1.7,0.4]|
|       1.0|  1.0|[4.8,3.4,1.6,0.2]|
|       1.0|  1.0|[4.3,3.0,1.1,0.1]|
|       1.0|  1.0|[5.7,4.4,1.5,0.4]|
|       1.0|  1.0|[5.8,4.0,1.2,0.2]|
|       1.0|  1.0|[5.1,3.3,1.7,0.5]|
|       1.0|  1.0|[5.1,3.8,1.5,0.3]|
|       1.0|  1.0|[5.4,3.4,1.7,0.2]|
|       1.0|  1.0|[4.5,2.3,1.3,0.3]|
|       1.0|  1.0|[5.0,3.5,1.6,0.6]|
|       1.0|  1.0|[5.1,3.8,1.9,0.4]|
|       2.0|  2.0|[6.6,2.9,4.6,1.3]|
|       2.0|  2.0|[5.6,2.5,3.9,1.1]|
|       2.0|  2.0|[5.6,3.0,4.5,1.5]|
|       0.0|  2.0|[5.9,3.2,4.8,1.8]|
|       2.0|  2.0|[6.1,2.8,4.0,1.3]|
|       2.0|  2.0|[6.3,2.5,4.9,1.5]|
|       2.0|  2.0|[6.4,2.9,4.3,1.3]|
+----------+-----+-----------------+
only showing top 20 rows

Test accuracy =  94.5945945945946 %
+-

### Random Forest Classifier

#### Prepare data for ML

In [7]:
#Transform to a Data Frame for input to Machine Learing

#stage 1: as before
#stage 2: as before 
                                  
#stage 3: create the model 
classifier_rf = RandomForestClassifier(labelCol="label", featuresCol="features")

#regroup the stages in a pipeline
pipeline_rf = Pipeline(stages=[stringIndexer, vectorAssembler, classifier_rf])

# split data
#(data_train, data_test) = iris_df.randomSplit([0.75, 0.25])

#### Perform Machine Learning

In [8]:
# apply the pipeline to apply the transformation and train the model 
model_rf = pipeline_rf.fit(data_train)

# make predictions by transforming the features dataframe into predictions dataframe
predictions_rf = model_rf.transform(data_test)

print("model info : ",model_rf.stages[-1])

predictions_rf.select("prediction","label", "features").show()

#Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="label",metricName="accuracy")
accuracy_rf = evaluator.evaluate(predictions_rf)    

print("Test accuracy = ", accuracy_rf*100, "%")

#Draw a confusion matrix
predictions_rf.groupBy("label","prediction").count().show()

Py4JJavaError: An error occurred while calling o281.getParam.
: java.util.NoSuchElementException: Param bootstrap does not exist.
	at org.apache.spark.ml.param.Params.$anonfun$getParam$2(params.scala:729)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.ml.param.Params.getParam(params.scala:729)
	at org.apache.spark.ml.param.Params.getParam$(params.scala:727)
	at org.apache.spark.ml.PipelineStage.getParam(Pipeline.scala:43)
	at sun.reflect.GeneratedMethodAccessor186.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)


### LogisticRegression Classifier 

Unfortunatly Pyspark does not handle multi class problems for the gradient boosting. I'll use the logistic regression instead.

#### Prepare data for ML

In [12]:
#Transform to a Data Frame for input to Machine Learing

#stage 1: as before
#stage 2: as before 
                                  
#stage 3: create the model 
classifier_gb = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10, regParam=0.3, elasticNetParam=0.8)

#regroup the stages in a pipeline
pipeline_gb = Pipeline(stages=[stringIndexer, vectorAssembler, classifier_gb])

# split data
#(data_train, data_test) = iris_df.randomSplit([0.75, 0.25])

#### Perform Machine Learning

In [14]:
# apply the pipeline to apply the transformation and train the model 
model_gb = pipeline_gb.fit(data_train)

# make predictions by transforming the features dataframe into predictions dataframe
predictions_gb = model_gb.transform(data_test)

print("model info : ",model_gb.stages[-1])

predictions_gb.select("prediction","label", "features").show()

#Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="label",metricName="accuracy")
accuracy_gb = evaluator.evaluate(predictions_gb)    

print("Test accuracy = ", accuracy_gb*100, "%")

#Draw a confusion matrix
predictions_gb.groupBy("label","prediction").count().show()

model info :  LogisticRegressionModel: uid=LogisticRegression_b6573008bbf6, numClasses=3, numFeatures=4
+----------+-----+-----------------+
|prediction|label|         features|
+----------+-----+-----------------+
|       1.0|  1.0|[4.7,3.2,1.3,0.2]|
|       1.0|  1.0|[4.9,3.0,1.4,0.2]|
|       1.0|  1.0|[5.4,3.9,1.7,0.4]|
|       1.0|  1.0|[4.8,3.4,1.6,0.2]|
|       1.0|  1.0|[4.3,3.0,1.1,0.1]|
|       1.0|  1.0|[5.7,4.4,1.5,0.4]|
|       1.0|  1.0|[5.8,4.0,1.2,0.2]|
|       1.0|  1.0|[5.1,3.3,1.7,0.5]|
|       1.0|  1.0|[5.1,3.8,1.5,0.3]|
|       1.0|  1.0|[5.4,3.4,1.7,0.2]|
|       1.0|  1.0|[4.5,2.3,1.3,0.3]|
|       1.0|  1.0|[5.0,3.5,1.6,0.6]|
|       1.0|  1.0|[5.1,3.8,1.9,0.4]|
|       0.0|  2.0|[6.6,2.9,4.6,1.3]|
|       0.0|  2.0|[5.6,2.5,3.9,1.1]|
|       0.0|  2.0|[5.6,3.0,4.5,1.5]|
|       0.0|  2.0|[5.9,3.2,4.8,1.8]|
|       0.0|  2.0|[6.1,2.8,4.0,1.3]|
|       0.0|  2.0|[6.3,2.5,4.9,1.5]|
|       0.0|  2.0|[6.4,2.9,4.3,1.3]|
+----------+-----+-----------------+
only sho