In [None]:
#Load the CSV file into a RDD
irisData = sc.textFile("data/iris.csv")
irisData.persist()

In [None]:
#Remove the first line (contains headers)
dataLines = irisData.filter(lambda x: "Sepal" not in x)
dataLines.count()

In [None]:
import math
from pyspark.mllib.linalg import Vectors

In [None]:
def transformToNumeric( inputStr) :
    attList=inputStr.split(",")
    
    #Set default to setosa
    irisValue=1.0
    if attList[4] == "versicolor":
        irisValue=2.0
    if attList[4] == "virginica":
        irisValue=3.0
       
    #Filter out columns not wanted at this stage
    values= Vectors.dense([ irisValue, \
                     float(attList[0]),  \
                     float(attList[1]),  \
                     float(attList[2]),  \
                     float(attList[3])  \
                     ])
    return values

In [None]:
#Change to a Vector
irisVectors = dataLines.map(transformToNumeric)
irisVectors.collect()

In [None]:
#Perform statistical Analysis
from pyspark.mllib.stat import Statistics
irisStats=Statistics.colStats(irisVectors)
irisStats.mean()

In [None]:
irisStats.variance()

In [None]:
irisStats.min()

In [None]:
irisStats.max()

In [None]:
Statistics.corr(irisVectors)

In [None]:
#Transform to a Data Frame for input to Machine Learing
#Drop columns that are not required (low correlation)

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [None]:
def transformToLabeledPoint(inStr) :
    attList=inStr.split(",")
    lp = ( attList[4], Vectors.dense([attList[0],attList[2],attList[3]]))
    return lp

In [None]:
irisLp = dataLines.map(transformToLabeledPoint)
irisDF = sqlContext.createDataFrame(irisLp,["label", "features"])
irisDF.select("label","features").show(10)

In [None]:
#Find correlations
numFeatures = irisDF.take(1)[0].features.size
labelRDD = irisDF.map(lambda lp: lp.label)
for i in range(numFeatures):
    featureRDD = irisDF.map(lambda lp: lp.features[i])
    corr = Statistics.corr(labelRDD, featureRDD, 'pearson')
    print('%d\t%g' % (i, corr))

In [None]:
#Indexing needed as pre-req for Decision Trees
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(irisDF)
td = si_model.transform(irisDF)
td.collect()

In [None]:
#Split into training and testing data
(trainingData, testData) = td.randomSplit([0.9, 0.1])
trainingData.count()
testData.count()
testData.collect()

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
#Create the model
dtClassifer = DecisionTreeClassifier(maxDepth=2, labelCol="indexed")
dtModel = dtClassifer.fit(trainingData)

dtModel.numNodes

In [None]:
dtModel.depth

In [None]:
#Predict on the test data
predictions = dtModel.transform(trainingData)
predictions.select("prediction","indexed","label","features").collect()
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="indexed",metricName="precision")
evaluator.evaluate(predictions)  

In [None]:
#Draw a confusion matrix
labelList=predictions.select("indexed","label").distinct().toPandas()
predictions.groupBy("indexed","prediction").count().show()