In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import Row
from pyspark.sql import SQLContext
from pyspark import SparkFiles
import os

In [2]:
!pip install pandas
import pandas as pd

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [3]:
"""----------------------------------------------------------------------------
CREATE SPARK CONTEXT
CREATE SQL CONTEXT
----------------------------------------------------------------------------"""
sc = SparkContext()
sqlContext = SQLContext(sc)

22/02/19 00:28:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/19 00:28:19 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
"""----------------------------------------------------------------------------
LOAD IRIS DATA
----------------------------------------------------------------------------"""
data_dir="./data/"
file = os.path.join(data_dir,"iris.csv")
panda_df = pd.read_csv(file)

iris_df=sqlContext.createDataFrame(panda_df)
iris_df.printSchema()

#Add a numeric indexer for the label/target column
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="variety", outputCol="ind_variety")
si_model = stringIndexer.fit(iris_df)
irisNormDf = si_model.transform(iris_df)
irisNormDf.printSchema()
irisNormDf.select("variety","ind_variety").distinct().collect()
#irisNormDf.cache()

"""--------------------------------------------------------------------------
Perform Data Analytics
-------------------------------------------------------------------------"""

#See standard parameters
#irisNormDf.describe().show()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- variety: string (nullable = true)



                                                                                

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- variety: string (nullable = true)
 |-- ind_variety: double (nullable = false)



                                                                                

'--------------------------------------------------------------------------\nPerform Data Analytics\n-------------------------------------------------------------------------'

In [5]:
"""--------------------------------------------------------------------------
Prepare data for ML
-------------------------------------------------------------------------"""

#Transform to a Data Frame for input to Machine Learing
#Drop columns that are not required (low correlation)

from pyspark.ml.linalg import Vectors
def transformToLabeledPoint(row) :
    lp = ( row["variety"], row["ind_variety"], \
                Vectors.dense([row["sepal_length"],\
                        row["sepal_width"], \
                        row["petal_length"], \
                        row["petal_width"]]))
    return lp

irisLp = irisNormDf.rdd.map(transformToLabeledPoint)
irisLpDf = sqlContext.createDataFrame(irisLp,["species","label", "features"])
irisLpDf.select("species","label","features").show(50)
irisLpDf.cache()

                                                                                

+-------+-----+-----------------+
|species|label|         features|
+-------+-----+-----------------+
| Setosa|  0.0|[5.1,3.5,1.4,0.2]|
| Setosa|  0.0|[4.9,3.0,1.4,0.2]|
| Setosa|  0.0|[4.7,3.2,1.3,0.2]|
| Setosa|  0.0|[4.6,3.1,1.5,0.2]|
| Setosa|  0.0|[5.0,3.6,1.4,0.2]|
| Setosa|  0.0|[5.4,3.9,1.7,0.4]|
| Setosa|  0.0|[4.6,3.4,1.4,0.3]|
| Setosa|  0.0|[5.0,3.4,1.5,0.2]|
| Setosa|  0.0|[4.4,2.9,1.4,0.2]|
| Setosa|  0.0|[4.9,3.1,1.5,0.1]|
| Setosa|  0.0|[5.4,3.7,1.5,0.2]|
| Setosa|  0.0|[4.8,3.4,1.6,0.2]|
| Setosa|  0.0|[4.8,3.0,1.4,0.1]|
| Setosa|  0.0|[4.3,3.0,1.1,0.1]|
| Setosa|  0.0|[5.8,4.0,1.2,0.2]|
| Setosa|  0.0|[5.7,4.4,1.5,0.4]|
| Setosa|  0.0|[5.4,3.9,1.3,0.4]|
| Setosa|  0.0|[5.1,3.5,1.4,0.3]|
| Setosa|  0.0|[5.7,3.8,1.7,0.3]|
| Setosa|  0.0|[5.1,3.8,1.5,0.3]|
| Setosa|  0.0|[5.4,3.4,1.7,0.2]|
| Setosa|  0.0|[5.1,3.7,1.5,0.4]|
| Setosa|  0.0|[4.6,3.6,1.0,0.2]|
| Setosa|  0.0|[5.1,3.3,1.7,0.5]|
| Setosa|  0.0|[4.8,3.4,1.9,0.2]|
| Setosa|  0.0|[5.0,3.0,1.6,0.2]|
| Setosa|  0.0

DataFrame[species: string, label: double, features: vector]

In [6]:
"""--------------------------------------------------------------------------
Perform Machine Learning
-------------------------------------------------------------------------"""

#Split into training and testing data
(trainingData, testData) = irisLpDf.randomSplit([0.9, 0.1])
trainingData.count()
testData.count()
testData.collect()

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


#Create the model
dtClassifer = DecisionTreeClassifier(maxDepth=4, labelCol="label",\
                featuresCol="features")
from pyspark.ml import Pipeline
dtpipeline = Pipeline(stages=[dtClassifer])
dtModel = dtpipeline.fit(trainingData)
#dtModel = dtClassifer.fit(trainingData)

#print(dtModel.numNodes)
#print(dtModel.depth)

#Predict on the test data
predictions = dtModel.transform(testData)
predictions.select("prediction","species","label").collect()

#Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="label",metricName="accuracy")
evaluator.evaluate(predictions)    

#Draw a confusion matrix
predictions.groupBy("label","prediction").count().show()

                                                                                

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|    6|
|  2.0|       2.0|    3|
|  2.0|       1.0|    1|
|  1.0|       2.0|    1|
|  0.0|       0.0|    4|
+-----+----------+-----+



In [7]:
evaluator.evaluate(predictions)  

0.8666666666666667

In [8]:
"""--------------------------------------------------------------------------
Perform Machine Learning RF
-------------------------------------------------------------------------"""

#Split into training and testing data
(trainingData, testData) = irisLpDf.randomSplit([0.9, 0.1])
trainingData.count()
testData.count()
testData.collect()

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Create the model
rfClassifer = RandomForestClassifier(maxDepth=4, labelCol="label",\
                featuresCol="features")
from pyspark.ml import Pipeline
rfpipeline = Pipeline(stages=[rfClassifer])
rfModel = rfpipeline.fit(trainingData)
#rfModel = rfClassifer.fit(trainingData)

#print(dtModel.numNodes)
#print(dtModel.depth)

#Predict on the test data
rfpredictions = rfModel.transform(testData)
rfpredictions.select("prediction","species","label").collect()

#Evaluate accuracy
rfevaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="label",metricName="accuracy")
rfevaluator.evaluate(rfpredictions)    

#Draw a confusion matrix
rfpredictions.groupBy("label","prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|    6|
|  2.0|       2.0|    4|
|  0.0|       0.0|    6|
+-----+----------+-----+



In [9]:
rfevaluator.evaluate(rfpredictions)

1.0

In [10]:
"""--------------------------------------------------------------------------
Perform Machine Learning GB
-------------------------------------------------------------------------"""

from pyspark.sql.functions import when, lit
irisLpDfBis = irisLpDf
irisLpDfBis = irisLpDfBis.withColumn('label', when(irisLpDfBis.label==2.0, 
lit(1.0)).otherwise(irisLpDfBis.label))
#irisLpDfBis.select("species","label","features").show(500)
#irisLpDfBisB = irisLpDfBis.withColumn('label', when(irisLpDfBis.label==2.0, 
#lit(0.0)).otherwise(irisLpDfBis.label))
#irisLpDfBisC = irisLpDfBis.withColumn('label', when(irisLpDfBis.label==0.0, 
#lit(1.0)).otherwise(irisLpDfBis.label))

#Split into training and testing data
(trainingDataBis, testDataBis) = irisLpDfBis.randomSplit([0.9, 0.1])
trainingDataBis.count()
testDataBis.count()
testDataBis.collect()

from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Create the model
gbtClassifer = GBTClassifier(maxDepth=4, labelCol="label",\
                featuresCol="features")
from pyspark.ml import Pipeline
gbtpipeline = Pipeline(stages=[gbtClassifer])
gbtModel = gbtpipeline.fit(trainingDataBis)
#gbtModel = gbtClassifer.fit(trainingData)

#print(dtModel.numNodes)
#print(dtModel.depth)

#Predict on the test data
gbtpredictions = gbtModel.transform(testDataBis)
gbtpredictions.select("prediction","species","label").collect()

#Evaluate accuracy
gbtevaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="label",metricName="accuracy")
gbtevaluator.evaluate(gbtpredictions)    

#Draw a confusion matrix
gbtpredictions.groupBy("label","prediction").count().show()

22/02/19 00:28:50 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/02/19 00:28:50 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|    9|
|  0.0|       0.0|    6|
+-----+----------+-----+



In [14]:
gbtevaluator.evaluate(gbtpredictions)

1.0

In [12]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import OneVsRest

#Create the model
gbtClassiferBis = GBTClassifier(maxDepth=4, labelCol="label",\
                featuresCol="features")
ovr = OneVsRest(classifier=gbtClassiferBis)
ovr.setPredictionCol("prediction")
gbtModelBis = ovr.fit(irisLpDf)

In [13]:
#Predict on the test data
gbtpredictionsBis = gbtModelBis.transform(testData)
gbtpredictionsBis.select("prediction","species","label").collect()

#Evaluate accuracy
#gbtevaluatorBis = MulticlassClassificationEvaluator(predictionCol="prediction", \
#                    labelCol="label",metricName="accuracy")
gbtevaluator.evaluate(gbtpredictionsBis)

#Draw a confusion matrix
gbtpredictionsBis.groupBy("label","prediction").count().show()

                                                                                

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|    6|
|  2.0|       2.0|    4|
|  0.0|       0.0|    6|
+-----+----------+-----+



In [15]:
gbtevaluator.evaluate(gbtpredictionsBis)

                                                                                

1.0

In [18]:
print("GBT accuracy: ", gbtevaluator.evaluate(gbtpredictionsBis))
print("RF accuracy:  ", rfevaluator.evaluate(rfpredictions))
print("DT accuracy:  ", evaluator.evaluate(predictions))

                                                                                

GBT accuracy:  1.0
RF accuracy:   1.0
DT accuracy:   0.8666666666666667


22/02/19 03:27:49 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 7843331 ms exceeds timeout 120000 ms
22/02/19 03:27:49 WARN SparkContext: Killing executors is not supported by current scheduler.


from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[gbtClassifer])
  
model = pipeline.fit(trainingData)
gbt2prediction = model.transform(trainingData)
gbt2prediction.printSchema()

gbt2evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="label",metricName="accuracy")
gbt2evaluator.evaluate(gbt2predictions) 

from pyspark.sql.functions import when, lit
irisLpDfBis = irisLpDf
irisLpDfBis = irisLpDfBis.withColumn('label', when(irisLpDfBis.label==2.0, 
lit('1.0')).otherwise(irisLpDfBis.label))
irisLpDfBis.select("species","label","features").show(500)