In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import Row
from pyspark.sql import SQLContext
from pyspark import SparkFiles
import os
import pandas as pd

In [2]:
"""----------------------------------------------------------------------------
CREATE SPARK CONTEXT
CREATE SQL CONTEXT
----------------------------------------------------------------------------"""
sc =SparkContext()
sqlContext = SQLContext(sc)

22/01/29 12:59:41 WARN Utils: Your hostname, TABLET-UD6BNBK5 resolves to a loopback address: 127.0.1.1; using 172.30.100.210 instead (on interface eth0)
22/01/29 12:59:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/01/29 12:59:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
"""----------------------------------------------------------------------------
LOAD IRIS DATA
----------------------------------------------------------------------------"""
data_dir=""
file = os.path.join(data_dir,"iris.csv")
panda_df = pd.read_csv(file)

iris_df=sqlContext.createDataFrame(panda_df)
iris_df.printSchema()

#Add a numeric indexer for the label/target column
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="variety", outputCol="ind_variety")
si_model = stringIndexer.fit(iris_df)
irisNormDf = si_model.transform(iris_df)
irisNormDf.printSchema()
irisNormDf.select("variety","ind_variety").distinct().collect()
#irisNormDf.cache()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- variety: string (nullable = true)



                                                                                

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- variety: string (nullable = true)
 |-- ind_variety: double (nullable = false)



                                                                                

[Row(variety='Virginica', ind_variety=2.0),
 Row(variety='Versicolor', ind_variety=1.0),
 Row(variety='Setosa', ind_variety=0.0)]

In [4]:
"""--------------------------------------------------------------------------
Perform Data Analytics
-------------------------------------------------------------------------"""

#See standard parameters
irisNormDf.describe().show()

+-------+------------------+------------------+------------------+------------------+---------+------------------+
|summary|      sepal_length|       sepal_width|      petal_length|       petal_width|  variety|       ind_variety|
+-------+------------------+------------------+------------------+------------------+---------+------------------+
|  count|               150|               150|               150|               150|      150|               150|
|   mean| 5.843333333333334|3.0573333333333332|3.7580000000000005|1.1993333333333331|     null|               1.0|
| stddev|0.8280661279778632|0.4358662849366984| 1.765298233259466|0.7622376689603464|     null|0.8192319205190405|
|    min|               4.3|               2.0|               1.0|               0.1|   Setosa|               0.0|
|    max|               7.9|               4.4|               6.9|               2.5|Virginica|               2.0|
+-------+------------------+------------------+------------------+--------------

In [5]:
"""--------------------------------------------------------------------------
Prepare data for ML
-------------------------------------------------------------------------"""

#Transform to a Data Frame for input to Machine Learing
#Drop columns that are not required (low correlation)

from pyspark.ml.linalg import Vectors
def transformToLabeledPoint(row) :
    lp = ( row["variety"], row["ind_variety"], \
                Vectors.dense([row["sepal_length"],\
                        row["sepal_width"], \
                        row["petal_length"], \
                        row["petal_width"]]))
    return lp

irisLp = irisNormDf.rdd.map(transformToLabeledPoint)
irisLpDf = sqlContext.createDataFrame(irisLp,["species","label", "features"])
irisLpDf.select("species","label","features").show(50)
irisLpDf.cache()

+-------+-----+-----------------+
|species|label|         features|
+-------+-----+-----------------+
| Setosa|  0.0|[5.1,3.5,1.4,0.2]|
| Setosa|  0.0|[4.9,3.0,1.4,0.2]|
| Setosa|  0.0|[4.7,3.2,1.3,0.2]|
| Setosa|  0.0|[4.6,3.1,1.5,0.2]|
| Setosa|  0.0|[5.0,3.6,1.4,0.2]|
| Setosa|  0.0|[5.4,3.9,1.7,0.4]|
| Setosa|  0.0|[4.6,3.4,1.4,0.3]|
| Setosa|  0.0|[5.0,3.4,1.5,0.2]|
| Setosa|  0.0|[4.4,2.9,1.4,0.2]|
| Setosa|  0.0|[4.9,3.1,1.5,0.1]|
| Setosa|  0.0|[5.4,3.7,1.5,0.2]|
| Setosa|  0.0|[4.8,3.4,1.6,0.2]|
| Setosa|  0.0|[4.8,3.0,1.4,0.1]|
| Setosa|  0.0|[4.3,3.0,1.1,0.1]|
| Setosa|  0.0|[5.8,4.0,1.2,0.2]|
| Setosa|  0.0|[5.7,4.4,1.5,0.4]|
| Setosa|  0.0|[5.4,3.9,1.3,0.4]|
| Setosa|  0.0|[5.1,3.5,1.4,0.3]|
| Setosa|  0.0|[5.7,3.8,1.7,0.3]|
| Setosa|  0.0|[5.1,3.8,1.5,0.3]|
| Setosa|  0.0|[5.4,3.4,1.7,0.2]|
| Setosa|  0.0|[5.1,3.7,1.5,0.4]|
| Setosa|  0.0|[4.6,3.6,1.0,0.2]|
| Setosa|  0.0|[5.1,3.3,1.7,0.5]|
| Setosa|  0.0|[4.8,3.4,1.9,0.2]|
| Setosa|  0.0|[5.0,3.0,1.6,0.2]|
| Setosa|  0.0

DataFrame[species: string, label: double, features: vector]

In [6]:
"""--------------------------------------------------------------------------
Perform Machine Learning
-------------------------------------------------------------------------"""

#Split into training and testing data
(trainingData, testData) = irisLpDf.randomSplit([0.9, 0.1])
trainingData.count()
testData.count()
testData.collect()

                                                                                

[Row(species='Setosa', label=0.0, features=DenseVector([4.8, 3.4, 1.9, 0.2])),
 Row(species='Setosa', label=0.0, features=DenseVector([5.2, 3.5, 1.5, 0.2])),
 Row(species='Setosa', label=0.0, features=DenseVector([4.8, 3.0, 1.4, 0.3])),
 Row(species='Setosa', label=0.0, features=DenseVector([4.9, 3.6, 1.4, 0.1])),
 Row(species='Setosa', label=0.0, features=DenseVector([5.1, 3.8, 1.6, 0.2])),
 Row(species='Setosa', label=0.0, features=DenseVector([5.1, 3.8, 1.9, 0.4])),
 Row(species='Setosa', label=0.0, features=DenseVector([5.3, 3.7, 1.5, 0.2])),
 Row(species='Versicolor', label=1.0, features=DenseVector([7.0, 3.2, 4.7, 1.4])),
 Row(species='Versicolor', label=1.0, features=DenseVector([5.2, 2.7, 3.9, 1.4])),
 Row(species='Versicolor', label=1.0, features=DenseVector([5.7, 2.8, 4.5, 1.3])),
 Row(species='Versicolor', label=1.0, features=DenseVector([6.6, 2.9, 4.6, 1.3])),
 Row(species='Versicolor', label=1.0, features=DenseVector([6.8, 2.8, 4.8, 1.4])),
 Row(species='Versicolor', label

In [17]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Create the model
dtClassifer = DecisionTreeClassifier(maxDepth=4, labelCol="label",\
                featuresCol="features")
dtModel = dtClassifer.fit(trainingData)

rfClasifier = RandomForestClassifier(maxDepth=4, labelCol="label",\
                featuresCol="features")
rfModel = rfClasifier.fit(trainingData)

# gbtClassifier = GBTClassifier(maxDepth=4, labelCol="label",\
#                 featuresCol="features")
# gbtModel = gbtClassifier.fit(trainingData)

# print(rfModel.numNodes)
# print(rfModel.depth)

In [18]:
#Predict on the test data
predictions = dtModel.transform(testData)
predictions.select("prediction","species","label").collect()

predictions_rf = rfModel.transform(testData)
predictions_rf.select("prediction","species","label").collect()

[Row(prediction=0.0, species='Setosa', label=0.0),
 Row(prediction=0.0, species='Setosa', label=0.0),
 Row(prediction=0.0, species='Setosa', label=0.0),
 Row(prediction=0.0, species='Setosa', label=0.0),
 Row(prediction=0.0, species='Setosa', label=0.0),
 Row(prediction=0.0, species='Setosa', label=0.0),
 Row(prediction=0.0, species='Setosa', label=0.0),
 Row(prediction=1.0, species='Versicolor', label=1.0),
 Row(prediction=1.0, species='Versicolor', label=1.0),
 Row(prediction=1.0, species='Versicolor', label=1.0),
 Row(prediction=1.0, species='Versicolor', label=1.0),
 Row(prediction=1.0, species='Versicolor', label=1.0),
 Row(prediction=1.0, species='Versicolor', label=1.0),
 Row(prediction=2.0, species='Virginica', label=2.0),
 Row(prediction=2.0, species='Virginica', label=2.0),
 Row(prediction=2.0, species='Virginica', label=2.0)]

# Decision Tree

In [21]:
#Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="label",metricName="accuracy")
evaluator.evaluate(predictions)    

#Draw a confusion matrix
# predictions.groupBy("label","prediction").count().show()

Exception ignored in: <function JavaWrapper.__del__ at 0x7f986d021d30>
Traceback (most recent call last):
  File "/home/aympab/local/anaconda3/envs/hadoop-spark/lib/python3.9/site-packages/pyspark/ml/wrapper.py", line 39, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'RandomForestClassifier' object has no attribute '_java_obj'


1.0

# Random Forest

In [20]:
#Evaluate accuracy
evaluator_rf = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="label",metricName="accuracy")
evaluator_rf.evaluate(predictions_rf)    

#Draw a confusion matrix
# predictions_rf.groupBy("label","prediction").count().show()

1.0