In [1]:
# Always needs to be done in Rasberry Pi
import findspark
findspark.init('/home/baxman/spark-2.4.7-bin-hadoop2.7')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('trees').getOrCreate()

In [5]:
# Import Classification from MLlib
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier, DecisionTreeClassifier)

In [6]:
# Import Pipeline
from pyspark.ml import Pipeline

In [7]:
# Import data
data = spark.read.format('libsvm').load('/home/baxman/Codes/PySpark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Tree_Methods/sample_libsvm_data.txt')

In [8]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [10]:
# Split data
training,test = data.randomSplit([0.7,0.3])

In [11]:
# Define three classifiers
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()
gbt = GBTClassifier()

In [12]:
# Fit models
dtc_model = dtc.fit(training)
rfc_model = rfc.fit(training)
gbt_model = gbt.fit(training)

In [13]:
# Get predictions from test data
dtc_predictions = dtc_model.transform(test)
rfc_predictions = rfc_model.transform(test)
gbt_predictions = gbt_model.transform(test)

In [14]:
# Show predictions
dtc_predictions.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[121,122,123...|   [31.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [31.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [31.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [31.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [31.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [31.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[152,153,154...|   [31.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[152,153,154...|   [31.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[154,155,156...|   [31.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[154,155,156...|   [31.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[155,156,180...|   [31.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[234,235,237...|   [31.0,0.0]|  [1.0,0.0]|       0.0|
|  1.0|(69

In [15]:
gbt_predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[121,122,123...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[123,124,125...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[127,128,129...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[127,128,129...|[1.43551560054612...|[0.94639568583144...|       0.0|
|  0.0|(692,[152,153,154...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[152,153,154...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[154,155,156...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[154

In [18]:
# Evaluate

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator(metricName = 'accuracy')

In [20]:
print('DTC Evaluation on Accuracy:')
acc_eval.evaluate(dtc_predictions)

DTC Evaluation on Accuracy:


1.0

In [21]:
print('RFC Evaluation on Accuracy:')
acc_eval.evaluate(rfc_predictions)

RFC Evaluation on Accuracy:


0.96

In [22]:
print('GBT Evaluation on Accuracy:')
acc_eval.evaluate(gbt_predictions)

GBT Evaluation on Accuracy:


1.0

In [23]:
# Show feature importances
rfc_model.featureImportances

SparseVector(692, {99: 0.0023, 207: 0.0034, 267: 0.0018, 271: 0.0295, 289: 0.0025, 291: 0.0367, 300: 0.0366, 313: 0.0051, 316: 0.0052, 318: 0.0027, 323: 0.0573, 350: 0.0567, 354: 0.0013, 371: 0.0076, 378: 0.058, 379: 0.0034, 400: 0.0449, 405: 0.0463, 414: 0.0065, 432: 0.0325, 434: 0.0477, 441: 0.006, 455: 0.0075, 456: 0.0402, 461: 0.0449, 462: 0.0447, 464: 0.005, 468: 0.0029, 481: 0.0105, 490: 0.0034, 496: 0.0331, 511: 0.1948, 512: 0.0448, 517: 0.0026, 546: 0.0151, 548: 0.0025, 570: 0.0036, 573: 0.0022, 579: 0.0263, 593: 0.0019, 597: 0.0013, 620: 0.0034, 622: 0.0022, 632: 0.0013, 636: 0.0054, 658: 0.0064})