In [1]:
import findspark
findspark.init('/usr/local/spark')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('mytree').getOrCreate()

In [2]:
data=spark.read.csv('read2.csv', inferSchema=True, header=True)

In [3]:
data.printSchema()

root
 |-- fLength: double (nullable = true)
 |-- fWidth: double (nullable = true)
 |-- fSize: double (nullable = true)
 |-- fConc: double (nullable = true)
 |-- fConc1: double (nullable = true)
 |-- fAsym: double (nullable = true)
 |-- fM3Long: double (nullable = true)
 |-- fM3Trans: double (nullable = true)
 |--  fAlpha: double (nullable = true)
 |-- fDist: double (nullable = true)
 |-- class: integer (nullable = true)



In [4]:
data.head(1)

[Row(fLength=28.7967, fWidth=16.0021, fSize=2.6449, fConc=0.3918, fConc1=0.1982, fAsym=27.7004, fM3Long=22.011, fM3Trans=-8.2027,  fAlpha=40.092, fDist=81.8828, class=1)]

In [5]:
from pyspark.ml.feature import VectorAssembler

In [6]:
data.columns

['fLength',
 'fWidth',
 'fSize',
 'fConc',
 'fConc1',
 'fAsym',
 'fM3Long',
 'fM3Trans',
 ' fAlpha',
 'fDist',
 'class']

In [7]:
assembler = VectorAssembler(inputCols=['fLength',
 'fWidth',
 'fSize',
 'fConc',
 'fConc1',
 'fAsym',
 'fM3Long',
 'fM3Trans',
 ' fAlpha',
 'fDist',
 'class'],outputCol='features')
#g=1, h=0
#convert class from string type to int type for Vector Assembler function

In [8]:
output=assembler.transform(data)

In [9]:
from pyspark.ml.feature import StringIndexer

In [10]:
indexer = StringIndexer(inputCol='class', outputCol='classIndex')

In [11]:
output_fixed=indexer.fit(output).transform(output)
#pipeline needed if you repeat your code alot

In [13]:
output_fixed.printSchema()
#features and classIndex column created

root
 |-- fLength: double (nullable = true)
 |-- fWidth: double (nullable = true)
 |-- fSize: double (nullable = true)
 |-- fConc: double (nullable = true)
 |-- fConc1: double (nullable = true)
 |-- fAsym: double (nullable = true)
 |-- fM3Long: double (nullable = true)
 |-- fM3Trans: double (nullable = true)
 |--  fAlpha: double (nullable = true)
 |-- fDist: double (nullable = true)
 |-- class: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- classIndex: double (nullable = false)



In [14]:
final_data=output_fixed.select('features', 'classIndex')

In [15]:
train_data, test_data=final_data.randomSplit([0.7,0.3])

In [16]:
from pyspark.ml.classification import (DecisionTreeClassifier, GBTClassifier, RandomForestClassifier)

In [17]:
from pyspark.ml import Pipeline

In [30]:
dtc = DecisionTreeClassifier(labelCol='classIndex', featuresCol='features')
rfc = RandomForestClassifier(numTrees=150, labelCol='classIndex', featuresCol='features')
gbt = GBTClassifier(labelCol='classIndex', featuresCol='features')
#shift tab to see GBTClassifer paramaters

In [31]:
dtc_model=dtc.fit(train_data)
rfc_model=rfc.fit(train_data)
gbt_model=gbt.fit(train_data)

In [32]:
dtc_preds=dtc_model.transform(test_data)
rfc_preds=rfc_model.transform(test_data)
gbt_preds=gbt_model.transform(test_data)

In [33]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [34]:
my_binary_eval=BinaryClassificationEvaluator(labelCol='classIndex')

In [35]:
print('DTC')
print(my_binary_eval.evaluate(dtc_preds))

DTC
1.0


In [36]:
print('RFC')
print(my_binary_eval.evaluate(rfc_preds))

RFC
1.0


In [37]:
gbt_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- classIndex: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [38]:
rfc_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- classIndex: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [39]:
my_binary_eval2=BinaryClassificationEvaluator(labelCol='classIndex',
                                             rawPredictionCol='prediction')

In [40]:
print('GBT')
print(my_binary_eval2.evaluate(gbt_preds))

GBT
1.0


In [47]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [48]:
acc_eval=MulticlassClassificationEvaluator(labelCol="classIndex", metricName='accuracy')

In [49]:
dtc_model.featureImportances

SparseVector(11, {10: 1.0})

In [50]:
rfc_model.featureImportances
#fDist (10) sees to be most significant in affecting class type, followed by fM3Trans(8)

SparseVector(11, {0: 0.0448, 1: 0.0244, 2: 0.013, 3: 0.0024, 4: 0.0016, 5: 0.0031, 6: 0.0105, 7: 0.0013, 8: 0.0787, 9: 0.003, 10: 0.8173})

In [51]:
gbt_model.featureImportances

SparseVector(11, {1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 1.0})