In [1]:
def setHadoopConfig(name):
    prefix = "fs.swift2d.service." + name
    hconf = sc._jsc.hadoopConfiguration()
    hconf.set(prefix + '.auth.url', 'https://identity.open.softlayer.com'+'/v3/auth/tokens')
    hconf.set(prefix + '.auth.endpoint.prefix', 'endpoints')
    hconf.set(prefix + '.tenant', 'a9fb4d478e3d40a8bbd54c5a2ecf25a3')
    hconf.set(prefix + '.username', '6a4cc8251c1940179a6cccc9098a15e0')
    hconf.set(prefix + '.password', 'kDTcKA2H(3eo5.G0')
    hconf.setInt(prefix + '.http.port', 8080)
    hconf.set(prefix + '.region', 'dallas')
    hconf.setBoolean(prefix + '.public', False)

name = "keystone"
setHadoopConfig(name)

In [2]:
seven_cases = spark.read.parquet("swift2d://MGH." + name + "/tempParq/7cases.parquet")

In [3]:
seven_cases.count()

247625

In [4]:
seven_cases = seven_cases.withColumnRenamed("prediction","clusterid").select("clusterid","pcaFeatures")

In [5]:
(trainingData, testData) = seven_cases.randomSplit([0.7, 0.3])

In [None]:
trainingData.repartition(600)

DataFrame[clusterid: int, pcaFeatures: vector]

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

labelIndexer = StringIndexer(inputCol="clusterid", outputCol="label").fit(seven_cases)

rf = RandomForestClassifier(labelCol="label", featuresCol="pcaFeatures")

pipeline = Pipeline(stages=[labelIndexer, rf])

paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [40]) \
    .addGrid(rf.maxBins, [20, 60]) \
    .addGrid(rf.maxDepth, [20]) \
    .build()
    
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=3)

model = crossval.fit(trainingData)

predictions = model.transform(testData)

In [11]:
predictions.select("prediction", "label").show(10)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

+----------+-----+
|prediction|label|
+----------+-----+
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
+----------+-----+
only showing top 10 rows

Test Error = 0.0313526


trees = 100
bins = 100
depth = 20
Test Error = 0.027172

trees = 40
bins = 60
depth = 20
Test Error = 0.0313526

In [12]:
labs = predictions.groupBy("label").count().collect()

In [13]:
preds = predictions.groupBy("prediction").count().collect()

In [14]:
sorted(preds)

[Row(prediction=0.0, count=13821),
 Row(prediction=1.0, count=11836),
 Row(prediction=2.0, count=10352),
 Row(prediction=3.0, count=8496),
 Row(prediction=4.0, count=5769),
 Row(prediction=5.0, count=5756),
 Row(prediction=6.0, count=5159),
 Row(prediction=7.0, count=3401),
 Row(prediction=8.0, count=3370),
 Row(prediction=9.0, count=2871),
 Row(prediction=10.0, count=1037),
 Row(prediction=11.0, count=732),
 Row(prediction=12.0, count=564),
 Row(prediction=13.0, count=361),
 Row(prediction=14.0, count=339),
 Row(prediction=15.0, count=324),
 Row(prediction=16.0, count=53),
 Row(prediction=17.0, count=43)]

In [15]:
sorted(labs)

[Row(label=0.0, count=13898),
 Row(label=1.0, count=11769),
 Row(label=2.0, count=10193),
 Row(label=3.0, count=8380),
 Row(label=4.0, count=5837),
 Row(label=5.0, count=5673),
 Row(label=6.0, count=5098),
 Row(label=7.0, count=3501),
 Row(label=8.0, count=3453),
 Row(label=9.0, count=3034),
 Row(label=10.0, count=1019),
 Row(label=11.0, count=734),
 Row(label=12.0, count=557),
 Row(label=13.0, count=370),
 Row(label=14.0, count=342),
 Row(label=15.0, count=324),
 Row(label=16.0, count=55),
 Row(label=17.0, count=47)]

In [17]:
best_pipeline = model.bestModel
bestRf = best_pipeline.stages[1]

print bestRf.numClasses
print bestRf.numFeatures
print bestRf.trees
print bestRf.featureImportances
print bestRf.getNumTrees

18
100
[DecisionTreeClassificationModel (uid=dtc_08d94c629896) of depth 20 with 14071 nodes, DecisionTreeClassificationModel (uid=dtc_f86779e276ed) of depth 20 with 13947 nodes, DecisionTreeClassificationModel (uid=dtc_a7e02c932128) of depth 20 with 15637 nodes, DecisionTreeClassificationModel (uid=dtc_f9a12c2eb8c2) of depth 20 with 13973 nodes, DecisionTreeClassificationModel (uid=dtc_0a7a53ac8cc7) of depth 20 with 15239 nodes, DecisionTreeClassificationModel (uid=dtc_1afac78b0577) of depth 20 with 15371 nodes, DecisionTreeClassificationModel (uid=dtc_b1e915a1b368) of depth 20 with 13441 nodes, DecisionTreeClassificationModel (uid=dtc_989897185f1c) of depth 20 with 13337 nodes, DecisionTreeClassificationModel (uid=dtc_d41968608749) of depth 20 with 15645 nodes, DecisionTreeClassificationModel (uid=dtc_a45dc2ec4b15) of depth 20 with 15435 nodes, DecisionTreeClassificationModel (uid=dtc_6d55825f640c) of depth 20 with 15149 nodes, DecisionTreeClassificationModel (uid=dtc_5ad865ff0d96) of