In [None]:
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.sql.Row

In [None]:
// Load and parse the data file.
val input = sc.textFile("/asnumbers.csv")
// The above is an Array[String] which is not what we need. We need a LabeledPoint 
val data  = input.map { line => 
    val lineSplit = line.split(',')
    val values = Vectors.dense(lineSplit.take(12).map(_.toDouble))
    (lineSplit(13).toDouble, values)
}
val splits = data.randomSplit(Array(0.7, 0.3))
val (trainingRDD, testRDD) = (splits(0), splits(1))
val training = trainingRDD.toDF("label", "features")
val test = testRDD.toDF("label", "features")

In [None]:
// Create the model - which is an estimator and find out the params are then set the number of iterations and regularization
val lr = new LogisticRegression()
println("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
lr.setMaxIter(50).setRegParam(0.001)

In [None]:
// Display how the model was built using params after calling fit on the estimator method 
val model = lr.fit(training)
println("Params: " + model.parent.extractParamMap)

In [None]:
// Can change any of the algo params and override the defaults in the param map
val map1 = ParamMap(lr.maxIter -> 100).put(lr.regParam -> 0.01, lr.threshold -> 0.4)
val map2 = ParamMap(lr.predictionCol -> "pred") 
val sum = map1 ++ map2

In [None]:
val modelupdate = lr.fit(training, sum)
println("Update to params: " + modelupdate.parent.extractParamMap)

In [None]:
modelupdate.transform(test).select("features", "label", "probability", "pred").collect().foreach { 
    case Row(features: Vector, label: Double, probability: Vector, pred: Double) =>
        println(s"($features, $label) -> probability=$probability, prediction=$prediction")
  }