For comments.

jeanlyn · May 1, 2015 · 1a0b9a4 · 1a0b9a4
1 parent bc397f2
commit 1a0b9a4
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 35 deletions.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml.Transformer
 import org.apache.spark.ml.attribute.BinaryAttribute
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared._
+import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
 import org.apache.spark.ml.util.SchemaUtils
 import org.apache.spark.sql._
 import org.apache.spark.sql.functions._
@@ -62,7 +62,10 @@ final class Binarizer extends Transformer with HasInputCol with HasOutputCol {
     val map = extractParamMap(paramMap)
     val td = map(threshold)
     val binarizer = udf { in: Double => if (in > td) 1.0 else 0.0 }
-    dataset.withColumn(map(outputCol), binarizer(col(map(inputCol))))
+    val outputColName = map(outputCol)
+    val metadata = BinaryAttribute.defaultAttr.withName(outputColName).toMetadata()
+    dataset.select(col("*"),
+      binarizer(col(map(inputCol))).as(outputColName, metadata))
   }
 
   override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
@@ -75,7 +78,7 @@ final class Binarizer extends Transformer with HasInputCol with HasOutputCol {
     require(inputFields.forall(_.name != outputColName),
       s"Output column $outputColName already exists.")
 
-    val attr = BinaryAttribute.defaultAttr.withName(map(outputCol))
+    val attr = BinaryAttribute.defaultAttr.withName(outputColName)
     val outputFields = inputFields :+ attr.toStructField()
     StructType(outputFields)
   }

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BinarizerSuite.scala
@@ -27,50 +27,43 @@ import org.apache.spark.sql.{DataFrame, Row, SQLContext}
 class BinarizerSuite extends FunSuite with MLlibTestSparkContext {
 
   @transient var data: Array[Double] = _
-  @transient var dataFrame: DataFrame = _
-  @transient var binarizer: Binarizer = _
-  @transient val threshold = 0.2
-  @transient var defaultBinarized: Array[Double] = _
-  @transient var thresholdBinarized: Array[Double] = _
+  @transient var sqlContext: SQLContext = _
 
   override def beforeAll(): Unit = {
     super.beforeAll()
-
+    sqlContext = new SQLContext(sc)
     data = Array(0.1, -0.5, 0.2, -0.3, 0.8, 0.7, -0.1, -0.4)
-    defaultBinarized = data.map(x => if (x > 0.0) 1.0 else 0.0)
-    thresholdBinarized = data.map(x => if (x > threshold) 1.0 else 0.0)
+  }
+
+  test("Binarize continuous features with default parameter") {
+    val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0)
+    val dataFrame: DataFrame = sqlContext.createDataFrame(
+      data.zip(defaultBinarized)).toDF("feature", "expected")
 
-    val sqlContext = new SQLContext(sc)
-    dataFrame = sqlContext.createDataFrame(sc.parallelize(data, 2).map(BinarizerSuite.FeatureData))
-    binarizer = new Binarizer()
+    val binarizer: Binarizer = new Binarizer()
       .setInputCol("feature")
       .setOutputCol("binarized_feature")
-  }
 
-  def collectResult(result: DataFrame): Array[Double] = {
-    result.select("binarized_feature").collect().map {
-      case Row(feature: Double) => feature
+    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
+      case Row(x: Double, y: Double) =>
+        assert(x === y, "The feature value is not correct after binarization.")
     }
   }
 
-  def assertValues(lhs: Array[Double], rhs: Array[Double]): Unit = {
-    assert((lhs, rhs).zipped.forall { (x1, x2) =>
-      x1 === x2
-    }, "The feature value is not correct after binarization.")
-  }
+  test("Binarize continuous features with setter") {
+    val threshold: Double = 0.2
+    val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0) 
+    val dataFrame: DataFrame = sqlContext.createDataFrame(
+        data.zip(thresholdBinarized)).toDF("feature", "expected")
 
-  test("Binarize continuous features with default parameter") {
-    val result = collectResult(binarizer.transform(dataFrame))
-    assertValues(result, defaultBinarized)
-  }
+    val binarizer: Binarizer = new Binarizer()
+      .setInputCol("feature")
+      .setOutputCol("binarized_feature")
+      .setThreshold(threshold)
 
-  test("Binarize continuous features with setter") {
-    binarizer.setThreshold(threshold)
-    val result = collectResult(binarizer.transform(dataFrame))
-    assertValues(result, thresholdBinarized)
+    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
+      case Row(x: Double, y: Double) =>
+        assert(x === y, "The feature value is not correct after binarization.")
+    }
   }
 }
-
-private object BinarizerSuite {
-  case class FeatureData(feature: Double)
-}