Skip to content

Commit

Permalink
For comments.
Browse files Browse the repository at this point in the history
  • Loading branch information
viirya committed May 1, 2015
1 parent bc397f2 commit 1a0b9a4
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.BinaryAttribute
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.SchemaUtils
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
Expand Down Expand Up @@ -62,7 +62,10 @@ final class Binarizer extends Transformer with HasInputCol with HasOutputCol {
val map = extractParamMap(paramMap)
val td = map(threshold)
val binarizer = udf { in: Double => if (in > td) 1.0 else 0.0 }
dataset.withColumn(map(outputCol), binarizer(col(map(inputCol))))
val outputColName = map(outputCol)
val metadata = BinaryAttribute.defaultAttr.withName(outputColName).toMetadata()
dataset.select(col("*"),
binarizer(col(map(inputCol))).as(outputColName, metadata))
}

override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
Expand All @@ -75,7 +78,7 @@ final class Binarizer extends Transformer with HasInputCol with HasOutputCol {
require(inputFields.forall(_.name != outputColName),
s"Output column $outputColName already exists.")

val attr = BinaryAttribute.defaultAttr.withName(map(outputCol))
val attr = BinaryAttribute.defaultAttr.withName(outputColName)
val outputFields = inputFields :+ attr.toStructField()
StructType(outputFields)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,50 +27,43 @@ import org.apache.spark.sql.{DataFrame, Row, SQLContext}
class BinarizerSuite extends FunSuite with MLlibTestSparkContext {

@transient var data: Array[Double] = _
@transient var dataFrame: DataFrame = _
@transient var binarizer: Binarizer = _
@transient val threshold = 0.2
@transient var defaultBinarized: Array[Double] = _
@transient var thresholdBinarized: Array[Double] = _
@transient var sqlContext: SQLContext = _

override def beforeAll(): Unit = {
super.beforeAll()

sqlContext = new SQLContext(sc)
data = Array(0.1, -0.5, 0.2, -0.3, 0.8, 0.7, -0.1, -0.4)
defaultBinarized = data.map(x => if (x > 0.0) 1.0 else 0.0)
thresholdBinarized = data.map(x => if (x > threshold) 1.0 else 0.0)
}

test("Binarize continuous features with default parameter") {
val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0)
val dataFrame: DataFrame = sqlContext.createDataFrame(
data.zip(defaultBinarized)).toDF("feature", "expected")

val sqlContext = new SQLContext(sc)
dataFrame = sqlContext.createDataFrame(sc.parallelize(data, 2).map(BinarizerSuite.FeatureData))
binarizer = new Binarizer()
val binarizer: Binarizer = new Binarizer()
.setInputCol("feature")
.setOutputCol("binarized_feature")
}

def collectResult(result: DataFrame): Array[Double] = {
result.select("binarized_feature").collect().map {
case Row(feature: Double) => feature
binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
case Row(x: Double, y: Double) =>
assert(x === y, "The feature value is not correct after binarization.")
}
}

def assertValues(lhs: Array[Double], rhs: Array[Double]): Unit = {
assert((lhs, rhs).zipped.forall { (x1, x2) =>
x1 === x2
}, "The feature value is not correct after binarization.")
}
test("Binarize continuous features with setter") {
val threshold: Double = 0.2
val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0)
val dataFrame: DataFrame = sqlContext.createDataFrame(
data.zip(thresholdBinarized)).toDF("feature", "expected")

test("Binarize continuous features with default parameter") {
val result = collectResult(binarizer.transform(dataFrame))
assertValues(result, defaultBinarized)
}
val binarizer: Binarizer = new Binarizer()
.setInputCol("feature")
.setOutputCol("binarized_feature")
.setThreshold(threshold)

test("Binarize continuous features with setter") {
binarizer.setThreshold(threshold)
val result = collectResult(binarizer.transform(dataFrame))
assertValues(result, thresholdBinarized)
binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
case Row(x: Double, y: Double) =>
assert(x === y, "The feature value is not correct after binarization.")
}
}
}

private object BinarizerSuite {
case class FeatureData(feature: Double)
}

0 comments on commit 1a0b9a4

Please sign in to comment.