In [None]:
# Binarizer Example
# Reference: https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.Binarizer

""" In classification problem, when the input data is continuous and the expected output is binary, 
then this Binarizer helps to solve the classification problem in few lines of code.
For example, from the Marks Data one wants to classify students into Pass and Fail. 
Hence the aim is to find out the pass and fail status of the students, based on a threshold (here it is pass mark)."""

In [2]:
from __future__ import print_function
from pyspark.sql import SparkSession
from pyspark.ml.feature import Binarizer

In [3]:
from pyspark.sql.types import *

In [4]:
spark = SparkSession \
    .builder \
    .appName("BinarizerExample") \
    .config("spark.some.config.optyion", "some-value") \
    .getOrCreate()

In [5]:
df = spark.createDataFrame([
    ("Hari", 34.0),
    ("Mah", 80.0),
    ("Swar", 90.0)
], ["Name", "Mark"])

In [6]:
br = Binarizer(threshold=35.0, inputCol="Mark", outputCol="b_mark")

In [7]:
brdf = br.transform(df)

In [8]:
print("Binarizer output with Threshold = %f" % br.getThreshold())
brdf.show()

Binarizer output with Threshold = 35.000000
+----+----+------+
|Name|Mark|b_mark|
+----+----+------+
|Hari|34.0|   0.0|
| Mah|80.0|   1.0|
|Swar|90.0|   1.0|
+----+----+------+

