In [None]:
# Example Notebook for Bucketizer in PySpark
# Reference: https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.Bucketizer

""" Bucketizer is used to transform a continuous data into categorical data.
The bucket ranges are specified based on the split values.
The number of splits can be in even numbers.
when using odd number of splits, it will prompt an error"""

In [1]:
from __future__ import print_function

In [2]:
#initialize the spark session
from pyspark.sql import SparkSession

In [3]:
#importing the bucketizer feature
from pyspark.ml.feature import Bucketizer

In [4]:
spark = SparkSession \
    .builder \
    .appName("Bucketizerexample") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [5]:
# splits has to be in even numbers
splits = [-float("inf"), -10.9, -10.1, 0.0, 10.2, 10.5, float("inf")]

In [6]:
data = [(-100.9,), (-50.5,), (-10.8,), (0.0,), (10.2,), (10.8,), (102.9,)]
df = spark.createDataFrame(data, ["values"])

In [7]:
bucket = Bucketizer(splits=splits, inputCol="values", outputCol="Bucket_value")

In [8]:
bucketed_data = bucket.transform(df)

In [9]:
print("Bucketizer output with %d buckets" % (len(bucket.getSplits())-1))
bucketed_data.show()

Bucketizer output with 6 buckets
+------+------------+
|values|Bucket_value|
+------+------------+
|-100.9|         0.0|
| -50.5|         0.0|
| -10.8|         1.0|
|   0.0|         3.0|
|  10.2|         4.0|
|  10.8|         5.0|
| 102.9|         5.0|
+------+------------+

