# Spark Chi Squared Selector

In [1]:
# https://github.com/apache/spark/blob/master/examples/src/main/python/ml/chisq_selector_example.py

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

In [3]:
spark = SparkSession\
        .builder\
        .appName("ChiSqSelectorExample")\
        .getOrCreate()

In [4]:
df = spark.createDataFrame([
        (7, Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0,),
        (8, Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0,),
        (9, Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0,)], ["id", "features", "clicked"])

In [5]:
selector = ChiSqSelector(numTopFeatures=1, featuresCol="features",
    outputCol="selectedFeatures", labelCol="clicked")

In [6]:
result = selector.fit(df).transform(df)

In [7]:
print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
result.show()

ChiSqSelector output with top 1 features selected
+---+------------------+-------+----------------+
| id|          features|clicked|selectedFeatures|
+---+------------------+-------+----------------+
|  7|[0.0,0.0,18.0,1.0]|    1.0|          [18.0]|
|  8|[0.0,1.0,12.0,0.0]|    0.0|          [12.0]|
|  9|[1.0,0.0,15.0,0.1]|    0.0|          [15.0]|
+---+------------------+-------+----------------+



In [8]:
spark.stop()