# Encoding categorical features in Spark


# 1.- String Indexing
String Indexer encodes a column of string labels/categories to a column of indices. The ordering of the indices is done on the basis of popularity and the range is [0, numOfLabels)

In [1]:
val df = spark.createDataFrame(Seq((0, "apples"), (1, "oranges"), (2, "banana"), (3, "apples"), (4, "banana"), (5, "oranges"), (6, "apples"))).toDF("id", "category")

Intitializing Scala interpreter ...

Spark Web UI available at http://DESKTOP-FQ2BOOJ:4040
SparkContext available as 'sc' (version = 2.4.0, master = local[*], app id = local-1557763667635)
SparkSession available as 'spark'


df: org.apache.spark.sql.DataFrame = [id: int, category: string]


In [2]:
df.show

+---+--------+
| id|category|
+---+--------+
|  0|  apples|
|  1| oranges|
|  2|  banana|
|  3|  apples|
|  4|  banana|
|  5| oranges|
|  6|  apples|
+---+--------+



In [3]:
import org.apache.spark.ml.feature.StringIndexer

val indexer = new StringIndexer().setInputCol("category").setOutputCol("category_index")

val df_indexed1 = indexer.fit(df).transform(df).show

+---+--------+--------------+
| id|category|category_index|
+---+--------+--------------+
|  0|  apples|           0.0|
|  1| oranges|           1.0|
|  2|  banana|           2.0|
|  3|  apples|           0.0|
|  4|  banana|           2.0|
|  5| oranges|           1.0|
|  6|  apples|           0.0|
+---+--------+--------------+



import org.apache.spark.ml.feature.StringIndexer
indexer: org.apache.spark.ml.feature.StringIndexer = strIdx_71614f945471
df_indexed1: Unit = ()


Let's add another column to our toy dataframe

In [5]:
val df = spark.createDataFrame(Seq((0, "apples", "red"), (1, "oranges", "orange"), (2, "banana", "yellow"), (3, "apples", "red"), (4, "banana", "yellow"), (5, "oranges", "orange"), 
                                   (6, "apples", "red"))).toDF("id", "category", "colour")                                   

df: org.apache.spark.sql.DataFrame = [id: int, category: string ... 1 more field]


In [6]:
df.show

+---+--------+------+
| id|category|colour|
+---+--------+------+
|  0|  apples|   red|
|  1| oranges|orange|
|  2|  banana|yellow|
|  3|  apples|   red|
|  4|  banana|yellow|
|  5| oranges|orange|
|  6|  apples|   red|
+---+--------+------+



In [7]:
import org.apache.spark.ml.{Pipeline, PipelineModel}

val features = df.columns.filterNot(_.contains("id"))

// here's how we encode several categorical features at once
val encodedFeatures = features.flatMap{name =>
    
    val indexer = new StringIndexer().setInputCol(name).setOutputCol(name + "_index").setHandleInvalid("keep")
    Array(indexer)
    
}

val pipeline = new Pipeline().setStages(encodedFeatures)

val indexer_model = pipeline.fit(df)

val df_transformed = indexer_model.transform(df)

df_transformed.show

+---+--------+------+--------------+------------+
| id|category|colour|category_index|colour_index|
+---+--------+------+--------------+------------+
|  0|  apples|   red|           0.0|         0.0|
|  1| oranges|orange|           1.0|         1.0|
|  2|  banana|yellow|           2.0|         2.0|
|  3|  apples|   red|           0.0|         0.0|
|  4|  banana|yellow|           2.0|         2.0|
|  5| oranges|orange|           1.0|         1.0|
|  6|  apples|   red|           0.0|         0.0|
+---+--------+------+--------------+------------+



import org.apache.spark.ml.{Pipeline, PipelineModel}
features: Array[String] = Array(category, colour)
encodedFeatures: Array[org.apache.spark.ml.feature.StringIndexer] = Array(strIdx_c95aa955927b, strIdx_a190bc56261e)
pipeline: org.apache.spark.ml.Pipeline = pipeline_1eb6bbcdc149
indexer_model: org.apache.spark.ml.PipelineModel = pipeline_1eb6bbcdc149
df_transformed: org.apache.spark.sql.DataFrame = [id: int, category: string ... 3 more fields]


# 2.- One-Hot Encoding

This method is generally used when we need to use categorical features but the algorithm expects continuous features (e.g. regression). The Spark one hot encoder takes the indexed label/category from the string indexer and then encodes it into a sparse vector.

In [8]:
import org.apache.spark.ml.feature.OneHotEncoderEstimator

val features = df.columns.filterNot(_.contains("id"))

val encodedFeatures = features.flatMap{name =>
    
    val indexer = new StringIndexer().setInputCol(name).setOutputCol(name + "_index").setHandleInvalid("keep")
    
    val oneHotEncoder = new OneHotEncoderEstimator()
         .setInputCols(Array(name + "_index"))
         .setOutputCols(Array(name + "_vec"))
         .setDropLast(false)
    
    Array(indexer, oneHotEncoder)
    
}

val pipeline = new Pipeline().setStages(encodedFeatures)

val indexer_model = pipeline.fit(df)

val df_transformed = indexer_model.transform(df)

df_transformed.show

+---+--------+------+--------------+-------------+------------+-------------+
| id|category|colour|category_index| category_vec|colour_index|   colour_vec|
+---+--------+------+--------------+-------------+------------+-------------+
|  0|  apples|   red|           0.0|(4,[0],[1.0])|         0.0|(4,[0],[1.0])|
|  1| oranges|orange|           1.0|(4,[1],[1.0])|         1.0|(4,[1],[1.0])|
|  2|  banana|yellow|           2.0|(4,[2],[1.0])|         2.0|(4,[2],[1.0])|
|  3|  apples|   red|           0.0|(4,[0],[1.0])|         0.0|(4,[0],[1.0])|
|  4|  banana|yellow|           2.0|(4,[2],[1.0])|         2.0|(4,[2],[1.0])|
|  5| oranges|orange|           1.0|(4,[1],[1.0])|         1.0|(4,[1],[1.0])|
|  6|  apples|   red|           0.0|(4,[0],[1.0])|         0.0|(4,[0],[1.0])|
+---+--------+------+--------------+-------------+------------+-------------+



import org.apache.spark.ml.feature.OneHotEncoderEstimator
features: Array[String] = Array(category, colour)
encodedFeatures: Array[org.apache.spark.ml.Estimator[_ >: org.apache.spark.ml.feature.OneHotEncoderModel with org.apache.spark.ml.feature.StringIndexerModel <: org.apache.spark.ml.Model[_ >: org.apache.spark.ml.feature.OneHotEncoderModel with org.apache.spark.ml.feature.StringIndexerModel <: org.apache.spark.ml.Transformer with org.apache.spark.ml.param.shared.HasHandleInvalid with org.apache.spark.ml.util.MLWritable] with org.apache.spark.ml.param.shared.HasHandleInvalid with org.apache.spark.ml.util.MLWritable] with org.apache.spark.ml.param.shared.HasHandleInvalid with org.apache.spark.ml.util.DefaultParamsWritable{def copy(extra: org.apache.spark.ml.param.ParamMap): org.apache...

The sparse vectors seen in the above dataframe have 4 different components. The first component which is a 0 indicates that it is a sparse vector. The second component talks about the size of the vector. The third component talks about the indices where the vector is populated while the fourth component talks about what values these are. This truncates the vector and is really efficient when you have really large vector representations. This truncates the vector and is really efficient when you have really large vector representations.

It's fairly simple to see the dense vector representation for the above sparse vectors using a udf: 


In [9]:
import org.apache.spark.ml.linalg.{Vector, Vectors}

val sparseToDense = udf((v: Vector) => v.toDense)

val df_denseVectors = df_transformed.withColumn("dense_category_vec", sparseToDense($"category_vec")) 

df_denseVectors.show

+---+--------+------+--------------+-------------+------------+-------------+------------------+
| id|category|colour|category_index| category_vec|colour_index|   colour_vec|dense_category_vec|
+---+--------+------+--------------+-------------+------------+-------------+------------------+
|  0|  apples|   red|           0.0|(4,[0],[1.0])|         0.0|(4,[0],[1.0])| [1.0,0.0,0.0,0.0]|
|  1| oranges|orange|           1.0|(4,[1],[1.0])|         1.0|(4,[1],[1.0])| [0.0,1.0,0.0,0.0]|
|  2|  banana|yellow|           2.0|(4,[2],[1.0])|         2.0|(4,[2],[1.0])| [0.0,0.0,1.0,0.0]|
|  3|  apples|   red|           0.0|(4,[0],[1.0])|         0.0|(4,[0],[1.0])| [1.0,0.0,0.0,0.0]|
|  4|  banana|yellow|           2.0|(4,[2],[1.0])|         2.0|(4,[2],[1.0])| [0.0,0.0,1.0,0.0]|
|  5| oranges|orange|           1.0|(4,[1],[1.0])|         1.0|(4,[1],[1.0])| [0.0,1.0,0.0,0.0]|
|  6|  apples|   red|           0.0|(4,[0],[1.0])|         0.0|(4,[0],[1.0])| [1.0,0.0,0.0,0.0]|
+---+--------+------+---------

import org.apache.spark.ml.linalg.{Vector, Vectors}
sparseToDense: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))
df_denseVectors: org.apache.spark.sql.DataFrame = [id: int, category: string ... 6 more fields]


# 3.- Vector Assembler

Most machine learning algorithms in Spark expect a single encoded numerical vector as the input. In order to do that, we use something called the vector assembler. Its job is to combine the raw features and features generated from various transforms into a single feature vector. It accepts boolean, numerical and vector type inputs. Lets modify our earlier code to combine the category and the color vectors to a single feature vector.

Vector assembler generally will always feature in your workflow when you want to combine all your features before training or scoring your model.

In [11]:
import org.apache.spark.ml.feature.VectorAssembler

val features = df.columns.filterNot(_.contains("id"))

val encodedFeatures = features.flatMap{name =>
    
    val indexer = new StringIndexer().setInputCol(name).setOutputCol(name + "_index").setHandleInvalid("keep")
    
    val oneHotEncoder = new OneHotEncoderEstimator()
         .setInputCols(Array(name + "_index"))
         .setOutputCols(Array(name + "_vec"))
         .setDropLast(false)
    
    Array(indexer, oneHotEncoder)
    
}

val pipeline = new Pipeline().setStages(encodedFeatures)

val indexer_model = pipeline.fit(df)

val df_transformed = indexer_model.transform(df)

val vecFeatures = df_transformed.columns.filter(_.contains("vec")).toArray

val vectorAssembler = new VectorAssembler().setInputCols(vecFeatures).setOutputCol("features")
    
val pipelineVectorAssembler = new Pipeline().setStages(Array(vectorAssembler))

val result_df = pipelineVectorAssembler.fit(df_transformed).transform(df_transformed)

result_df.show

+---+--------+------+--------------+-------------+------------+-------------+-------------------+
| id|category|colour|category_index| category_vec|colour_index|   colour_vec|           features|
+---+--------+------+--------------+-------------+------------+-------------+-------------------+
|  0|  apples|   red|           0.0|(4,[0],[1.0])|         0.0|(4,[0],[1.0])|(8,[0,4],[1.0,1.0])|
|  1| oranges|orange|           1.0|(4,[1],[1.0])|         1.0|(4,[1],[1.0])|(8,[1,5],[1.0,1.0])|
|  2|  banana|yellow|           2.0|(4,[2],[1.0])|         2.0|(4,[2],[1.0])|(8,[2,6],[1.0,1.0])|
|  3|  apples|   red|           0.0|(4,[0],[1.0])|         0.0|(4,[0],[1.0])|(8,[0,4],[1.0,1.0])|
|  4|  banana|yellow|           2.0|(4,[2],[1.0])|         2.0|(4,[2],[1.0])|(8,[2,6],[1.0,1.0])|
|  5| oranges|orange|           1.0|(4,[1],[1.0])|         1.0|(4,[1],[1.0])|(8,[1,5],[1.0,1.0])|
|  6|  apples|   red|           0.0|(4,[0],[1.0])|         0.0|(4,[0],[1.0])|(8,[0,4],[1.0,1.0])|
+---+--------+------

import org.apache.spark.ml.feature.VectorAssembler
features: Array[String] = Array(category, colour)
encodedFeatures: Array[org.apache.spark.ml.Estimator[_ >: org.apache.spark.ml.feature.OneHotEncoderModel with org.apache.spark.ml.feature.StringIndexerModel <: org.apache.spark.ml.Model[_ >: org.apache.spark.ml.feature.OneHotEncoderModel with org.apache.spark.ml.feature.StringIndexerModel <: org.apache.spark.ml.Transformer with org.apache.spark.ml.param.shared.HasHandleInvalid with org.apache.spark.ml.util.MLWritable] with org.apache.spark.ml.param.shared.HasHandleInvalid with org.apache.spark.ml.util.MLWritable] with org.apache.spark.ml.param.shared.HasHandleInvalid with org.apache.spark.ml.util.DefaultParamsWritable{def copy(extra: org.apache.spark.ml.param.ParamMap): org.apache.spark....

# 4.- Vector Indexer

Vector indexer allows us to skip the one hot encoding stage when encoding categorical features. The algorithm performs an inference based on the values of the features, automatically generating the desired feature vector

In [12]:
import org.apache.spark.ml.feature.VectorIndexer

val features = df.columns.filterNot(_.contains("id"))

// no hot-one encoding this time
val encodedFeatures = features.flatMap{name =>
    
    val indexer = new StringIndexer().setInputCol(name).setOutputCol(name + "_index").setHandleInvalid("keep")
    
    Array(indexer)
    
}

val pipeline1 = new Pipeline().setStages(encodedFeatures)

val indexer_model1 = pipeline1.fit(df)

val df_transformed_bis = indexer_model1.transform(df)

//////////////////////////////////////////////////////////////////////////////

// we take the string indexed features
val features_bis = df_transformed_bis.columns.filter(_.contains("index")).toArray

// assembler
val vectorAssembler = new VectorAssembler().setInputCols(features_bis).setOutputCol("features_bis")

// vector indexer
val vectorIndexer = new VectorIndexer().setInputCol("features_bis").setOutputCol("indexed_features_bis").setMaxCategories(2)

// adding to the pipeline
val pipeline2 = new Pipeline().setStages(Array(vectorAssembler, vectorIndexer))

val indexer_model2 = pipeline2.fit(df_transformed_bis)

val result2_df = indexer_model2.transform(df_transformed_bis)

result2_df.show

+---+--------+------+--------------+------------+------------+--------------------+
| id|category|colour|category_index|colour_index|features_bis|indexed_features_bis|
+---+--------+------+--------------+------------+------------+--------------------+
|  0|  apples|   red|           0.0|         0.0|   (2,[],[])|           (2,[],[])|
|  1| oranges|orange|           1.0|         1.0|   [1.0,1.0]|           [1.0,1.0]|
|  2|  banana|yellow|           2.0|         2.0|   [2.0,2.0]|           [2.0,2.0]|
|  3|  apples|   red|           0.0|         0.0|   (2,[],[])|           (2,[],[])|
|  4|  banana|yellow|           2.0|         2.0|   [2.0,2.0]|           [2.0,2.0]|
|  5| oranges|orange|           1.0|         1.0|   [1.0,1.0]|           [1.0,1.0]|
|  6|  apples|   red|           0.0|         0.0|   (2,[],[])|           (2,[],[])|
+---+--------+------+--------------+------------+------------+--------------------+



import org.apache.spark.ml.feature.VectorIndexer
features: Array[String] = Array(category, colour)
encodedFeatures: Array[org.apache.spark.ml.feature.StringIndexer] = Array(strIdx_8dd7346f7fc3, strIdx_3311e22d1bb1)
pipeline1: org.apache.spark.ml.Pipeline = pipeline_ffad86ceb189
indexer_model1: org.apache.spark.ml.PipelineModel = pipeline_ffad86ceb189
df_transformed_bis: org.apache.spark.sql.DataFrame = [id: int, category: string ... 3 more fields]
features_bis: Array[String] = Array(category_index, colour_index)
vectorAssembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_2a2a13c0b904
vectorIndexer: org.apache.spark.ml.feature.VectorIndexer = vecIdx_ce46856b8a25
pipeline2: org.apache.spark.ml.Pipeline = pipeline_f4a8043f4eb1
indexer_model2: org.apache.spark.ml.PipelineMod...

Column "features_bis" is the output of the assembler but the categorical features are not encoded yet. The vector indexer will do this, producing "indexed_features_bis" as the final feature vector