# Categorical Handling Example

## Read the data and convert to a dataframe

In [1]:
import numpy as np
from pyspark.sql.types import StructType, StructField, LongType
from pyspark.sql import SparkSession
import pyspark.sql
import pyspark.sql.functions

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
# store path to notebook
PWD = !pwd
PWD = PWD[0]

In [4]:
# start Spark Session
app_name = "categorical_notebook"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

In [5]:
smallRDD2 = sc.parallelize([(1,1), (2,1), (2,1), (3,1), (3,1), (3,1), (4,1), (4,1), (4,1), (4,1)])
numColumns = 2

In [6]:
structFieldList = [StructField('field_' + str(num), LongType(), True) for num in range(numColumns)]
schema = StructType(structFieldList)
testDF = spark.createDataFrame(smallRDD2, schema)

## View the fields of interest

In [7]:
testDF.select(['field_0']).show()

+-------+
|field_0|
+-------+
|      1|
|      2|
|      2|
|      3|
|      3|
|      3|
|      4|
|      4|
|      4|
|      4|
+-------+



## Define functions

In [8]:
def oheForRow(value):

    # Encode the rare / unseen value
    if value not in valueDict:
        oheList = [0] * len(valueDict)
        oheList.append(1)
        return oheList

    oheList = []

    # Encode values that have been seen
    for key in valueDict.keys():
        if key == value:
            oheList.append(1)
        else:
            oheList.append(0)
    
    # Last column is for rare / unseen. Set to 0.
    oheList.append(0)
    
    return oheList

In [9]:
def oheDataFrame_fit(field, topN, df):

    # Find the frequency of items in the category
    fieldFreqRDD = df.rdd.map(lambda x: (x[field], 1)).\
                          reduceByKey(lambda x, y: x+y)

    # Use the top N frequent values
    valueCountList = fieldFreqRDD.takeOrdered(topN, key=lambda x: -x[1])
    
    # Add those values to a dictionary for OHE
    for pair in valueCountList:
        valueDict[pair[0]] = True

In [10]:
def oheDataFrame_transform(field, df):

    # Create a new RDD with the encoded values
    oheRDD = df.rdd.map(lambda row: oheForRow(row[field]))

    # Create a new DataFrame for the OHE RDD
    structFieldList = [StructField(field + '_' + str(num), LongType(), True) for num in range(len(valueDict))]
    structFieldList.append(StructField(field + '_UnkwnRare', LongType(), True))
    schema = StructType(structFieldList)
    oheDF = spark.createDataFrame(oheRDD, schema)
    
    # Add an index column
    df = df.withColumn('id', pyspark.sql.functions.monotonically_increasing_id())
    oheDF = oheDF.withColumn('id', pyspark.sql.functions.monotonically_increasing_id())

    # Join the original DataFrame with the OHE DataFrame
    #updatedDF = df.join(oheDF, df.id == oheDF.id, 'inner').drop(oheDF.id)
    updatedDF = df.join(oheDF, ['id'])
    
    # Drop the original field that was OHE
    updatedDF = updatedDF.drop(field)
    updatedDF = updatedDF.drop('id')

    return updatedDF

## Example

Create a dictionary for the values to create new fields

In [11]:
valueDict = {}
fieldToEncode = 'field_0'

Call the function to OHE the field to the number of specified fields

In [12]:
oheDataFrame_fit(fieldToEncode, 3, testDF)

In [13]:
testDF3 = oheDataFrame_transform(fieldToEncode, testDF)

Display the DataFrame with the OHE fields

In [14]:
testDF3.orderBy('id').show()

+-------+---------+---------+---------+-----------------+
|field_1|field_0_0|field_0_1|field_0_2|field_0_UnkwnRare|
+-------+---------+---------+---------+-----------------+
|      1|        0|        0|        0|                1|
|      1|        0|        0|        1|                0|
|      1|        0|        0|        1|                0|
|      1|        0|        1|        0|                0|
|      1|        0|        1|        0|                0|
|      1|        0|        1|        0|                0|
|      1|        1|        0|        0|                0|
|      1|        1|        0|        0|                0|
|      1|        1|        0|        0|                0|
|      1|        1|        0|        0|                0|
+-------+---------+---------+---------+-----------------+

