# Categorical Handling Example

## Read the data and convert to a dataframe

In [81]:
import numpy as np
from pyspark.sql.types import StructType, StructField, LongType
from pyspark.sql import SparkSession
import pyspark.sql
import pyspark.sql.functions

In [82]:
%reload_ext autoreload
%autoreload 2

In [83]:
# store path to notebook
PWD = !pwd
PWD = PWD[0]

In [84]:
# start Spark Session
app_name = "categorical_notebook"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

In [85]:
smallRDD2 = sc.parallelize([(1,1), (2,1), (2,1), (3,1), (3,1), (3,1), (4,1), (4,1), (4,1), (4,1)])
numColumns = 2

In [86]:
structFieldList = [StructField('field_' + str(num), LongType(), True) for num in range(numColumns)]
schema = StructType(structFieldList)
testDF = spark.createDataFrame(smallRDD2, schema)

## View the fields of interest

In [87]:
testDF.select(['field_0']).show()

+-------+
|field_0|
+-------+
|      1|
|      2|
|      2|
|      3|
|      3|
|      3|
|      4|
|      4|
|      4|
|      4|
+-------+



## Once all null observations have been removed, add a primary key index

In [88]:
testDF2 = testDF.withColumn('id', pyspark.sql.functions.monotonically_increasing_id())

In [95]:
testDF2.show()

+-------+-------+-----------+
|field_0|field_1|         id|
+-------+-------+-----------+
|      1|      1|          0|
|      2|      1|          1|
|      2|      1| 8589934592|
|      3|      1| 8589934593|
|      3|      1|17179869184|
|      3|      1|17179869185|
|      4|      1|25769803776|
|      4|      1|25769803777|
|      4|      1|25769803778|
|      4|      1|25769803779|
+-------+-------+-----------+



## Define functions

In [90]:
def createOneHotEncodingForRow(value):

    # Encode the rare / unseen value
    if value not in valueDict:
        oheList = [0] * len(valueDict)
        oheList.append(1)
        return oheList

    oheList = []

    # Encode values that have been seen
    for key in valueDict.keys():
        if key == value:
            oheList.append(1)
        else:
            oheList.append(0)
    
    # Last column is for rare / unseen. Set to 0.
    oheList.append(0)
    
    return oheList

In [91]:
def createOHEDataFrame(field, topN, df):
    
    # Find the frequency of items in the category
    fieldFreqRDD = df.select(field).rdd.map(lambda x: (x[field], 1)).\
                                        reduceByKey(lambda x, y: x+y).\
                                        sortBy(lambda x: -x[1])

    # Use the top N frequent values
    valueCountList = fieldFreqRDD.take(topN)

    # Add those values to a dictionary for OHE
    for pair in valueCountList:
        valueDict[pair[0]] = True

    # Create a new RDD with the encoded values
    oheRDD = df.rdd.map(lambda row: createOneHotEncodingForRow(row[field]))

    # Create a new DataFrame for the OHE RDD
    structFieldList = [StructField(field + '_' + str(num), LongType(), True) for num in range(len(valueDict))]
    structFieldList.append(StructField(field + '_UnkwnRare', LongType(), True))
    schema = StructType(structFieldList)
    oheDF = spark.createDataFrame(oheRDD, schema)

    # Add an index column
    oheDF = oheDF.withColumn('id', pyspark.sql.functions.monotonically_increasing_id())

    # Join the original DataFrame with the OHE DataFrame
    #updatedDF = df.join(oheDF, df.id == oheDF.id, 'inner').drop(oheDF.id)
    updatedDF = df.join(oheDF, ["id"])
    
    # Drop the original field that was OHE
    updatedDF = updatedDF.drop(field)
    
    return updatedDF

## Example

Create a dictionary for the values to create new fields

In [92]:
valueDict = {}

Call the function to OHE the field to the number of specified fields

In [93]:
testDF3 = createOHEDataFrame('field_0', 3, testDF2)

Display the DataFrame with the OHE fields

In [94]:
testDF3.orderBy('id').show()

+-----------+-------+---------+---------+---------+-----------------+
|         id|field_1|field_0_0|field_0_1|field_0_2|field_0_UnkwnRare|
+-----------+-------+---------+---------+---------+-----------------+
|          0|      1|        0|        0|        0|                1|
|          1|      1|        0|        0|        1|                0|
| 8589934592|      1|        0|        0|        1|                0|
| 8589934593|      1|        0|        1|        0|                0|
|17179869184|      1|        0|        1|        0|                0|
|17179869185|      1|        0|        1|        0|                0|
|25769803776|      1|        1|        0|        0|                0|
|25769803777|      1|        1|        0|        0|                0|
|25769803778|      1|        1|        0|        0|                0|
|25769803779|      1|        1|        0|        0|                0|
+-----------+-------+---------+---------+---------+-----------------+

