# Categorical Handling Example (to support OHE and feature hashing)

## Read the data and convert to a dataframe

In [51]:
import numpy as np
import pandas as pd
from pyspark.sql.types import StructType, StructField, LongType, StringType
from pyspark.sql import SparkSession
import pyspark.sql
import pyspark.sql.functions

In [52]:
%reload_ext autoreload
%autoreload 2

In [53]:
# store path to notebook
PWD = !pwd
PWD = PWD[0]

In [54]:
# start Spark Session
app_name = "categorical_notebook"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

In [55]:
smallRDD2 = sc.parallelize([(1,1,'a',3,121), (2,2,'b',3,230), (2,3,'c',1,1056), (3,4,'a',2,4587), (3,5,'b',2,9546), (3,6,'c',4,1218), (4,7,'a',4,463), (4,8,'a',4,571), (4,9,'c',4,32), (4,10,'c',4,799)])
numColumns = 4

In [56]:
structFieldList = [StructField('field_0', LongType(), True),StructField('field_1', LongType(), True),StructField('field_2', StringType(), True),StructField('field_11', StringType(), True),StructField('field_12', LongType(), True)]
schema = StructType(structFieldList)
testDF = spark.createDataFrame(smallRDD2, schema)

## View the fields of interest

In [57]:
testDF.show()

+-------+-------+-------+--------+--------+
|field_0|field_1|field_2|field_11|field_12|
+-------+-------+-------+--------+--------+
|      1|      1|      a|       3|     121|
|      2|      2|      b|       3|     230|
|      2|      3|      c|       1|    1056|
|      3|      4|      a|       2|    4587|
|      3|      5|      b|       2|    9546|
|      3|      6|      c|       4|    1218|
|      4|      7|      a|       4|     463|
|      4|      8|      a|       4|     571|
|      4|      9|      c|       4|      32|
|      4|     10|      c|       4|     799|
+-------+-------+-------+--------+--------+



## Define functions

In [58]:
def convertRowToArray(row):
    rowDict = row.asDict()
    
    X = np.array([])
    
    # Iterate over fields in the row
    for field in rowDict.keys():
        
        # If the field is categorical and to be OHE
        if field in valueDict:
            
            if rowDict[field] not in valueDict[field]:
                
                # If the value is not found in the categories for that field (rare/unknown),
                # then the encoding is all zeros
                
                X = np.append(X, np.zeros(cardinalityDict[field]))
                
            else:
                
                # If the value is found in the categories for that field      
                ohe = np.zeros(cardinalityDict[field])
                
                # Look up the value in the dictionary for this category (it is an index)
                index = valueDict[field][rowDict[field]]
                ohe[index] = 1
                X = np.append(X, ohe)
                
        # If the field is categorical and to be feature hashed
        elif field in featureHashingDict:
            
            # Get the cardinality and create a zeros array. Note, to encode 3 values, use 3-1 bits, e.g. (00, 10, 01)
            cardinality = featureHashingDict[field]
            hashEnc = np.zeros(cardinality-1)
            
            # Encode the field % cardinality
            index = rowDict[field]%cardinality
            if index < cardinality-1:
                hashEnc[index] = 1
            
            X = np.append(X, hashEnc)
    
        # Set the actual value (Y) if the field is field_0
        elif field == 'field_0':
            Y = rowDict[field]
            
        # If the field is not categorical, then just use the existing value
        else:
            X = np.append(X, rowDict[field])
    
    return (X, Y)

In [59]:
def ohe_fit(field, topN, df):

    # Find the frequency of items in the category
    fieldFreqRDD = df.rdd.map(lambda x: (x[field], 1)).\
                          reduceByKey(lambda x, y: x+y)

    # Save the topN values in the dictionary associated with this field
    validValuesDict = {}
    index=0
    for value in fieldFreqRDD.takeOrdered(topN, key=lambda x: -x[1]):
        validValuesDict[value[0]] = index
        index += 1

    # Use the top N frequent values
    valueDict[field] = validValuesDict

In [60]:
def ohe_transform(df):
    oheRDD = df.rdd.map(lambda row: convertRowToArray(row))
    return oheRDD

## Example

Create a dictionary for the values to create new fields

In [61]:
valueDict = {}
cardinalityDict = {}
featureHashingDict = {'field_12':10}

Call the function to restrict the fields in the DataFrame

In [62]:
oheFieldsAndSizes = [('field_2', 10), ('field_11', 3)]

In [63]:
for fieldAndSize in oheFieldsAndSizes:
    cardinalityDict[fieldAndSize[0]] = fieldAndSize[1]
    ohe_fit(fieldAndSize[0], fieldAndSize[1], testDF)

In [64]:
testRDD = ohe_transform(testDF)

In [65]:
testRDD.collect()

[(array([1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0.,
         0., 0., 0., 0., 0., 0.]), 1),
 (array([2., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0.,
         0., 0., 0., 0., 0., 0.]), 2),
 (array([3., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 1., 0., 0.]), 2),
 (array([4., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 1., 0.]), 3),
 (array([5., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 1., 0., 0.]), 3),
 (array([6., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 1.]), 3),
 (array([7., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         1., 0., 0., 0., 0., 0.]), 4),
 (array([8., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
         0., 0., 0., 0., 0., 0.]), 4),
 (array([9., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,