# Custom Scaling Example

We need to do custom scaling due to the following limitations:

1. MinMaxScaler in MLlib does not work for us, because it works on Vector objects
2. MinMaxScaler in Scikit Learn works on Pandas DataFrames, but Pandas DataFrames do not scale
3. Using UDF in Spark SQL is too cumbersome

## Read the data and convert to a dataframe

In [1]:
import numpy as np
from pyspark.sql.types import StructType, StructField, LongType
from pyspark.sql import SparkSession
import pyspark.sql

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
# store path to notebook
PWD = !pwd
PWD = PWD[0]

In [4]:
# start Spark Session
app_name = "scaling_notebook"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

In [5]:
#!head -n 10 data/train_100K.txt > data/train_10.txt
#!head -n 20 data/train_100K.txt > data/test_10.txt

In [6]:
smallRDD = sc.textFile('data/train_10.txt')

In [7]:
def ConvertNumber(idx, num):
    if num != '':
        if idx > 13:
            return int(num, 16)
        else:
            return int(num)
    else:
        return None

In [8]:
smallRDD2 = smallRDD.map(lambda x: [ConvertNumber(idx, num) for idx,num in enumerate(x.split('\t'))]).cache()

In [9]:
smallRDDRow = smallRDD2.take(1)
numColumns = len(smallRDDRow[0])

In [10]:
structFieldList = [StructField('field_' + str(num), LongType(), True) for num in range(numColumns)]
schema = StructType(structFieldList)
testDF = spark.createDataFrame(smallRDD2, schema)

## View the fields of interest

In [11]:
testDF.select(['field_0', 'field_2', 'field_5']).show()

+-------+-------+-------+
|field_0|field_2|field_5|
+-------+-------+-------+
|      0|      1|   1382|
|      0|      0|    102|
|      0|      0|    767|
|      0|    893|   4392|
|      0|     -1|      2|
|      0|     -1|  12824|
|      0|      1|   3168|
|      1|      4|      0|
|      0|     44|  19010|
|      0|     35|  33737|
+-------+-------+-------+



## Define scaling functions

In [12]:
def scaleRow(row):
    rowDict = row.asDict()
    
    # Scale by subtracting the min, and dividing by the delta
    for field in scaleDict.keys():
        temp = float(rowDict[field]-scaleDict[field][0])/scaleDict[field][1]
        
        # The test set may have data that is outside the range in the training set.
        # Limit the range to 0-1.
        rowDict[field] = min(max(temp, 0.0), 1.0)
        
    return pyspark.sql.Row(**rowDict)

In [13]:
def scaleDataFrame_fit(fields, df):

    # Note: Need to rename the 'summary' column, because using it in the filter statement tries to invoke the function
    summaryDF = df.select(fields).summary(['min', 'max']).withColumnRenamed('summary', 'summary_col').cache()
    
    minRow = summaryDF.filter(summaryDF.summary_col == 'min').first()
    maxRow = summaryDF.filter(summaryDF.summary_col == 'max').first()
    
    for field in fields:
        min = int(minRow[field])
        max = int(maxRow[field])    
        scaleDict[field] = (min, max-min)

In [14]:
def scaleDataFrame_transform(df):
    return df.rdd.map(scaleRow).toDF()

## Example

First define an empty scaling dictionary

In [15]:
scaleDict = {}

Scale the selected fields

In [18]:
scaleDataFrame_fit(['field_2', 'field_5'], testDF)
testDF2 = scaleDataFrame_transform(testDF)

See the results

In [19]:
testDF2.select(['field_0', 'field_2', 'field_5']).show()

+-------+--------------------+--------------------+
|field_0|             field_2|             field_5|
+-------+--------------------+--------------------+
|      0|0.002237136465324...|  0.0409639268458962|
|      0|0.001118568232662...|0.003023386786021...|
|      0|0.001118568232662...| 0.02273468298900317|
|      0|                 1.0|  0.1301834780804458|
|      0|                 0.0|5.928209384355455...|
|      0|                 0.0|  0.3801167857248718|
|      0|0.002237136465324...| 0.09390283664819041|
|      1|0.005592841163310962|                 0.0|
|      0|0.050335570469798654|   0.563476301982986|
|      0|0.040268456375838924|                 1.0|
+-------+--------------------+--------------------+

