# Custom Scaling Example

We need to do custom scaling due to the following limitations:

1. MinMaxScaler in MLlib does not work for us, because it works on Vector objects
2. MinMaxScaler in Scikit Learn works on Pandas DataFrames, but Pandas DataFrames do not scale
3. Using UDF in Spark SQL is too cumbersome

## Read the data and convert to a dataframe

In [1]:
import numpy as np
from pyspark.sql.types import StructType, StructField, LongType, FloatType
from pyspark.sql import SparkSession
import pyspark.sql

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
# store path to notebook
PWD = !pwd
PWD = PWD[0]

In [4]:
# start Spark Session
app_name = "scaling_notebook"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

In [17]:
#!head -n 10 data/train_100K.txt > data/train_10.txt
#!head -n 20 data/train_100K.txt > data/test_10.txt

In [18]:
#smallRDD = sc.textFile('data/train_10.txt')

In [5]:
def ConvertNumber(idx, num):
    if num != '':
        return float(num)
    else:
        return None

In [6]:
smallRDD2 = sc.parallelize([(1.0,1.0,3.0), (2.0,2.0,3.0), (2.0,3.0,1.0), (3.0,4.0,2.0), (3.0,5.0,2.0), (3.0,6.0,4.0), (4.0,7.0,4.0), (4.0,8.0,4.0), (4.0,9.0,4.0), (4.0,10.0,4.0)])

In [7]:
smallRDDRow = smallRDD2.take(1)
numColumns = len(smallRDDRow[0])

In [8]:
structFieldList = [StructField('field_1', FloatType(), True), StructField('field_2', FloatType(), True), StructField('field_11', FloatType(), True)]
schema = StructType(structFieldList)
testDF = spark.createDataFrame(smallRDD2, schema)

In [10]:
testDF.show()

+-------+-------+--------+
|field_1|field_2|field_11|
+-------+-------+--------+
|    1.0|    1.0|     3.0|
|    2.0|    2.0|     3.0|
|    2.0|    3.0|     1.0|
|    3.0|    4.0|     2.0|
|    3.0|    5.0|     2.0|
|    3.0|    6.0|     4.0|
|    4.0|    7.0|     4.0|
|    4.0|    8.0|     4.0|
|    4.0|    9.0|     4.0|
|    4.0|   10.0|     4.0|
+-------+-------+--------+



## View the fields of interest

In [11]:
testDF.select(['field_1', 'field_2', 'field_11']).show()

+-------+-------+--------+
|field_1|field_2|field_11|
+-------+-------+--------+
|    1.0|    1.0|     3.0|
|    2.0|    2.0|     3.0|
|    2.0|    3.0|     1.0|
|    3.0|    4.0|     2.0|
|    3.0|    5.0|     2.0|
|    3.0|    6.0|     4.0|
|    4.0|    7.0|     4.0|
|    4.0|    8.0|     4.0|
|    4.0|    9.0|     4.0|
|    4.0|   10.0|     4.0|
+-------+-------+--------+



## Define scaling functions

In [28]:
def scaleRow(row):
    rowDict = row.asDict()
    
    # Scale by subtracting the min, and dividing by the delta
    for field in scaleDict.keys():
        rowDict[field] = float(rowDict[field]-scaleDict[field][0])/scaleDict[field][1]

    return pyspark.sql.Row(**rowDict)

In [13]:
def scaleDataFrame_fit(fields, df):

    # Note: Need to rename the 'summary' column, because using it in the filter statement tries to invoke the function
    summaryDF = df.select(fields).summary(['mean', 'stddev']).withColumnRenamed('summary', 'summary_col').cache()
    
    meanRow = summaryDF.filter(summaryDF.summary_col == 'mean').first()
    stddevRow = summaryDF.filter(summaryDF.summary_col == 'stddev').first()
    
    for field in fields:  
        scaleDict[field] = (float(meanRow[field]), float(stddevRow[field]))

In [14]:
def scaleDataFrame_transform(df):
    return df.rdd.map(scaleRow).toDF()

## Example

First define an empty scaling dictionary

In [15]:
scaleDict = {}

Scale the selected fields

In [31]:
originalColumnOrder = testDF.columns
originalColumnOrder

['field_1', 'field_2', 'field_11']

In [32]:
scaleDataFrame_fit(['field_1','field_2','field_11'], testDF)
testDF2 = scaleDataFrame_transform(testDF)

See the results

In [33]:
testDF2 = testDF2.select(originalColumnOrder)

In [34]:
testDF2.show()

+-------------------+--------------------+--------------------+
|            field_1|             field_2|            field_11|
+-------------------+--------------------+--------------------+
|-1.8973665961010275| -1.4863010829205867|-0.09086737992230748|
|-0.9486832980505138| -1.1560119533826787|-0.09086737992230748|
|-0.9486832980505138| -0.8257228238447705| -1.9082149783684554|
|                0.0|-0.49543369430686224| -0.9995411791453815|
|                0.0| -0.1651445647689541| -0.9995411791453815|
|                0.0|  0.1651445647689541|  0.8178064193007665|
| 0.9486832980505138| 0.49543369430686224|  0.8178064193007665|
| 0.9486832980505138|  0.8257228238447705|  0.8178064193007665|
| 0.9486832980505138|  1.1560119533826787|  0.8178064193007665|
| 0.9486832980505138|  1.4863010829205867|  0.8178064193007665|
+-------------------+--------------------+--------------------+

