<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import" data-toc-modified-id="Import-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import</a></span></li><li><span><a href="#Example" data-toc-modified-id="Example-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Example</a></span></li><li><span><a href="#Example-1" data-toc-modified-id="Example-1-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Example 1</a></span><ul class="toc-item"><li><span><a href="#UDF" data-toc-modified-id="UDF-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>UDF</a></span><ul class="toc-item"><li><span><a href="#V1-with-Decorator" data-toc-modified-id="V1-with-Decorator-3.1.1"><span class="toc-item-num">3.1.1&nbsp;&nbsp;</span>V1 with Decorator</a></span></li><li><span><a href="#V2-Without-Decorator" data-toc-modified-id="V2-Without-Decorator-3.1.2"><span class="toc-item-num">3.1.2&nbsp;&nbsp;</span>V2 Without Decorator</a></span></li></ul></li><li><span><a href="#Pandas-UDF" data-toc-modified-id="Pandas-UDF-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Pandas UDF</a></span><ul class="toc-item"><li><span><a href="#V1-with-Decorator" data-toc-modified-id="V1-with-Decorator-3.2.1"><span class="toc-item-num">3.2.1&nbsp;&nbsp;</span>V1 with Decorator</a></span></li><li><span><a href="#V2-Without-Decorator" data-toc-modified-id="V2-Without-Decorator-3.2.2"><span class="toc-item-num">3.2.2&nbsp;&nbsp;</span>V2 Without Decorator</a></span></li></ul></li></ul></li></ul></div>

# UDF and Pandas_UDF

https://spark.apache.org/docs/2.3.0/sql-programming-guide.html#grouped-map

## Import 

In [4]:
import pandas as pd 
import numpy as np

from pyspark.sql.functions import pandas_udf,udf, col
from pyspark.sql.types import *

In [1]:
import findspark
findspark.find() 

from pyspark import SparkContext
sc = SparkContext()

from pyspark.sql import SQLContext
sqlcontext = SQLContext(sc)

print('Spark Version:',sc.version)

Spark Version: 2.3.0


In [2]:
# testing 
sc.parallelize(range(10)).count()

10

## Example 1 

Given string 'True' and 'False', convert to binary. 

In [40]:
pdf = pd.DataFrame({'value':['True','False','True']})
sdf = sqlcontext.createDataFrame(pdf)
sdf.show()

+-----+
|value|
+-----+
| True|
|False|
| True|
+-----+



### UDF 

#### V1 with Decorator

In [51]:
@udf('integer')
def convert_boolean_udf(i):
    '''
    Accepts input one row at a time, eg. one string at a time 
    '''
    # type(i) shows string 
    
    if i == 'True':
        return 1
    elif i =='False':
        return 0 
    
sdf.withColumn('output', convert_boolean_udf(sdf.value)).select('output').show(4)

+------+
|output|
+------+
|     1|
|     0|
|     1|
+------+



#### V2 Without Decorator

In [54]:
def convert_boolean(i):
    '''
    Accepts input one row at a time, eg. one string at a time 
    '''
    if i == 'True':
        return 1
    elif i =='False':
        return 0 

convert_boolean_udf = udf(convert_boolean, returnType=IntegerType())

In [55]:
sdf.select(convert_boolean_udf(col("value")).alias('value')).show()

+-----+
|value|
+-----+
|    1|
|    0|
|    1|
+-----+



### Pandas UDF

#### V1 with Decorator

In [60]:
@pandas_udf('integer')
def convert_boolean_pudf(i):
    '''
    Accepts input of type Pandas Series
    '''
    # print(type(i)) shows <class 'pandas.core.series.Series'>
    return (i.apply(convert_boolean))

sdf.withColumn('output', convert_boolean_pudf(sdf.value)).select('output').show(3)

+------+
|output|
+------+
|     1|
|     0|
|     1|
+------+



#### V2 Without Decorator

In [61]:
def convert_boolean_series(i):
    '''
    Accepts input of type Pandas Series
    '''
    return (i.apply(convert_boolean))

convert_boolean_pudf = pandas_udf(convert_boolean_series, returnType=IntegerType())

sdf.withColumn('output', convert_boolean_pudf(sdf.value)).select('output').show(3)

+------+
|output|
+------+
|     1|
|     0|
|     1|
+------+



In [64]:
# pandas_udf function should work with pandas series object
pdf.apply(convert_boolean_series)

Unnamed: 0,value
0,1
1,0
2,1
