In [1]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.functions import pandas_udf

spark = SparkSession.builder.appName('pandas_UDF_as_decorator').getOrCreate()

/usr/local/lib/python3.7/site-packages/pyspark/bin/load-spark-env.sh: line 68: ps: command not found
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/17 23:24:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
@pandas_udf("col1 string, col2 long")
def func(s1: pd.Series, s2: pd.Series, s3: pd.DataFrame) -> pd.DataFrame:
    s3['col2'] = s1 + s2.str.len()
    return s3

# Create a Spark DataFrame that has three columns including a struct column.
df = spark.createDataFrame([[1, "a string", ("a nested string",)],
                            [3, "a single word", ("a simple sentence",)]],
                     "long_col long, string_col string, struct_col struct<col1:string>")

df.printSchema()

root
 |-- long_col: long (nullable = true)
 |-- string_col: string (nullable = true)
 |-- struct_col: struct (nullable = true)
 |    |-- col1: string (nullable = true)



In [3]:
df.show()

                                                                                

+--------+-------------+-------------------+
|long_col|   string_col|         struct_col|
+--------+-------------+-------------------+
|       1|     a string|  {a nested string}|
|       3|a single word|{a simple sentence}|
+--------+-------------+-------------------+



In [4]:
df.select(func("long_col", "string_col", "struct_col")).printSchema()

root
 |-- func(long_col, string_col, struct_col): struct (nullable = true)
 |    |-- col1: string (nullable = true)
 |    |-- col2: long (nullable = true)



In [5]:
df.select(func("long_col", "string_col", "struct_col").alias("new_colum")).show(truncate=False)

                                                                                

+-----------------------+
|new_colum              |
+-----------------------+
|{a nested string, 9}   |
|{a simple sentence, 16}|
+-----------------------+



### Series to Series
The function takes one or more pandas.Series and outputs one pandas.Series. The output of the function should always be of the same length as the input.

In [6]:
@pandas_udf("string")
def to_upper(s: pd.Series) -> pd.Series:
    return s.str.upper()

df = spark.createDataFrame([("John Doe",), ("Juan Garcia",)], ("name",))
df.select(to_upper("name").alias('upper')).show(truncate=False)

+-----------+
|upper      |
+-----------+
|JOHN DOE   |
|JUAN GARCIA|
+-----------+



In [7]:
@pandas_udf("first string, last string")
def split_expand(s: pd.Series) -> pd.DataFrame:
    return s.str.split(expand=True)

df = spark.createDataFrame([("John Doe",), ("Juan Garcia Smith",)], ("name",))
df.select(split_expand("name").alias('splitted')).show()

+--------------+
|      splitted|
+--------------+
|   {John, Doe}|
|{Juan, Garcia}|
+--------------+



### Iterator of Series to Iterator of Series
The function takes an iterator of pandas.Series and outputs an iterator of pandas.Series. In this case, the created pandas UDF instance requires one input column when this is called as a PySpark column. The length of the entire output from the function should be the same length of the entire input; therefore, it can prefetch the data from the input iterator as long as the lengths are the same.

It is also useful when the UDF execution requires initializing some states although internally it works identically as Series to Series case.

In [8]:
from typing import Iterator
@pandas_udf("long")
def plus_one(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
    for s in iterator:
        yield s + 1

df = spark.createDataFrame(pd.DataFrame([1, 2, 3], columns=["v"]))
df.select(plus_one(df.v).alias('plus_one')).show()

+--------+
|plus_one|
+--------+
|       2|
|       3|
|       4|
+--------+



### Iterator of Multiple Series to Iterator of Series
The function takes an iterator of a tuple of multiple pandas.Series and outputs an iterator of pandas.Series. In this case, the created pandas UDF instance requires input columns as many as the series when this is called as a PySpark column. Otherwise, it has the same characteristics and restrictions as Iterator of Series to Iterator of Series case.

In [9]:
from typing import Iterator, Tuple
from pyspark.sql.functions import struct, col
@pandas_udf("long")
def multiply(iterator: Iterator[Tuple[pd.Series, pd.DataFrame]]) -> Iterator[pd.Series]:
    for s1, df in iterator:
        yield s1 * df.v

df = spark.createDataFrame(pd.DataFrame([1, 2, 3], columns=["v"]))
df.withColumn('output', multiply(col("v"), struct(col("v")))).show()

+---+------+
|  v|output|
+---+------+
|  1|     1|
|  2|     4|
|  3|     9|
+---+------+



### Series to Scalar
The function takes pandas.Series and returns a scalar value. The returnType should be a primitive data type, and the returned scalar can be either a python primitive type, e.g., int or float or a numpy data type, e.g., numpy.int64 or numpy.float64. Any should ideally be a specific scalar type accordingly.

In [10]:
@pandas_udf("double")
def mean_udf(v: pd.Series) -> float:
    return v.mean()

df = spark.createDataFrame([(1, 1.0), 
                            (1, 2.0), 
                            (2, 3.0), 
                            (2, 5.0), 
                            (2, 10.0)], ("id", "v"))
df.show()                            
df.groupby("id").agg(mean_udf(df['v'])).show()

+---+----+
| id|   v|
+---+----+
|  1| 1.0|
|  1| 2.0|
|  2| 3.0|
|  2| 5.0|
|  2|10.0|
+---+----+

+---+-----------+
| id|mean_udf(v)|
+---+-----------+
|  1|        1.5|
|  2|        6.0|
+---+-----------+



This UDF can also be used as window functions as below:

In [11]:
from pyspark.sql import Window
@pandas_udf("double")
def mean_udf(v: pd.Series) -> float:
    return v.mean()

df = spark.createDataFrame([(1, 1.0), 
                            (1, 2.0), 
                            (2, 3.0), 
                            (2, 5.0), 
                            (2, 10.0)], ("id", "v"))

# PARTITION BY 'id' ORDER BY 'v' ROWS BETWEEN start=-1 and end=0 (PRECEDING AND CURRENT ROW) both inclusive 
# Both start and end are relative positions from the current row                    
w = Window.partitionBy('id').orderBy('v').rowsBetween(-1, 0)
df.withColumn('mean_v', mean_udf("v").over(w)).show()

+---+----+------+
| id|   v|mean_v|
+---+----+------+
|  1| 1.0|   1.0|
|  1| 2.0|   1.5|
|  2| 3.0|   3.0|
|  2| 5.0|   4.0|
|  2|10.0|   7.5|
+---+----+------+



split by id:  (1, 1.0), (1, 2.0) , 1 row mean of 1 = 1, 2 row mean of 1 and 2 =  1.5
              (2, 3.0), (2, 5.0), (2, 10.0) , 3 row mean of 3 = 3, 4 row mean of 3 and 5 =  4.0, 5 row mean of 5 and 10 = 7.5

### Pandas UDF types 

In [13]:
from pyspark.sql.functions import pandas_udf, PandasUDFType

df = spark.createDataFrame([(1, 1.0), 
                            (1, 2.0), 
                            (2, 3.0), 
                            (2, 5.0), 
                            (2, 10.0)], ("id", "v"))


# wrapping of the functions as a function inside a function
# We have defined a normal UDF called 'my_function' that takes the Pyspark DF and the argument to be used in the core pandas groupby. 
def my_function(df, by="id", column="v", value=1.0):
    schema = "{} long, {} double".format(by, column)

    # schema is used to map the Pandas DF returned by subtract_value (return pdf.assign(v = v - g * value)).
    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def subtract_value(pdf):
        # pdf is a pandas.DataFrame
        v = pdf[column]
        g = pdf[by]
        return pdf.assign(v = v - g * value)

    # function 'subtract_value' defined as a Pandas UDF and it does not have any arguments. 
    # However, we are still inside the scope of 'my_function' when 'subtract_value' is called
    # as 'apply' and it has value defined
    return df.groupby(by).apply(subtract_value)

my_function(df, by="id", column="v", value=2.0).show()



+---+----+
| id|   v|
+---+----+
|  1|-1.0|
|  1| 0.0|
|  2|-1.0|
|  2| 1.0|
|  2| 6.0|
+---+----+



The example below shows a Pandas UDF to simply add one to each value, in which it is defined with the function called pandas_plus_one decorated by pandas_udf with the Pandas UDF type specified as PandasUDFType.SCALAR.

In [14]:
from pyspark.sql.functions import pandas_udf, PandasUDFType

@pandas_udf('double', PandasUDFType.SCALAR)
def pandas_plus_one(v):
    # `v` is a pandas Series
    return v.add(1)  # outputs a pandas Series

spark.range(10).select(pandas_plus_one("id")).show()



+-------------------+
|pandas_plus_one(id)|
+-------------------+
|                1.0|
|                2.0|
|                3.0|
|                4.0|
|                5.0|
|                6.0|
|                7.0|
|                8.0|
|                9.0|
|               10.0|
+-------------------+



In [15]:
@pandas_udf('long', PandasUDFType.SCALAR)
def pandas_plus_one_long(v):
    # `v` is a pandas Series
    return v.add(1)  # outputs a pandas Series

spark.range(10).select(pandas_plus_one_long("id")).show()

+------------------------+
|pandas_plus_one_long(id)|
+------------------------+
|                       1|
|                       2|
|                       3|
|                       4|
|                       5|
|                       6|
|                       7|
|                       8|
|                       9|
|                      10|
+------------------------+



### New Pandas APIs with Python Type Hints

In [16]:
# Pandas UDF
import pandas as pd
from pyspark.sql.functions import pandas_udf, log2, col

@pandas_udf('long')
def pandas_plus_one(s: pd.Series) -> pd.Series:
    return s + 1

# pandas_plus_one("id") is identically treated as _a SQL expression_ internally.
# Namely, you can combine with other columns, functions and expressions.
spark.range(10).select(
    pandas_plus_one(col("id") - 1) + log2("id") + 1).show()

+--------------------------------------------+
|((pandas_plus_one((id - 1)) + LOG2(id)) + 1)|
+--------------------------------------------+
|                                        null|
|                                         2.0|
|                                         4.0|
|                           5.584962500721156|
|                                         7.0|
|                           8.321928094887362|
|                           9.584962500721156|
|                          10.807354922057604|
|                                        12.0|
|                          13.169925001442312|
+--------------------------------------------+



In [17]:
# Pandas Function API
from typing import Iterator
import pandas as pd


def pandas_plus_one(iterator: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]:
    return map(lambda v: v + 1, iterator)


# pandas_plus_one is just a regular Python function, and mapInPandas is
# logically treated as _a separate SQL query plan_ instead of a SQL expression. 
# Therefore, direct interactions with other expressions are impossible.
spark.range(10).mapInPandas(pandas_plus_one, schema="id long").show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
+---+

