# DL API Pydoc

In [1]:
spark

### Example (tensor column):

In [2]:
import numpy as np

from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.types import ArrayType, FloatType

In [3]:
def make_mnist_fn():
    # load/init happens once per python worker
    import tensorflow as tf
    model = tf.keras.models.load_model('/home/leey/devpub/leewyang/sparkext/examples/tensorflow/mnist_model')

    # predict on batches of tasks/partitions, using cached model
    def predict(inputs: np.ndarray) -> np.ndarray:
        # inputs.shape = [batch_size, 784]
        # outputs.shape = [batch_size, 10], return_type = ArrayType(FloatType())
        return model.predict(inputs)

    return predict

mnist_udf = predict_batch_udf(make_mnist_fn,
                              return_type=ArrayType(FloatType()),
                              batch_size=100,
                              input_tensor_shapes=[[784]])

df = spark.read.parquet("/home/leey/devpub/leewyang/sparkext/examples/tensorflow/mnist_test")
df.show(5)

                                                                                

+--------------------+
|                data|
+--------------------+
|[0.0, 0.0, 0.0, 0...|
|[0.0, 0.0, 0.0, 0...|
|[0.0, 0.0, 0.0, 0...|
|[0.0, 0.0, 0.0, 0...|
|[0.0, 0.0, 0.0, 0...|
+--------------------+
only showing top 5 rows



In [4]:
df.withColumn("preds", mnist_udf("data")).show(5)

[Stage 2:>                                                          (0 + 1) / 1]

+--------------------+--------------------+
|                data|               preds|
+--------------------+--------------------+
|[0.0, 0.0, 0.0, 0...|[1.4888417, -9.10...|
|[0.0, 0.0, 0.0, 0...|[-7.9047585, -3.5...|
|[0.0, 0.0, 0.0, 0...|[13.611437, -14.1...|
|[0.0, 0.0, 0.0, 0...|[17.657278, -18.9...|
|[0.0, 0.0, 0.0, 0...|[-9.834987, -10.8...|
+--------------------+--------------------+
only showing top 5 rows



                                                                                

### Example (scalar column)

In [5]:
import numpy as np
import pandas as pd
from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.types import FloatType

df = spark.createDataFrame(pd.DataFrame(np.arange(100)))
df.show(5)

[Stage 3:>                                                          (0 + 1) / 1]

+---+
|  0|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+
only showing top 5 rows



                                                                                

In [6]:
def make_times_two_fn():
    def predict(inputs: np.ndarray) -> np.ndarray:
        # inputs.shape = [batch_size]
        # outputs.shape = [batch_size], return_type = FloatType()
        return inputs * 2

    return predict

times_two_udf = predict_batch_udf(make_times_two_fn,
                                  return_type=FloatType(),
                                  batch_size=10)

df = spark.createDataFrame(pd.DataFrame(np.arange(100)))
df.withColumn("x2", times_two_udf("0")).show(5)

+---+---+
|  0| x2|
+---+---+
|  0|0.0|
|  1|2.0|
|  2|4.0|
|  3|6.0|
|  4|8.0|
+---+---+
only showing top 5 rows



                                                                                

### Example (multiple scalar columns)

In [7]:
import numpy as np
import pandas as pd
from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.functions import array

data = np.arange(0, 1000, dtype=np.float64).reshape(-1, 4)
pdf = pd.DataFrame(data, columns=['a','b','c','d'])
df = spark.createDataFrame(pdf)
df.show(5)

+----+----+----+----+
|   a|   b|   c|   d|
+----+----+----+----+
| 0.0| 1.0| 2.0| 3.0|
| 4.0| 5.0| 6.0| 7.0|
| 8.0| 9.0|10.0|11.0|
|12.0|13.0|14.0|15.0|
|16.0|17.0|18.0|19.0|
+----+----+----+----+
only showing top 5 rows



In [8]:
def make_sum_fn():
    def predict(inputs: np.ndarray) -> np.ndarray:
        # inputs.shape = [batch_size, 4]
        # outputs.shape = [batch_size], return_type = FloatType()
        return np.sum(inputs, axis=1)

    return predict

sum_udf = predict_batch_udf(make_sum_fn,
                            return_type=FloatType(),
                            batch_size=10,
                            input_tensor_shapes=[[4]])

df.withColumn("sum", sum_udf(array("a", "b", "c", "d"))).show(5)

[Stage 6:>                                                          (0 + 1) / 1]

+----+----+----+----+----+
|   a|   b|   c|   d| sum|
+----+----+----+----+----+
| 0.0| 1.0| 2.0| 3.0| 6.0|
| 4.0| 5.0| 6.0| 7.0|22.0|
| 8.0| 9.0|10.0|11.0|38.0|
|12.0|13.0|14.0|15.0|54.0|
|16.0|17.0|18.0|19.0|70.0|
+----+----+----+----+----+
only showing top 5 rows



                                                                                

In [9]:
def make_sum_fn():
    def predict(x1: np.ndarray, x2: np.ndarray, x3: np.ndarray, x4: np.ndarray) -> np.ndarray:
        # xN.shape = [batch_size]
        # outputs.shape = [batch_size], return_type = FloatType()
        return x1 + x2 + x3 + x4

    return predict

sum_udf = predict_batch_udf(make_sum_fn,
                            return_type=FloatType(),
                            batch_size=10)

df.withColumn("sum", sum_udf("a", "b", "c", "d")).show(5)

+----+----+----+----+----+
|   a|   b|   c|   d| sum|
+----+----+----+----+----+
| 0.0| 1.0| 2.0| 3.0| 6.0|
| 4.0| 5.0| 6.0| 7.0|22.0|
| 8.0| 9.0|10.0|11.0|38.0|
|12.0|13.0|14.0|15.0|54.0|
|16.0|17.0|18.0|19.0|70.0|
+----+----+----+----+----+
only showing top 5 rows



                                                                                

### Example (multiple tensor columns)

In [10]:
import numpy as np
import pandas as pd
from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.types import ArrayType, FloatType, StructType, StructField
from typing import Mapping

data = np.arange(0, 1000, dtype=np.float64).reshape(-1, 4)
pdf = pd.DataFrame(data, columns=['a','b','c','d'])
pdf_tensor = pd.DataFrame()
pdf_tensor['t1'] = pdf.values.tolist()
pdf_tensor['t2'] = pdf.drop(columns='d').values.tolist()
df = spark.createDataFrame(pdf_tensor)
df.show(5)

+--------------------+------------------+
|                  t1|                t2|
+--------------------+------------------+
|[0.0, 1.0, 2.0, 3.0]|   [0.0, 1.0, 2.0]|
|[4.0, 5.0, 6.0, 7.0]|   [4.0, 5.0, 6.0]|
|[8.0, 9.0, 10.0, ...|  [8.0, 9.0, 10.0]|
|[12.0, 13.0, 14.0...|[12.0, 13.0, 14.0]|
|[16.0, 17.0, 18.0...|[16.0, 17.0, 18.0]|
+--------------------+------------------+
only showing top 5 rows



In [11]:
def make_multi_sum_fn():
    def predict(x1: np.ndarray, x2: np.ndarray) -> Mapping[str, np.dtype]:
        # x1.shape = [batch_size, 4]
        # x2.shape = [batch_size, 3]
        # outputs.shape = [batch_size], result_type = FloatType()
        return np.sum(x1, axis=1) + np.sum(x2, axis=1)

    return predict

# multiple tensor columns with tensor_input_shapes => list of numpy arrays
multi_sum_udf = predict_batch_udf(
    make_multi_sum_fn,
    return_type=FloatType(),
    batch_size=5,
    input_tensor_shapes=[[4], [3]],
)

df.withColumn("sum", multi_sum_udf("t1", "t2")).show(5)

+--------------------+------------------+-----+
|                  t1|                t2|  sum|
+--------------------+------------------+-----+
|[0.0, 1.0, 2.0, 3.0]|   [0.0, 1.0, 2.0]|  9.0|
|[4.0, 5.0, 6.0, 7.0]|   [4.0, 5.0, 6.0]| 37.0|
|[8.0, 9.0, 10.0, ...|  [8.0, 9.0, 10.0]| 65.0|
|[12.0, 13.0, 14.0...|[12.0, 13.0, 14.0]| 93.0|
|[16.0, 17.0, 18.0...|[16.0, 17.0, 18.0]|121.0|
+--------------------+------------------+-----+
only showing top 5 rows



                                                                                

### Example (multiple outputs)

In [12]:
def make_multi_sum_fn():
    def predict_columnar(x1, x2):
        # x1.shape = [batch_size, 4]
        # x2.shape = [batch_size, 3]
        return {
            "sum1": np.sum(x1, axis=1),
            "sum2": np.sum(x2, axis=1)
        }  # return_type = StructType()

    return predict_columnar

sum_cols = predict_batch_udf(
    make_multi_sum_fn,
    return_type=StructType([
        StructField("sum1", FloatType(), True),
        StructField("sum2", FloatType(), True)
    ]),
    batch_size=5,
    input_tensor_shapes=[[4], [3]],
)

df.withColumn("sum", sum_cols("t1", "t2")).select("t1", "t2", "sum.*").show(5)

+--------------------+------------------+----+----+
|                  t1|                t2|sum1|sum2|
+--------------------+------------------+----+----+
|[0.0, 1.0, 2.0, 3.0]|   [0.0, 1.0, 2.0]| 6.0| 3.0|
|[4.0, 5.0, 6.0, 7.0]|   [4.0, 5.0, 6.0]|22.0|15.0|
|[8.0, 9.0, 10.0, ...|  [8.0, 9.0, 10.0]|38.0|27.0|
|[12.0, 13.0, 14.0...|[12.0, 13.0, 14.0]|54.0|39.0|
|[16.0, 17.0, 18.0...|[16.0, 17.0, 18.0]|70.0|51.0|
+--------------------+------------------+----+----+
only showing top 5 rows



                                                                                

In [13]:
def make_multi_sum_fn():
    def predict_row(x1: np.ndarray, x2: np.ndarray) -> list[Mapping[str, float]]:
        # x1.shape = [batch_size, 4]
        # x2.shape = [batch_size, 3]
        return [{'sum1': np.sum(x1[i]), 'sum2': np.sum(x2[i])} for i in range(len(x1))]
    return predict_row

multi_sum_udf = predict_batch_udf(
    make_multi_sum_fn,
    return_type=StructType([
        StructField("sum1", FloatType(), True),
        StructField("sum2", FloatType(), True)
    ]),
    batch_size=5,
    input_tensor_shapes=[[4], [3]],
)

df.withColumn("sum", multi_sum_udf("t1", "t2")).select("t1", "t2", "sum.*").show(5)

+--------------------+------------------+----+----+
|                  t1|                t2|sum1|sum2|
+--------------------+------------------+----+----+
|[0.0, 1.0, 2.0, 3.0]|   [0.0, 1.0, 2.0]| 6.0| 3.0|
|[4.0, 5.0, 6.0, 7.0]|   [4.0, 5.0, 6.0]|22.0|15.0|
|[8.0, 9.0, 10.0, ...|  [8.0, 9.0, 10.0]|38.0|27.0|
|[12.0, 13.0, 14.0...|[12.0, 13.0, 14.0]|54.0|39.0|
|[16.0, 17.0, 18.0...|[16.0, 17.0, 18.0]|70.0|51.0|
+--------------------+------------------+----+----+
only showing top 5 rows



                                                                                

In [14]:
def make_multi_times_two_fn():
    def predict(x1: np.ndarray, x2: np.ndarray) -> Mapping[str, np.ndarray]:
        # x1.shape = [batch_size, 4]
        # x2.shape = [batch_size, 3]
        return {"t1x2": x1 * 2, "t2x2": x2 * 2}
    return predict

multi_times_two_udf = predict_batch_udf(
    make_multi_times_two_fn,
    return_type=StructType([
        StructField("t1x2", ArrayType(FloatType()), True),
        StructField("t2x2", ArrayType(FloatType()), True)
    ]),
    batch_size=5,
    input_tensor_shapes=[[4], [3]],
)

df.withColumn("x2", multi_times_two_udf("t1", "t2")).select("t1", "t2", "x2.*").show(5)

                                                                                

+--------------------+------------------+--------------------+------------------+
|                  t1|                t2|                t1x2|              t2x2|
+--------------------+------------------+--------------------+------------------+
|[0.0, 1.0, 2.0, 3.0]|   [0.0, 1.0, 2.0]|[0.0, 2.0, 4.0, 6.0]|   [0.0, 2.0, 4.0]|
|[4.0, 5.0, 6.0, 7.0]|   [4.0, 5.0, 6.0]|[8.0, 10.0, 12.0,...| [8.0, 10.0, 12.0]|
|[8.0, 9.0, 10.0, ...|  [8.0, 9.0, 10.0]|[16.0, 18.0, 20.0...|[16.0, 18.0, 20.0]|
|[12.0, 13.0, 14.0...|[12.0, 13.0, 14.0]|[24.0, 26.0, 28.0...|[24.0, 26.0, 28.0]|
|[16.0, 17.0, 18.0...|[16.0, 17.0, 18.0]|[32.0, 34.0, 36.0...|[32.0, 34.0, 36.0]|
+--------------------+------------------+--------------------+------------------+
only showing top 5 rows



In [15]:
import numpy as np
import pandas as pd
from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.types import ArrayType, FloatType, StructType, StructField
from typing import Mapping

df = spark.createDataFrame([[[0.0, 1.0, 2.0, 3.0], [0.0, 1.0, 2.0]]], schema=["t1", "t2"])

def make_multi_sum_fn():
    def predict(x1: np.ndarray, x2: np.ndarray) -> np.ndarray:
        return np.sum(x1, axis=1) + np.sum(x2, axis=1)
    return predict

multi_sum_udf = predict_batch_udf(
    make_multi_sum_fn,
    return_type=FloatType(),
    batch_size=1,
    input_tensor_shapes=[[4], [3]],
)

df.select(multi_sum_udf("t1", "t2")).collect()

                                                                                

[Row(predict(t1, t2)=9.0)]