# Scratch Notebook

## Create pandas/spark dataframes for testing

In [None]:
import numpy as np
import pandas as pd

from pyspark.sql.functions import col, pandas_udf, spark_partition_id, struct
from pyspark.sql.types import *
from typing import Iterator, Union

In [None]:
data = np.arange(0, 1000, dtype=np.float64).reshape(-1, 4)
pdf = pd.DataFrame(data, columns=['a','b','c','d'])

In [None]:
pdf_scalar = pdf
df_scalar = spark.createDataFrame(pdf_scalar)

pdf_tensor = pd.DataFrame()
pdf_tensor['t1'] = pdf_scalar.values.tolist()
df_tensor1 = spark.createDataFrame(pdf_tensor)

pdf_tensor['t2'] = pdf_scalar.drop(columns='d').values.tolist()
df_tensor2 = spark.createDataFrame(pdf_tensor)

### pandas dataframes

In [None]:
pdf_scalar

In [None]:
pdf_tensor

In [None]:
foo = pdf_scalar['a']

In [None]:
type(foo)

In [None]:
foo.iloc[0]

### spark dataframes

In [None]:
df_scalar.show()

In [None]:
df_tensor1.show()

In [None]:
df_tensor2.show()

### spark partitions

In [None]:
from pyspark.sql.functions import spark_partition_id
df.withColumn("partition_id", spark_partition_id()).groupBy("partition_id").count().show()

## Test tensor columns

### pDF -> pDF | returnType=StructType | return pDF => OK

In [None]:
df_tensor2.schema

In [None]:
@pandas_udf(returnType=df_tensor2.schema)
def predict(inputs: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]:
    for batch in inputs:
        print("===== batch: {}".format(type(batch)))
        print("===== len(batch: {}".format(len(batch)))
        print("===== batch.columns: {}".format(batch.columns))
        print("===== batch.dtypes:\n{}".format(batch.dtypes))
        yield batch

In [None]:
columns = df_tensor2.columns
preds = df_tensor2.withColumn("preds", predict(struct(*columns))).toPandas()
preds

### pDF -> pDF | returnType=ArrayType(DoubleType()) | return pS => OK

In [None]:
df_tensor2.schema

In [None]:
@pandas_udf(returnType=ArrayType(DoubleType()))
def predict(inputs: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]:
    for batch in inputs:
        print("===== batch: {}".format(type(batch)))
        print("===== len(batch: {}".format(len(batch)))
        print("===== batch.columns: {}".format(batch.columns))
        print("===== batch.dtypes:\n{}".format(batch.dtypes))
        yield batch['t1']

In [None]:
columns = df_tensor2.columns
preds = df_tensor2.withColumn("preds", predict(struct(*columns))).toPandas()
preds

### pDF -> pS | returnType=StructType | return pDF => OK

In [None]:
df_tensor2.schema

In [None]:
@pandas_udf(returnType=df_tensor2.schema)
def predict(inputs: Iterator[pd.DataFrame]) -> Iterator[pd.Series]:
    for batch in inputs:
        print("===== batch: {}".format(type(batch)))
        print("===== len(batch: {}".format(len(batch)))
        print("===== batch.columns: {}".format(batch.columns))
        print("===== batch.dtypes:\n{}".format(batch.dtypes))
        yield batch

In [None]:
columns = df_tensor2.columns
preds = df_tensor2.withColumn("preds", predict(struct(*columns))).toPandas()
preds

### pDF -> pS | returnType=ArrayType(DoubleType()) | return pS => OK

In [None]:
df_tensor2.schema

In [None]:
@pandas_udf(returnType=ArrayType(DoubleType()))
def predict(inputs: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]:
    for batch in inputs:
        print("===== batch: {}".format(type(batch)))
        print("===== len(batch: {}".format(len(batch)))
        print("===== batch.columns: {}".format(batch.columns))
        print("===== batch.dtypes:\n{}".format(batch.dtypes))
        yield batch['t1']

In [None]:
columns = df_tensor2.columns
preds = df_tensor2.withColumn("preds", predict(struct(*columns))).toPandas()
preds

### Union[pDF, pS] -> Union[pDF, pS] | returnType=StructType() | return pS => FAIL

In [None]:
df_tensor2.schema

In [None]:
@pandas_udf(returnType=df_tensor2.schema)
# def predict(inputs: Iterator[Union[pd.DataFrame, pd.Series]]) -> Iterator[Union[pd.DataFrame, pd.Series]]:
# def predict(inputs: Iterator[pd.DataFrame]) -> Iterator[Union[pd.DataFrame, pd.Series]]:
# def predict(inputs: Iterator[pd.Series]) -> Iterator[Union[pd.DataFrame, pd.Series]]:
    for batch in inputs:
        print("===== batch: {}".format(type(batch)))
        print("===== len(batch: {}".format(len(batch)))
        print("===== batch.columns: {}".format(batch.columns))
        print("===== batch.dtypes:\n{}".format(batch.dtypes))
        yield batch

## Test scalar columns

### pDF -> pDF | returnType=StructType | return pDF => OK

In [None]:
df_scalar.schema

In [None]:
@pandas_udf(returnType=df_scalar.schema)
def predict(inputs: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]:
    for batch in inputs:
        print("===== batch: {}".format(type(batch)))
        print("===== len(batch: {}".format(len(batch)))
        print("===== batch.columns: {}".format(batch.columns))
        print("===== batch.dtypes:\n{}".format(batch.dtypes))
        yield batch

In [None]:
columns = df_scalar.columns
preds = df_scalar.withColumn("preds", predict(struct(*columns))).toPandas()
preds

### pDF -> pDF | returnType=DoubleType() | return pS => OK

In [None]:
df_scalar.schema

In [None]:
@pandas_udf(returnType=DoubleType())
def predict(inputs: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]:
    for batch in inputs:
        print("===== batch: {}".format(type(batch)))
        print("===== len(batch: {}".format(len(batch)))
        print("===== batch.columns: {}".format(batch.columns))
        print("===== batch.dtypes:\n{}".format(batch.dtypes))
        yield batch['a']

In [None]:
columns = df_scalar.columns
preds = df_scalar.withColumn("preds", predict(struct(*columns))).toPandas()
preds

### pDF -> pS | returnType=StructType | return pDF => OK

In [None]:
df_scalar.schema

In [None]:
@pandas_udf(returnType=df_scalar.schema)
def predict(inputs: Iterator[pd.DataFrame]) -> Iterator[pd.Series]:
    for batch in inputs:
        print("===== batch: {}".format(type(batch)))
        print("===== len(batch: {}".format(len(batch)))
        print("===== batch.columns: {}".format(batch.columns))
        print("===== batch.dtypes:\n{}".format(batch.dtypes))
        yield batch

In [None]:
columns = df_scalar.columns
preds = df_scalar.withColumn("preds", predict(struct(*columns))).toPandas()
preds

### pDF -> pS | returnType=DoubleType() | return pS => OK

In [None]:
df_scalar.schema

In [None]:
@pandas_udf(returnType=DoubleType())
def predict(inputs: Iterator[pd.DataFrame]) -> Iterator[pd.Series]:
    for batch in inputs:
        print("===== batch: {}".format(type(batch)))
        print("===== len(batch: {}".format(len(batch)))
        print("===== batch.columns: {}".format(batch.columns))
        print("===== batch.dtypes:\n{}".format(batch.dtypes))
        yield batch['a']

In [None]:
columns = df_scalar.columns
preds = df_scalar.withColumn("preds", predict(struct(*columns))).toPandas()
preds

### Union[pDF, pS] -> Union[pDF, pS] | returnType=StructType) | return pS => FAIL

In [None]:
@pandas_udf(returnType=df_scalar.schema)
def predict(inputs: Iterator[Union[pd.DataFrame, pd.Series]]) -> Iterator[Union[pd.DataFrame, pd.Series]]:
    for batch in inputs:
        print("===== batch: {}".format(type(batch)))
        print("===== len(batch: {}".format(len(batch)))
        print("===== batch.columns: {}".format(batch.columns))
        print("===== batch.dtypes:\n{}".format(batch.dtypes))
        yield batch

In [None]:
from inspect import signature
from pyspark.sql.pandas.typehints import infer_eval_type
from typing import get_type_hints, Any, Callable, Optional, Tuple

In [None]:
def check_iterator_annotation(
    annotation: Any, parameter_check_func: Optional[Callable[[Any], bool]] = None
) -> bool:
    name = getattr(annotation, "_name", getattr(annotation, "__name__", None))
    return name == "Iterator" and (
        parameter_check_func is None or all(map(parameter_check_func, annotation.__args__))
    )

In [None]:
def check_union_annotation(
    annotation: Any, parameter_check_func: Optional[Callable[[Any], bool]] = None
) -> bool:
    import typing

    # Note that we cannot rely on '__origin__' in other type hints as it has changed from version
    # to version. For example, it's abc.Iterator in Python 3.7 but typing.Iterator in Python 3.6.
    origin = getattr(annotation, "__origin__", None)
    return origin == typing.Union and (
        parameter_check_func is None or all(map(parameter_check_func, annotation.__args__))
    )

In [None]:
def check_tuple_annotation(
    annotation: Any, parameter_check_func: Optional[Callable[[Any], bool]] = None
) -> bool:
    # Tuple has _name but other types have __name__
    # Check if the name is Tuple first. After that, check the generic types.
    name = getattr(annotation, "_name", getattr(annotation, "__name__", None))
    return name == "Tuple" and (
        parameter_check_func is None or all(map(parameter_check_func, annotation.__args__))
    )

In [None]:
def predict(inputs: Iterator[Union[pd.Series, pd.DataFrame]]) -> Iterator[pd.DataFrame]:
    for batch in inputs:
        print("===== batch: {}".format(type(batch)))
        print("===== len(batch: {}".format(len(batch)))
        print("===== batch.columns: {}".format(batch.columns))
        print("===== batch.dtypes:\n{}".format(batch.dtypes))
        yield batch

In [None]:
sig = signature(predict)
sig

In [None]:
type_hints = get_type_hints(predict)
type_hints

In [None]:
annotations = {}
for param in sig.parameters.values():
    if param.annotation is not param.empty:
        annotations[param.name] = type_hints.get(param.name, param.annotation)
annotations

In [None]:
parameters_sig = [
    annotations[parameter] for parameter in sig.parameters if parameter in annotations]
parameters_sig

In [None]:
return_annotation = type_hints.get("return", sig.return_annotation)
return_annotation

In [None]:
is_series_or_frame = all(
    a == pd.Series
    or a == pd.DataFrame  # Series
    or check_union_annotation(  # DataFrame  # Union[DataFrame, Series]
        a, parameter_check_func=lambda na: na == pd.Series or na == pd.DataFrame
    )
    for a in parameters_sig
) and (return_annotation == pd.Series or return_annotation == pd.DataFrame)
is_series_or_frame

In [None]:
is_iterator_tuple_series_or_frame = (
    len(parameters_sig) == 1
    and check_iterator_annotation(  # Iterator
        parameters_sig[0],
        parameter_check_func=lambda a: check_tuple_annotation(  # Tuple
            a,
            parameter_check_func=lambda ta: (
                ta == Ellipsis
                or ta == pd.Series  # ...
                or ta == pd.DataFrame  # Series
                or check_union_annotation(  # DataFrame  # Union[DataFrame, Series]
                    ta, parameter_check_func=lambda na: (na == pd.Series or na == pd.DataFrame)
                )
            ),
        ),
    )
    and check_iterator_annotation(
        return_annotation, parameter_check_func=lambda a: a == pd.DataFrame or a == pd.Series
    )
)
is_iterator_tuple_series_or_frame

In [None]:
is_iterator_series_or_frame = (
    len(parameters_sig) == 1
    and check_iterator_annotation(
        parameters_sig[0],
        parameter_check_func=lambda a: (
            a == pd.Series
            or a == pd.DataFrame  # Series
            or check_union_annotation(  # DataFrame  # Union[DataFrame, Series]
                a, parameter_check_func=lambda ua: ua == pd.Series or ua == pd.DataFrame
            )
        ),
    )
    and check_iterator_annotation(
        return_annotation, parameter_check_func=lambda a: a == pd.DataFrame or a == pd.Series
    )
)
is_iterator_series_or_frame

### pS -> pS | returnType=ArrayType(DoubleType()) | return pS => OK

In [None]:
from pyspark.sql.functions import array

In [None]:
df_scalar.schema

In [None]:
@pandas_udf(returnType=ArrayType(DoubleType()))
def predict(inputs: Iterator[pd.Series]) -> Iterator[pd.Series]:
    for batch in inputs:
        print("===== batch: {}".format(type(batch)))
        print("===== len(batch: {}".format(len(batch)))
        # print("===== batch.columns: {}".format(batch.columns))
        # print("===== batch.dtypes:\n{}".format(batch.dtypes))
        print("===== batch[0]:\n{}".format(batch[0]))
        yield batch

In [None]:
columns = df_scalar.columns
preds = df_scalar.withColumn("preds", predict(array(columns))).toPandas()
preds

## Test caching

In [None]:
import numpy as np
import pandas as pd
import threading
import time

from pyspark.ml.functions import batch_infer_udf
from pyspark.sql.functions import struct, pandas_udf
from pyspark.sql.types import *
from typing import Iterator

In [None]:
data = np.arange(0, 1000, dtype=np.float64).reshape(-1, 4)

# 4 scalar columns
pdf = pd.DataFrame(data, columns=["a", "b", "c", "d"])
df = spark.createDataFrame(pdf)

In [None]:
def predict_batch_fn():
    # emulate loading a model, this should only be invoked once (per worker process)
    fake_output = np.random.random()

    def predict(inputs):
        return [fake_output for i in inputs]

    return predict

In [None]:
identity = batch_infer_udf(predict_batch_fn, return_type=DoubleType())

In [None]:
%%time
# results should be the same
df1 = df.withColumn("preds", identity(struct("a"))).toPandas()

In [None]:
%%time
df2 = df.withColumn("preds", identity(struct("a"))).toPandas()

In [None]:
%%time
df2 = df.withColumn("preds", identity(struct("a"))).toPandas()

In [None]:
%%time
df2 = df.withColumn("preds", identity(struct("a"))).toPandas()

In [None]:
%%time
df2 = df.withColumn("preds", identity(struct("a"))).toPandas()

## Test executors

In [None]:
import numpy as np
import pandas as pd
import threading
import time

from pyspark.ml.functions import batch_infer_udf
from pyspark.sql.functions import struct, pandas_udf
from pyspark.sql.types import *
from typing import Iterator

In [None]:
data = np.arange(0, 1000, dtype=np.float64).reshape(-1, 4)

# 4 scalar columns
pdf = pd.DataFrame(data, columns=["a", "b", "c", "d"])
df = spark.createDataFrame(pdf)

In [None]:
def myfn(it):
    import tensorflow as tf
    print(">>>> {}".format(tf.__version__))
    print(tf.config.list_physical_devices('GPU'))
    tf.debugging.set_log_device_placement(True)

    # Create some tensors
    a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
    b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
    c = tf.matmul(a, b)
    print(c)

In [None]:
df.foreachPartition(myfn)

In [None]:
@pandas_udf(returnType=FloatType())
def myudf(it: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]:
    for part in it:
        import tensorflow as tf
        print(tf.__version__)
        yield part

In [None]:
foo = df.withColumn("preds", struct("a"))

In [None]:
foo.show()

In [None]:
print("==== df1:\n{}".format(df1))
print("==== df2:\n{}".format(df2))

In [None]:
from pyspark.sql.functions import spark_partition_id
df.withColumn("partition_id", spark_partition_id()).withColumn("preds", identity(struct("a"))).groupBy("partition_id", "preds").count().show()

## Test zip

In [4]:
import pandas as pd

In [36]:
foo = pd.Series([0,1,2,3,4,5,6,7,8])
bar = pd.Series(['a','b','c','d','e','f','g','h','i'])

In [37]:
test = (foo, bar)
test

(0    0
 1    1
 2    2
 3    3
 4    4
 5    5
 6    6
 7    7
 8    8
 dtype: int64,
 0    a
 1    b
 2    c
 3    d
 4    e
 5    f
 6    g
 7    h
 8    i
 dtype: object)

In [31]:
def batch(iterable, batch_size):
    acc = []
    for i, x in enumerate(zip(*iterable)):
        acc += x
        if i % batch_size == 0:
            yield acc
            acc = []
    yield acc

In [38]:
import itertools
import more_itertools

def batch(iterable, batch_size):
    for x in more_itertools.chunked(zip(*iterable), batch_size):
        yield x

In [39]:
for x in batch(test, 2):
    print(x)
    print("====")

[(0, 'a'), (1, 'b')]
====
[(2, 'c'), (3, 'd')]
====
[(4, 'e'), (5, 'f')]
====
[(6, 'g'), (7, 'h')]
====
[(8, 'i')]
====


In [50]:
pdf = pd.concat(test, axis=1)

In [51]:
pdf

Unnamed: 0,0,1
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e
5,5,f
6,6,g
7,7,h
8,8,i


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/leey/devpub/leewyang/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/leey/devpub/leewyang/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/leey/devpub/leewyang/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
