In [1]:
from typing import Iterator
from pyspark.sql import SparkSession, DataFrame

from pyspark.sql.types import ArrayType, DoubleType, IntegerType, StringType
from pyspark.sql.functions import udf, col, pandas_udf, mean as pyspark_mean, PandasUDFType, countDistinct

import math
import pandas as pd
import databricks.koalas as ks
from pyarrow.parquet import ParquetDataset

import time

In [2]:
DATA_PATH: str = '/media/maja/Samsung_T51/quasar'

In [3]:
spark = SparkSession.builder \
    .master("local") \
    .appName("UDF_Benchmark") \
    .config("spark.rdd.compress", True) \
    .config("spark.sql.execution.arrow.pyspark.enabled", True) \
    .config("spark.sql.parquet.enableVectorizedReader", True) \
    .config("spark.sql.execution.arrow.maxRecordsPerBatch", 10000) \
    .getOrCreate()

In [52]:
df = spark.read.parquet(f'{DATA_PATH}/spectra.parquet', mergeSchema=True)
df_csv = spark.read.csv(f'{DATA_PATH}/spectra-sample.csv').cache()

In [19]:
df = df.filter(df['_corrupt_record'].isNull())

In [53]:
@udf(ArrayType(DoubleType()))
def replace_nans(arr: ArrayType(DoubleType())) -> ArrayType(DoubleType()):
    return [x if x is not None else 0.0 for x in arr]

df: DataFrame = df.withColumn("continuum", replace_nans(df["continuum"].cast(ArrayType(DoubleType())))) \
    .withColumn("spectrum", replace_nans(df["spectrum"].cast(ArrayType(DoubleType()))))

In [17]:
df.select(['dec', 'fiber', 'mjd', 'plate', 'ra', 'size', 'z']).limit(100000).write.option("header", "true").csv(f'{DATA_PATH}/spectra-sample.csv')

In [55]:
df_csv.head()

Row(_c0='dec', _c1='fiber', _c2='mjd', _c3='plate', _c4='ra', _c5='size', _c6='z')

In [56]:
df_csv = df_csv.select(col('_c2').cast(DoubleType()).alias('mjd'), 
                       col('_c6').cast(DoubleType()).alias('z'))

In [57]:
df_csv = df_csv.dropna()
df_csv.count()

99996

# Simple operation

## Pyspark UDF

In [22]:
@udf('double')
def add_one_tenth(x):
    return x+0.1

In [36]:
startudf = time.time()
df.select(add_one_tenth(col("z"))).show()
endudf = time.time()
print(f'Time PySpark UDF: {endudf-startudf}')

+------------------+
|  add_one_tenth(z)|
+------------------+
|2.8785515785217286|
|2.8372965812683106|
|0.6703451037406921|
| 2.435253953933716|
|0.8849816679954529|
|0.7128542423248291|
| 2.975859498977661|
| 0.709883964061737|
| 2.485312557220459|
| 4.079956865310669|
| 2.336691474914551|
| 3.592419719696045|
|3.1562567710876466|
|2.1594024658203126|
|0.8447280287742615|
|2.1348730087280274|
|2.8504024505615235|
| 2.392560338973999|
|0.7440633535385132|
|0.8248033285140991|
+------------------+
only showing top 20 rows

Time PySpark UDF: 0.12053942680358887


In [37]:
startudf = time.time()
df_csv.select(add_one_tenth(col('z'))).show()
endudf = time.time()
print(f'Time PySpark UDF for CSV: {endudf-startudf}')

+------------------+
|  add_one_tenth(z)|
+------------------+
|2.8785515785217286|
|2.8372965812683106|
|0.6703451037406921|
| 2.435253953933716|
|0.8849816679954529|
|0.7128542423248291|
| 2.975859498977661|
| 0.709883964061737|
| 2.485312557220459|
| 4.079956865310669|
| 2.336691474914551|
| 3.592419719696045|
|3.1562567710876466|
|2.1594024658203126|
|0.8447280287742615|
|2.1348730087280274|
|2.8504024505615235|
| 2.392560338973999|
|0.7440633535385132|
|0.8248033285140991|
+------------------+
only showing top 20 rows

Time PySpark UDF for CSV: 0.12468457221984863


In [40]:
df.select(add_one_tenth(col("z"))).explain()

== Physical Plan ==
*(2) Project [pythonUDF0#1870 AS add_one_tenth(z)#1868]
+- BatchEvalPython [add_one_tenth(z#626)], [pythonUDF0#1870]
   +- *(1) Project [z#626]
      +- *(1) Filter isnull(_corrupt_record#612)
         +- *(1) ColumnarToRow
            +- FileScan parquet [_corrupt_record#612,z#626] Batched: true, DataFilters: [isnull(_corrupt_record#612)], Format: Parquet, Location: InMemoryFileIndex[file:/media/maja/Samsung_T51/quasar/spectra.parquet], PartitionFilters: [], PushedFilters: [IsNull(_corrupt_record)], ReadSchema: struct<_corrupt_record:string,z:double>




In [41]:
df_csv.select(add_one_tenth(col("z"))).explain()

== Physical Plan ==
*(2) Project [pythonUDF0#1944 AS add_one_tenth(z)#1872]
+- BatchEvalPython [add_one_tenth(cast(_c6#666 as double))], [pythonUDF0#1944]
   +- *(1) Filter AtLeastNNulls(n, cast(_c6#666 as double))
      +- InMemoryTableScan [_c6#666], [AtLeastNNulls(n, cast(_c6#666 as double))]
            +- InMemoryRelation [_c0#660, _c1#661, _c2#662, _c3#663, _c4#664, _c5#665, _c6#666], StorageLevel(disk, memory, deserialized, 1 replicas)
                  +- FileScan csv [_c0#130,_c1#131,_c2#132,_c3#133,_c4#134,_c5#135,_c6#136] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/media/maja/Samsung_T51/quasar/spectra-sample.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string,_c2:string,_c3:string,_c4:string,_c5:string,_c6:string>




105ms, 114ms, 106ms, 105ms, 97ms

## Scala UDF

## Pandas UDF

In [42]:
@pandas_udf('double')
def pandas_add_one_tenth(x: pd.Series) -> pd.Series:
    return x+0.1

In [44]:
startpandasudf = time.time()
df.select(pandas_add_one_tenth(col("z"))).show()
endpandasudf = time.time()
print(f'Time Pandas UDF: {endpandasudf-startpandasudf}')

+-----------------------+
|pandas_add_one_tenth(z)|
+-----------------------+
|     2.8785515785217286|
|     2.8372965812683106|
|     0.6703451037406921|
|      2.435253953933716|
|     0.8849816679954529|
|     0.7128542423248291|
|      2.975859498977661|
|      0.709883964061737|
|      2.485312557220459|
|      4.079956865310669|
|      2.336691474914551|
|      3.592419719696045|
|     3.1562567710876466|
|     2.1594024658203126|
|     0.8447280287742615|
|     2.1348730087280274|
|     2.8504024505615235|
|      2.392560338973999|
|     0.7440633535385132|
|     0.8248033285140991|
+-----------------------+
only showing top 20 rows

Time Pandas UDF: 0.36875438690185547


In [45]:
df.select(pandas_add_one_tenth(col("z"))).explain()

== Physical Plan ==
*(2) Project [pythonUDF0#2001 AS pandas_add_one_tenth(z)#1999]
+- ArrowEvalPython [pandas_add_one_tenth(z#626)], [pythonUDF0#2001], 200
   +- *(1) Project [z#626]
      +- *(1) Filter isnull(_corrupt_record#612)
         +- *(1) ColumnarToRow
            +- FileScan parquet [_corrupt_record#612,z#626] Batched: true, DataFilters: [isnull(_corrupt_record#612)], Format: Parquet, Location: InMemoryFileIndex[file:/media/maja/Samsung_T51/quasar/spectra.parquet], PartitionFilters: [], PushedFilters: [IsNull(_corrupt_record)], ReadSchema: struct<_corrupt_record:string,z:double>




In [46]:
startpandasudf = time.time()
df_csv.select(pandas_add_one_tenth(col("z"))).show()
endpandasudf = time.time()
print(f'Time Pandas UDF: {endpandasudf-startpandasudf}')

+-----------------------+
|pandas_add_one_tenth(z)|
+-----------------------+
|     2.8785515785217286|
|     2.8372965812683106|
|     0.6703451037406921|
|      2.435253953933716|
|     0.8849816679954529|
|     0.7128542423248291|
|      2.975859498977661|
|      0.709883964061737|
|      2.485312557220459|
|      4.079956865310669|
|      2.336691474914551|
|      3.592419719696045|
|     3.1562567710876466|
|     2.1594024658203126|
|     0.8447280287742615|
|     2.1348730087280274|
|     2.8504024505615235|
|      2.392560338973999|
|     0.7440633535385132|
|     0.8248033285140991|
+-----------------------+
only showing top 20 rows

Time Pandas UDF: 0.34230995178222656


In [47]:
df_csv.select(pandas_add_one_tenth(col("z"))).explain()

== Physical Plan ==
*(2) Project [pythonUDF0#2189 AS pandas_add_one_tenth(z)#2117]
+- ArrowEvalPython [pandas_add_one_tenth(cast(_c6#666 as double))], [pythonUDF0#2189], 200
   +- *(1) Filter AtLeastNNulls(n, cast(_c6#666 as double))
      +- InMemoryTableScan [_c6#666], [AtLeastNNulls(n, cast(_c6#666 as double))]
            +- InMemoryRelation [_c0#660, _c1#661, _c2#662, _c3#663, _c4#664, _c5#665, _c6#666], StorageLevel(disk, memory, deserialized, 1 replicas)
                  +- FileScan csv [_c0#130,_c1#131,_c2#132,_c3#133,_c4#134,_c5#135,_c6#136] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/media/maja/Samsung_T51/quasar/spectra-sample.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string,_c2:string,_c3:string,_c4:string,_c5:string,_c6:string>




330ms, 344ms, 345ms, 344ms, 342ms

# Complex operation

In [None]:
import statistics

## PySpark UDF

In [None]:
@udf(ArrayType(DoubleType()))
def udf_mean(ax: ArrayType(DoubleType())) -> ArrayType(DoubleType()):
    mn = statistics.mean(ax)
    return [y-mn for y in ax]

In [None]:
startudf = time.time()
df.select(col('continuum')).select(udf_mean(col("continuum"))).show()
endudf = time.time()
print(f'Time PySpark UDF: {endudf-startudf}')

781ms, 787ms, 761ms, 740ms, 737ms

## Pandas UDF

In [None]:
@pandas_udf(ArrayType(DoubleType()))
def pandas_udf_mean(ax: pd.Series) -> pd.Series:
    def subtract_statistics(x):
        mn = statistics.mean(x)
        return [y-mn for y in x]
    return ax.apply(subtract_statistics)

In [None]:
startpandasudf = time.time()
df.select(col('continuum')).select(pandas_udf_mean(col("continuum"))).show()
endpandasudf = time.time()
print(f'Time Pandas UDF: {endpandasudf-startpandasudf}')

5.95s, 5.93s, 6.10s, 5.99s, 6.11s

# Operation over a row

## Pyspark UDF

In [58]:
@udf(DoubleType())
def udf_column_multiplication(x: DoubleType(), y: DoubleType()) -> DoubleType():
    return x*y

In [59]:
startudf = time.time()
df.select(udf_column_multiplication(col("z"), col("mjd"))).show()
endudf = time.time()
print(f'Time PySpark UDF: {endudf-startudf}')

+---------------------------------+
|udf_column_multiplication(z, mjd)|
+---------------------------------+
|               153487.18919754028|
|               151208.26314926147|
|               31505.863530635834|
|               128999.42841529846|
|                43362.38734006882|
|                33854.06834602356|
|                 158862.478723526|
|               33689.990174770355|
|               131764.66566085815|
|               219852.81723976135|
|               123554.83707427979|
|               192806.01546525955|
|                168726.7675614357|
|               113693.43193054199|
|               41138.776309490204|
|               112406.38500213623|
|               151932.23136901855|
|               126565.37863373756|
|                 35556.8055588007|
|                40014.21735727787|
+---------------------------------+
only showing top 20 rows

Time PySpark UDF: 0.11775493621826172


In [61]:
startudf = time.time()
df.select(udf_column_multiplication(col("z"), col("mjd"))).show()
endudf = time.time()
print(f'Time PySpark UDF: {endudf-startudf}')

+---------------------------------+
|udf_column_multiplication(z, mjd)|
+---------------------------------+
|               153487.18919754028|
|               151208.26314926147|
|               31505.863530635834|
|               128999.42841529846|
|                43362.38734006882|
|                33854.06834602356|
|                 158862.478723526|
|               33689.990174770355|
|               131764.66566085815|
|               219852.81723976135|
|               123554.83707427979|
|               192806.01546525955|
|                168726.7675614357|
|               113693.43193054199|
|               41138.776309490204|
|               112406.38500213623|
|               151932.23136901855|
|               126565.37863373756|
|                 35556.8055588007|
|                40014.21735727787|
+---------------------------------+
only showing top 20 rows

Time PySpark UDF: 0.11924123764038086


99ms, 89ms, 91ms, 91ms, 94ms

## Pandas UDF

In [62]:
@pandas_udf(DoubleType())
def pandas_column_multiplication(x: pd.Series, y: pd.Series) -> pd.Series:
    return x*y

In [63]:
df.select(pandas_column_multiplication(col("z"), col("mjd"))).show()

+------------------------------------+
|pandas_column_multiplication(z, mjd)|
+------------------------------------+
|                  153487.18919754028|
|                  151208.26314926147|
|                  31505.863530635834|
|                  128999.42841529846|
|                   43362.38734006882|
|                   33854.06834602356|
|                    158862.478723526|
|                  33689.990174770355|
|                  131764.66566085815|
|                  219852.81723976135|
|                  123554.83707427979|
|                  192806.01546525955|
|                   168726.7675614357|
|                  113693.43193054199|
|                  41138.776309490204|
|                  112406.38500213623|
|                  151932.23136901855|
|                  126565.37863373756|
|                    35556.8055588007|
|                   40014.21735727787|
+------------------------------------+
only showing top 20 rows



In [64]:
df_csv.select(pandas_column_multiplication(col("z"), col("mjd"))).show()

+------------------------------------+
|pandas_column_multiplication(z, mjd)|
+------------------------------------+
|                  153487.18919754028|
|                  151208.26314926147|
|                  31505.863530635834|
|                  128999.42841529846|
|                   43362.38734006882|
|                   33854.06834602356|
|                    158862.478723526|
|                  33689.990174770355|
|                  131764.66566085815|
|                  219852.81723976135|
|                  123554.83707427979|
|                  192806.01546525955|
|                   168726.7675614357|
|                  113693.43193054199|
|                  41138.776309490204|
|                  112406.38500213623|
|                  151932.23136901855|
|                  126565.37863373756|
|                    35556.8055588007|
|                   40014.21735727787|
+------------------------------------+
only showing top 20 rows



399ms, 386ms, 405ms, 401ms, 398ms

In [None]:
df.head(10000).write.csv(f'{DATA_PATH}/quasar/spectra')