In [None]:
!pip3 install pyarrow==0.17.1
!pip3 install pandas==1.0.4
!pip3 install numpy==1.18.4

In [1]:
from typing import Iterator
from pyspark import SparkContext, SQLContext
from pyspark.sql import DataFrame

from pyspark.sql.types import ArrayType, DoubleType, IntegerType
from pyspark.sql.functions import udf, col, pandas_udf, mean as pyspark_mean

from pandas import Series

import math

In [2]:
DATA_PATH: str = '/media/maja/Samsung_T51/quasar'

In [3]:
sc: SparkContext = SparkContext(appName='UDF Benchmark')
sql: SQLContext = SQLContext(sparkContext=sc)

In [4]:
df: DataFrame = sql.read.parquet(f'{DATA_PATH}/spectra-split/spectra-00.parquet')

In [5]:
@udf(ArrayType(DoubleType()))
def replace_nans(arr: ArrayType(DoubleType())) -> ArrayType(DoubleType()):
    return [x if x is not None else 0.0 for x in arr]

df: DataFrame = df.withColumn("continuum", replace_nans(df["continuum"].cast(ArrayType(DoubleType())))) \
    .withColumn("spectrum", replace_nans(df["spectrum"].cast(ArrayType(DoubleType()))))

# Simple operation

## Pyspark UDF

In [None]:
@udf('double')
def add_one_tenth(x: DoubleType()) -> DoubleType():
    return x+0.1

In [None]:
df.select(add_one_tenth(col("zerr"))).show()

1.82s, 1.83s, 2.06s, 2.10s, 1.98s, 2.02s

## Scala UDF

## Pandas UDF

In [8]:
@pandas_udf("double")
def pandas_add_one_tenth(x: Series) -> Series:
    return x+0.1

In [9]:
df.select(pandas_add_one_tenth(col("zerr"))).show()

+--------------------------+
|pandas_add_one_tenth(zerr)|
+--------------------------+
|       0.10017811526777223|
|        0.1006066409405321|
|         0.100527334632352|
|       0.10050413532881067|
|       0.10117024872452021|
|        0.1002694987342693|
|       0.10058358878595755|
|       0.10028059619944543|
|       0.10033464186708443|
|       0.10006479036528618|
|       0.10043243065010757|
|       0.10029585268930533|
|       0.10046873549581506|
|       0.10064944464247674|
|       0.10002571633376647|
|          418.502587890625|
|        0.1006547129014507|
|       0.10084083812544123|
|       0.10076503457967192|
|       0.10123053300194443|
+--------------------------+
only showing top 20 rows



2.27s, 2.11s, 2.21s

# Complex operation

In [6]:
import statistics

## PySpark UDF

In [17]:
@udf(ArrayType(DoubleType()))
def udf_mean(ax: ArrayType(DoubleType())) -> ArrayType(DoubleType()):
    mn = statistics.mean(ax)
    return [y-mn for y in ax]

In [18]:
df.select(udf_mean(col("continuum"))).show()

+--------------------+
| udf_mean(continuum)|
+--------------------+
|[2.79667835243599...|
|[5.47788933854912...|
|[1.40903803289305...|
|[2.78555591488440...|
|[2.60475770446768...|
|[2.13302572664774...|
|[3.89495111345354...|
|[1.36330158011475...|
|[2.21373749174633...|
|[-4.4439965138789...|
|[2.90998045899855...|
|[-1.8891068039334...|
|[3.67987040756589...|
|[1.11581323916359...|
|[-2.1260869299019...|
|[7.77628477482626...|
|[3.16409160196432...|
|[1.61474805027984...|
|[3.40600726473766...|
|[1.10089945855926...|
+--------------------+
only showing top 20 rows



1.73s, 1.69s, 2.71s, 1.70s, 2.64s

## Pandas UDF

In [30]:
@pandas_udf(ArrayType(DoubleType()))
def pandas_mean(iterator: Series) -> Series:
    def subtract_mean(x):
        mn = statistics.mean(x)
        return [y-mn for y in x]
    return iterator.apply(subtract_mean)

In [31]:
df.select(pandas_mean(col("continuum"))).show()

+----------------------+
|pandas_mean(continuum)|
+----------------------+
|  [2.79667835243599...|
|  [5.47788933854912...|
|  [1.40903803289305...|
|  [2.78555591488440...|
|  [2.60475770446768...|
|  [2.13302572664774...|
|  [3.89495111345354...|
|  [1.36330158011475...|
|  [2.21373749174633...|
|  [-4.4439965138789...|
|  [2.90998045899855...|
|  [-1.8891068039334...|
|  [3.67987040756589...|
|  [1.11581323916359...|
|  [-2.1260869299019...|
|  [7.77628477482626...|
|  [3.16409160196432...|
|  [1.61474805027984...|
|  [3.40600726473766...|
|  [1.10089945855926...|
+----------------------+
only showing top 20 rows



9.20s, 8.00s, 7.54s, 7.10s, 10.1s

# Operation over a row

## Pyspark UDF

In [10]:
@udf(DoubleType())
def udf_column_mean(x: DoubleType(), y: DoubleType()) -> DoubleType():
    return x*y

In [15]:
df.select(udf_column_mean(col("z"), pyspark_mean(col("z")))).show()

AnalysisException: grouping expressions sequence is empty, and '`z`' is not an aggregate function. Wrap '(udf_column_mean(z, avg(z)) AS `udf_column_mean(z, avg(z))`)' in windowing function(s) or wrap '`z`' in first() (or first_value) if you don't care which value you get.;;
Aggregate [udf_column_mean(z#14, avg(z#14)) AS udf_column_mean(z, avg(z))#85]
+- Project [_corrupt_record#0, _id#1, continuum#33, dec#3, fiber#4L, mjd#5L, name#6, plate#7L, ra#8, size#9L, spectraSetOID#10, replace_nans(cast(spectrum#11 as array<double>)) AS spectrum#51, subtype#12, type#13, z#14, zerr#15]
   +- Project [_corrupt_record#0, _id#1, replace_nans(cast(continuum#2 as array<double>)) AS continuum#33, dec#3, fiber#4L, mjd#5L, name#6, plate#7L, ra#8, size#9L, spectraSetOID#10, spectrum#11, subtype#12, type#13, z#14, zerr#15]
      +- Relation[_corrupt_record#0,_id#1,continuum#2,dec#3,fiber#4L,mjd#5L,name#6,plate#7L,ra#8,size#9L,spectraSetOID#10,spectrum#11,subtype#12,type#13,z#14,zerr#15] parquet


In [None]:
sc.stop()