Notebook dedicado a mostrar ejemplos de creación e invocación de UDFs en spark sobre DataFrames y Spark SQL

In [1]:
import spark.implicits._
val columns = Seq("X","f(x)=2*X")
val numRecords = 100
val df = spark.createDataFrame((0 until numRecords).map(x => (x, 2*x))).toDF(columns:_*)
df.show()

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.1.36:4042
SparkContext available as 'sc' (version = 3.0.2, master = local[*], app id = local-1615757241189)
SparkSession available as 'spark'


+---+--------+
|  X|f(x)=2*X|
+---+--------+
|  0|       0|
|  1|       2|
|  2|       4|
|  3|       6|
|  4|       8|
|  5|      10|
|  6|      12|
|  7|      14|
|  8|      16|
|  9|      18|
| 10|      20|
| 11|      22|
| 12|      24|
| 13|      26|
| 14|      28|
| 15|      30|
| 16|      32|
| 17|      34|
| 18|      36|
| 19|      38|
+---+--------+
only showing top 20 rows



import spark.implicits._
columns: Seq[String] = List(X, f(x)=2*X)
numRecords: Int = 100
df: org.apache.spark.sql.DataFrame = [X: int, f(x)=2*X: int]


In [2]:
val squared = (y: Long) => {
  y * y
}

spark.udf.register("squared", squared)

squared: Long => Long = $Lambda$2446/181673764@12050e01
res1: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$2446/181673764@12050e01,LongType,List(Some(class[value[0]: bigint])),Some(squared),false,true)


In [3]:
// UDFs en SparkSQL

df.createOrReplaceTempView("testUDF")

spark.sql("SELECT X, `f(x)=2*X` as Y, squared(`f(x)=2*X`) as `Y^2` FROM testUDF").show()

+---+---+----+
|  X|  Y| Y^2|
+---+---+----+
|  0|  0|   0|
|  1|  2|   4|
|  2|  4|  16|
|  3|  6|  36|
|  4|  8|  64|
|  5| 10| 100|
|  6| 12| 144|
|  7| 14| 196|
|  8| 16| 256|
|  9| 18| 324|
| 10| 20| 400|
| 11| 22| 484|
| 12| 24| 576|
| 13| 26| 676|
| 14| 28| 784|
| 15| 30| 900|
| 16| 32|1024|
| 17| 34|1156|
| 18| 36|1296|
| 19| 38|1444|
+---+---+----+
only showing top 20 rows



In [4]:
// UDFs en DataFrames

import org.apache.spark.sql.functions.{col, udf}

val squaredOnDF = udf(squared)

df.select(col("X"), col("`f(x)=2*X`").as("Y"), squaredOnDF(col("`f(x)=2*X`")).as("Y^2")).show()

+---+---+----+
|  X|  Y| Y^2|
+---+---+----+
|  0|  0|   0|
|  1|  2|   4|
|  2|  4|  16|
|  3|  6|  36|
|  4|  8|  64|
|  5| 10| 100|
|  6| 12| 144|
|  7| 14| 196|
|  8| 16| 256|
|  9| 18| 324|
| 10| 20| 400|
| 11| 22| 484|
| 12| 24| 576|
| 13| 26| 676|
| 14| 28| 784|
| 15| 30| 900|
| 16| 32|1024|
| 17| 34|1156|
| 18| 36|1296|
| 19| 38|1444|
+---+---+----+
only showing top 20 rows



import org.apache.spark.sql.functions.{col, udf}
squaredOnDF: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$2446/181673764@12050e01,LongType,List(Some(class[value[0]: bigint])),None,false,true)


In [5]:
val columns = Seq("name","age","weight", "height")
val data = Seq(("Jose", 40, 74, 1.63), ("María Isabel", 38, 58, 1.65), ("Antonio", 27, 60, 1.68), ("Norma", 63, 65, 1.54))
val df = data.toDF(columns:_*)
df.show()

+------------+---+------+------+
|        name|age|weight|height|
+------------+---+------+------+
|        Jose| 40|    74|  1.63|
|María Isabel| 38|    58|  1.65|
|     Antonio| 27|    60|  1.68|
|       Norma| 63|    65|  1.54|
+------------+---+------+------+



columns: Seq[String] = List(name, age, weight, height)
data: Seq[(String, Int, Int, Double)] = List((Jose,40,74,1.63), (María Isabel,38,58,1.65), (Antonio,27,60,1.68), (Norma,63,65,1.54))
df: org.apache.spark.sql.DataFrame = [name: string, age: int ... 2 more fields]


In [6]:
// UDFs en DataFrames
// función para el calculo del indice de masa corporal
val calculate_imc = (weight: Integer, height: Float) => weight/height
val imc = udf(calculate_imc)

df.select(col("name"), col("age"), col("weight"), col("height"), imc(col("weight"), col("height")).as("IMC")).show()

+------------+---+------+------+---------+
|        name|age|weight|height|      IMC|
+------------+---+------+------+---------+
|        Jose| 40|    74|  1.63|45.398773|
|María Isabel| 38|    58|  1.65|35.151516|
|     Antonio| 27|    60|  1.68|35.714287|
|       Norma| 63|    65|  1.54|42.207794|
+------------+---+------+------+---------+



calculate_imc: (Integer, Float) => Float = $Lambda$2595/608032175@3ca618b8
imc: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$2595/608032175@3ca618b8,FloatType,List(Some(class[value[0]: int]), Some(class[value[0]: float])),None,false,true)


In [7]:
// UDFs en SparkSQL
// función para el calculo del indice de masa corporal
spark.udf.register("imc", calculate_imc)

df.createOrReplaceTempView("testUDF2")
spark.sql("SELECT name, age, weight, height, imc(weight, height) as IMC FROM testUDF2").show()

+------------+---+------+------+---------+
|        name|age|weight|height|      IMC|
+------------+---+------+------+---------+
|        Jose| 40|    74|  1.63|45.398773|
|María Isabel| 38|    58|  1.65|35.151516|
|     Antonio| 27|    60|  1.68|35.714287|
|       Norma| 63|    65|  1.54|42.207794|
+------------+---+------+------+---------+

