In [1]:
%%init_spark
launcher.num_executors = 4
launcher.executor_cores = 2
launcher.driver_memory = '4g'
launcher.conf.set("spark.sql.catalogImplementation", "hive")
launcher.conf.set("spark.jars.packages", "net.liftweb:lift-json_2.12:3.5.0")
launcher.conf.set("spark.jars", "/opt/spark-hive_2.12-3.4.0.jar,/opt/scala-project_2.12-0.1.0-SNAPSHOT.jar")

In [2]:
spark

Intitializing Scala interpreter ...

Spark Web UI available at http://localhost:8088/proxy/application_1682380615897_0014
SparkContext available as 'sc' (version = 3.4.0, master = yarn, app id = application_1682380615897_0014)
SparkSession available as 'spark'


res0: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@2074de99


In [3]:
spark.version

res1: String = 3.4.0


## Define Data Frame

In [4]:
import org.apache.spark.sql.types.{IntegerType, StringType, DoubleType, MapType, ArrayType, StructField, StructType}
import org.apache.spark.sql.Row

val movies = Seq(
Row(1, "The Super Mario Bros. Movie", Map("currency_code" -> "USD", "amount" -> "3.95"), "2023-04-10 06:14:23.298", List(Map("Jack Black" -> "Bowser", "Chris Pratt" -> "Mario", "Charlie Day" -> "Luigi", "Anya Taylor" -> "Princess Peach"))),
Row(2, "Top Gun: Maverick", Map("currency_code" -> "USD", "amount" -> "3.95"), "2023-04-10 06:14:23.298", List(Map("Tom Cruise" -> "Pete Mitchell", "Miles Teller" -> "Bradley", "Val Kilmer" -> "Tom Kazansky", "Jennifer Connelly" -> "Penny Benjamin"), Map("Glen Powell" -> "Lt. Jake Seresin"))),
Row(3, null, Map("currency_code" -> "USD", "amount" -> "3.95"), "2023-04-10 06:14:23.298", List())
)

val schema = List(
    StructField("id", IntegerType, true),
    StructField("title", StringType, true),
    StructField("price", MapType(StringType, StringType, true), true),
    StructField("release_date", StringType, true),
    StructField("cast", ArrayType(MapType(StringType, StringType, true), true))
)

var moviesDf = spark.createDataFrame(
    spark.sparkContext.parallelize(movies),
    StructType(schema)
)

moviesDf = moviesDf.withColumn("release_date", to_timestamp(col("release_date"),"MM-dd-yyyy HH:mm:ss.SSSS"))

moviesDf.printSchema()
moviesDf.show(false)

root
 |-- id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- price: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- release_date: timestamp (nullable = true)
 |-- cast: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)

+---+---------------------------+--------------------------------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |title                      |price                                 |release_date|cast                                                                                                                                                        |
+---+---------------------------+--------------------------------------+------------+----------------------------------

import org.apache.spark.sql.types.{IntegerType, StringType, DoubleType, MapType, ArrayType, StructField, StructType}
import org.apache.spark.sql.Row
movies: Seq[org.apache.spark.sql.Row] = List([1,The Super Mario Bros. Movie,Map(currency_code -> USD, amount -> 3.95),2023-04-10 06:14:23.298,List(Map(Jack Black -> Bowser, Chris Pratt -> Mario, Charlie Day -> Luigi, Anya Taylor -> Princess Peach))], [2,Top Gun: Maverick,Map(currency_code -> USD, amount -> 3.95),2023-04-10 06:14:23.298,List(Map(Tom Cruise -> Pete Mitchell, Miles Teller -> Bradley, Val Kilmer -> Tom Kazansky, Jennifer Connelly -> Penny Benjamin), Map(Glen Powell -> Lt. Jake Seresin))], [3,null,Map(currency_code -> USD, amount -> 3.95),2023-04-10 06:14:23.298,List()])
schema: List[org.apache.spark.sql.types.StructField] = Lis...


## Defining an UDF

In [5]:
import scala.collection.mutable.WrappedArray
import org.apache.spark.sql.functions.udf
import net.liftweb.json._
import net.liftweb.json.Serialization.write

def udfStringifyCast() = {
    udf((movieCast: WrappedArray[Map[String, String]]) => {
        var newCastList = List[String]()
        for (element <- movieCast) {
            newCastList = newCastList :+ element.map(_.productIterator.mkString(":")).mkString("|")
        }
        newCastList
    })
}

def udfLowerCase() = {
    udf((columnValue: String) => {
        println(s"O que temos? $columnValue")
        if (columnValue == null) "" else columnValue.toLowerCase()
    })
}

def udfJsonify() = {
    udf((column: Map[String, String]) => {
        implicit val formats = DefaultFormats
        write(Map(
            "currency_code" -> column("currency_code"),
            "amount" -> column("amount")
        ))
    })
}


import scala.collection.mutable.WrappedArray
import org.apache.spark.sql.functions.udf
import net.liftweb.json._
import net.liftweb.json.Serialization.write
udfStringifyCast: ()org.apache.spark.sql.expressions.UserDefinedFunction
udfLowerCase: ()org.apache.spark.sql.expressions.UserDefinedFunction
udfJsonify: ()org.apache.spark.sql.expressions.UserDefinedFunction


In [6]:
val df = moviesDf
    .withColumn("cast_stringify", udfStringifyCast()($"cast"))
    .withColumn("title_lowercase", udfLowerCase()($"title"))
    .withColumn("price_jsonify", udfJsonify()($"price"))

df.show
df.printSchema

+---+--------------------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+
| id|               title|               price|release_date|                cast|      cast_stringify|     title_lowercase|       price_jsonify|
+---+--------------------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+
|  1|The Super Mario B...|{currency_code ->...|        null|[{Jack Black -> B...|[Jack Black:Bowse...|the super mario b...|{"currency_code":...|
|  2|   Top Gun: Maverick|{currency_code ->...|        null|[{Tom Cruise -> P...|[Tom Cruise:Pete ...|   top gun: maverick|{"currency_code":...|
|  3|                null|{currency_code ->...|        null|                  []|                  []|                    |{"currency_code":...|
+---+--------------------+--------------------+------------+--------------------+--------------------+--------------------+-------

df: org.apache.spark.sql.DataFrame = [id: int, title: string ... 6 more fields]


## Importing a UDF

In [8]:
import com.myudfs.MathUDFs

import com.myudfs.MathUDFs


In [9]:
df.withColumn("greatherThanZero", MathUDFs.isGreaterThanZeroUDF($"id")).select("greatherThanZero", "title").show

+----------------+--------------------+
|greatherThanZero|               title|
+----------------+--------------------+
|            true|The Super Mario B...|
|            true|   Top Gun: Maverick|
|            true|                null|
+----------------+--------------------+



In [15]:
spark.udf.register("multiply", com.myudfs.MathUDFs.multiplyBy(2))

res11: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction(com.myudfs.MathUDFs$$$Lambda$4261/0x000000084199f840@18365020,IntegerType,List(Some(class[value[0]: int])),Some(class[value[0]: int]),Some(multiply),false,true)


In [16]:
df.createOrReplaceTempView("movies")

In [None]:
// needs to setup hive metastore
spark.sql("""
    SELECT isGreaterThanZero(id), name
    FROM movies
    """).show()