Notebook dedicado a explorar las distintas formas que existen para añadir valores constantes a columnas en un DataFrame
* Función lit
* Función typedLit
* Utilizando SparkSQL

In [1]:
import spark.implicits._
val columns = Seq("name","age","weight", "height")
val data = Seq(("Jose", 40, 74, 1.63), ("María Isabel", 38, 58, 1.65), ("Antonio", 27, 60, 1.68), ("Norma", 63, 65, 1.54))
// Creando DF a partir de un Seq de datos utilizando implicits -> Seq.toDF
// El schema es inferido y los nombres de columnas spasados como parametro
val df = data.toDF(columns:_*)
df.show()

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.1.36:4040
SparkContext available as 'sc' (version = 3.0.2, master = local[*], app id = local-1615721746736)
SparkSession available as 'spark'


+------------+---+------+------+
|        name|age|weight|height|
+------------+---+------+------+
|        Jose| 40|    74|  1.63|
|María Isabel| 38|    58|  1.65|
|     Antonio| 27|    60|  1.68|
|       Norma| 63|    65|  1.54|
+------------+---+------+------+



import spark.implicits._
columns: Seq[String] = List(name, age, weight, height)
data: Seq[(String, Int, Int, Double)] = List((Jose,40,74,1.63), (María Isabel,38,58,1.65), (Antonio,27,60,1.68), (Norma,63,65,1.54))
df: org.apache.spark.sql.DataFrame = [name: string, age: int ... 2 more fields]


In [2]:
import org.apache.spark.sql.functions.lit
                                       
// Añadir una columna nueva y asigandole valores constante utilizando la función lit
val df2 = df.withColumn("type", lit("Person"))
df2.show()

+------------+---+------+------+------+
|        name|age|weight|height|  type|
+------------+---+------+------+------+
|        Jose| 40|    74|  1.63|Person|
|María Isabel| 38|    58|  1.65|Person|
|     Antonio| 27|    60|  1.68|Person|
|       Norma| 63|    65|  1.54|Person|
+------------+---+------+------+------+



import org.apache.spark.sql.functions.lit
df2: org.apache.spark.sql.DataFrame = [name: string, age: int ... 3 more fields]


In [3]:
import org.apache.spark.sql.functions.{col, typedLit, when}
                                       
// Añadir una columna nueva y asigandole valores constante utilizando la función typedLit
// La diferencia entre esta función y lit es que esta función puede manejar tipos de scala parametrizados, por ejemplo: List, Seq y Map
val df3 = df2.withColumn("citizenships", when(col("name") === "Jose", typedLit(Seq("Venezuelan", "Spanish")))
                                        .otherwise(typedLit(Seq("Venezuelan"))))
df3.show(false)

+------------+---+------+------+------+---------------------+
|name        |age|weight|height|type  |citizenships         |
+------------+---+------+------+------+---------------------+
|Jose        |40 |74    |1.63  |Person|[Venezuelan, Spanish]|
|María Isabel|38 |58    |1.65  |Person|[Venezuelan]         |
|Antonio     |27 |60    |1.68  |Person|[Venezuelan]         |
|Norma       |63 |65    |1.54  |Person|[Venezuelan]         |
+------------+---+------+------+------+---------------------+



import org.apache.spark.sql.functions.{col, typedLit, when}
df3: org.apache.spark.sql.DataFrame = [name: string, age: int ... 4 more fields]


In [4]:
// Añadir una columna nueva asignandole valores constantes utilizando SparkSQL
df3.createOrReplaceTempView("persons")
val df4 = spark.sql("SELECT name, age, weight, height, type, citizenships, 'SPAIN' AS location FROM persons")
df4.show(false)

+------------+---+------+------+------+---------------------+--------+
|name        |age|weight|height|type  |citizenships         |location|
+------------+---+------+------+------+---------------------+--------+
|Jose        |40 |74    |1.63  |Person|[Venezuelan, Spanish]|SPAIN   |
|María Isabel|38 |58    |1.65  |Person|[Venezuelan]         |SPAIN   |
|Antonio     |27 |60    |1.68  |Person|[Venezuelan]         |SPAIN   |
|Norma       |63 |65    |1.54  |Person|[Venezuelan]         |SPAIN   |
+------------+---+------+------+------+---------------------+--------+



df4: org.apache.spark.sql.DataFrame = [name: string, age: int ... 5 more fields]
