Notebook dedicado a mostrar distintas operaciones sobre columnas de un DataFrame:
- Eliminar columnas
- Añadir columnas
- Actualizar columna

In [1]:
import spark.implicits._
val columns = Seq("name","age","weight", "height")
val data = Seq(("Jose", 40, 74, 1.63), ("María Isabel", 38, 58, 1.65), ("Antonio", 27, 60, 1.68), ("Norma", 63, 65, 1.54))

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.1.36:4043
SparkContext available as 'sc' (version = 3.0.2, master = local[*], app id = local-1615722820654)
SparkSession available as 'spark'


import spark.implicits._
columns: Seq[String] = List(name, age, weight, height)
data: Seq[(String, Int, Int, Double)] = List((Jose,40,74,1.63), (María Isabel,38,58,1.65), (Antonio,27,60,1.68), (Norma,63,65,1.54))


In [2]:
// Creando DF a partir de un Seq de datos utilizando implicits -> Seq.toDF
// El schema es inferido y los nombres de columnas son pasados como parametro
val df = data.toDF(columns:_*)
df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = false)
 |-- weight: integer (nullable = false)
 |-- height: double (nullable = false)

+------------+---+------+------+
|        name|age|weight|height|
+------------+---+------+------+
|        Jose| 40|    74|  1.63|
|María Isabel| 38|    58|  1.65|
|     Antonio| 27|    60|  1.68|
|       Norma| 63|    65|  1.54|
+------------+---+------+------+



df: org.apache.spark.sql.DataFrame = [name: string, age: int ... 2 more fields]


In [3]:
// Seleccionando columnas
val df1 = df.select("name", "age", "height")
df1.show()

+------------+---+------+
|        name|age|height|
+------------+---+------+
|        Jose| 40|  1.63|
|María Isabel| 38|  1.65|
|     Antonio| 27|  1.68|
|       Norma| 63|  1.54|
+------------+---+------+



df1: org.apache.spark.sql.DataFrame = [name: string, age: int ... 1 more field]


In [4]:
// Eliminar columna
val dfDrop = df.drop("age")
dfDrop.show()

+------------+------+------+
|        name|weight|height|
+------------+------+------+
|        Jose|    74|  1.63|
|María Isabel|    58|  1.65|
|     Antonio|    60|  1.68|
|       Norma|    65|  1.54|
+------------+------+------+



dfDrop: org.apache.spark.sql.DataFrame = [name: string, weight: int ... 1 more field]


In [5]:
import org.apache.spark.sql.functions.{lit, when, col}

// Añadir columna con valor constante
val df2 = df.withColumn("type", lit("Person"))
df2.show()

// Añadir columna con valor dinamico dependiendo del valor de una columna existente usando la funcion "when"
val df3 = df2.withColumn("sex", when(col("name") === "Jose" || col("name") === "Antonio", "Male").otherwise("Female"))
df3.show()

+------------+---+------+------+------+
|        name|age|weight|height|  type|
+------------+---+------+------+------+
|        Jose| 40|    74|  1.63|Person|
|María Isabel| 38|    58|  1.65|Person|
|     Antonio| 27|    60|  1.68|Person|
|       Norma| 63|    65|  1.54|Person|
+------------+---+------+------+------+

+------------+---+------+------+------+------+
|        name|age|weight|height|  type|   sex|
+------------+---+------+------+------+------+
|        Jose| 40|    74|  1.63|Person|  Male|
|María Isabel| 38|    58|  1.65|Person|Female|
|     Antonio| 27|    60|  1.68|Person|  Male|
|       Norma| 63|    65|  1.54|Person|Female|
+------------+---+------+------+------+------+



import org.apache.spark.sql.functions.{lit, when, col}
df2: org.apache.spark.sql.DataFrame = [name: string, age: int ... 3 more fields]
df3: org.apache.spark.sql.DataFrame = [name: string, age: int ... 4 more fields]


In [6]:
// Actualizar column
val df4 = df3.withColumn("type", lit("Persona"))
df4.show()
df4.printSchema()

+------------+---+------+------+-------+------+
|        name|age|weight|height|   type|   sex|
+------------+---+------+------+-------+------+
|        Jose| 40|    74|  1.63|Persona|  Male|
|María Isabel| 38|    58|  1.65|Persona|Female|
|     Antonio| 27|    60|  1.68|Persona|  Male|
|       Norma| 63|    65|  1.54|Persona|Female|
+------------+---+------+------+-------+------+

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = false)
 |-- weight: integer (nullable = false)
 |-- height: double (nullable = false)
 |-- type: string (nullable = false)
 |-- sex: string (nullable = false)



df4: org.apache.spark.sql.DataFrame = [name: string, age: int ... 4 more fields]


In [7]:
// Actualizar columna con valor dinamico dependiendo del valor de una columna existente aplicandole a esta un cast
// Podemos ver como cambia el schema del DF
val df5 = df4.withColumn("weight", col("weight").cast("Double"))
df5.show()
df5.printSchema()

+------------+---+------+------+-------+------+
|        name|age|weight|height|   type|   sex|
+------------+---+------+------+-------+------+
|        Jose| 40|  74.0|  1.63|Persona|  Male|
|María Isabel| 38|  58.0|  1.65|Persona|Female|
|     Antonio| 27|  60.0|  1.68|Persona|  Male|
|       Norma| 63|  65.0|  1.54|Persona|Female|
+------------+---+------+------+-------+------+

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = false)
 |-- weight: double (nullable = false)
 |-- height: double (nullable = false)
 |-- type: string (nullable = false)
 |-- sex: string (nullable = false)



df5: org.apache.spark.sql.DataFrame = [name: string, age: int ... 4 more fields]


In [8]:
// Renombrado de columna
val df6 = df4.withColumnRenamed("type", "tipo")
df6.show()

+------------+---+------+------+-------+------+
|        name|age|weight|height|   tipo|   sex|
+------------+---+------+------+-------+------+
|        Jose| 40|    74|  1.63|Persona|  Male|
|María Isabel| 38|    58|  1.65|Persona|Female|
|     Antonio| 27|    60|  1.68|Persona|  Male|
|       Norma| 63|    65|  1.54|Persona|Female|
+------------+---+------+------+-------+------+



df6: org.apache.spark.sql.DataFrame = [name: string, age: int ... 4 more fields]


In [9]:
// Renombrado de multiples columnas
val newColumns = Seq("newName","newAge","newWeight", "newHeight")
val df7 = df.toDF(newColumns:_*)
df7.printSchema()
df7.show()

root
 |-- newName: string (nullable = true)
 |-- newAge: integer (nullable = false)
 |-- newWeight: integer (nullable = false)
 |-- newHeight: double (nullable = false)

+------------+------+---------+---------+
|     newName|newAge|newWeight|newHeight|
+------------+------+---------+---------+
|        Jose|    40|       74|     1.63|
|María Isabel|    38|       58|     1.65|
|     Antonio|    27|       60|     1.68|
|       Norma|    63|       65|     1.54|
+------------+------+---------+---------+



newColumns: Seq[String] = List(newName, newAge, newWeight, newHeight)
df7: org.apache.spark.sql.DataFrame = [newName: string, newAge: int ... 2 more fields]


In [10]:
// Diferencia entre drop y select
// Eliminar una (o varias) columna(s) se puede conseguir mediente un drop o un select de un subconjunto del total de columnas

// NOTA: De ambos se puede apreciar que el plan logico y fisico es casi identico
df6.drop("sex").explain(true)
df6.select("name", "age", "weight", "height", "tipo").explain(true)

== Parsed Logical Plan ==
Project [name#13, age#14, weight#15, height#16, tipo#193]
+- Project [name#13, age#14, weight#15, height#16, type#129 AS tipo#193, sex#97]
   +- Project [name#13, age#14, weight#15, height#16, Persona AS type#129, sex#97]
      +- Project [name#13, age#14, weight#15, height#16, type#70, CASE WHEN ((name#13 = Jose) OR (name#13 = Antonio)) THEN Male ELSE Female END AS sex#97]
         +- Project [name#13, age#14, weight#15, height#16, Person AS type#70]
            +- Project [_1#4 AS name#13, _2#5 AS age#14, _3#6 AS weight#15, _4#7 AS height#16]
               +- LocalRelation [_1#4, _2#5, _3#6, _4#7]

== Analyzed Logical Plan ==
name: string, age: int, weight: int, height: double, tipo: string
Project [name#13, age#14, weight#15, height#16, tipo#193]
+- Project [name#13, age#14, weight#15, height#16, type#129 AS tipo#193, sex#97]
   +- Project [name#13, age#14, weight#15, height#16, Persona AS type#129, sex#97]
      +- Project [name#13, age#14, weight#15, hei

In [11]:
// Diferencia entre withColumn y select
// Añadir una (o varias) columna(s) se puede conseguir mediente un withColumn (como pudimos ver antes)
// o mediante un select donde se añada la nueva columna

// NOTA: De ambos se puede apreciar que el plan logico y fisico es parecido, pero es más óptimo el uso del select
// ya que el primero añade una projection adicional por cada llamada al método withColumn por lo cual de hacer un uso excesivo de este
// se podría derivar en un StackOverflowException
df6.withColumn("newColumn1", lit(1)).withColumn("newColumn2", lit(2))explain(true)
df6.select(col("name"), col("age"), col("weight"), col("height"), col("tipo"), col("sex"), lit(1).as("newColumn1"), lit(2).as("newColumn2")).explain(true)

== Parsed Logical Plan ==
Project [name#13, age#14, weight#15, height#16, tipo#193, sex#97, newColumn1#260, 2 AS newColumn2#268]
+- Project [name#13, age#14, weight#15, height#16, tipo#193, sex#97, 1 AS newColumn1#260]
   +- Project [name#13, age#14, weight#15, height#16, type#129 AS tipo#193, sex#97]
      +- Project [name#13, age#14, weight#15, height#16, Persona AS type#129, sex#97]
         +- Project [name#13, age#14, weight#15, height#16, type#70, CASE WHEN ((name#13 = Jose) OR (name#13 = Antonio)) THEN Male ELSE Female END AS sex#97]
            +- Project [name#13, age#14, weight#15, height#16, Person AS type#70]
               +- Project [_1#4 AS name#13, _2#5 AS age#14, _3#6 AS weight#15, _4#7 AS height#16]
                  +- LocalRelation [_1#4, _2#5, _3#6, _4#7]

== Analyzed Logical Plan ==
name: string, age: int, weight: int, height: double, tipo: string, sex: string, newColumn1: int, newColumn2: int
Project [name#13, age#14, weight#15, height#16, tipo#193, sex#97, newCo