Notebook dedicado a exploar distintos métodos a utilizar con columnas de datos que alojan arrays. Daremos un vistazo a las funciones:
* explode
* explode_outer
* array_join
* array_contains
* array_max
* array_min
* array_remove
* array_repeat
* filter
* size

In [3]:
import org.apache.spark.sql.types.{StringType, StructField, IntegerType, ArrayType, StructType}
import org.apache.spark.sql.Row

val data = Seq(
  Row(List("Jose", "Antonio"), 40, List("Java", "Scala", "Python", "C", "C++"), List("Spanish", "Venezuelan")), 
  Row(List("María", "Isabel"), 38, List(), List("Spanish", "Venezuelan")), 
  Row(List("Antonio", "Jose"), 28, List("Javascript"), List("Venezuelan")), 
  Row(List("Norma", "Elena"), 63, List(), List("Spanish", "Venezuelan", "Italian"))
)

val schema = new StructType(
  Array(
    StructField("names", ArrayType(StringType), true),
    StructField("age", IntegerType, true),
    StructField("prog_languages", ArrayType(StringType), true),
    StructField("citizenships", ArrayType(StringType), true)
    )
  )

val df = spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
df.show()

+---------------+---+--------------------+--------------------+
|          names|age|      prog_languages|        citizenships|
+---------------+---+--------------------+--------------------+
|[Jose, Antonio]| 40|[Java, Scala, Pyt...|[Spanish, Venezue...|
|[María, Isabel]| 38|                  []|[Spanish, Venezue...|
|[Antonio, Jose]| 28|        [Javascript]|        [Venezuelan]|
| [Norma, Elena]| 63|                  []|[Spanish, Venezue...|
+---------------+---+--------------------+--------------------+



import org.apache.spark.sql.types.{StringType, StructField, IntegerType, ArrayType, StructType}
import org.apache.spark.sql.Row
data: Seq[org.apache.spark.sql.Row] = List([List(Jose, Antonio),40,List(Java, Scala, Python, C, C++),List(Spanish, Venezuelan)], [List(María, Isabel),38,List(),List(Spanish, Venezuelan)], [List(Antonio, Jose),28,List(Javascript),List(Venezuelan)], [List(Norma, Elena),63,List(),List(Spanish, Venezuelan, Italian)])
schema: org.apache.spark.sql.types.StructType = StructType(StructField(names,ArrayType(StringType,true),true), StructField(age,IntegerType,true), StructField(prog_languages,ArrayType(StringType,true),true), StructField(citizenships,ArrayType(StringType,true),true))
df: org.apache.spark.sql.DataFrame = [names: array<string>, age: int ... 2 more fields]


In [4]:
import org.apache.spark.sql.functions._

df.select(array_join($"names", " "), $"age", explode($"prog_languages")).show()

+--------------------+---+----------+
|array_join(names,  )|age|       col|
+--------------------+---+----------+
|        Jose Antonio| 40|      Java|
|        Jose Antonio| 40|     Scala|
|        Jose Antonio| 40|    Python|
|        Jose Antonio| 40|         C|
|        Jose Antonio| 40|       C++|
|        Antonio Jose| 28|Javascript|
+--------------------+---+----------+



import org.apache.spark.sql.functions._


In [5]:
df.select(array_join($"names", " "), $"age", explode_outer($"prog_languages")).show()

+--------------------+---+----------+
|array_join(names,  )|age|       col|
+--------------------+---+----------+
|        Jose Antonio| 40|      Java|
|        Jose Antonio| 40|     Scala|
|        Jose Antonio| 40|    Python|
|        Jose Antonio| 40|         C|
|        Jose Antonio| 40|       C++|
|        María Isabel| 38|      null|
|        Antonio Jose| 28|Javascript|
|         Norma Elena| 63|      null|
+--------------------+---+----------+



In [6]:
df.select(array_join($"names", " ").alias("name"), array_contains($"prog_languages", "Java").alias("constains_Java")).show()

+------------+--------------+
|        name|constains_Java|
+------------+--------------+
|Jose Antonio|          true|
|María Isabel|         false|
|Antonio Jose|         false|
| Norma Elena|         false|
+------------+--------------+



In [7]:
df.select(array_join($"names", " ").alias("name"), array_contains($"prog_languages", "Java").alias("constains_Java")).where(array_contains($"names", "Jose")).show()

+------------+--------------+
|        name|constains_Java|
+------------+--------------+
|Jose Antonio|          true|
|Antonio Jose|         false|
+------------+--------------+



In [8]:
df.select(array_max($"names"), $"age", array_min($"prog_languages")).show()

+----------------+---+-------------------------+
|array_max(names)|age|array_min(prog_languages)|
+----------------+---+-------------------------+
|            Jose| 40|                        C|
|           María| 38|                     null|
|            Jose| 28|               Javascript|
|           Norma| 63|                     null|
+----------------+---+-------------------------+



In [9]:
df.select(array_join($"names", "-"), $"age", array_repeat($"age", 3), array_remove($"citizenships", "Spanish")).show(false)

+--------------------+---+--------------------+-----------------------------------+
|array_join(names, -)|age|array_repeat(age, 3)|array_remove(citizenships, Spanish)|
+--------------------+---+--------------------+-----------------------------------+
|Jose-Antonio        |40 |[40, 40, 40]        |[Venezuelan]                       |
|María-Isabel        |38 |[38, 38, 38]        |[Venezuelan]                       |
|Antonio-Jose        |28 |[28, 28, 28]        |[Venezuelan]                       |
|Norma-Elena         |63 |[63, 63, 63]        |[Venezuelan, Italian]              |
+--------------------+---+--------------------+-----------------------------------+



In [10]:
df.select(array_join($"names", "-"), size($"prog_languages"), filter($"citizenships", x => x === "Venezuelan")).show()

+--------------------+--------------------+---------------------------------------------------------+
|array_join(names, -)|size(prog_languages)|filter(citizenships, lambdafunction((x = Venezuelan), x))|
+--------------------+--------------------+---------------------------------------------------------+
|        Jose-Antonio|                   5|                                             [Venezuelan]|
|        María-Isabel|                   0|                                             [Venezuelan]|
|        Antonio-Jose|                   1|                                             [Venezuelan]|
|         Norma-Elena|                   0|                                             [Venezuelan]|
+--------------------+--------------------+---------------------------------------------------------+

