Notebook dedicado a explorar distintos métodos de parsing de datos en formato JSON

In [1]:
import spark.implicits._
val columns = Seq("ID","JSON_value")
val data = Seq(
  (1, """{"name": "Jose", "age": 40, "weight": 74, "height": 1.63, "sex": 'm', "citizenship": ["Venezuelan", "Spanish"]}"""), 
  (2, """{"name": "María Isabel", "age": 38, "weight": 58, "height": 1.65, "sex": 'f', "citizenship": ["Venezuelan"]}"""), 
  (3, """{"name": "Antonio", "age": 27, "weight": 60, "height": 1.68, "sex": 'm', "citizenship": ["Venezuelan"]}"""), 
  (4, """{"name": "Norma", "age": 63, "weight": 65, "height": 1.54, "sex": 'f', "citizenship": ["Venezuelan"]}""")
  )

val df = data.toDF(columns:_*)

df.show(false)

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.1.36:4041
SparkContext available as 'sc' (version = 3.0.2, master = local[*], app id = local-1615756990306)
SparkSession available as 'spark'


+---+---------------------------------------------------------------------------------------------------------------+
|ID |JSON_value                                                                                                     |
+---+---------------------------------------------------------------------------------------------------------------+
|1  |{"name": "Jose", "age": 40, "weight": 74, "height": 1.63, "sex": 'm', "citizenship": ["Venezuelan", "Spanish"]}|
|2  |{"name": "María Isabel", "age": 38, "weight": 58, "height": 1.65, "sex": 'f', "citizenship": ["Venezuelan"]}   |
|3  |{"name": "Antonio", "age": 27, "weight": 60, "height": 1.68, "sex": 'm', "citizenship": ["Venezuelan"]}        |
|4  |{"name": "Norma", "age": 63, "weight": 65, "height": 1.54, "sex": 'f', "citizenship": ["Venezuelan"]}          |
+---+---------------------------------------------------------------------------------------------------------------+



import spark.implicits._
columns: Seq[String] = List(ID, JSON_value)
data: Seq[(Int, String)] = List((1,{"name": "Jose", "age": 40, "weight": 74, "height": 1.63, "sex": 'm', "citizenship": ["Venezuelan", "Spanish"]}), (2,{"name": "María Isabel", "age": 38, "weight": 58, "height": 1.65, "sex": 'f', "citizenship": ["Venezuelan"]}), (3,{"name": "Antonio", "age": 27, "weight": 60, "height": 1.68, "sex": 'm', "citizenship": ["Venezuelan"]}), (4,{"name": "Norma", "age": 63, "weight": 65, "height": 1.54, "sex": 'f', "citizenship": ["Venezuelan"]}))
df: org.apache.spark.sql.DataFrame = [ID: int, JSON_value: string]


In [2]:
import org.apache.spark.sql.functions._

val extractedColumnJSONDF = df.select($"ID", get_json_object($"JSON_value", "$.name").alias("name"), get_json_object($"JSON_value", "$.sex").alias("sex"))
extractedColumnJSONDF.show()

+---+------------+---+
| ID|        name|sex|
+---+------------+---+
|  1|        Jose|  m|
|  2|María Isabel|  f|
|  3|     Antonio|  m|
|  4|       Norma|  f|
+---+------------+---+



import org.apache.spark.sql.functions._
extractedColumnJSONDF: org.apache.spark.sql.DataFrame = [ID: int, name: string ... 1 more field]


In [3]:
extractedColumnJSONDF.select(countDistinct("sex")).show()

+-------------------+
|count(DISTINCT sex)|
+-------------------+
|                  2|
+-------------------+



In [4]:
import org.apache.spark.sql.types.{StringType, StructField, IntegerType, FloatType, StructType, ArrayType}

// Si se omite un campo en la estructura el proceso no falla, sencillamente hace caso omiso de la existencia de este
val schema = StructType(
  Array(
    StructField("name", StringType, true),
    StructField("age", IntegerType, true),
    StructField("weight", IntegerType, true),
    StructField("height", FloatType, true),
    StructField("sex", StringType, true),
    StructField("citizenship", ArrayType(StringType, true), true)
    )
  )
    
val from_jsonDF = df.select($"ID", from_json($"JSON_value", schema).as("jsonData"))
from_jsonDF.show(false)

+---+----------------------------------------------+
|ID |jsonData                                      |
+---+----------------------------------------------+
|1  |[Jose, 40, 74, 1.63, m, [Venezuelan, Spanish]]|
|2  |[María Isabel, 38, 58, 1.65, f, [Venezuelan]] |
|3  |[Antonio, 27, 60, 1.68, m, [Venezuelan]]      |
|4  |[Norma, 63, 65, 1.54, f, [Venezuelan]]        |
+---+----------------------------------------------+



import org.apache.spark.sql.types.{StringType, StructField, IntegerType, FloatType, StructType, ArrayType}
schema: org.apache.spark.sql.types.StructType = StructType(StructField(name,StringType,true), StructField(age,IntegerType,true), StructField(weight,IntegerType,true), StructField(height,FloatType,true), StructField(sex,StringType,true), StructField(citizenship,ArrayType(StringType,true),true))
from_jsonDF: org.apache.spark.sql.DataFrame = [ID: int, jsonData: struct<name: string, age: int ... 4 more fields>]


In [5]:
from_jsonDF.select("ID","jsonData.*").show(false)

+---+------------+---+------+------+---+---------------------+
|ID |name        |age|weight|height|sex|citizenship          |
+---+------------+---+------+------+---+---------------------+
|1  |Jose        |40 |74    |1.63  |m  |[Venezuelan, Spanish]|
|2  |María Isabel|38 |58    |1.65  |f  |[Venezuelan]         |
|3  |Antonio     |27 |60    |1.68  |m  |[Venezuelan]         |
|4  |Norma       |63 |65    |1.54  |f  |[Venezuelan]         |
+---+------------+---+------+------+---+---------------------+



In [6]:
df.select(schema_of_json(df.select($"JSON_value").first().getString(0)).as("schema_of_json")).show(false)

+-----------------------------------------------------------------------------------------------+
|schema_of_json                                                                                 |
+-----------------------------------------------------------------------------------------------+
|struct<age:bigint,citizenship:array<string>,height:double,name:string,sex:string,weight:bigint>|
|struct<age:bigint,citizenship:array<string>,height:double,name:string,sex:string,weight:bigint>|
|struct<age:bigint,citizenship:array<string>,height:double,name:string,sex:string,weight:bigint>|
|struct<age:bigint,citizenship:array<string>,height:double,name:string,sex:string,weight:bigint>|
+-----------------------------------------------------------------------------------------------+



In [11]:
// No ha funcionado pasandole la columna
df.select(schema_of_json(df.select($"JSON_value").as[String].first).as("schema_of_json")).show(false)

+-----------------------------------------------------------------------------------------------+
|schema_of_json                                                                                 |
+-----------------------------------------------------------------------------------------------+
|struct<age:bigint,citizenship:array<string>,height:double,name:string,sex:string,weight:bigint>|
|struct<age:bigint,citizenship:array<string>,height:double,name:string,sex:string,weight:bigint>|
|struct<age:bigint,citizenship:array<string>,height:double,name:string,sex:string,weight:bigint>|
|struct<age:bigint,citizenship:array<string>,height:double,name:string,sex:string,weight:bigint>|
+-----------------------------------------------------------------------------------------------+



In [8]:
// json_tuple permite extraer en forma de columnas varios elementos del JSON, aquellos atributos indicados
// Si no le otorgamos un alias a las columnas cada nueva columna se llamará c0, c1, c2 y asi sucesivamente
// Aprovechamos la sobrecarga del método as para otorgar alias a las nuevas columnas
df.select(json_tuple($"JSON_value", "name", "height", "sex").as(Array("name", "height", "sex"))).show(false)

+------------+------+---+
|name        |height|sex|
+------------+------+---+
|Jose        |1.63  |m  |
|María Isabel|1.65  |f  |
|Antonio     |1.68  |m  |
|Norma       |1.54  |f  |
+------------+------+---+



In [9]:
// Es necesario un tipo de dato que sea un structype, ArrayType o MapTYpe
from_jsonDF.select(to_csv($"jsonData")).show(false)

+----------------------------------------+
|to_csv(jsonData)                        |
+----------------------------------------+
|Jose,40,74,1.63,m,"[Venezuelan,Spanish]"|
|María Isabel,38,58,1.65,f,[Venezuelan]  |
|Antonio,27,60,1.68,m,[Venezuelan]       |
|Norma,63,65,1.54,f,[Venezuelan]         |
+----------------------------------------+



In [10]:
// El método transform opera sobre un array de elementos aplicando una lambda pero utilizando funciones propias del paquete sql (p.e. upper) o nativas (p.e x+1)
from_jsonDF.select(transform($"jsonData.citizenship", x => upper(x))).show(false)

+------------------------------------------------------------+
|transform(jsonData.citizenship, lambdafunction(upper(x), x))|
+------------------------------------------------------------+
|[VENEZUELAN, SPANISH]                                       |
|[VENEZUELAN]                                                |
|[VENEZUELAN]                                                |
|[VENEZUELAN]                                                |
+------------------------------------------------------------+

