Notebook dedicado a explorar distintos métodos de parsing de datos en formato CSV

In [1]:
import spark.implicits._
val columns = Seq("ID","CSV_value")
val data = Seq(
  (1, """Jose,40,74,1.63,M"""), 
  (2, """María Isabel,38,58,1.65,F"""), 
  (3, """Antonio,27,60,1.68,M"""), 
  (4, """Norma,63,65,1.54,F""")
  )

val df = data.toDF(columns:_*)

df.show(false)

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.1.36:4045
SparkContext available as 'sc' (version = 3.0.2, master = local[*], app id = local-1615723338259)
SparkSession available as 'spark'


+---+-------------------------+
|ID |CSV_value                |
+---+-------------------------+
|1  |Jose,40,74,1.63,M        |
|2  |María Isabel,38,58,1.65,F|
|3  |Antonio,27,60,1.68,M     |
|4  |Norma,63,65,1.54,F       |
+---+-------------------------+



import spark.implicits._
columns: Seq[String] = List(ID, CSV_value)
data: Seq[(Int, String)] = List((1,Jose,40,74,1.63,M), (2,María Isabel,38,58,1.65,F), (3,Antonio,27,60,1.68,M), (4,Norma,63,65,1.54,F))
df: org.apache.spark.sql.DataFrame = [ID: int, CSV_value: string]


In [2]:
import org.apache.spark.sql.types.{StringType, StructField, IntegerType, FloatType, StructType}
import org.apache.spark.sql.functions._

val schema = StructType(
  Array(
    StructField("name", StringType, true),
    StructField("age", IntegerType, true),
    StructField("weight", IntegerType, true),
    StructField("height", FloatType, true),
    StructField("sex", StringType, true),
    )
  )

val options = Map("delimiter" ->",")
val from_csvDF = df.select($"ID", from_csv($"CSV_value", schema, options).as("csvData"))
from_csvDF.show(false)

+---+-------------------------------+
|ID |csvData                        |
+---+-------------------------------+
|1  |[Jose, 40, 74, 1.63, M]        |
|2  |[María Isabel, 38, 58, 1.65, F]|
|3  |[Antonio, 27, 60, 1.68, M]     |
|4  |[Norma, 63, 65, 1.54, F]       |
+---+-------------------------------+



import org.apache.spark.sql.types.{StringType, StructField, IntegerType, FloatType, StructType}
import org.apache.spark.sql.functions._
schema: org.apache.spark.sql.types.StructType = StructType(StructField(name,StringType,true), StructField(age,IntegerType,true), StructField(weight,IntegerType,true), StructField(height,FloatType,true), StructField(sex,StringType,true))
options: scala.collection.immutable.Map[String,String] = Map(delimiter -> ,)
from_csvDF: org.apache.spark.sql.DataFrame = [ID: int, csvData: struct<name: string, age: int ... 3 more fields>]


In [3]:
from_csvDF.select("ID","csvData.*").show(false)

+---+------------+---+------+------+---+
|ID |name        |age|weight|height|sex|
+---+------------+---+------+------+---+
|1  |Jose        |40 |74    |1.63  |M  |
|2  |María Isabel|38 |58    |1.65  |F  |
|3  |Antonio     |27 |60    |1.68  |M  |
|4  |Norma       |63 |65    |1.54  |F  |
+---+------------+---+------+------+---+



In [4]:
df.select(schema_of_csv(df.select($"CSV_value").first().getString(0)).as("schema_of_csv")).show(false)

+--------------------------------------------------------+
|schema_of_csv                                           |
+--------------------------------------------------------+
|struct<_c0:string,_c1:int,_c2:int,_c3:double,_c4:string>|
|struct<_c0:string,_c1:int,_c2:int,_c3:double,_c4:string>|
|struct<_c0:string,_c1:int,_c2:int,_c3:double,_c4:string>|
|struct<_c0:string,_c1:int,_c2:int,_c3:double,_c4:string>|
+--------------------------------------------------------+

