Notebook dedicado a mostrar distintas formas de creación de DF a partir de datos en linea mediante:
- RDD
- Seq
- Row
- GenericRowWithSchema
- GenericRow

In [1]:
// SparkSession inyectado al notebook por ende no es necesario instanciarlo

import spark.implicits._
val columns = Seq("name","age","weight", "height")
val data = Seq(("Jose", 40, 74, 1.63), ("María Isabel", 38, 58, 1.65), ("Antonio", 27, 60, 1.68), ("Norma", 63, 65, 1.54))

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.1.36:4044
SparkContext available as 'sc' (version = 3.0.2, master = local[*], app id = local-1615723130945)
SparkSession available as 'spark'


import spark.implicits._
columns: Seq[String] = List(name, age, weight, height)
data: Seq[(String, Int, Int, Double)] = List((Jose,40,74,1.63), (María Isabel,38,58,1.65), (Antonio,27,60,1.68), (Norma,63,65,1.54))


In [2]:
// Creando DF a partir de un RDD utilizando implicits -> rdd.toDF
// El schema es inferido y los nombres de columnas son asignados por defecto: _1, _2, ...
val rdd = spark.sparkContext.parallelize(data)
val dfFromRDD1 = rdd.toDF()
dfFromRDD1.printSchema()
dfFromRDD1.show()

root
 |-- _1: string (nullable = true)
 |-- _2: integer (nullable = false)
 |-- _3: integer (nullable = false)
 |-- _4: double (nullable = false)

+------------+---+---+----+
|          _1| _2| _3|  _4|
+------------+---+---+----+
|        Jose| 40| 74|1.63|
|María Isabel| 38| 58|1.65|
|     Antonio| 27| 60|1.68|
|       Norma| 63| 65|1.54|
+------------+---+---+----+



rdd: org.apache.spark.rdd.RDD[(String, Int, Int, Double)] = ParallelCollectionRDD[0] at parallelize at <console>:31
dfFromRDD1: org.apache.spark.sql.DataFrame = [_1: string, _2: int ... 2 more fields]


In [3]:
// Creando DF a partir de un RDD utilizando implicits -> rdd.toDF
// El schema es inferido y los nombres de columnas pasados como argumentos al método toDF
val dfFromRDD1WithColumnNames = rdd.toDF(columns:_*)
dfFromRDD1WithColumnNames.printSchema()
dfFromRDD1WithColumnNames.show()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = false)
 |-- weight: integer (nullable = false)
 |-- height: double (nullable = false)

+------------+---+------+------+
|        name|age|weight|height|
+------------+---+------+------+
|        Jose| 40|    74|  1.63|
|María Isabel| 38|    58|  1.65|
|     Antonio| 27|    60|  1.68|
|       Norma| 63|    65|  1.54|
+------------+---+------+------+



dfFromRDD1WithColumnNames: org.apache.spark.sql.DataFrame = [name: string, age: int ... 2 more fields]


In [4]:
// Creando DF a partir de método createDataFrame (de SparkSession) utilizando un RDD como argumento
// El schema es inferido y los nombres de columnas pasados como argumentos al método toDF
val dfFromRDD2 = spark.createDataFrame(rdd).toDF(columns:_*)
dfFromRDD2.printSchema()
dfFromRDD2.show()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = false)
 |-- weight: integer (nullable = false)
 |-- height: double (nullable = false)

+------------+---+------+------+
|        name|age|weight|height|
+------------+---+------+------+
|        Jose| 40|    74|  1.63|
|María Isabel| 38|    58|  1.65|
|     Antonio| 27|    60|  1.68|
|       Norma| 63|    65|  1.54|
+------------+---+------+------+



dfFromRDD2: org.apache.spark.sql.DataFrame = [name: string, age: int ... 2 more fields]


In [5]:
// Creando DF a partir de un Seq de datos utilizando implicits -> Seq.toDF
// El schema es inferido y los nombres de columnas son asignados por defecto: _1, _2, ...
val dfFromData1 = data.toDF()
dfFromData1.printSchema()
dfFromData1.show()

root
 |-- _1: string (nullable = true)
 |-- _2: integer (nullable = false)
 |-- _3: integer (nullable = false)
 |-- _4: double (nullable = false)

+------------+---+---+----+
|          _1| _2| _3|  _4|
+------------+---+---+----+
|        Jose| 40| 74|1.63|
|María Isabel| 38| 58|1.65|
|     Antonio| 27| 60|1.68|
|       Norma| 63| 65|1.54|
+------------+---+---+----+



dfFromData1: org.apache.spark.sql.DataFrame = [_1: string, _2: int ... 2 more fields]


In [6]:
// Creando DF a partir de método createDataFrame (de SparkSession) utilizando un Seq como argumento
// El schema es inferido y los nombres de columnas pasados como argumentos al método toDF
var dfFromData2 = spark.createDataFrame(data).toDF(columns:_*)
dfFromData2.printSchema()
dfFromData2.show()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = false)
 |-- weight: integer (nullable = false)
 |-- height: double (nullable = false)

+------------+---+------+------+
|        name|age|weight|height|
+------------+---+------+------+
|        Jose| 40|    74|  1.63|
|María Isabel| 38|    58|  1.65|
|     Antonio| 27|    60|  1.68|
|       Norma| 63|    65|  1.54|
+------------+---+------+------+



dfFromData2: org.apache.spark.sql.DataFrame = [name: string, age: int ... 2 more fields]


In [7]:
import org.apache.spark.sql.types.{StringType, IntegerType, DoubleType, StructField, StructType}
import org.apache.spark.sql.Row

val schema = StructType( Array(
                 StructField("name", StringType,true),
                 StructField("age", IntegerType,true),
                 StructField("weight", IntegerType,true),
                 StructField("height", DoubleType,true)
             ))

//From Data (USING createDataFrame and Adding schema using StructType)
val rowData = Seq(Row("Jose", 40, 74, 1.63), 
               Row("María Isabel", 38, 58, 1.65), 
               Row("Antonio", 27, 60, 1.68),
               Row("Norma", 63, 65, 1.54))

// Otra forma, mas dinamica para generar rowData
//val rowData = data.map(dataElement => Row.fromTuple(dataElement))

// Creando DF a partir de método createDataFrame (de SparkSession) utilizando un RDD (sparkContext.parallelize) como argumento.
// El RDD se creó a partir de un conjunto de instancias Row
// El schema es pasado como parametro al crear el DataFrame
val dfFromData3 = spark.createDataFrame(spark.sparkContext.parallelize(rowData),schema)
dfFromData3.printSchema()
dfFromData3.show()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- weight: integer (nullable = true)
 |-- height: double (nullable = true)

+------------+---+------+------+
|        name|age|weight|height|
+------------+---+------+------+
|        Jose| 40|    74|  1.63|
|María Isabel| 38|    58|  1.65|
|     Antonio| 27|    60|  1.68|
|       Norma| 63|    65|  1.54|
+------------+---+------+------+



import org.apache.spark.sql.types.{StringType, IntegerType, DoubleType, StructField, StructType}
import org.apache.spark.sql.Row
schema: org.apache.spark.sql.types.StructType = StructType(StructField(name,StringType,true), StructField(age,IntegerType,true), StructField(weight,IntegerType,true), StructField(height,DoubleType,true))
rowData: Seq[org.apache.spark.sql.Row] = List([Jose,40,74,1.63], [María Isabel,38,58,1.65], [Antonio,27,60,1.68], [Norma,63,65,1.54])
dfFromData3: org.apache.spark.sql.DataFrame = [name: string, age: int ... 2 more fields]


In [8]:
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema

val rowDataWithSchema: Seq[Row] = Seq(new GenericRowWithSchema(Array("Jose", 40, 74, 1.63), schema), 
               new GenericRowWithSchema(Array("María Isabel", 38, 58, 1.65), schema),
               new GenericRowWithSchema(Array("Antonio", 27, 60, 1.68), schema),
               new GenericRowWithSchema(Array("Norma", 63, 65, 1.54), schema))

// Otra forma, mas dinamica para generar rowDataWithSchema
// val rowDataWithSchema: Seq[Row] = data.map(dataElement => new GenericRowWithSchema(dataElement.productIterator.toArray, schema))



// Creando DF a partir de método createDataFrame (de SparkSession) utilizando un RDD (sparkContext.parallelize) como argumento. 
// El RDD se creó a partir de un conjunto (Seq) de instancias GenericRowWithSchema
// El schema es pasado como parametro al crear el DataFrame
val dfFromData4 = spark.createDataFrame(spark.sparkContext.parallelize(rowDataWithSchema), schema)

dfFromData4.printSchema()
dfFromData4.show()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- weight: integer (nullable = true)
 |-- height: double (nullable = true)

+------------+---+------+------+
|        name|age|weight|height|
+------------+---+------+------+
|        Jose| 40|    74|  1.63|
|María Isabel| 38|    58|  1.65|
|     Antonio| 27|    60|  1.68|
|       Norma| 63|    65|  1.54|
+------------+---+------+------+



import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
rowDataWithSchema: Seq[org.apache.spark.sql.Row] = List([Jose,40,74,1.63], [María Isabel,38,58,1.65], [Antonio,27,60,1.68], [Norma,63,65,1.54])
dfFromData4: org.apache.spark.sql.DataFrame = [name: string, age: int ... 2 more fields]


In [9]:
import org.apache.spark.sql.catalyst.expressions.GenericRow

val rowDataWithoutSchema: Seq[Row] = data.map(dataElement => new GenericRow(dataElement.productIterator.toArray))



// Creando DF a partir de método createDataFrame (de SparkSession) utilizando un RDD (sparkContext.parallelize) como argumento. 
// El RDD se creó a partir de un conjunto (Seq) de instancias GenericRow
// El schema es pasado como parametro al crear el DataFrame
val dfFromData5 = spark.createDataFrame(spark.sparkContext.parallelize(rowDataWithoutSchema), schema)

dfFromData5.printSchema()
dfFromData5.show()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- weight: integer (nullable = true)
 |-- height: double (nullable = true)

+------------+---+------+------+
|        name|age|weight|height|
+------------+---+------+------+
|        Jose| 40|    74|  1.63|
|María Isabel| 38|    58|  1.65|
|     Antonio| 27|    60|  1.68|
|       Norma| 63|    65|  1.54|
+------------+---+------+------+



import org.apache.spark.sql.catalyst.expressions.GenericRow
rowDataWithoutSchema: Seq[org.apache.spark.sql.Row] = List([Jose,40,74,1.63], [María Isabel,38,58,1.65], [Antonio,27,60,1.68], [Norma,63,65,1.54])
dfFromData5: org.apache.spark.sql.DataFrame = [name: string, age: int ... 2 more fields]
