In [9]:
// Create DF

val df = spark.read.format("json")
    .load("guide/data/flight-data/json/2015-summary.json")

df = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]

In [2]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [4]:
// Enforce a specific Schema


import org.apache.spark.sql.types.{StructField, StructType, StringType, LongType}
import org.apache.spark.sql.types.Metadata

In [9]:
val myManualSchema = StructType(Array(
    StructField("DEST_COUNTRY_NAME", StringType, true),
    StructField("ORIGIN_COUNTRY_NAME", StringType, true),
    StructField("count", LongType, false,
               Metadata.fromJson("{\"hello\":\"world\"}"))
))

myManualSchema = StructType(StructField(DEST_COUNTRY_NAME,StringType,true), StructField(ORIGIN_COUNTRY_NAME,StringType,true), StructField(count,LongType,false))


StructType(StructField(DEST_COUNTRY_NAME,StringType,true), StructField(ORIGIN_COUNTRY_NAME,StringType,true), StructField(count,LongType,false))

In [10]:
val df = spark.read.format("json").schema(myManualSchema)
    .load("guide/data/flight-data/json/2015-summary.json")

df = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]

In [3]:
// Columns

import org.apache.spark.sql.functions.{col, column}

In [4]:
col("someColumnName")

someColumnName

In [5]:
column("someColumnName")

someColumnName

In [6]:
// Syntatic sugar

$"myColumn"

myColumn

In [8]:
'myColumn

'myColumn

In [10]:
// col refs


df.col("count")

count

In [11]:
(((col("someCol") + 5) * 200) - 6) < col("otherCol")

((((someCol + 5) * 200) - 6) < otherCol)

In [16]:
import org.apache.spark.sql.functions.expr

expr("(((someCol + 5) * 200) - 6) < otherCol")

((((someCol + 5) * 200) - 6) < otherCol)

In [19]:
// schemas

spark.read.format("json").load("guide/data/flight-data/json/2015-summary.json").columns

Array(DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count)

In [20]:
// records

df.first()

[United States,Romania,15]

In [21]:
import org.apache.spark.sql.Row

val myRow = Row("Hello", null, 1, false)

myRow = [Hello,null,1,false]


[Hello,null,1,false]

In [22]:
myRow(0)

Hello

In [23]:
myRow(1)

res44: Any = null


In [24]:
myRow(2)

1

In [25]:
myRow(3)

false

In [26]:
myRow(4)

Name: java.lang.ArrayIndexOutOfBoundsException
Message: 4
StackTrace:   at org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:174)
  at org.apache.spark.sql.Row$class.apply(Row.scala:163)
  at org.apache.spark.sql.catalyst.expressions.GenericRow.apply(rows.scala:166)

In [30]:
// creating dataframes

val df = spark.read.format("json")
    .load("guide/data/flight-data/json/2015-summary.json")


df.createOrReplaceTempView("dfTable")


df = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]

In [31]:
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StructField, StructType, StringType, LongType}

In [32]:
val myManualSchema = new StructType(Array(
    new StructField("some", StringType, true),
    new StructField("col", StringType, true),
    new StructField("names", LongType, false)))


myManualSchema = StructType(StructField(some,StringType,true), StructField(col,StringType,true), StructField(names,LongType,false))


StructType(StructField(some,StringType,true), StructField(col,StringType,true), StructField(names,LongType,false))

In [33]:

val myRows = Seq(Row("Hello", null, 1L))




myRows = List([Hello,null,1])


List([Hello,null,1])

In [34]:
val myRDD = spark.sparkContext.parallelize(myRows)


myRDD = ParallelCollectionRDD[19] at parallelize at <console>:35


ParallelCollectionRDD[19] at parallelize at <console>:35

In [36]:
val myDF = spark.createDataFrame(myRDD, myManualSchema)

myDF = [some: string, col: string ... 1 more field]


[some: string, col: string ... 1 more field]

In [38]:
myDF.show()

+-----+----+-----+
| some| col|names|
+-----+----+-----+
|Hello|null|    1|
+-----+----+-----+

