### Structured API Overview

In [None]:
//DataFrame with one column containing 1000 rows with values from 0 to 999.
val myRange=spark.range(1000).toDF("number")
val divisBy2 = myRange.where("number % 2 = 0")

In [17]:
divisBy2.count()

500

In [None]:
//DataFrameReader (via spark.read) interface, specify the format and the path.
val flightData2015 = spark.read
.json("/home/koushik/git/spark/input/Spark-The-Definitive-Guide/flight-data/json/2015-summary.json")
val sortedFlightData2015 = flightData2015.sort("count")

In [18]:
sortedFlightData2015.take(2)

Array([United States,Singapore,1], [Moldova,United States,1])

In [None]:
//using "option" for our DataFrameReader. Options allow you to control how you read in a given file format and
//tell Spark to take advantage of some of the structures or information available in the files. 
//In this case we’re going to use two popular options "inferSchema and header".
val flightData2015 = spark.read
.option("inferSchema", "true")
.option("header", "true")
.csv("/home/koushik/git/spark/input/Spark-The-Definitive-Guide/flight-data/csv/2015-summary.csv")
flightData2015

In [None]:
//JSON and CSVs qualify as semi-structured data formats and
//json provides a bit more structure than CSVs because JSON has a notion of types.
val jsonSchema = spark
.read.format("json")
.load("/home/koushik/git/spark/input/Spark-The-Definitive-Guide/flight-data/json/2015-summary.json")
.schema

In [19]:
println(jsonSchema)

StructType(StructField(DEST_COUNTRY_NAME,StringType,true), StructField(ORIGIN_COUNTRY_NAME,StringType,true), StructField(count,LongType,true))


In [None]:
//The first row in our csv file is the header (column names) and because CSV files are not guaranteed to have this information we must specify it manually. 
//The inferSchema option might feel a bit more unfamiliar. JSON objects provides a bit more structure than csvs because JSON has a notion of types. We can get
//past this by infering the schema of the csv file we are reading in.
val csvSchema = spark
.read.format("csv")
.option("inferSchema", "true")
.option("header", "true")
.load("/home/koushik/git/spark/input/Spark-The-Definitive-Guide/flight-data/csv/2015-summary.csv")
.schema

In [20]:
println(csvSchema)

StructType(StructField(DEST_COUNTRY_NAME,StringType,true), StructField(ORIGIN_COUNTRY_NAME,StringType,true), StructField(count,IntegerType,true))


In [None]:
//Any DataFrame can be made into a table or view with one simple method call.
flightData2015.createOrReplaceTempView("flight_data_2015")

In [None]:
//we can query our data in SQL. To execute a SQL query, we’ll use the "spark.sql" function  that conveniently, returns a new DataFrame. (remember spark is our SparkSession variable?)
val sqlWay = spark.sql("""SELECT DEST_COUNTRY_NAME, count(1) FROM flight_data_2015 GROUP BY DEST_COUNTRY_NAME """)
val dataFrameWay = flightData2015.groupBy('DEST_COUNTRY_NAME).count()

In [21]:
sqlWay.explain()

== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#36], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#36, 200)
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#36], functions=[partial_count(1)])
      +- *(1) FileScan csv [DEST_COUNTRY_NAME#36] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/koushik/git/spark/input/Spark-The-Definitive-Guide/flight-data/csv/2..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


In [22]:
dataFrameWay.explain()

== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#36], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#36, 200)
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#36], functions=[partial_count(1)])
      +- *(1) FileScan csv [DEST_COUNTRY_NAME#36] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/koushik/git/spark/input/Spark-The-Definitive-Guide/flight-data/csv/2..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


In [23]:
spark.sql("SELECT max(count) from flight_data_2015").take(1)

Array([370002])

In [None]:
val maxSql = spark.sql("""SELECT DEST_COUNTRY_NAME, sum(count) as destination_total 
FROM flight_data_2015 GROUP BY DEST_COUNTRY_NAME ORDER BY sum(count) DESC LIMIT 5""")
maxSql.collect()

In [24]:
import org.apache.spark.sql.functions.desc
val maxDF=flightData2015
.groupBy("DEST_COUNTRY_NAME")
.sum("count")
.withColumnRenamed("sum(count)", "destination_total")
.sort(desc("destination_total"))
.limit(5)
.explain()

== Physical Plan ==
TakeOrderedAndProject(limit=5, orderBy=[destination_total#149L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#36,destination_total#149L])
+- *(2) HashAggregate(keys=[DEST_COUNTRY_NAME#36], functions=[sum(cast(count#38 as bigint))])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#36, 200)
      +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#36], functions=[partial_sum(cast(count#38 as bigint))])
         +- *(1) FileScan csv [DEST_COUNTRY_NAME#36,count#38] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/koushik/git/spark/input/Spark-The-Definitive-Guide/flight-data/csv/2..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>


maxDF: Unit = ()
