# Using Spark SQL

### Create a SQL context on the data

In [1]:
// Create a SQL context on the data
val sqlContext = new org.apache.spark.sql.SQLContext(sc)

// this is used to implicitly convert an RDD to a DataFrame.
import sqlContext.implicits._

### Create RDD, DataFrame and Table

In [2]:
// Create an RDD
val people = sc.textFile("data/people.txt")

// The schema is encoded in a string
val schemaString = "name age"

// Import Row.
import org.apache.spark.sql.Row;

// Import Spark SQL data types
import org.apache.spark.sql.types.{StructType,StructField,StringType};

// Generate the schema based on the string of schema
val schema =
  StructType(
    schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true)))

// Convert records of the RDD (people) to Rows.
val rowRDD = people.map(_.split(",")).map(p => Row(p(0), p(1).trim))

// Apply the schema to the RDD.
val peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema)

// Register the DataFrames as a table.
peopleDataFrame.registerTempTable("people")



### Using SQL to query a table

In [3]:
// SQL statements can be run by using the sql methods provided by sqlContext.
val results = sqlContext.sql("SELECT name FROM people")

// The results of SQL queries are DataFrames and support all the normal RDD operations.
// The columns of a row in the result can be accessed by field index or by field name.
results.map(t => "Name: " + t(0)).collect().foreach(println)

val results2 = sqlContext.sql("SELECT * FROM people")
results2.collect().foreach(println)


Name: Michael
Name: Andy
Name: Justin
Name: Michael
[Michael,29]
[Andy,30]
[Justin,19]
[Michael,29]


### Using  DataFrames commands

In [4]:
peopleDataFrame.printSchema()
peopleDataFrame.show()

root
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)

+-------+---+
|   name|age|
+-------+---+
|Michael| 29|
|   Andy| 30|
| Justin| 19|
|Michael| 29|
+-------+---+



checked