In [1]:
from pyspark import SparkContext
from pyspark.sql import HiveContext
sc = SparkContext('local[4]')

- **HiveContext is same as SQLContext, except HiveContext has more features**
- **Therefore in practice, always use HiveContext**

In [2]:
hive_contxt = HiveContext(sc)

- **Make an RDD where each item of the RDD is a JSON in string form**

In [3]:
rdd = sc.parallelize(['{"name":"Yin", "address":{"city":"SF","state":"CA"}}',
                      '{"name":"Mike", "address":{"city":"SE", "state":"WA"}, "hobbies":["coding", "fishing"]}',
                      '{"name":"Mary", "address":{"city":"SE", "state":"WA"}, "hobbies":["playing chess"]}'])

- **Make a SchemaRDD (can be queried with SQL) from the RDD of JSON strings**

In [4]:
json_rdd = hive_contxt.jsonRDD(rdd)

- **Print the schema of the SchemaRDD**
- **Note that the first row does not have the field hobbies (second row has), and yet the schema infers it**
- **By default HiveContext jsonRDD infers the schema by looking at all the rows**

In [5]:
json_rdd.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |-- hobbies: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- name: string (nullable = true)



- **Now register the SchemaRDD as a table, so we can query it**

In [6]:
json_rdd.registerTempTable('people')

- **We can query the nested dictionary of the field `address` by specifying `address.city` for example**

In [7]:
everyone_rdd = hive_contxt.sql('SELECT name, address.city, address.state from people')

- **Let's see the result**

In [8]:
everyone_rdd.collect()

[Row(name=u'Yin', city=u'SF', state=u'CA'),
 Row(name=u'Mike', city=u'SE', state=u'WA'),
 Row(name=u'Mary', city=u'SE', state=u'WA')]

In [9]:
# Turn SchemaRDD to an RDD of JSON String and then collect
everyone_rdd.toJSON().collect()

['{"name":"Yin","city":"SF","state":"CA"}',
 '{"name":"Mike","city":"SE","state":"WA"}',
 '{"name":"Mary","city":"SE","state":"WA"}']

In [10]:
# Turn SchemaRDD to an RDD of dictionaries and then collect
everyone_rdd.map(lambda row: row.asDict()).collect()

[{u'city': u'SF', u'name': u'Yin', u'state': u'CA'},
 {u'city': u'SE', u'name': u'Mike', u'state': u'WA'},
 {u'city': u'SE', u'name': u'Mary', u'state': u'WA'}]

- **We can even query fields that is an Array by using `LATERAL VIEW explode` in Hive**

In [11]:
rdd2 = hive_contxt.sql("""SELECT * FROM people
                          LATERAL VIEW explode(hobbies) h as hobby
                          WHERE hobby = 'coding'""")

In [12]:
rdd2.collect()

[Row(address=Row(city=u'SE', state=u'WA'), hobbies=[u'coding', u'fishing'], name=u'Mike', hobby=u'coding')]