# Jonathan Halverson
# Thursday, September 29, 2016
# Review of Spark SQL in Spark 2

We begin by looking at what is available to us:

In [1]:
print dir()

['In', 'Out', 'SQLContext', 'SparkContext', 'SparkSession', 'StorageLevel', '_', '__', '___', '__builtin__', '__builtins__', '__doc__', '__name__', '__package__', '_dh', '_i', '_i1', '_ih', '_ii', '_iii', '_oh', '_pythonstartup', '_sh', 'atexit', 'exit', 'get_ipython', 'os', 'platform', 'py4j', 'pyspark', 'quit', 'sc', 'spark', 'sqlContext', 'sqlCtx']


We noticed from the above that a HiveContext is not available. This is different from Spark 1.6. Note that for the line below to work, only one pyspark session can be running.

In [2]:
people = sqlCtx.read.json('person.json')
people.show()

+-------------------+---------+--------+--------------------+
|            address|firstName|lastName|        phoneNumbers|
+-------------------+---------+--------+--------------------+
|[New York,10021,NY]|     John|   Smith|[[212 555-1234,ho...|
| [Buffalo,10541,NY]|    Jimmy|   Cutts|[[567 555-4991,ho...|
+-------------------+---------+--------+--------------------+



In [3]:
people.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- postalCode: string (nullable = true)
 |    |-- state: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- phoneNumbers: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- number: string (nullable = true)
 |    |    |-- type: string (nullable = true)



In [4]:
people.select('firstName').show()

+---------+
|firstName|
+---------+
|     John|
|    Jimmy|
+---------+



In [5]:
people.filter(people.firstName == 'John').show()

+-------------------+---------+--------+--------------------+
|            address|firstName|lastName|        phoneNumbers|
+-------------------+---------+--------+--------------------+
|[New York,10021,NY]|     John|   Smith|[[212 555-1234,ho...|
+-------------------+---------+--------+--------------------+



### Run a SQL query

In [6]:
people.registerTempTable('Peoples')
sqlCtx.sql("""select firstName, lastName from Peoples where firstName == 'John'""").show()

+---------+--------+
|firstName|lastName|
+---------+--------+
|     John|   Smith|
+---------+--------+



### Convert the DataFrame to an RDD

In [7]:
people.rdd.map(lambda row: row.firstName).collect()

[u'John', u'Jimmy']

### Basic RDD operations applied to dataframes

In [8]:
people.count()

2

In [9]:
people.first()

Row(address=Row(city=u'New York', postalCode=u'10021', state=u'NY'), firstName=u'John', lastName=u'Smith', phoneNumbers=[Row(number=u'212 555-1234', type=u'home')])

In [10]:
people.take(1)

[Row(address=Row(city=u'New York', postalCode=u'10021', state=u'NY'), firstName=u'John', lastName=u'Smith', phoneNumbers=[Row(number=u'212 555-1234', type=u'home')])]

Note that not all actions on basic RDD's work:

In [11]:
#people.countByValue()
#people.rdd.countByValue()

### Caching a table for later use

In [12]:
sqlCtx.cacheTable("Peoples")

### Creating a DataFrame

In [13]:
from pyspark.sql import Row
teams = sc.parallelize([Row(name='Blackhawks', mascot='Blacky'), Row(name='Bears', mascot='Bucky')])
teams_df = sqlCtx.createDataFrame(teams) # this apparently replaces inferschema
teams_df.registerTempTable("teams")
teams_df.show()

+------+----------+
|mascot|      name|
+------+----------+
|Blacky|Blackhawks|
| Bucky|     Bears|
+------+----------+



In [14]:
sqlCtx.sql("""select * from teams""").show()

+------+----------+
|mascot|      name|
+------+----------+
|Blacky|Blackhawks|
| Bucky|     Bears|
+------+----------+

