Use of the `au.com.d2dcrc.carbon.spark.gsr` datasource without a `version` parameter is deprecated. 
Please specify one of

   * `"1.0"` - Old behaviour. This should be avoided as any potential mismatched records are not included
     in the dataset. That is, if any events that are encoded multiple times but with differing values in
     fields, for example if multiple population groups legitimately participated in the event, then they
     are not included in the dataset.

   * `"1.1"` - Same as 2.0, but with missing columns present in 1.0 populated using some row of the
     'records' array column. This is purely to maintain API compatibility but should be avoided
     as you are not getting the full picture. (Eg. if an event had multiple population groups)

   * `"2.0"` - New behaviour. multiple encodings of the same events are grouped by `"firstReportedLink"`
     and `"eventDate"`

   * `"latest"` - an alias for the latest version
   
Default is `version="1.1"`

In [1]:
dfDefault = sqlContext.read.format("au.com.d2dcrc.carbon.spark.gsr").options(
#   version="1.1", # the default, can explicitly specify this version 
    startTime="2016-07-01T00:00:00Z", 
    endTime="2016-08-01T00:00:00Z"
).load()
dfDefault.printSchema()

root
 |-- firstReportedLink: string (nullable = true)
 |-- key:eventDate: date (nullable = true)
 |-- records: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- id: long (nullable = false)
 |    |    |-- authorId: integer (nullable = true)
 |    |    |-- timestamp: timestamp (nullable = true)
 |    |    |-- timestampRevision: timestamp (nullable = true)
 |    |    |-- widespreadEventId: long (nullable = true)
 |    |    |-- eventType: string (nullable = true)
 |    |    |-- populationGroup: string (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- state: string (nullable = true)
 |    |    |-- city: string (nullable = true)
 |    |    |-- earliestReportedDate: date (nullable = true)
 |    |    |-- crowdSize: string (nullable = true)
 |    |    |-- violent: boolean (nullable = true)
 |    |    |-- newsSourceName: string (nullable = true)
 |    |    |-- headline: string (nullable = true)
 |    |    |-- englishHeadline: 

In [2]:
dfOld = sqlContext.read.format("au.com.d2dcrc.carbon.spark.gsr").options(
    version="1.0",
    startTime="2016-07-01T00:00:00Z", 
    endTime="2016-08-01T00:00:00Z"
).load()
dfOld.printSchema()

root
 |-- key:id: long (nullable = true)
 |-- authorId: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampRevision: timestamp (nullable = true)
 |-- widespreadEventId: long (nullable = true)
 |-- eventType: string (nullable = true)
 |-- populationGroup: string (nullable = true)
 |-- country: string (nullable = true)
 |-- state: string (nullable = true)
 |-- city: string (nullable = true)
 |-- key:eventDate: date (nullable = true)
 |-- earliestReportedDate: date (nullable = true)
 |-- crowdSize: string (nullable = true)
 |-- isViolent: boolean (nullable = true)
 |-- newsSourceName: string (nullable = true)
 |-- headline: string (nullable = true)
 |-- englishHeadline: string (nullable = true)
 |-- eventDescription: string (nullable = true)
 |-- firstReportedLink: string (nullable = true)
 |-- otherLinks: string (nullable = true)
 |-- comment: string (nullable = true)
 |-- ingested: boolean (nullable = true)
 |-- currentValidationState: string (nullable = tr

In [3]:
dfNew = sqlContext.read.format("au.com.d2dcrc.carbon.spark.gsr").options(
    version="2.0",
    startTime="2016-07-01T00:00:00Z", 
    endTime="2016-08-01T00:00:00Z"
).load()
dfNew.printSchema()

root
 |-- firstReportedLink: string (nullable = true)
 |-- eventDate: date (nullable = true)
 |-- records: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- id: long (nullable = false)
 |    |    |-- authorId: integer (nullable = true)
 |    |    |-- timestamp: timestamp (nullable = true)
 |    |    |-- timestampRevision: timestamp (nullable = true)
 |    |    |-- widespreadEventId: long (nullable = true)
 |    |    |-- eventType: string (nullable = true)
 |    |    |-- populationGroup: string (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- state: string (nullable = true)
 |    |    |-- city: string (nullable = true)
 |    |    |-- earliestReportedDate: date (nullable = true)
 |    |    |-- crowdSize: string (nullable = true)
 |    |    |-- violent: boolean (nullable = true)
 |    |    |-- newsSourceName: string (nullable = true)
 |    |    |-- headline: string (nullable = true)
 |    |    |-- englishHeadline: stri

In [4]:
print("Total rows: {}".format(dfNew.count()))

Total rows: 375


In [5]:
dfMismatched = dfNew.filter("size(records) > 1")
print("Mismatched records: {}".format(dfMismatched.count()))

Mismatched records: 75


In [6]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [7]:
def populationGroup(rows):
    return [row["populationGroup"] for row in rows]
    
populationGroupsUdf = udf(populationGroup, ArrayType(StringType()))
    
dfSelect = dfMismatched.select(
    dfMismatched.eventDate,
    populationGroupsUdf(dfMismatched.records).alias("populationGroups")
)
    

In [8]:
dfSelect.printSchema()

root
 |-- eventDate: date (nullable = true)
 |-- populationGroups: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [9]:
dfSelect.show(4)

+----------+--------------------+
| eventDate|    populationGroups|
+----------+--------------------+
|2016-07-12|[General, Religious]|
|2016-07-05|[Religious, General]|
|2016-07-05|  [General, General]|
|2016-07-18|   [Ethnic, General]|
+----------+--------------------+
only showing top 4 rows



In [10]:
dfExplode = dfSelect.select(
    dfSelect.eventDate,
    explode(dfSelect.populationGroups).alias("populationGroup")
)

In [11]:
dfExplode.printSchema()

root
 |-- eventDate: date (nullable = true)
 |-- populationGroup: string (nullable = true)



In [12]:
dfExplode.show(8)

+----------+---------------+
| eventDate|populationGroup|
+----------+---------------+
|2016-07-12|        General|
|2016-07-12|      Religious|
|2016-07-05|      Religious|
|2016-07-05|        General|
|2016-07-05|        General|
|2016-07-05|        General|
|2016-07-18|         Ethnic|
|2016-07-18|        General|
+----------+---------------+
only showing top 8 rows

