// Databricks notebook source exported at Wed, 17 Feb 2016 20:50:15 UTC
 #### Business question:

* Question # 1) How many Edits occur every 3 seconds to the English Wikipedia vs. another language?

In [1]:
import org.apache.spark._
import org.apache.spark.storage._
import org.apache.spark.streaming._
import org.apache.spark.sql.functions._

In [2]:
// The batch interval sets how we collect data for, before analyzing it in a batch
val BatchInterval = Seconds(3)

// We'll use a unique server for the English edit stream
val EnglishStreamingServerHost  = "52.89.53.194"
val EnglishStreamingServerPort  = 9002 //en

// We'll use a unique server for all the other language edit streams
val MiscLangStreamingServerHost  = "54.68.10.240"

val SpanishStreamingServerPort  = 9007 //es
val GermanStreamingServerPort  = 9003 //de
val FrenchStreamingServerPort  = 9004 //fr
val RussianStreamingServerPort  = 9005 //ru
val ItalianStreamingServerPort  = 9006 //it

In [3]:
sc

org.apache.spark.SparkContext = org.apache.spark.SparkContext@74babce8

 Create a new `StreamingContext`, using the SparkContext and batch interval:

In [4]:
val ssc = new StreamingContext(sc, BatchInterval)

 Create two DStreams, one for English and another for a language of your choosing:

In [5]:
val baseEnDSTREAM = ssc.socketTextStream(EnglishStreamingServerHost, EnglishStreamingServerPort)

In [6]:
val baseDeDSTREAM = ssc.socketTextStream(MiscLangStreamingServerHost, GermanStreamingServerPort)

 For each DStream, parse the incoming JSON and register a new temporary table every batch interval:

In [7]:
// Create an English temp table at every 3 sec batch interval
baseEnDSTREAM.foreachRDD { rdd =>
  if(! rdd.isEmpty) {
    sqlContext.read.json(rdd).registerTempTable("English_Edits")
  }
}

  baseDeDSTREAM.foreachRDD { rdd => 
    
    if (! rdd.isEmpty) {
      sqlContext.read.json(rdd).registerTempTable("German_Edits")
    }
  }

In [8]:
  ssc.remember(Minutes(1))  // To make sure data is not deleted by the time we query it interactively

In [9]:
ssc.start

In [16]:
sqlContext.sql("select * from English_Edits").count

Long = 16

In [32]:
sqlContext.sql("select page from English_Edits").collect foreach println

                                                                                [User:Masterknighted/sandbox]
[Category:Pages with reference errors]
[Category:CS1 maint: Explicit use of et al.]
[Category:Pages with reference errors]
[Category:Atchison, Topeka and Santa Fe Railway stations in San Bernardino County, California]
[Category:Atchison, Topeka and Santa Fe Railway stations]
[Special:Log/thanks]
[Citizenship in the United States]
[DaVarryl Williamson]


In [15]:
sqlContext.sql("select * from German_Edits").count

Long = 3

In [42]:
sqlContext.sql("select * from English_Edits").printSchema

root
 |-- anonymous: boolean (nullable = true)
 |-- channel: string (nullable = true)
 |-- comment: string (nullable = true)
 |-- delta: long (nullable = true)
 |-- flag: string (nullable = true)
 |-- namespace: string (nullable = true)
 |-- newPage: boolean (nullable = true)
 |-- page: string (nullable = true)
 |-- pageUrl: string (nullable = true)
 |-- robot: boolean (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- unpatrolled: boolean (nullable = true)
 |-- url: string (nullable = true)
 |-- user: string (nullable = true)
 |-- userUrl: string (nullable = true)
 |-- wikipedia: string (nullable = true)
 |-- wikipediaLong: string (nullable = true)
 |-- wikipediaShort: string (nullable = true)
 |-- wikipediaUrl: string (nullable = true)



### Steve: not sure what this does.  there is a count(*) with no group by.

In [None]:
sqlContext.sql("""SELECT first(channel) AS Language, Count(*) AS Edit_Count FROM English_Edits
UNION
SELECT first(channel) AS Language, Count(*)  AS Edit_Count FROM German_Edits"""""").show

In [45]:
sqlContext.sql(""" 
select "english" AS language, substring(timestamp, 0, 19) as timestamp, count(*) AS count from English_Edits GROUP BY timestamp UNION ALL
select "german" AS language, substring(timestamp, 0, 19) as timestamp, count(*) AS count from German_Edits GROUP BY timestamp""").show 

+--------+-------------------+-----+
|language|          timestamp|count|
+--------+-------------------+-----+
| english|2016-02-17T23:35:05|    1|
| english|2016-02-17T23:35:05|    1|
| english|2016-02-17T23:35:05|    1|
| english|2016-02-17T23:35:05|    1|
|  german|2016-02-17T23:35:04|    1|
+--------+-------------------+-----+



In [46]:
// Optional just to stop
StreamingContext.getActive.foreach { _.stop(stopSparkContext = false) }