In [4]:
%showTypes on

Types will be printed.


In [5]:
sc

org.apache.spark.SparkContext = org.apache.spark.SparkContext@13a87f6f


In [3]:
println(sc.version)
println(sc.master)
println("Running locally? " + sc.isLocal)

2.4.3
local[*]
Running locally? true


In [13]:
def info(message: String): String = {
    println(message)
    
    message
}

info: (message: String)String


In [14]:
def error(message: String): String = {
    val fullMessage = s"""
        |***********
        |
        | ERROR: $message
        |
        |***********
        """.stripMargin
    println(fullMessage)
    
    fullMessage
}

error: (message: String)String


In [7]:
val infoString = info("All is well.")

All is well.


infoString: String = All is well.


infoString: String = All is well.


In [8]:
val errorString = error("Uh oh...")


***********

 ERROR: Uh oh...

***********
        


errorString: String = 


"
***********
 ERROR: Uh oh...
***********
        "


errorString: String = 


In [9]:
errorString

"
***********
 ERROR: Uh oh...
***********
        "


String = 


In [10]:
s"""Use braces for expressions: ${sc.version}.
You can omit the braces when just using a variable: $sc
However, watch for ambiguities like ${sc}andextrastuff"""

Use braces for expressions: 2.4.3.
You can omit the braces when just using a variable: org.apache.spark.SparkContext@32c3891d
However, watch for ambiguities like org.apache.spark.SparkContext@32c3891dandextrastuff


String = 


In [17]:
s"""
    |line 1
    |  line 2
    |  | line 3
    |""".stripMargin

"
line 1
  line 2
 line 3
"


String = 


In [14]:
s"""
    |line 1
    |  line 2
    |  |  line 3
    |""".stripMargin

"
line 1
  line 2
  line 3
"


String = 


In [18]:
'/'

Char = /


In [19]:
"/"

String = /


In [2]:
import java.io.File

In [3]:
val shakespeare = new File("/home/jovyan/work/data-jesfs/shakespeare")

shakespeare = /home/jovyan/work/data-jesfs/shakespeare


/home/jovyan/work/data-jesfs/shakespeare

In [15]:
val success = if (shakespeare.exists == false) {
    error(s"Data directory path doesn't exist! $shakespeare")
    false
} else {
    info(s"$shakespeare exists")
    true
}
println("success = " + success)

/home/jovyan/work/data-jesfs/shakespeare exists
success = true


success: Boolean = true


success: Boolean = true


In [16]:
val pathSeparator = File.separator
val targetDirName = shakespeare.toString
val plays = Seq(
    "tamingoftheshrew", "comedyoferrors", "loveslabourslost", "midsummersnightsdream",
    "merrywivesofwindsor", "muchadoaboutnothing", "asyoulikeit", "twelfthnight")

if (success) {
    println(s"Checking that the plays are in $shakespeare:")
    val failures = for {
        play <- plays
        playFileName = targetDirName + pathSeparator + play
        playFile = new File(playFileName)
        if (playFile.exists == false)
    } yield {
        s"$playFileName:\tNOT FOUND!!"
    }
    
    println("Finished!")
    if (failures.size == 0) {
        info("All plays found!")
    } else {
        println("The following expected plays were not found:")
        failures.foreach(play => error(play))
    }
}

Checking that the plays are in /home/jovyan/work/data-jesfs/shakespeare:
Finished!
All plays found!


pathSeparator: String = /
targetDirName: String = /home/jovyan/work/data-jesfs/shakespeare
plays: Seq[String] = List(tamingoftheshrew, comedyoferrors, loveslabourslost, midsummersnightsdream, merrywivesofwindsor, muchadoaboutnothing, asyoulikeit, twelfthnight)


Any = All plays found!


In [2]:
println("Pass prinln as the function to use for each element:")

plays.foreach(println)

Pass prinln as the function to use for each element:
tamingoftheshrew
comedyoferrors
loveslabourslost
midsummersnightsdream
merrywivesofwindsor
muchadoaboutnothing
asyoulikeit
twelfthnight


In [3]:
println("Anonymous function   println `str => println(str)`")
plays.foreach(str => println(str))

Anonymous function   println `str => println(str)`
tamingoftheshrew
comedyoferrors
loveslabourslost
midsummersnightsdream
merrywivesofwindsor
muchadoaboutnothing
asyoulikeit
twelfthnight


In [4]:
println("\nWhy do we need to name this argument? Scala lets us use _ as a placeholder.")
plays.foreach(println(_))



Why do we need to name this argument? Scala lets us use _ as a placeholder.
tamingoftheshrew
comedyoferrors
loveslabourslost
midsummersnightsdream
merrywivesofwindsor
muchadoaboutnothing
asyoulikeit
twelfthnight


In [5]:
println("\nFor longer functions, you can use {...} instead of (...).")
println("Why? Because it gives you the familiar multiline block syntax with {...}")
plays.foreach {
  (str: String) => println(str)
}


For longer functions, you can use {...} instead of (...).
Why? Because it gives you the familiar multiline block syntax with {...}
tamingoftheshrew
comedyoferrors
loveslabourslost
midsummersnightsdream
merrywivesofwindsor
muchadoaboutnothing
asyoulikeit
twelfthnight


In [8]:

println("\nThe _ placeholder can be used *once* for each argument in the list.")
println("As an assume, use `reduceLeft` to sum some integers.")

val integers = 0 to 10
integers.reduceLeft((i,j) => i+j)



The _ placeholder can be used *once* for each argument in the list.
As an assume, use `reduceLeft` to sum some integers.


integers: scala.collection.immutable.Range.Inclusive = Range(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)


Int = 55


In [9]:
integers.reduceLeft(_+_)

Int = 55


Inverted Index

In [14]:
val iiFirstPass1 = sc.wholeTextFiles(shakespeare.toString)
                    .flatMap { location_contents_tuple2 =>
                        val words = location_contents_tuple2._2.split("""\W+""")
                        val fileName = location_contents_tuple2._1.split(pathSeparator).last
                        words.map(word => ((word, fileName), 1))
                        }
                    .
                    .reduceByKey((count1, count2) => count1 + count2)
                    .map { word_file_count_tup3 =>
                        (word_file_count_tup3._1._1, (word_file_count_tup3._1._2, word_file_count_tup3._2))
                        }
                    .groupByKey
                    .sortByKey(ascending = true)
                    .mapValues { iterable =>
                        val vect = iterable.toVector.sortBy { file_count_tup2 =>
                            (-file_count_tup2._2, file_count_tup2._1)
                        }
                        vect.mkString(",")
                    }

iiFirstPass1: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[9] at mapValues at <console>:44


iiFirstPass1: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[9] at mapValues at <console>:44


In [15]:
iiFirstPass1.take(3)

Array[(String, String)] = Array(("",(asyoulikeit,1),(comedyoferrors,1),(loveslabourslost,1),(merrywivesofwindsor,1),(midsummersnightsdream,1),(muchadoaboutnothing,1),(tamingoftheshrew,1),(twelfthnight,1)), (A,(loveslabourslost,78),(tamingoftheshrew,59),(twelfthnight,47),(comedyoferrors,42),(midsummersnightsdream,39),(merrywivesofwindsor,38),(asyoulikeit,34),(muchadoaboutnothing,31)), (ABOUT,(muchadoaboutnothing,18)))


steb by step

In [19]:
val fileContents = sc.wholeTextFiles(shakespeare.toString)

fileContents: org.apache.spark.rdd.RDD[(String, String)] = /home/jovyan/work/data-jesfs/shakespeare MapPartitionsRDD[3] at wholeTextFiles at <console>:30


fileContents: org.apache.spark.rdd.RDD[(String, String)] = /home/jovyan/work/data-jesfs/shakespeare MapPartitionsRDD[3] at wholeTextFiles at <console>:30


In [7]:
fileContents

org.apache.spark.rdd.RDD[(String, String)] = /home/jovyan/work/data-jesfs/shakespeare MapPartitionsRDD[1] at wholeTextFiles at <console>:30


In [8]:
("foo", 101, 3.14159, ("bar", 202L))

(String, Int, Double, (String, Long)) = (foo,101,3.14159,(bar,202))


In [9]:
(1,2,3,(1,2,3), (1,2,3,(4,5,6)))

(Int, Int, Int, (Int, Int, Int), (Int, Int, Int, (Int, Int, Int))) = (1,2,3,(1,2,3),(1,2,3,(4,5,6)))


In [10]:
fileContents.count

Long = 8


In [11]:
Seq(1,2,3).size

Int = 3


In [45]:
val wordFileNameOnes = fileContents.flatMap { location_contents_tuple2 => 
    val words = location_contents_tuple2._2.split("""\W+""")
        .filter(word => word.size > 0)
    val fileName = location_contents_tuple2._1.split(pathSeparator).last
    words.map(word => ((word.toLowerCase, fileName), 1))
}


wordFileNameOnes: org.apache.spark.rdd.RDD[((String, String), Int)] = MapPartitionsRDD[17] at flatMap at <console>:32


wordFileNameOnes: org.apache.spark.rdd.RDD[((String, String), Int)] = MapPartitionsRDD[17] at flatMap at <console>:32


In [46]:
wordFileNameOnes.count

Long = 173328


In [47]:
wordFileNameOnes.take(10).foreach(println)

((the,merrywivesofwindsor),1)
((merry,merrywivesofwindsor),1)
((wives,merrywivesofwindsor),1)
((of,merrywivesofwindsor),1)
((windsor,merrywivesofwindsor),1)
((dramatis,merrywivesofwindsor),1)
((personae,merrywivesofwindsor),1)
((sir,merrywivesofwindsor),1)
((john,merrywivesofwindsor),1)
((falstaff,merrywivesofwindsor),1)


In [None]:
wordFileNameOnes.take(10).foreach(println)

In [23]:
val uniques = wordFileNameOnes.reduceByKey((count1, count2) => count1 + count2)

uniques: org.apache.spark.rdd.RDD[((String, String), Int)] = ShuffledRDD[5] at reduceByKey at <console>:28


uniques: org.apache.spark.rdd.RDD[((String, String), Int)] = ShuffledRDD[5] at reduceByKey at <console>:28


In [25]:
uniques.count

Long = 27276


In [26]:
uniques.take(30).foreach(println)

((dexterity,merrywivesofwindsor),1)
((force,muchadoaboutnothing),2)
((whole,comedyoferrors),2)
((lamb,muchadoaboutnothing),2)
((blunt,tamingoftheshrew),3)
((letter,merrywivesofwindsor),19)
((crest,asyoulikeit),1)
((bestow,asyoulikeit),1)
((rear,midsummersnightsdream),1)
((crossing,tamingoftheshrew),1)
((wronged,merrywivesofwindsor),4)
((S,tamingoftheshrew),10)
((HIPPOLYTA,midsummersnightsdream),19)
((revolve,twelfthnight),1)
((er,merrywivesofwindsor),11)
((renown,asyoulikeit),1)
((cubiculo,twelfthnight),1)
((All,twelfthnight),3)
((power,loveslabourslost),8)
((Albeit,asyoulikeit),1)
((lips,tamingoftheshrew),3)
((upshot,twelfthnight),1)
((approach,midsummersnightsdream),4)
((mean,muchadoaboutnothing),5)
((embossed,asyoulikeit),1)
((varnish,loveslabourslost),2)
((Apollo,midsummersnightsdream),1)
((spangled,midsummersnightsdream),1)
((gentlemen,comedyoferrors),1)
((Rebuke,loveslabourslost),1)


In [27]:
val words = uniques.map {
    word_file_count_tup3 => 
        (word_file_count_tup3._1._1, (word_file_count_tup3._1._2, word_file_count_tup3._2))
}

words: org.apache.spark.rdd.RDD[(String, (String, Int))] = MapPartitionsRDD[6] at map at <console>:28


words: org.apache.spark.rdd.RDD[(String, (String, Int))] = MapPartitionsRDD[6] at map at <console>:28


In [33]:
val wordGroups = words.groupByKey.sortByKey(ascending = true)

wordGroups: org.apache.spark.rdd.RDD[(String, Iterable[(String, Int)])] = ShuffledRDD[14] at sortByKey at <console>:28


wordGroups: org.apache.spark.rdd.RDD[(String, Iterable[(String, Int)])] = ShuffledRDD[14] at sortByKey at <console>:28


In [34]:
wordGroups.count

Long = 11951


In [35]:
wordGroups.take(30).foreach(println)

(,CompactBuffer((tamingoftheshrew,1), (asyoulikeit,1), (merrywivesofwindsor,1), (comedyoferrors,1), (midsummersnightsdream,1), (twelfthnight,1), (loveslabourslost,1), (muchadoaboutnothing,1)))
(A,CompactBuffer((loveslabourslost,78), (midsummersnightsdream,39), (muchadoaboutnothing,31), (merrywivesofwindsor,38), (comedyoferrors,42), (asyoulikeit,34), (twelfthnight,47), (tamingoftheshrew,59)))
(ABOUT,CompactBuffer((muchadoaboutnothing,18)))
(ACT,CompactBuffer((asyoulikeit,22), (comedyoferrors,11), (tamingoftheshrew,12), (loveslabourslost,9), (muchadoaboutnothing,17), (twelfthnight,18), (merrywivesofwindsor,23), (midsummersnightsdream,9)))
(ADAM,CompactBuffer((asyoulikeit,16)))
(ADO,CompactBuffer((muchadoaboutnothing,18)))
(ADRIANA,CompactBuffer((comedyoferrors,85)))
(ADRIANO,CompactBuffer((loveslabourslost,111)))
(AEGEON,CompactBuffer((comedyoferrors,20)))
(AEMELIA,CompactBuffer((comedyoferrors,16)))
(AEMILIA,CompactBuffer((comedyoferrors,3)))
(AEacides,CompactBuffer((tamingoftheshrew,1)

In [37]:
val iiFirstPass2 = wordGroups.mapValues {
    iterable => val vect = iterable.toVector.sortBy {
        file_count_tup2 => (-file_count_tup2._2, file_count_tup2._1)
    }
    vect.mkString(",")
}

iiFirstPass2: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[15] at mapValues at <console>:28


iiFirstPass2: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[15] at mapValues at <console>:28


In [38]:

iiFirstPass2.take(30).foreach(println)

(,(asyoulikeit,1),(comedyoferrors,1),(loveslabourslost,1),(merrywivesofwindsor,1),(midsummersnightsdream,1),(muchadoaboutnothing,1),(tamingoftheshrew,1),(twelfthnight,1))
(A,(loveslabourslost,78),(tamingoftheshrew,59),(twelfthnight,47),(comedyoferrors,42),(midsummersnightsdream,39),(merrywivesofwindsor,38),(asyoulikeit,34),(muchadoaboutnothing,31))
(ABOUT,(muchadoaboutnothing,18))
(ACT,(merrywivesofwindsor,23),(asyoulikeit,22),(twelfthnight,18),(muchadoaboutnothing,17),(tamingoftheshrew,12),(comedyoferrors,11),(loveslabourslost,9),(midsummersnightsdream,9))
(ADAM,(asyoulikeit,16))
(ADO,(muchadoaboutnothing,18))
(ADRIANA,(comedyoferrors,85))
(ADRIANO,(loveslabourslost,111))
(AEGEON,(comedyoferrors,20))
(AEMELIA,(comedyoferrors,16))
(AEMILIA,(comedyoferrors,3))
(AEacides,(tamingoftheshrew,1))
(AEgeon,(comedyoferrors,7))
(AEgle,(midsummersnightsdream,1))
(AEmilia,(comedyoferrors,4))
(AEsculapius,(merrywivesofwindsor,1))
(AGUECHEEK,(twelfthnight,2))
(ALL,(midsummersnightsdream,2),(tamingof

----




Pattern Matching







--------

In [49]:
val ii1 = sc.wholeTextFiles(shakespeare.toString).
    flatMap {
        case (location, contents) => 
            val words = contents.split("""\W+""").
                filter(word => word.size > 0)                      // #1
            val fileName = location.split(pathSeparator).last
            words.map(word => ((word.toLowerCase, fileName), 1))   // #2
    }.
    reduceByKey((count1, count2) => count1 + count2).
    map { 
        case ((word, fileName), count) => (word, (fileName, count)) 
    }.
    groupByKey.
    sortByKey(ascending = true).
    mapValues { iterable => 
        val vect = iterable.toVector.sortBy { 
            case (fileName, count) => (-count, fileName) 
        }
        vect.mkString(",")
    }

ii1: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[27] at mapValues at <console>:48


ii1: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[27] at mapValues at <console>:48


In [51]:
import org.apache.spark.sql.SQLContext
val sqlContext = new SQLContext(sc)

sqlContext: org.apache.spark.sql.SQLContext = org.apache.spark.sql.SQLContext@bbf1695




sqlContext: org.apache.spark.sql.SQLContext = org.apache.spark.sql.SQLContext@bbf1695


In [52]:
val ii1DF = 

ii1DF: org.apache.spark.sql.DataFrame = [word: string, locations_counts: string]


ii1DF: org.apache.spark.sql.DataFrame = [word: string, locations_counts: string]
