# Importing Libraries

In [1]:
import scala.collection.mutable.Map
import scala.collection.immutable.ListMap

Intitializing Scala interpreter ...

Spark Web UI available at http://b2943a752d84:4040
SparkContext available as 'sc' (version = 3.0.1, master = local[*], app id = local-1665705678241)
SparkSession available as 'spark'


import scala.collection.mutable.Map
import scala.collection.immutable.ListMap


# Reading Files

In [2]:
val verbs_all = sc.textFile("all_verbs.txt")
val shakespeare = sc.textFile("shakespeare.txt")
val verb_dict = sc.textFile("verb_dict.txt")

verbs_all: org.apache.spark.rdd.RDD[String] = all_verbs.txt MapPartitionsRDD[1] at textFile at <console>:27
shakespeare: org.apache.spark.rdd.RDD[String] = shakespeare.txt MapPartitionsRDD[3] at textFile at <console>:28
verb_dict: org.apache.spark.rdd.RDD[String] = verb_dict.txt MapPartitionsRDD[5] at textFile at <console>:29


In [3]:
val verb_list = verbs_all.map(_.trim()).collect()

verb_list: Array[String] = Array(abash, abashed, abashed, abashes, abashing, abate, abated, abated, abates, abating, abide, abode, abode, abides, abiding, absorb, absorbed, absorbed, absorbs, absorbing, accept, accepted, accepted, accepts, accepting, accompany, accompanied, accompanied, accompanies, accompanying, ache, ached, ached, aches, aching, achieve, achieved, achieved, achieves, achieving, acquire, acquired, acquired, acquires, acquiring, act, acted, acted, acts, acting, add, added, added, adds, adding, address, addressed, addressed, addresses, addressing, adjust, adjusted, adjusted, adjusts, adjusting, admire, admired, admired, admires, admiring, admit, admitted, admitted, admits, admitting, advise, advised, advised, advises, advising, afford, afforded, afforded, affords, afford...


# Filtering Verbs

In [4]:
//Removing empty lines, punctuations & Changing capitalization
val shakespeare_pre = shakespeare.filter(row => !row.isEmpty).map(_.replaceAll("[,.!?:;']", "").replaceAll("\\[", "").replaceAll("\\]","")).map(_.toLowerCase).map(_.trim())
val shakespeare_split = shakespeare_pre.flatMap(line=>line.split("\\s+"))

shakespeare_pre: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[10] at map at <console>:29
shakespeare_split: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[11] at flatMap at <console>:30


In [5]:
//Getting only the verbs present inside verb list
val shakespeare_verb_filter = shakespeare_split.filter(word => verb_list.contains(word))
val shakespeare_verb_count = shakespeare_verb_filter.countByValue()

shakespeare_verb_filter: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[12] at filter at <console>:31
shakespeare_verb_count: scala.collection.Map[String,Long] = Map(purifies -> 1, breaks -> 29, forgotten -> 15, leer -> 5, respecting -> 4, lights -> 28, laughing -> 14, mew -> 5, beheld -> 26, looks -> 214, implored -> 1, blurs -> 1, scare -> 4, subscribes -> 3, shivers -> 2, coursing -> 2, tiring -> 4, used -> 19, eye -> 414, striking -> 10, allowed -> 4, sack -> 57, straining -> 4, murmur -> 3, severs -> 1, boil -> 6, writing -> 23, curbed -> 1, believing -> 5, lasting -> 13, conquers -> 3, cooled -> 2, misuses -> 1, warns -> 1, killed -> 7, concluding -> 1, cooks -> 3, rub -> 12, respects -> 18, regarding -> 1, beg -> 94, regarded -> 6, spelling -> 1, bow -> 47, succeed -> 13, ta...


# Mapping Verbs

In [6]:
//Making key value pair of verbs for mapping
val verb_dict_map = verb_dict.map(line => line.split("\n")).map(arr => arr.head.split(",")).flatMap(arr => arr.map(word => (word, arr.head)).distinct).collect

verb_dict_map: Array[(String, String)] = Array((abash,abash), (abashed,abash), (abashes,abash), (abashing,abash), (abate,abate), (abated,abate), (abates,abate), (abating,abate), (abide,abide), (abode,abide), (abides,abide), (abiding,abide), (absorb,absorb), (absorbed,absorb), (absorbs,absorb), (absorbing,absorb), (accept,accept), (accepted,accept), (accepts,accept), (accepting,accept), (accompany,accompany), (accompanied,accompany), (accompanies,accompany), (accompanying,accompany), (ache,ache), (ached,ache), (aches,ache), (aching,ache), (achieve,achieve), (achieved,achieve), (achieves,achieve), (achieving,achieve), (acquire,acquire), (acquired,acquire), (acquires,acquire), (acquiring,acquire), (act,act), (acted,act), (acts,act), (acting,act), (add,add), (added,add), (adds,add), (adding...


In [7]:
//Getting first occurance of unique key in key-value pairs
val final_verb_dict_map: Map[String,String] = Map.empty[String,String]

for((key, value) <- verb_dict_map){
    if(!final_verb_dict_map.contains(key)){
        final_verb_dict_map += (key -> value)
    }
}

final_verb_dict_map: scala.collection.mutable.Map[String,String] = Map(worried -> worry, follow -> follow, forlore -> forlese, digging -> dig, migrate -> migrate, preferred -> prefer, construed -> construe, forlorn -> forlese, inaugurate -> inaugurate, founding -> found, consorted -> consort, immersed -> immerse, handles -> handle, illustrating -> illustrate, detected -> detect, betraying -> betray, whipped -> whip, chooses -> choose, created -> create, forsakes -> forsake, shrink -> shrink, consults -> consult, satisfy -> satisfy, humiliates -> humiliate, hypnotize -> hypnotize, spent -> spend, gag -> gag, dances -> dance, contemned -> contemn, inscribes -> inscribe, questioned -> question, welcome -> welcome, absorbs -> absorb, costing -> cost, pasted -> paste, relates -> relate, seat...


In [8]:
//Getting occurance of verbs
val new_shakespeare_verb_count = for{
    
    (key, value) <- shakespeare_verb_count.toList;
    new_key <- final_verb_dict_map.get(key)

} yield (new_key, value)

new_shakespeare_verb_count: List[(String, Long)] = List((purify,1), (break,29), (forget,15), (leer,5), (respect,4), (light,28), (laugh,14), (mew,5), (behold,26), (look,214), (implore,1), (blur,1), (scare,4), (subscribe,3), (shiver,2), (course,2), (tire,4), (use,19), (eye,414), (strike,10), (allow,4), (sack,57), (strain,4), (murmur,3), (sever,1), (boil,6), (write,23), (curb,1), (believe,5), (last,13), (conquer,3), (cool,2), (misuse,1), (warn,1), (kill,7), (conclude,1), (cook,3), (rub,12), (respect,18), (regard,1), (beg,94), (regard,6), (spell,1), (bow,47), (succeed,13), (taste,7), (want,21), (dwell,2), (kill,29), (measure,95), (pull,4), (empty,2), (roll,4), (chide,49), (propose,1), (notify,2), (contract,1), (hatch,1), (pardon,2), (enjoy,5), (please,388), (thrust,4), (clutch,3), (mind,61)...


In [9]:
//Grouping by key value and getting final count
val result = new_shakespeare_verb_count.groupBy(_._1).mapValues(seq => seq.map(_._2).reduce(_ + _))

result: scala.collection.immutable.Map[String,Long] = Map(leer -> 6, mew -> 6, scare -> 5, eye -> 1098, sack -> 61, murmur -> 7, boil -> 13, rub -> 19, beg -> 109, bow -> 62, succeed -> 25, measure -> 111, chide -> 86, notify -> 2, please -> 438, clutch -> 3, read -> 318, hurry -> 10, drive -> 76, tire -> 38, slit -> 1, find -> 802, shoot -> 84, spit -> 35, announce -> 1, contend -> 21, nurse -> 210, support -> 14, produce -> 20, question -> 166, chip -> 1, satisfy -> 78, smother -> 10, consult -> 6, scream -> 2, test -> 5, moult -> 1, feel -> 177, inlay -> 2, dye -> 8, grind -> 169, vanish -> 20, paint -> 90, reach -> 30, begin -> 246, lead -> 264, win -> 224, review -> 1, fail -> 76, recollect -> 1, spread -> 31, approve -> 44, saddle -> 10, prove -> 282, rid -> 32, verify -> 8, chall...


# Final Results

In [10]:
//Ordering in Descending Values
val result_top10 = ListMap(result.toSeq.sortWith(_._2 > _._2):_*).take(10)

result_top10: scala.collection.immutable.ListMap[String,Long] = ListMap(be -> 26661, have -> 7840, do -> 6396, come -> 3596, make -> 2888, go -> 2566, love -> 2479, let -> 2384, say -> 2336, know -> 2239)


In [11]:
result_top10.foreach(println)

(be,26661)
(have,7840)
(do,6396)
(come,3596)
(make,2888)
(go,2566)
(love,2479)
(let,2384)
(say,2336)
(know,2239)
