In [1]:
import numpy as np

In [2]:
import findspark
import os
findspark.init()
import pyspark
sc = pyspark.SparkContext()
sc.setLogLevel("ERROR")

In [3]:
def clean_lines(line):
    result = []
    for r in line.split():
        upper = r.upper()
        if r != upper and not (r.isdigit()):
            result.append(r)
    return result

### Build the model

In [41]:
wlist = sc.textFile('shakespeare.txt')

In [42]:
print wlist
print wlist.getNumPartitions()
print wlist.count()
print wlist.take(1)

MapPartitionsRDD[75] at textFile at NativeMethodAccessorImpl.java:-2
2
124787
[u'The Project Gutenberg EBook of The Complete Works of William Shakespeare, by']


In [43]:
words = wlist.flatMap(lambda line: clean_lines(line))

In [44]:
print words
print words.getNumPartitions()
print words.count()
print words.take(1)

PythonRDD[78] at RDD at PythonRDD.scala:43
2
822306
[u'The']


In [45]:
words_index = words.zipWithIndex() 

In [46]:
print words_index
print words_index.getNumPartitions()
print words_index.count()
print words_index.take(1)

PythonRDD[82] at RDD at PythonRDD.scala:43
2
822306
[(u'The', 0)]


In [47]:
words_index_1 = words_index.map(lambda w: (w[1], w[0]))

In [48]:
print words_index_1
print words_index_1.getNumPartitions()
print words_index_1.count()
print words_index_1.take(1)

PythonRDD[85] at RDD at PythonRDD.scala:43
2
822306
[(0, u'The')]


In [49]:
words_index_2 = words_index_1.map(lambda w: (w[0]+1, w[1]))

In [50]:
print words_index_2
print words_index_2.getNumPartitions()
print words_index_2.count()
print words_index_2.take(1)

PythonRDD[88] at RDD at PythonRDD.scala:43
2
822306
[(1, u'The')]


In [51]:
words_index_3 = words_index_2.map(lambda w: (w[0]+1, w[1]))

In [52]:
print words_index_3
print words_index_3.getNumPartitions()
print words_index_3.count()
print words_index_3.take(1)

PythonRDD[91] at RDD at PythonRDD.scala:43
2
822306
[(2, u'The')]


In [53]:
words_grouped = words_index_3.join(words_index_2).join(words_index_1).map(lambda w: (w[1], 1))

In [54]:
print words_grouped
print words_grouped.getNumPartitions()
print words_grouped.count()
print words_grouped.take(1)

PythonRDD[108] at RDD at PythonRDD.scala:43
6
822304
[(((u'more', u'wealth'), u'than'), 1)]


In [55]:
words_grouped_count = words_grouped.reduceByKey(lambda x, y: x + y).map(lambda w: (w[0][0], (w[0][1], w[1])))

In [56]:
print words_grouped_count
print words_grouped_count.getNumPartitions()
print words_grouped_count.count()
print words_grouped_count.take(1)

PythonRDD[115] at RDD at PythonRDD.scala:43
6
746679
[((u'merit', u'do'), (u'in', 1))]


In [57]:
words_grouped_result = words_grouped_count.groupByKey().mapValues(list)

In [59]:
print words_grouped_result
print words_grouped_result.getNumPartitions()
print words_grouped_result.count()
print words_grouped_result.take(2)

PythonRDD[122] at RDD at PythonRDD.scala:43
6
452218
[((u'Shall', u'braying'), [(u'trumpets', 1)]), ((u'souls', u'Do'), [(u'through', 1)])]


In [72]:
# Test dictionary
print words_grouped_result.map(lambda x: x).lookup(tuple([u'Now', u'is']))

[[(u'Mortimer', 1), (u'that', 1), (u'the', 9), (u'his', 1), (u'it', 3), (u'he', 1), (u'be', 1), (u'my', 2), (u'this', 1), (u'your', 1), (u'a', 1)]]


### Use the model to generate text

In [149]:
def getLastWord(wordList):

    words = [i[0] for i in wordList]
    weights = [i[1] for i in wordList]
    weights = [i/float(sum(weights)) for i in weights]
    
    return [np.random.choice(words, p=weights)]

In [150]:
def getPhrase(rdd, noWords):
    
    firstWord = rdd.takeSample(True, 1)
    phrase = list(firstWord[0][0]) + getLastWord(firstWord[0][1])
    
    while len(phrase) < noWords:
        newWords = rdd.map(lambda x: x).lookup(tuple(phrase[-2:]))[0]
        phrase = phrase + getLastWord(newWords)
    
    result = ' '.join(phrase)
    return result

In [151]:
noPhrases = 10
noWords = 20

In [152]:
text = ''
for i in range(noPhrases):
    text = text + getPhrase(words_grouped_result, noWords) + '\n'

In [153]:
text_file = open("P6.txt", "w")
text_file.write(text)
text_file.close()

In [154]:
print text

refund and replacement provisions of this pleasant chase; 'Tis not a jot. I'faith, fear it not. And why rail on
Boy Prithee, honey-sweet husband, let us dine, and never two ladies loved as they stream forth thy weapon; we are
large confession To angle for me, And could command no more oppos'd Against acquaintance, kindred, and night-walking heralds That trudge
the deputy's wife of Antony Is valiant and dejected; and by some spies. Fear not. The villain is a good
therewithal the best; The worst is this, Sweet love? Thy love! Out, tawny Tartar, out! Out, loathed med'cine! hated potion,
with hands Made hard with poor Antonio. Is it possible Disdain should die the death. Why, that were enough To
Englishman, and swore by his death hath raught him. [Drums afar off by soldiers How say you will wonder at.
if The passage and whole carriage of his effects in this. Enter with the bell, Or else you do wrong
many regist'red in promise, Which you do not know; either it is posted with permission of t