In [1]:
import pyspark
import re
import random
sc = pyspark.SparkContext(appName="P6")

In [2]:
def splitLine(line):
    return line.split()
def filter_word(word):
    not_numbers = len(re.sub('[0-9]', "", word)) > 0
    not_caps_period = len(re.sub('[A-Z]+\.*', "", word)) > 0
    return not_numbers and not_caps_period
def format_rdd(node, words_list):
    index = node[1]
    if index < len(words_list) - 2:
        return ((node[0], words_list[index+1]), [(words_list[index+2], 1)])
    else:
        return None
def update_hash(word_hash, arr):
    for word, count in arr:
        if word in word_hash:
            word_hash[word] += count
        else:
            word_hash[word] = count
    
def reduce_tuples(x, y):
    word_hash = {}
    update_hash(word_hash, x)
    update_hash(word_hash, y)
    return word_hash.items()
    

In [3]:
lines = sc.textFile('Shakespeare.txt')
words = lines.flatMap(splitLine)
words.count()

904061

In [4]:
filtered_words = words.filter(filter_word)
filtered_words.count()

829777

In [6]:
words_list = filtered_words.collect()
words_list[1]

[u'The', u'Project', u'Gutenberg', u'EBook', u'of']

In [10]:
indexed_words = filtered_words.zipWithIndex()
indexed_words = indexed_words.map(lambda x: format_rdd(x, words_list)).filter(lambda x: x is not None)
indexed_words.take(1)

[((u'The', u'Project'), [(u'Gutenberg', 1)])]

In [23]:
indexed_words = filtered_words.zipWithIndex().map(lambda x: (x[1], x[0])).partitionBy(4)
indexed_words.foreach(lambda x: x)
offset_words_1 = indexed_words.map(lambda x: (x[0]-1, x[1])).partitionBy(4)
offset_words_2 = indexed_words.map(lambda x: (x[0]-2, x[1])).partitionBy(4)
indexed_words = indexed_words.join(offset_words_1).join(offset_words_2).map(lambda x: ((x[1][0][0], x[1][0][1]), [(x[1][1],1)]))
indexed_words.take(1)

[((u'The', u'Project'), [(u'Gutenberg', 1)])]

In [18]:
word_model = indexed_words.reduceByKey(reduce_tuples)
word_model = word_model.sortByKey()

In [19]:
word_model.take(5)

[((u'"', u'"'),
  [(u'DEMETRIUS,', 1),
   (u'SELEUCUS,', 1),
   (u'VOLUMNIUS,', 1),
   (u'BARDOLPH,', 1),
   (u'-', 1),
   (u'THYREUS,', 1),
   (u'DIOMEDES,', 1),
   (u'BEAU,', 1),
   (u'GRATIANO,', 1),
   (u'JAMY,', 1),
   (u'DION,', 1),
   (u'NYM,', 1),
   (u'ARMADO,', 1),
   (u'LORENZO,', 1),
   (u'CLAUDIO,', 1),
   (u'PROCULEIUS,', 1),
   (u'PINDARUS,', 1),
   (u'"', 110),
   (u'STRATO,', 1),
   (u'BATES,', 1),
   (u'PHILO,', 1),
   (u'CINNA,', 1),
   (u'SIXTH,', 1),
   (u'DUMAIN,', 1),
   (u'CIMBER,', 1),
   (u'CLEOMENES,', 1),
   (u'DERCETAS,', 1),
   (u'KATHARINE,', 1),
   (u'EROS,', 1),
   (u'BRUTUS,', 1),
   (u'TOUCHSTONE,', 1),
   (u'ANTONIO,', 2),
   (u'MACMORRIS,', 1),
   (u'EXETER,', 1),
   (u'MENAS,', 1),
   (u'DARDANIUS,', 1),
   (u'DOLABELLA,', 1),
   (u'ORLANDO,', 1),
   (u'WILLIAMS,', 1),
   (u'ERPINGHAM,', 1),
   (u'SCARUS,', 1),
   (u'ARTEMIDORUS,', 1),
   (u'MONTJOY,', 1),
   (u'FLUELLEN,', 1),
   (u'GALLUS,', 1),
   (u'MESSALA,', 1),
   (u'ADAM,', 1),
   (u'VARRIU

In [20]:
word_model.map(lambda x: x).lookup((u'Now', u'is'))

[[(u'a', 1),
  (u'be', 1),
  (u'his', 1),
  (u'that', 1),
  (u'this', 1),
  (u'Mortimer', 1),
  (u'it', 3),
  (u'the', 9),
  (u'my', 2),
  (u'your', 1),
  (u'he', 1)]]

In [21]:
def find_next(words):
    choice = random.randint(1, sum(map(lambda x: x[1], words)))
    count = 0
    for word in words:
        count += word[1]
        if count >= choice:
            return word[0]
def generate_phrase(num_words, rdd):
    phrase = []
    rand_sample = rdd.takeSample(True, 1)
    phrase.append(rand_sample[0][0][0])
    phrase.append(rand_sample[0][0][1])
    while len(phrase) < num_words:
        lookup = rdd.map(lambda x: x).lookup((phrase[-2], phrase[-1]))
        if len(lookup) > 0:
            phrase.append(find_next(lookup[0]))
        else:
            return " ".join(phrase)
    return " ".join(phrase)

In [22]:
phrases = []
for i in range(0, 2):
    phrases.append(generate_phrase(20, word_model))
print phrases

[u'sport of it is my duty, As doth a sail, a sail, a sail, a sail!" Enter a King and', u'tower in hope, as in dumb shows and noise. would have done this? What, this, you gods? Why, this it']


In [24]:
output = [u"came. And him to his short banishment, He should have ask'd you. [Within] And you the more. am your master,", u"foul fault. [A parley sounded] The town will rise. God's will, thou wrong'st him, Somerset; His grandfather was Lionel Duke", u'in London, keeper of the fashion? Bora. Not honestly, my lord, trust not simple Henry nor his turkey-cocks. God pless', u'bind them, gentle Publius- Caius and Valentine, lay hands on them. How likes Gremio these quick-witted folks? Believe me, no;', u'Vows for thee to thy master. Exit May these add to it. My ring hold dear as my grief and', u"Must (like a whore) unpack my heart o'erweens too much, Unless my sins are writ, and that's as easy broke", u"controll'd in that it stands me much that this treason with thy yard As thou lov'st her. Farewell. Exit No", u"noverbs. Give me Bianca for my brother back again, And dare us with our woes And make our batt'ry next.", u"Buckingham, doth York intend no harm in their guiltiness. Go you and all the world oppos'd, he being drest to", u"cold, smile and take't for a looking-glass, And entertain a cheerful disposition. To please his fantasy. Re-enter Iago. How now,"]

In [29]:
for phrase in output:
    print phrase

came. And him to his short banishment, He should have ask'd you. [Within] And you the more. am your master,
foul fault. [A parley sounded] The town will rise. God's will, thou wrong'st him, Somerset; His grandfather was Lionel Duke
in London, keeper of the fashion? Bora. Not honestly, my lord, trust not simple Henry nor his turkey-cocks. God pless
bind them, gentle Publius- Caius and Valentine, lay hands on them. How likes Gremio these quick-witted folks? Believe me, no;
Vows for thee to thy master. Exit May these add to it. My ring hold dear as my grief and
Must (like a whore) unpack my heart o'erweens too much, Unless my sins are writ, and that's as easy broke
controll'd in that it stands me much that this treason with thy yard As thou lov'st her. Farewell. Exit No
noverbs. Give me Bianca for my brother back again, And dare us with our woes And make our batt'ry next.
Buckingham, doth York intend no harm in their guiltiness. Go you and all the world oppos'd, he being drest to
cold, sm