In [1]:
import pyspark
import re
import random
sc = pyspark.SparkContext(appName="P6")

In [2]:
def splitLine(line):
    return line.split()
def filter_word(word):
    not_numbers = len(re.sub('[0-9]', "", word)) > 0
    not_caps_period = len(re.sub('[A-Z]+\.*', "", word)) > 0
    return not_numbers and not_caps_period
def format_rdd(node, words_list):
    index = node[1]
    if index < len(words_list) - 2:
        return ((node[0], words_list[index+1]), [(words_list[index+2], 1)])
    else:
        return None
def update_hash(word_hash, arr):
    for word, count in arr:
        if word in word_hash:
            word_hash[word] += count
        else:
            word_hash[word] = count
    
def reduce_tuples(x, y):
    word_hash = {}
    update_hash(word_hash, x)
    update_hash(word_hash, y)
    return word_hash.items()
    

In [5]:
lines = sc.textFile('Shakespeare.txt')
words = lines.flatMap(splitLine)
words.count()

904061

In [6]:
filtered_words = words.filter(filter_word)
filtered_words.count()

829777

In [7]:
words_list = filtered_words.collect()
words_list[1]

u'Project'

In [8]:
indexed_words = filtered_words.zipWithIndex()
indexed_words = indexed_words.map(lambda x: format_rdd(x, words_list)).filter(lambda x: x is not None)
indexed_words.count()

829775

In [9]:
word_model = indexed_words.reduceByKey(reduce_tuples)
word_model = word_model.sortByKey()

In [10]:
word_model.take(5)

[((u'"', u'"'),
  [(u'DEMETRIUS,', 1),
   (u'DION,', 1),
   (u'SELEUCUS,', 1),
   (u'BARDOLPH,', 1),
   (u'-', 1),
   (u'THYREUS,', 1),
   (u'DIOMEDES,', 1),
   (u'BEAU,', 1),
   (u'LUCIUS,', 1),
   (u'GREY,', 1),
   (u'LIGARIUS,', 1),
   (u'NYM,', 1),
   (u'CLEOPATRA,', 1),
   (u'LORENZO,', 1),
   (u'CLAUDIO,', 1),
   (u'PROCULEIUS,', 1),
   (u'PINDARUS,', 1),
   (u'"', 110),
   (u'STRATO,', 1),
   (u'BATES,', 1),
   (u'PHILO,', 1),
   (u'CINNA,', 1),
   (u'SIXTH,', 1),
   (u'DUMAIN,', 1),
   (u'CLEOMENES,', 1),
   (u'DERCETAS,', 1),
   (u'KATHARINE,', 1),
   (u'EROS,', 1),
   (u'BRUTUS,', 1),
   (u'TOUCHSTONE,', 1),
   (u'ANTONIO,', 2),
   (u'PERCY,', 1),
   (u'EXETER,', 1),
   (u'MENAS,', 1),
   (u'DARDANIUS,', 1),
   (u'DOLABELLA,', 1),
   (u'ORLANDO,', 1),
   (u'WILLIAMS,', 1),
   (u'ERPINGHAM,', 1),
   (u'SCARUS,', 1),
   (u'ARMADO,', 1),
   (u'ARTEMIDORUS,', 1),
   (u'MONTJOY,', 1),
   (u'FLUELLEN,', 1),
   (u'GALLUS,', 1),
   (u'VOLUMNIUS,', 1),
   (u'VARRIUS,', 1),
   (u'GRATI

In [11]:
word_model.map(lambda x: x).lookup((u'Now', u'is'))

[[(u'a', 1),
  (u'be', 1),
  (u'his', 1),
  (u'that', 1),
  (u'this', 1),
  (u'Mortimer', 1),
  (u'it', 3),
  (u'the', 9),
  (u'my', 2),
  (u'your', 1),
  (u'he', 1)]]

In [14]:
def find_next(words):
    choice = random.randint(1, sum(map(lambda x: x[1], words)))
    count = 0
    for word in words:
        count += word[1]
        if count >= choice:
            return word[0]
def generate_phrase(num_words, rdd):
    phrase = []
    rand_sample = rdd.takeSample(True, 1)
    phrase.append(rand_sample[0][0][0])
    phrase.append(rand_sample[0][0][1])
    while len(phrase) < num_words:
        lookup = rdd.map(lambda x: x).lookup((phrase[-2], phrase[-1]))
        if len(lookup) > 0:
            phrase.append(find_next(lookup[0]))
        else:
            return " ".join(phrase)
    return " ".join(phrase)

In [15]:
phrases = []
for i in range(0, 2):
    phrases.append(generate_phrase(20, word_model))
print phrases

[u"sight, poor mortal living ghost, Woe's scene, world's shame, grave's due by promise, For which we will hear anon. Egeus,", u"lost. Then you have fed upon this quiet life! want work.' 'O my sweet son. Dead life, blind sight, poor"]
