In [16]:
import findspark
# For home PC
#findspark.init('/home/nick/spark')
# For macbook
findspark.init('/Users/nick/Documents/spark')

In [17]:
import pyspark
sc = pyspark.SparkContext(appName='Shakespeare')

In [18]:
shakes = sc.textFile('shakespeare.txt')

In [53]:
import re
import random

In [21]:
def word_match(line):
    """Matches all words in a line.
    To be used with flatMap, is it returns a tuple of all the words in a given line.
    """
    # Just one or more letters
    a_word = re.compile(r"\w+")
    # Match all the words in a given line
    matches = a_word.findall(line)
    
    # findall returns a list, so...
    return tuple(matches)

In [22]:
words = shakes.flatMap(word_match)

In [23]:
def filter_words(word):
    """Function that uses regular expressions to filter out words that are made of
    entirely capital letters, or entirely capital letters followed by a period, as per
    the problem statement.
    Note that the way we matched words, \w+, has already filtered out words made of only numbers.
    """
    
    length_of_word = len(word)
    
    # Matches one or more capital letters in a row.
    only_capitals = re.compile(r"[A-Z]{" + str(length_of_word) + "}")
    
    # Matches one or more capital letters in a row followed by a period.
    only_capitals_period = re.compile(r"[A-Z]{" + str(length_of_word-1) + "}\.+")
    
    # Only numbers
    only_numbers = re.compile(r"\d{" + str(length_of_word) + "}")
    
    # if it's a capital...
    caps = only_capitals.match(word)
    
    # if it's a capital followed by a period...
    cap_per = only_capitals_period.match(word)
    
    # if it's a number...
    numbs = only_numbers.match(word)
        
    result = True
    if (caps) or (cap_per) or (numbs):
        result = False
        
    return result

In [24]:
words = words.filter(filter_words)

In [25]:
numbered_words = words.zipWithIndex()
numbered_words.take(2)

[(u'The', 0), (u'Project', 1)]

In [26]:
shifted_words = numbered_words.map(lambda (x, y): (y-1, x))
index_words = numbered_words.map(lambda (x, y): (y, x))
print shifted_words.take(10)
print index_words.take(10)

[(-1, u'The'), (0, u'Project'), (1, u'Gutenberg'), (2, u'EBook'), (3, u'of'), (4, u'The'), (5, u'Complete'), (6, u'Works'), (7, u'of'), (8, u'William')]
[(0, u'The'), (1, u'Project'), (2, u'Gutenberg'), (3, u'EBook'), (4, u'of'), (5, u'The'), (6, u'Complete'), (7, u'Works'), (8, u'of'), (9, u'William')]


In [29]:
test = index_words.join(shifted_words)
indices_and_pairs = test

In [30]:
double_shifted_words = numbered_words.map(lambda (x, y): (y-2, x))
almost_there = indices_and_pairs.join(double_shifted_words)
almost_there.take(2)

[(0, ((u'The', u'Project'), u'Gutenberg')),
 (786432, ((u'song', u'of'), u'good'))]

In [32]:
all_pairs_and_followed = almost_there.map(lambda (x, y): (y[0], y[1]))
all_pairs_and_followed.take(10)

[((u'The', u'Project'), u'Gutenberg'),
 ((u'song', u'of'), u'good'),
 ((u'shoe', u'trod'), u'upon'),
 ((u'cue', u'and'), u'my'),
 ((u'Complete', u'Works'), u'of'),
 ((u'and', u'his'), u'earth'),
 ((u'this', u'caparison'), u'and'),
 ((u'search', u'there'), u'shall'),
 ((u'William', u'Shakespeare'), u'This'),
 ((u'la', u'Then'), u'keep')]

In [33]:
def combine(list_of_word_count_tup1, list_of_word_count_tup2):
    """Function to combine two "tuple lists" of the form [(third_word_1, count_1), ...]
    This is to be used with reduceByKey on an RDD of ALL possible occurences of the form (word1, word2), (word3, n).
    We want to allow for the case where we encounter new words, and also the case where the lists overlap.
    If we have a new word, then we just append it to the list now with count one.
    If a word in the second list already exists in the first, we must add their counts.
    """
    
    # First get the list of words that already occur
    list_of_words_1 = map(lambda (x, y): x, list_of_word_count_tup1)
    
    # Now go over every (word, count) pair in the second tuple
    for word_count_tup in list_of_word_count_tup2:
        
        # Access the word for clarity
        word = word_count_tup[0]
        
        # Test membership in the first list
        if word in list_of_words_1:
            # Find out where it occurred so we can get its count
            ind = list_of_words_1.index(word)
            # Tuples are immutable, so we recreate the tuple in Python with the updated count
            list_of_word_count_tup1[ind] = (word, list_of_word_count_tup1[ind][1]+word_count_tup[1])
        else:
            # Otherwise, just add away
            list_of_word_count_tup1.append(word_count_tup)
            
    # Return that bad boy
    return list_of_word_count_tup1

In [34]:
start = all_pairs_and_followed.map(lambda (x, y): (x, [(y, 1)]))
start.take(2)

[((u'The', u'Project'), [(u'Gutenberg', 1)]),
 ((u'song', u'of'), [(u'good', 1)])]

In [45]:
end = start.reduceByKey(combine)
end.take(1)

[((u'thought', u'Suppose'), [(u'that', 1)])]

In [48]:
test = end.takeSample(True, 1)
test1 = test[0]

print test1

((u'apply', u'yourself'), [(u'to', 1)])


In [64]:
def generate_phrase(markov, num_words):
    """Takes a markov chain, assumed to be an RDD of the form described in the problem
    and returns a randomly generated phrase with num_words words. The phrase is generated by
    choosing a pair of words randomly from the RDD, weighting the next word by the relative
    frequencies of occurrences of other words after that pair in Shakespeare, and then
    continues to do so until we have num_words.
    """
    # Our counter
    curr_num_words = 0
    
    # Start us off with a blank phrase
    phrase = []
    
    # Keep going until we are done
    while (curr_num_words < num_words):
        # First flag is for with replacement... doesn't matter because we just do one at a time
        # Note that takeSample returns a list, so we just really want one element from it
        next_three = markov.takeSample(True, 1)[0]
        
        # Get the first two words
        word_1 = next_three[0][0]
        word_2 = next_three[0][1]
        
        # Now get the list of possible words
        poss_words = next_three[1]
        
        # And choose accordingly...
        word_3 = weighted_choice(poss_words)
        
        # Add our words to the phrase...
        phrase.append(word_1)
        phrase.append(word_2)
        phrase.append(word_3)
        
        # Make sure we stop at some point
        curr_num_words += 3
        
    # Now we get it to be a string
    phrase = ' '.join(phrase)
    
    # And we are done!
    return phrase

def weighted_choice(choices):
    """ Inspired by http://stackoverflow.com/questions/3679694/a-weighted-version-of-random-choice
    Given a list of tuples, [(x1, c1), (x2, c2), ...]
    Returns some xn from the list chosen randomly but weighted by the counts cn.
    """
    
    # First get the total count
    total = sum(w for c, w in choices)
    
    # Generate some random number between 0 and the total, uniformly
    r = random.uniform(0, total)
    
    # Variable that stores what "range" we are currently in
    upto = 0
    
    # Variable that we will ultimately return
    result = choices[-1][0]
    
    # Loop over all possible choices and weights
    for c, w in choices:
        
        # If our randomly generated number is in the correct range, take it
        if upto + w > r:
            result = c
            
        # Otherwise check the next range
        upto += w
        
    return result

In [65]:
# Now generate our ten phrases using the Markov Chain
for ii in range(10):
    print generate_phrase(end, 20)

late king s country matters Oph infect to the self confounded to forget am in beguile The tedious gait And speaking
force Their scanted hounds are bred lament or fear no my nephew gratulate his safe persuade the King of wheat hid
indifferent eye You Blame not this of stay to some glorious day of Darius Transported my tale Right love shall render
face That every servant charge thee My master sues loves Caesar best Haven Which is slain as thou possess All the
will choose it duty now am grace Well there And fame in and convey what Semiramis this nymph wonders Went you
er yet beaten strength even at has forsook him lays she to room comes me shall see If See Buckingham Somerset
matters heavy matters Damnable both sides assured Whether yond worms And stop Being a thing daughter Tis he stretch thy chest
office And will lord has paid strength my valour stream And in contestation Was theme prince so wild true hearts cannot
perpetual honour Dar narrow The throng ll rest us reward thee the Perkes