# Problem 6

In [1]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.ticker as ticker   

import urllib
import numpy as np
from scipy import stats
import re

# setup spark
conf = SparkConf().setAppName('MarkovShakespeare')
sc = SparkContext(conf=conf)

In [2]:
# load textlist from net
filename = "pg100.txt"

url = "http://www.gutenberg.org/cache/epub/100/pg100.txt"

wordfile = urllib.URLopener()
wordfile.retrieve(url, filename)

('pg100.txt', <httplib.HTTPMessage instance at 0x110a990e0>)

In [3]:
# the texts from shakespeare are contained in lines 182 - 124367
# create the file Shakespeare.txt using the line range from above
# if all is ok, this should return True, True
with open(filename, 'r') as f:
    lines = f.readlines()
    lines = lines[181:124367]
    with open('Shakespeare.txt', 'w') as f2:
        f2.writelines(lines)
    print f2.closed
f.closed

True


True

In [28]:
# Pardon the naming, but I couldn't resist!
def loadShakespearsBrain(filename):
    words = sc.textFile(filename)
    
    # split lines into words (after whitespace)
    # before splitting replace first some escape characters with white space
    rdd = words.flatMap(lambda line: line.replace('\n', ' ').replace('\r', ' ').split(' ')).cache()
    
    # now perform filtering on the words
    # to develop the regular expression this awesome tool was used https://regex101.com

    # (1) filter out words that only contain numbers
    # (2) filter out words for which all letters are capitalized
    # (3) filter out words that contain letters only and end with a period

    pattern1 = re.compile(ur'([0-9]+\.?)|\.')  # use only a simple RE for numbers here (numbers in the shakespeare text are formateed as 1., 2., ...)
    pattern2 = re.compile(ur'(\b[A-Z]+\.?\b)|\.') # ==> do (2) & (3) together in one regex!

    # filter does not change the order
    rdd = rdd.filter(lambda x: pattern1.match(x) is None).filter(lambda x: pattern2.match(x) is None).filter(lambda x: len(x) > 0)
    
    # use index to identify the neighbored words!
    rdd = rdd.zipWithIndex()
    rdd = rdd.flatMap(lambda x: [(x[1], (0, x[0])), (x[1] - 1, (1, x[0])), (x[1] - 2, (2, x[0]))])
    rdd = rdd.groupByKey()

    # map (keyA, (keyA, wordA), (keyA + 1, wordA+1), (keyA + 2, wordA+2)) to (wordA, wordA+1, wordA+2)
    fun = lambda x: list(x[1])
    rdd = rdd.map(fun)

    # previous mapping created some tuples, that do not have length 3 at the beginning / end of the text
    # ==> get rid of them!
    rdd = rdd.filter(lambda x: len(x) == 3)

    # count number of combinations (key is the 3-word combination)
    rdd = rdd.map(lambda x: ((x[0][1], x[1][1], x[2][1]), 1))
    rdd = rdd.reduceByKey(lambda x, y: x + y)

    # remap form
    rdd = rdd.map(lambda x: ((x[0][0], x[0][1]), (x[0][2], x[1])))

    # final group by key
    rdd = rdd.groupByKey().map(lambda x: (x[0], sorted(list(x[1])) )) \
             .sortBy(lambda x: x[0])
    
    return rdd

In [29]:
# test the code
rdd = loadShakespearsBrain('Shakespeare.txt')
test = rdd.map(lambda x: x).lookup((u'Now', u'is'))
test

[[(u'Mortimer', 1),
  (u'a', 1),
  (u'be', 1),
  (u'he', 1),
  (u'his', 1),
  (u'it', 3),
  (u'my', 2),
  (u'that', 1),
  (u'the', 9),
  (u'this', 1),
  (u'your', 1)]]

In [30]:
# phrase generator
def generatePhrase(num_of_words, rdd):
    phrase = ''
    for i in range(0, num_of_words - 1):

        if i == 0:
            cur_sample = rdd.takeSample(False, 1)[0]
            phrase = cur_sample[0][0] + ' ' + cur_sample[0][1]
        else:
            cur_sample = (cur_phrase, rdd.map(lambda x:x).lookup(cur_phrase)[0])   

        cur_phrase = cur_sample[0]
        nextwordlist = cur_sample[1]

        # to choose the next word an next tuple, we have to draw from the random distribution that is described by the word lost
        # this can be done manually (like here) or using scipy.stats i.e.

        xk = np.arange(0, len(nextwordlist))
        pk = np.array([w[1] for w in nextwordlist])
        # normalize probabilities
        pk = np.divide(pk, np.sum(pk) * 1.0)
        word_distribution = stats.rv_discrete(name='word_distribution', values=(xk, pk))

        # draw sample
        index = word_distribution.rvs(size=1)[0]

        # the next word is now at position index of the list
        next_word = nextwordlist[index][0]

        phrase += ' ' + next_word
        cur_phrase = (cur_phrase[1], next_word)
        
    return phrase

In [31]:
# test code
generatePhrase(20, rdd)

u'effuse of blood and made his peace and gentle as a ducat for delivering your letter; for it is thy business'

## Old Code

In [4]:
# filtering is done by spark
words = sc.textFile('Shakespeare.txt')

In [5]:
# split lines into words (after whitespace)
# before splitting replace first some escape characters with white space
rdd = words.flatMap(lambda line: line.replace('\n', ' ').replace('\r', ' ').split(' '))

In [6]:
# now perform filtering on the words
# to develop the regular expression this awesome tool was used https://regex101.com

# (1) filter out words that only contain numbers
# (2) filter out words for which all letters are capitalized
# (3) filter out words that contain letters only and end with a period

pattern1 = re.compile(ur'([0-9]+\.?)|\.')  # use only a simple RE for numbers here (numbers in the shakespeare text are formateed as 1., 2., ...)
pattern2 = re.compile(ur'(\b[A-Z]+\.?\b)|\.') # ==> do (2) & (3) together in one regex!

# filter does not change the order

# FILTERING DOES NOT WORK YET GOOD!

# first pattern works, second has troubles!
rdd = rdd.filter(lambda x: pattern1.match(x) is None).filter(lambda x: pattern2.match(x) is None).filter(lambda x: len(x) > 0)

In [7]:
rdd.take(5)

[u'From', u'fairest', u'creatures', u'we', u'desire']

In [8]:
# zip with index
rdd = rdd.zipWithIndex()

In [9]:
rdd.take(5)

[(u'From', 0), (u'fairest', 1), (u'creatures', 2), (u'we', 3), (u'desire', 4)]

In [10]:
#rdd.map(lambda x:x).lookup(u'Now')

In [11]:
# map to list
rdd = rdd.flatMap(lambda x: [(x[1], (0, x[0])), (x[1] - 1, (1, x[0])), (x[1] - 2, (2, x[0]))])

In [12]:
rdd = rdd.groupByKey()

In [13]:
# map (keyA, (keyA, wordA), (keyA + 1, wordA+1), (keyA + 2, wordA+2)) to (wordA, wordA+1, wordA+2)
fun = lambda x: list(x[1])
rdd = rdd.map(fun)

In [14]:
# HIER NOCH UNBEDINGT ALLE TUPEL RAUSSCHMEISSEN, DEREN LISTE != 3 Elemente enthaelt
rdd = rdd.filter(lambda x: len(x) == 3)

In [15]:
# map such that the three words form the key with value 1, we use that then to sum up everything
#gfun = lambda x: ((x[0][1], x[1][1], x[2][1]), 1)

# try to combine
gfun = lambda x: ((x[0][1], x[1][1], x[2][1]), 1)
#gfun = lambda x: (x[0][1], 1)
#rdd = rdd.map(gfun)

In [16]:
rdd = rdd.map(gfun)

In [17]:
rdd = rdd.reduceByKey(lambda x,y:x+y)
#temp.take(5)

In [18]:
# remap form
rdd = rdd.map(lambda x: ((x[0][0], x[0][1]), (x[0][2], x[1])))

In [19]:
# final group by key
# maybe sort list after it!!!
rdd = rdd.groupByKey().map(lambda x: (x[0], sorted(list(x[1])) ))

In [20]:
test = rdd.map(lambda x: x).lookup((u'Now', u'is'))

In [21]:
test

[[(u'Mortimer', 1),
  (u'a', 1),
  (u'be', 1),
  (u'he', 1),
  (u'his', 1),
  (u'it', 3),
  (u'my', 2),
  (u'that', 1),
  (u'the', 9),
  (u'this', 1),
  (u'your', 1)]]

In [22]:
# now, it is time to start the Markov chain!
cur_sample = [((u'Now', u'is'), test[0])][0]
#cur_phrase = rdd.takeSample(False, 1)[0]
cur_phrase = cur_sample[0]
cur_phrase

(u'Now', u'is')

In [23]:
# how long shall the phrase be?
num_words = 20

In [24]:
phrase = ''
for i in range(0, num_words - 1):
    
    if i == 0:
        cur_sample = rdd.takeSample(False, 1)[0]
        phrase = cur_sample[0][0] + ' ' + cur_sample[0][1]
    else:
        cur_sample = (cur_phrase, rdd.map(lambda x:x).lookup(cur_phrase)[0])   
    
    cur_phrase = cur_sample[0]
    nextwordlist = cur_sample[1]

    print('%dth sample is %s' % (i, str(cur_sample)))
    
    # to choose the next word an next tuple, we have to draw from the random distribution that is described by the word lost
    # this can be done manually (like here) or using scipy.stats i.e.

    xk = np.arange(0, len(nextwordlist))
    pk = np.array([w[1] for w in nextwordlist])
    # normalize probabilities
    pk = np.divide(pk, np.sum(pk) * 1.0)
    word_distribution = stats.rv_discrete(name='word_distribution', values=(xk, pk))

    # draw sample
    index = word_distribution.rvs(size=1)[0]

    # the next word is now at position index of the list
    next_word = nextwordlist[index][0]
    print('next word is   %s' % next_word)

    phrase += ' ' + next_word
    cur_phrase = (cur_phrase[1], next_word)
    print(cur_phrase)

0th sample is ((u'parting', u'all'), [(u'the', 1)])
next word is   the
(u'all', u'the')
1th sample is ((u'all', u'the'), [(u"'orld:", 1), (u'Andronici', 1), (u'Duke', 1), (u'English', 1), (u'Fairies,', 1), (u'Greekish', 3), (u'Greeks', 1), (u'Guests', 1), (u'Indies', 1), (u'Inns', 1), (u'Muses', 1), (u'Parthian', 1), (u'Roman', 1), (u'Romans,', 1), (u'Volsces,', 1), (u'Welshmen,', 1), (u'accoutrement,', 1), (u'admired', 1), (u'age', 1), (u'air,', 1), (u'alehouses', 1), (u'all', 1), (u'and', 1), (u'art', 1), (u'battlements', 1), (u'battles', 1), (u'beauty', 1), (u'beholders', 1), (u'benefits', 1), (u'better', 2), (u'better;', 1), (u'bitterest', 1), (u'blessings', 1), (u'blood', 1), (u"body's", 1), (u'books', 1), (u'boys', 1), (u'breathers', 1), (u'brothers', 1), (u'budding', 1), (u'building', 1), (u'business', 1), (u'cabileros', 1), (u'care', 1), (u'ceremony', 1), (u'chamber', 1), (u'characters', 1), (u'charms', 1), (u'chests', 1), (u'chivalry', 1), (u'choicest', 1), (u'church', 1), (u'

In [25]:
cur_sample

((u'lord,', u'or'), [(u'any', 1), (u'else', 1), (u'you,', 1)])

In [26]:
phrase

u'parting all the world, Let me see what good turns now unto the point of death. Rescue, fair lord, or you,'