In [57]:
import numpy as np

SONNET_LINES = 14
NUM_QUATRAINS = 3
QUATRAIN_LINES = 4
COUPLET_LINES = 2
# Moved Sonnet 99 to bottom because it has 15 lines.
NUM_SHKSP_SONNETS = 152

PUNCTUATION = [',', ':', '.', ';', '?']

############################################################
# Read in raw sonnets.
############################################################

def shksp_raw(filename='data/shakespeare.txt'):
    seqs = np.loadtxt(filename, delimiter='\n', dtype='str')
    return seqs

############################################################
# Different tokenizers per line.
############################################################

# simple_token1:
#   Checks back of each line for punctuation and if found, 
#   replaces with a single token of the punctuation and a 
#   newline character. Punctuation within line is attatched
#   to word on left.
def simple_token1(line):
    line = line.lower().lstrip().rstrip()
    line = line.split(' ')
    # Handle punctuation at end of line.
    last_word = list(line[-1])
    del line[-1]
    if last_word[-1] in PUNCTUATION:
        tmp = last_word[-1]
        del last_word[-1]
        line.append(''.join(last_word))
        line.append(tmp + '\n')
    else:
        line.append(''.join(last_word))
    return line

# simple_token2:
#   All punctuation are attached to word on left, no 
#   newline characters.
def simple_token2(line):
    line = line.lower().lstrip().rstrip()
    line = line.split(' ')
    return line

# simple_token3:
#   All punctuation attached to word on left. Newline
#   characters for each line.
def simple_token3(line):
    line = line.lower().lstrip().rstrip()
    line = line.split(' ')
    return line + ['\n']

############################################################
# Preprocess Shakespeare sonnets.
############################################################

def shksp_per_sonnet(tokenizer, filename='data/shakespeare.txt'):
    raw = shksp_raw(filename)
    sequences = []
    cursor = 0
    for sonnet in range(NUM_SHKSP_SONNETS):
        # Skip first line which is a number.
        cursor += 1
        # Setup sequence.
        seq = []
        for i in range(SONNET_LINES):
            seq += tokenizer(raw[cursor])
            cursor += 1
        sequences.append(seq)
    return sequences

def shksp_per_line(tokenizer, filename='data/shakespeare.txt'):
    raw = shksp_raw(filename)
    sequences = []
    cursor = 0
    for sonnet in range(NUM_SHKSP_SONNETS):
        # Skip first line which is a number.
        cursor += 1
        for i in range(SONNET_LINES):
            sequences.append(tokenizer(raw[cursor]))
            cursor += 1
    return sequences

def shksp_quatrains_and_couplets(tokenizer, filename='data/shakespeare.txt'):
    raw = shksp_raw(filename)
    quatrains = []
    couplets = []
    cursor = 0
    for sonnet in range(NUM_SHKSP_SONNETS):
        cursor += 1
        couplet = []
        for quatrain in range(NUM_QUATRAINS):
            quatrain = []
            for line in range(QUATRAIN_LINES):
                quatrain += tokenizer(raw[cursor])
                cursor += 1
            quatrains.append(quatrain)
        for line in range(COUPLET_LINES):
            couplet += tokenizer(raw[cursor])
            cursor += 1
        couplets.append(couplet)
    return quatrains, couplets

In [72]:
############################################################
# Generate from trained models.
############################################################
def gen_txt(trans, emiss, init, word_map):
    # Verify that the model is functional and setup.
    num_states = len(trans)
    num_words = len(emiss[0])
    assert (num_states == len(trans[0])), 'Transition matrix is not square.'
    assert (num_states == len(emiss)), 'Emission matrix not correct dimensions.'
    

In [71]:
tmp = shksp_raw()
print len(tmp)
print tmp[-1]

2309
  But sweet, or colour it had stol'n from thee.


In [59]:
q, c = shksp_quatrains_and_couplets(simple_token3)
print len(q)
print len(c)

459
153


In [61]:
for i in range(NUM_SHKSP_SONNETS - 1):
    print c[i], '\n'

['pity', 'the', 'world,', 'or', 'else', 'this', 'glutton', 'be,', '\n', 'to', 'eat', 'the', "world's", 'due,', 'by', 'the', 'grave', 'and', 'thee.', '\n'] 

['this', 'were', 'to', 'be', 'new', 'made', 'when', 'thou', 'art', 'old,', '\n', 'and', 'see', 'thy', 'blood', 'warm', 'when', 'thou', "feel'st", 'it', 'cold.', '\n'] 

['but', 'if', 'thou', 'live', 'remembered', 'not', 'to', 'be,', '\n', 'die', 'single', 'and', 'thine', 'image', 'dies', 'with', 'thee.', '\n'] 

['thy', 'unused', 'beauty', 'must', 'be', 'tombed', 'with', 'thee,', '\n', 'which', 'used', 'lives', "th'", 'executor', 'to', 'be.', '\n'] 

['but', 'flowers', 'distilled', 'though', 'they', 'with', 'winter', 'meet,', '\n', 'leese', 'but', 'their', 'show,', 'their', 'substance', 'still', 'lives', 'sweet.', '\n'] 

['be', 'not', 'self-willed', 'for', 'thou', 'art', 'much', 'too', 'fair,', '\n', 'to', 'be', "death's", 'conquest', 'and', 'make', 'worms', 'thine', 'heir.', '\n'] 

['so', 'thou,', 'thy', 'self', 'out-going', 'in

In [62]:
np.random.choice(1, 10)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [65]:
a = (4 == 4)