In [20]:
import numpy as np

SONNET_LINES = 14
NUM_QUATRAINS = 3
QUATRAIN_LINES = 4
COUPLET_LINES = 2
NUM_SHKSP_SONNETS = 151

PUNCTUATION = [',', ':', '.', ';', '?', '!', '(', ')']

SYLL_DICT = 'syll_dict.p'

############################################################
# Read in raw sonnets.
############################################################

def shksp_raw(filename='shakespeare.txt'):
    seqs = np.loadtxt(filename, delimiter='\n', dtype='str')
    return seqs

############################################################
# Different tokenizers per line.
############################################################

# simple_token1:
#   Checks back of each line for punctuation and if found, 
#   replaces with a single token of the punctuation and a 
#   newline character. Punctuation within line is attatched
#   to word on left.
def simple_token1(line):
    line = line.lower().lstrip().rstrip()
    line = line.split(' ')
    # Handle punctuation at end of line.
    last_word = list(line[-1])
    del line[-1]
    if last_word[-1] in PUNCTUATION:
        tmp = last_word[-1]
        del last_word[-1]
        line.append(''.join(last_word))
        line.append(tmp + '\n')
    else:
        line.append(''.join(last_word))
    return line

# simple_token2:
#   All punctuation are attached to word on left, no 
#   newline characters.
def simple_token2(line):
    line = line.lower().lstrip().rstrip()
    line = line.split(' ')
    return line

# simple_token3:
#   All punctuation attached to word on left. Newline
#   characters for each line.
def simple_token3(line):
    line = line.lower().lstrip().rstrip()
    line = line.split(' ')
    return line + ['\n']

# simple_token4:
#   Remove all punctuation, no newline character.
def simple_token4(line):
    line = line.lower().lstrip().rstrip()
    for punc in PUNCTUATION:
        line = line.replace(punc, '')
    line = line.split(' ')
    return line

############################################################
# Preprocess Shakespeare sonnets.
############################################################

def shksp_per_sonnet(tokenizer, filename='shakespeare.txt'):
    raw = shksp_raw(filename)
    sequences = []
    cursor = 0
    for sonnet in range(NUM_SHKSP_SONNETS):
        # Skip first line which is a number.
        cursor += 1
        # Setup sequence.
        seq = []
        for i in range(SONNET_LINES):
            seq += tokenizer(raw[cursor])
            cursor += 1
        sequences.append(seq)
    return sequences

def shksp_per_line(tokenizer, filename='shakespeare.txt'):
    raw = shksp_raw(filename)
    sequences = []
    cursor = 0
    for sonnet in range(NUM_SHKSP_SONNETS):
        # Skip first line which is a number.
        cursor += 1
        for i in range(SONNET_LINES):
            sequences.append(tokenizer(raw[cursor]))
            cursor += 1
    return sequences

def shksp_quatrains_and_couplets(tokenizer, filename='shakespeare.txt'):
    raw = shksp_raw(filename)
    quatrains = []
    couplets = []
    cursor = 0
    for sonnet in range(NUM_SHKSP_SONNETS):
        cursor += 1
        couplet = []
        for quatrain in range(NUM_QUATRAINS):
            quatrain = []
            for line in range(QUATRAIN_LINES):
                quatrain += tokenizer(raw[cursor])
                cursor += 1
            quatrains.append(quatrain)
        for line in range(COUPLET_LINES):
            couplet += tokenizer(raw[cursor])
            cursor += 1
        couplets.append(couplet)
    return quatrains, couplets

def shksp_quatrain_couplets_line(tokenizer, filename='shakespeare.txt'):
    raw = shksp_raw(filename)
    quatrains = []
    couplets = []
    cursor = 0
    for sonnet in range(NUM_SHKSP_SONNETS):
        cursor += 1
        couplet = []
        for quatrain in range(NUM_QUATRAINS):
            for line in range(QUATRAIN_LINES):
                quatrains.append(tokenizer(raw[cursor]))
                cursor += 1
        for line in range(COUPLET_LINES):
            couplets.append(tokenizer(raw[cursor]))
            cursor += 1
    return quatrains, couplets

In [None]:
############################################################
# Generate from trained models.
############################################################
def gen_txt(trans, emiss, init, word_map, length, space_symb=' '):
    # Verify that the model is functional and setup.
    num_states = len(trans)
    num_words = len(emiss[0])
    assert (num_states == len(trans[0])), 'Transition matrix is not square.'
    assert (num_states == len(emiss)), 'Emission matrix not correct dimensions.'
    
    # Prepare to iterate for words.
    build = ''
    curr_state = np.random.choice(num_states, p=init)
    
    # Build the sequence.
    for i in range(length):
        nxt_token = np.random.choice(num_words, p=emiss[curr_state])
        build += word_map[nxt_token] + space_symb
        curr_state = np.random.choice(num_states, p=trans[curr_state])
    return build

In [None]:
class HMM:

    def __init__(self, num_states):

        self.D = 0 # num of unique observations
        self.L = num_states # num of hidden states

        self.token_dict = {} # map of integers to tokens

        self.A = None # transition (row: from; col: to), 0-indexed
        self.PI = None # initial state distribution, 0-indexed
        self.O = None # observation (row: state; col: observation), 0-indexed

    def train(self, data, epsilon=0.001, scaling=True):
        X = self.registerObs(data)
        L = self.L
        D = self.D

        # Initialize Matrices
        self.A = self.normalize(np.random.rand(L, L))
        self.PI = self.normalize(np.random.rand(L))
        self.O = self.normalize(np.random.rand(L, D))


        norm_arr = []
        iterations = 0

        while (True):
            iterations += 1
            # E Step
            alphas_arr = []
            betas_arr = []
            for seq in X:
                alphas, betas = self.forwardBackward(seq, scaling)
                alphas_arr.append(alphas)
                betas_arr.append(betas)

            # M step (Computes marginals + Updates)
            change_norm = self.update(X, alphas_arr, betas_arr)
            norm_arr.append(change_norm)

            # Stopping Condition
            if len(norm_arr) > 1 and norm_arr[-1] / norm_arr[0] < epsilon:
                print iterations
                break

        print h.PI
        print h.A
        print h.O

    """ Registers observations as integers and returns data transformed into
    integers. """
    def registerObs(self, data):
        # Reset Variables
        self.D = 0
        self.token_dict = {}

        X = [] # data transformed into integers corresponding to tokens
        for seq in data:
            X_i = [] # this sequence transformed into integers
            for token in seq:
                if token not in self.token_dict:
                    self.token_dict[token] = self.D
                    self.D += 1
                X_i.append(self.token_dict[token])
            X.append(X_i)
        return X

    """ Makes all rows add up to 1 """
    @staticmethod
    def normalize(matrix):
        if len(matrix.shape) == 1:
            return matrix / matrix.sum()
        sums = matrix.sum(axis=1)
        return matrix / sums.reshape(sums.shape[0], 1)

    def forwardBackward(self, seq, scaling=True):
        """ This function computes alpha and beta values for a sequence
            using the Forward-Backward algorithm.
        """
        M = len(seq) # length of given sequence
        L = self.L # num of states

        alphas = np.zeros((M, L)) # row: position; col: state
        betas = np.zeros((M, L)) # row: position; col: state

        # FORWARD ALGORITHM
        for i in range(M): # For each observation
            for s in range(L): # For each state
                # Base case
                if i == 0:
                    alphas[i, s] = self.O[s, seq[0]] * self.PI[s]
                else:
                    sum = 0
                    # For each previous state
                    for prev in range(L):
                        sum += alphas[i-1, prev] * self.A[prev, s]
                    alphas[i, s] = sum * self.O[s, seq[i]]
            # Scaling
            if scaling:
                scale = np.sum(alphas[i])
                alphas[i] = alphas[i] / scale

        # BACKWARD ALGORITHM
        for i in reversed(range(M)): # For each observation
            for s in range(L): # For each state
                # Base case
                if i == M-1:
                    betas[i, s] = 1
                else:
                    # For each next state
                    for next in range(L):
                        betas[i, s] += betas[i+1, next] * \
                                       self.A[s, next] * self.O[next, seq[i+1]]
            # Scaling
            if scaling:
                scale = np.sum(betas[i])
                betas[i] = betas[i] / scale

        return (alphas, betas)

    def update(self, X, alphas_arr, betas_arr):
        L = self.L # num states
        D = self.D # num unique tokens

        # new matrices
        PI = np.zeros(self.PI.shape)
        A = np.zeros(self.A.shape)
        O = np.zeros(self.O.shape)

        # update O (emission matrix)
        for state in range(L):
            for token in range(D):
                numerator = 0
                denominator = 0
                for j in range(len(X)): # iterate over all sequences
                    seq = X[j]
                    alphas = alphas_arr[j]
                    betas = betas_arr[j]

                    # for each index in seq
                    for i in range(len(seq)):
                        # compute P(y_i = z)
                        top = alphas[i, state] * betas[i, state]
                        bot = alphas[i].dot(betas[i])
                        prob = top / bot

                        if seq[i] == token: # indicator function
                            numerator += prob
                        denominator += prob
                O[state, token] = numerator / denominator

        # Make sure numbers add up to 1
        O = self.normalize(O)

        # update PI (initial distribution matrix)
        for state in range(L):
            prob_sum = 0
            for j in range(len(X)): # iterate over all sequences
                seq = X[j]
                alphas = alphas_arr[j]
                betas = betas_arr[j]

                # compute P(y_0 = state)
                prob_sum += alphas[0, state] * betas[0, state] / \
                            alphas[0].dot(betas[0])
            PI[state] = prob_sum / len(X)
        # Make sure numbers add up to 1
        PI = self.normalize(PI)

        # Update A (transition matrix)
        for prev in range(L):
            for next in range(L):
                numerator = 0
                denominator = 0

                for j in range(len(X)): # iterate over all sequences
                    seq = X[j]
                    alphas = alphas_arr[j]
                    betas = betas_arr[j]

                    # for each index in seq excluding last index
                    for i in range(len(seq)-1):
                        # Compute P(y_i = prev, y_i+1 = next) and add to
                        # numerator
                        # Names: numerator_top, numerator_bottom
                        num_top = alphas[i, prev] * self.O[next, seq[i+1]] * \
                               self.A[prev, next] * betas[i+1, next]
                        num_bot = 0
                        # SHOULD BE PRECOMPUTED (FIX LATER)
                        for prev_state in range(L):
                            for next_state in range(L):
                                num_bot += alphas[i, prev_state] * \
                                        self.O[next_state, seq[i+1]] * \
                                        self.A[prev_state, next_state] * \
                                        betas[i+1, next_state]
                        numerator += num_top / num_bot

                        # Compute P(y_i = b) and add to denominator
                        # Names: denominator_top, denominator_bottom
                        denom_top = alphas[i, prev] * betas[i, prev]
                        denom_bot = alphas[i].dot(betas[i])

                        denominator += denom_top / denom_bot

                # UPDATE A_{prev, next}
                A[prev, next] = numerator / denominator

        # Make sure numbers add up to 1
        A = self.normalize(A)

        # Calculate norm of change
        # Frobenius norm of the differences between update and previous matrices
        change_norm = np.linalg.norm(self.A - A) + np.linalg.norm(self.O - O) \
                      + np.linalg.norm(self.PI - PI)

        # update matrices
        self.O = O
        self.PI = PI
        self.A = A

        return change_norm

In [None]:
# Test training.
q, c = shksp_quatrain_couplets_line(simple_token2)
q_hmm = HMM(20)

In [14]:
from wordnik import *
apiUrl = 'http://api.wordnik.com/v4'
apiKey = '552b2562693245ea105020d08c904c58324d0d2b793995895'
client = swagger.ApiClient(apiKey, apiUrl)

wordApi = WordApi.WordApi(client)
example1 = wordApi.getHyphenation('dsad')
example2 = wordApi.getHyphenation('dafe')

In [35]:
# Set up pickle dictionary for syllable processing. 
# DO NOT RUN AFTER COMPLETING DICTIONARY!
import pickle
syll_dict = {'from' : 1}
save_file = open(SYLL_DICT, 'wb')
pickle.dump(syll_dict, save_file)
save_file.close()

In [36]:
# Method for assisting in creating syllables dict.
def make_syll_dict(lines):
    # Setup API connections.
    apiUrl = 'http://api.wordnik.com/v4'
    apiKey = '552b2562693245ea105020d08c904c58324d0d2b793995895'
    client = swagger.ApiClient(apiKey, apiUrl)
    wordApi = WordApi.WordApi(client)
    
    # Read in old dictionary.
    save_file = open(SYLL_DICT, 'rb')
    syll_dict = pickle.load(save_file)
    save_file.close()
    
    for line in lines:
        for word in line:
            if syll_dict.get(word) == None:
                try_api = wordApi.getHyphenation(word)
                if try_api != None:
                    syll_dict[word] = len(try_api)
                else:
                    print word, ':', line
                    count = input()
                    syll_dict[word] = count
        save_file = open(SYLL_DICT, 'wb')
        pickle.dump(syll_dict, save_file)
        save_file.close()
        
    return syll_dict

In [26]:
all_lines = shksp_per_line(simple_token4)
print len(all_lines)

2114


In [41]:
make_syll_dict(all_lines)

bones : ['when', 'that', 'churl', 'death', 'my', 'bones', 'with', 'dust', 'shall', 'cover']
1
re-survey : ['and', 'shalt', 'by', 'fortune', 'once', 'more', 're-survey']
3
bett'ring : ['compare', 'them', 'with', 'the', "bett'ring", 'of', 'the', 'time']
2
outstripped : ['and', 'though', 'they', 'be', 'outstripped', 'by', 'every', 'pen']
2
exceeded : ['exceeded', 'by', 'the', 'height', 'of', 'happier', 'men']
3
'had : ["'had", 'my', "friend's", 'muse', 'grown', 'with', 'this', 'growing', 'age']
1
friend's : ["'had", 'my', "friend's", 'muse', 'grown', 'with', 'this', 'growing', 'age']
1
dearer : ['a', 'dearer', 'birth', 'than', 'this', 'his', 'love', 'had', 'brought']
2
brought : ['a', 'dearer', 'birth', 'than', 'this', 'his', 'love', 'had', 'brought']
1
ranks : ['to', 'march', 'in', 'ranks', 'of', 'better', 'equipage']
1
died : ['but', 'since', 'he', 'died', 'and', 'poets', 'better', 'prove']
1
poets : ['but', 'since', 'he', 'died', 'and', 'poets', 'better', 'prove']
2
i'll : ['theirs', '

{'fawn': 1,
 'pardon': 2,
 'yellow': 2,
 'four': 1,
 'hath': 1,
 'sleep': 1,
 "friend's": 1,
 'hanging': 2,
 'mansion': 2,
 'appetite': 3,
 'evermore': 3,
 'hate': 1,
 'forget': 2,
 'whose': 1,
 'feeding': 2,
 'vile': 1,
 'granting': 2,
 'sweetest': 2,
 'presents': 2,
 'walks': 1,
 "there's": 1,
 'whatsoever': 4,
 'under': 2,
 'lord': 1,
 'sorry': 2,
 'pride': 1,
 'sway': 1,
 'worth': 1,
 'wondrous': 2,
 'tune': 1,
 'discased': 2,
 'dispense': 2,
 'hadst': 1,
 'inhearse': 2,
 "women's": 2,
 'shoot': 1,
 'every': 3,
 'foul': 1,
 'nourished': 2,
 "o'er-read": 2,
 'special': 2,
 'believed': 2,
 'uttering': 3,
 'prize': 1,
 'unrest': 2,
 'graced': 1,
 'succession': 3,
 'graces': 1,
 'triumph': 2,
 'enjoy': 2,
 'charter': 2,
 'force': 1,
 'tired': 1,
 'awake': 2,
 'razed': 1,
 'out-going': 3,
 'assure': 2,
 'tires': 1,
 'crave': 1,
 'persuade': 2,
 'quill': 1,
 'even': 2,
 'beated': 2,
 'captain': 2,
 'hide': 1,
 "ne'er": 1,
 'solemn': 2,
 'thunder': 2,
 'fingers': 2,
 'liberty': 3,
 'child

In [43]:
save_file = open(SYLL_DICT, 'rb')
syll_dict = pickle.load(save_file)
save_file.close()
print len(syll_dict)

3190


In [34]:
print syll_dict

{'test': 1}
