In [415]:
import matplotlib.pyplot as plt
import numpy as np
import re
from nltk.corpus import cmudict
import csv
import os
import string

In [136]:
# Seed the random number generator:
np.random.seed(2019)
PUNC = [',', ':', '.', '?', '!', ';', '', '(', ')']
REP = {}
INPOEM_LINES = 14
OUTPOEM_LINES = 14

# Convert roman numerals to arabic numbers 
def roman_int(rom):
    r_dict = {'I':1, 'V':5, 'X':10, 'L':50}
    arab = 0
    for i,c in enumerate(rom):
        if i == len(rom) - 1 or r_dict[c] > r_dict[rom[i+1]]:
            arab += r_dict[c]
        else:
            arab -= r_dict[c]
    return arab

In [137]:
def load_data_dic(filename, roman=False):
    """
    SIMPLE Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    
    text = open(filename, 'r').read()
    
    lines = [line.split() for line in text.split('\n') if line.split()]
    lines.remove(lines[0])

    obs_counter = 0
    obs = []
    obs_map = {}
    line_counter = 0

    for line in lines:
        if line_counter < 14:
            line_counter += 1
            obs_elem = []
            for word in line:
                temp = word.lower()
                word = re.sub(r'[^\w]', '', word).lower()
                if temp != word:
                    REP[word] = temp
                else:
                    REP[temp] = temp
                if word not in obs_map:
                    # Add unique words to the observations map.
                    obs_map[word] = obs_counter
                    obs_counter += 1

                # Add the encoded word.
                obs_elem.append(obs_map[word])

            # Add the encoded sequence.
            obs.append(obs_elem)
        else:
            line_counter = 0

    return obs, obs_map

In [138]:
load_data_dic('/Users/neymikajain/Downloads/project3/data/spenser.txt')

([[0, 1, 2, 3, 4, 5, 6, 7],
  [8, 9, 10, 11, 12, 13, 14, 15],
  [16, 17, 18, 19, 9, 12, 20, 21, 22],
  [23, 24, 25, 26, 27, 28, 29],
  [19, 0, 30, 31, 8, 32, 33, 34],
  [5, 35, 36, 37, 38, 39, 40, 41],
  [19, 42, 27, 43, 44, 10, 45, 46],
  [47, 32, 48, 12, 49, 50, 51],
  [19, 0, 52, 53, 12, 27, 54, 55],
  [44, 56, 57, 58, 59, 60],
  [3, 1, 61, 62, 63, 64, 41],
  [10, 65, 66, 67, 10, 68, 69],
  [2, 30, 19, 52, 70, 71, 40, 72, 73],
  [74, 75, 1, 72, 76, 77, 78, 79, 80],
  [81, 82, 74, 26, 27, 83, 76, 84],
  [44, 85, 86, 87, 44, 10, 88, 89],
  [19, 90, 91, 32, 92, 19, 43, 93],
  [94, 95, 96, 10, 97, 98, 99, 100],
  [101, 102, 26, 103, 104, 44, 27, 105, 106],
  [12, 8, 98, 107, 23, 40, 108, 109],
  [19, 70, 110, 111, 112, 40, 113, 10, 114],
  [19, 115, 40, 116, 117, 118, 32, 67],
  [119, 75, 12, 120, 44, 62, 121, 122],
  [98, 123, 40, 124, 125, 126, 26, 71, 127],
  [19, 32, 128, 129, 19, 130, 131],
  [132, 78, 133, 19, 134, 78, 135, 136],
  [8, 75, 58, 137, 96, 138, 19, 10, 139, 140],
  [7

In [139]:
def read_syllable():
    text = open('/Users/neymikajain/Downloads/project3/data/Syllable_dictionary.txt', 'r').read()
    lines = [line.split() for line in text.split('\n') if line.split()]
    syl_dict = {}
    
    for line in lines:
        word = line[0]
        word = re.sub(r'[^\w]', '', word).lower()
        syl_arr = []
        for syl in range(1, len(line)):
            if line[syl][0] == 'E':
                syl_arr.append(int(line[syl][1]))
            else:
                syl_arr.append(int(line[syl]))
        
        syl_dict[word] = syl_arr
    return syl_dict

In [148]:
cmudict.dict()['quiet'][0]

['K', 'W', 'AY1', 'AH0', 'T']

In [161]:
def syllable_count(word):
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
            if word.endswith("e"):
                count -= 1
    if count == 0:
        count += 1
    return count

In [163]:
syllable_count("succour")

2

In [177]:
def find_word(word):
    cmu_dict = cmudict.dict()
    words = []
    if "'" in word:
        words = word.split("'")
    elif "-" in word:
        words = word.split("-")
    elif word[-1] in PUNC:
        words = [word[:-1]]
    else:
        words = [word]
    syllable = 0
#     weird_words = {'unquiet': 4, 'sithens': 2, 'woxen': 2, 'lurkest': 2, 'succour': 2}
    
    for part in words:
        if part != '':
            if part not in cmu_dict.keys():
                print(part, words)
                syllable += syllable_count(part)
            else:
                phon = cmu_dict[part][0]
                for emes in phon:
                    if emes[-1].isdigit():
                        syllable += 1
                
    return syllable

In [171]:
read_syllable()["selfsubstantial"]

[4]

In [178]:
def load_data_dic2(filename, roman=False):
    """
    SYLLABLE Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    
    text = open(filename, 'r').read()
    
    lines = [line.split() for line in text.split('\n') if line.split()]
    lines.remove(lines[0])

    obs_counter = 0
    obs = []
    obs_map = {}
    syllables = {}
    syl_dict = read_syllable()
    line_counter = 0

    for line in lines:
        if line_counter < 14:
            line_counter += 1
            obs_elem = []
            for word in line:
                temp = word.lower()
                word = re.sub(r'[^\w]', '', word).lower()
                if temp != word:
                    REP[temp] = word
                else:
                    REP[temp] = temp
                if word not in obs_map:
                    # Add unique words to the observations map.
                    obs_map[word] = obs_counter
                    if word in syl_dict.keys():
                        syllables[obs_counter] = syl_dict[word]
                    else:
                        syllables[obs_counter] = [find_word(temp)]
                    obs_counter += 1

                # Add the encoded word.
                obs_elem.append(obs_map[word])

            # Add the encoded sequence.
            obs.append(obs_elem)
        else:
            line_counter = 0

    return obs, obs_map, syllables

In [180]:
load_data_dic2('/Users/neymikajain/Downloads/project3/data/shakespeare.txt')

([[0, 1, 2, 3, 4, 5],
  [6, 7, 8, 9, 10, 11, 12],
  [13, 14, 15, 16, 17, 18, 19, 20],
  [21, 22, 23, 10, 24, 21, 25],
  [13, 26, 27, 28, 29, 30, 31, 32],
  [33, 34, 35, 36, 37, 38, 39],
  [40, 41, 42, 43, 44, 45],
  [34, 46, 34, 47, 28, 34, 48, 46, 49, 50],
  [26, 6, 51, 52, 15, 53, 54, 55],
  [56, 57, 58, 28, 15, 59, 60],
  [61, 29, 30, 62, 63, 34, 64],
  [56, 22, 65, 66, 67, 68, 69],
  [70, 15, 71, 72, 73, 74, 75, 76],
  [28, 77, 15, 53, 78, 18, 15, 79, 56, 80],
  [81, 82, 83, 84, 85, 34, 86],
  [56, 87, 88, 89, 68, 34, 8, 90],
  [34, 91, 92, 93, 94, 95, 96, 52],
  [97, 76, 41, 98, 99, 100, 101, 102, 103],
  [104, 105, 106, 43, 107, 34, 108, 45],
  [43, 107, 15, 109, 100, 34, 110, 111],
  [28, 112, 61, 29, 30, 88, 113, 32],
  [114, 115, 116, 117, 56, 118, 119],
  [120, 121, 122, 119, 123, 34, 8, 124],
  [125, 26, 126, 127, 74, 128, 129, 100, 130],
  [84, 131, 132, 133, 56, 134, 132, 135, 136],
  [137, 21, 108, 18, 138, 29],
  [74, 114, 28, 76, 139, 140, 81, 26, 51, 135],
  [56, 141, 

In [349]:
def partial_sum(combo, rem):
    num_var = len(combo)
    
    if num_var == 1:
        return [rem]
    if num_var == 2:
        c = np.array(np.meshgrid(combo[0], combo[1]))
    elif num_var == 3:
        c = np.array(np.meshgrid(combo[0], combo[1], combo[2]))
    elif num_var == 4:
        c = np.array(np.meshgrid(combo[0], combo[1], combo[2], combo[3]))
    elif num_var == 5:
        c = np.array(np.meshgrid(combo[0], combo[1], combo[2], combo[3], combo[4]))
    elif num_var == 6:
        c = np.array(np.meshgrid(combo[0], combo[1], combo[2], combo[3], combo[4], combo[5]))
    elif num_var == 7:
        c = np.array(np.meshgrid(combo[0], combo[1], combo[2], combo[3], combo[4], combo[5], combo[6]))
    elif num_var == 8:
        c = np.array(np.meshgrid(combo[0], combo[1], combo[2], combo[3], combo[4], combo[5], combo[6], combo[7]))
    elif num_var == 9:
        c = np.array(np.meshgrid(combo[0], combo[1], combo[2], combo[3], combo[4], combo[5], combo[6], combo[7], combo[8]))
    else:
        # THIS SHOULD NEVER HAPPEN
        c = np.array(np.meshgrid(combo[0], combo[1], combo[2], combo[3], combo[4], combo[5], combo[6], combo[7], combo[8], combo[9]))
    c = c.T.reshape(-1, num_var)

    for arr in c:
        if np.sum(arr) == rem:
            return arr
        
    print("SHOULD NOT PRINT")
    return []

In [350]:
def sum_method(line, obs_map, syllables):
    total = 10
    rem = 10
    combo = []
    stressed = []
    prev = ""
    prev_check = 0
    
    for word in range(len(line)):
        temp = re.sub(r'[^\w]', '', line[word]).lower()
        check = syllables[obs_map[temp]]

        if len(check) == 1:
            if word == 0:
                stressed.append(0)
                prev_check = check[0]
                prev = line[word]
                rem -= check[0]
            elif prev != "":
                stressed.append(prev_check % 2)
                prev_check += check[0]
                prev = line[word]
                rem -= check[0]
            else:
                combo.append(check)
        else:
            combo.append(check)
            prev = ""
            
    if rem > 0:
        new_syllables = partial_sum(combo, rem)
        
        for word in range(len(new_syllables)):
            check = new_syllables[word]
            stressed.append(prev_check % 2)
            prev_check += check
    
    return stressed

In [373]:
def load_data_dic3(filename, roman=False):
    """
    DOES NOT WORK WITH SPENSER
    STRESSED Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    
    text = open(filename, 'r').read()
    
    lines = [line.split() for line in text.split('\n') if line.split()]
    lines.remove(lines[0])

    obs_counter = 0
    obs = []
    obs_map = {}
    syllables = {}
    stressed = {}
    syl_dict = read_syllable()
    line_counter = 0

    for line in lines:
        if line_counter < 14:
            line_counter += 1
            obs_elem = []
            for word in line:
                temp = word.lower()
                word = re.sub(r'[^\w]', '', word).lower()
                if temp != word:
                    REP[temp] = word
                else:
                    REP[temp] = temp
                if word not in obs_map:
                    # Add unique words to the observations map.
                    obs_map[word] = obs_counter
                    if word in syl_dict.keys():
                        syllables[obs_counter] = syl_dict[word]
                    else:
                        syllables[obs_counter] = [find_word(temp)]
                    obs_counter += 1

                # Add the encoded word.
                obs_elem.append(obs_map[word])
                
            line_stress = sum_method(line, obs_map, syllables)            
            for i in range(len(line)):
                word = re.sub(r'[^\w]', '', line[i]).lower()
                if obs_map[word] not in stressed:
                    stressed[obs_map[word]] = [line_stress[i]]
                elif line_stress[i] not in stressed[obs_map[word]]:
                    stressed[obs_map[word]].append(line_stress[i])
            
            # Add the encoded sequence.
            obs.append(obs_elem)
        else:
            line_counter = 0

    return obs, obs_map, syllables, stressed

In [374]:
np.meshgrid([1, 2, 3], [4, 5], [6, 7])

[array([[[1, 1],
         [2, 2],
         [3, 3]],
 
        [[1, 1],
         [2, 2],
         [3, 3]]]), array([[[4, 4],
         [4, 4],
         [4, 4]],
 
        [[5, 5],
         [5, 5],
         [5, 5]]]), array([[[6, 7],
         [6, 7],
         [6, 7]],
 
        [[6, 7],
         [6, 7],
         [6, 7]]])]

In [375]:
obs, obs_map, syllables, stressed = load_data_dic3('/Users/neymikajain/Downloads/project3/data/shakespeare.txt')

In [385]:
def load_data_dic4(filename, roman=False):
    """
    RHYME Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    
    text = open(filename, 'r').read()
    
    lines = [line.split() for line in text.split('\n') if line.split()]
    lines.remove(lines[0])

    obs_counter = 0
    obs = []
    obs_map = {}
    line_counter = 0
    rhyme = {}

    for line in lines:
        if line_counter < 14:
            line_counter += 1
            obs_elem = []
            for word in line:
                temp = word.lower()
                word = re.sub(r'[^\w]', '', word).lower()
                if temp != word:
                    REP[word] = temp
                else:
                    REP[temp] = temp
                if word not in obs_map:
                    # Add unique words to the observations map.
                    obs_map[word] = obs_counter
                    rhyme[obs_map[word]] = [-1]
                    obs_counter += 1

                # Add the encoded word.
                obs_elem.append(obs_map[word])

            # Add the encoded sequence.
            obs.append(obs_elem)
        else:
            line_counter = 0
    
    line_counter = 0
    for line in range(len(lines)):
        if line_counter < 14:
            line_counter += 1
            word = re.sub(r'[^\w]', '', lines[line][-1]).lower()
            if not roman:
                if line_counter in [1, 2, 5, 6, 9, 10]:
                    next_word = re.sub(r'[^\w]', '', lines[line + 2][-1]).lower()
                    if rhyme[obs_map[word]] == [-1]:
                        rhyme[obs_map[word]] = [obs_map[next_word]]
                    elif obs_map[next_word] not in rhyme[obs_map[word]]:
                        rhyme[obs_map[word]].append(obs_map[next_word])
                        
                    if rhyme[obs_map[next_word]] == [-1]:
                        rhyme[obs_map[next_word]] = [obs_map[word]]
                    elif obs_map[word] not in rhyme[obs_map[next_word]]:
                        rhyme[obs_map[next_word]].append(obs_map[word])
                elif line_counter == 13:
                    next_word = re.sub(r'[^\w]', '', lines[line + 1][-1]).lower()
                    if rhyme[obs_map[word]] == [-1]:
                        rhyme[obs_map[word]] = [obs_map[next_word]]
                    elif obs_map[next_word] not in rhyme[obs_map[word]]:
                        rhyme[obs_map[word]].append(obs_map[next_word])
                        
                    if rhyme[obs_map[next_word]] == [-1]:
                        rhyme[obs_map[next_word]] = [obs_map[word]]
                    elif obs_map[word] not in rhyme[obs_map[next_word]]:
                        rhyme[obs_map[next_word]].append(obs_map[word])
            else:
                if line_counter in [1, 2, 5, 6, 9, 10]:
                    next_word = re.sub(r'[^\w]', '', lines[line + 2][-1]).lower()
                    if rhyme[obs_map[word]] == [-1]:
                        rhyme[obs_map[word]] = [obs_map[next_word]]
                    elif obs_map[next_word] not in rhyme[obs_map[word]]:
                        rhyme[obs_map[word]].append(obs_map[next_word])
                        
                    if rhyme[obs_map[next_word]] == [-1]:
                        rhyme[obs_map[next_word]] = [obs_map[word]]
                    elif obs_map[word] not in rhyme[obs_map[next_word]]:
                        rhyme[obs_map[next_word]].append(obs_map[word])
                elif line_counter == 13:
                    next_word = re.sub(r'[^\w]', '', lines[line + 1][-1]).lower()
                    if rhyme[obs_map[word]] == [-1]:
                        rhyme[obs_map[word]] = [obs_map[next_word]]
                    elif obs_map[next_word] not in rhyme[obs_map[word]]:
                        rhyme[obs_map[word]].append(obs_map[next_word])
                        
                    if rhyme[obs_map[next_word]] == [-1]:
                        rhyme[obs_map[next_word]] = [obs_map[word]]
                    elif obs_map[word] not in rhyme[obs_map[next_word]]:
                        rhyme[obs_map[next_word]].append(obs_map[word])
                elif line_counter in [4, 8]:
                    next_word = re.sub(r'[^\w]', '', lines[line + 1][-1]).lower()
                    if rhyme[obs_map[word]] == [-1]:
                        rhyme[obs_map[word]] = [obs_map[next_word]]
                    elif obs_map[next_word] not in rhyme[obs_map[word]]:
                        rhyme[obs_map[word]].append(obs_map[next_word])
                        
                    if rhyme[obs_map[next_word]] == [-1]:
                        rhyme[obs_map[next_word]] = [obs_map[word]]
                    elif obs_map[word] not in rhyme[obs_map[next_word]]:
                        rhyme[obs_map[next_word]].append(obs_map[word])
                        
                    next_word2 = re.sub(r'[^\w]', '', lines[line - 2][-1]).lower()
                    if obs_map[next_word2] not in rhyme[obs_map[next_word]]:
                        rhyme[obs_map[next_word]].append(obs_map[next_word2])
                    if obs_map[next_word] not in rhyme[obs_map[next_word2]]:
                        rhyme[obs_map[next_word2]].append(obs_map[next_word])
                    
                    next_word3 = re.sub(r'[^\w]', '', lines[line + 3][-1]).lower()
                    if obs_map[next_word3] not in rhyme[obs_map[next_word2]]:
                        rhyme[obs_map[next_word2]].append(obs_map[next_word3])
                    if obs_map[next_word2] not in rhyme[obs_map[next_word3]]:
                        rhyme[obs_map[next_word3]].append(obs_map[next_word2])
                    if obs_map[next_word3] not in rhyme[obs_map[word]]:
                        rhyme[obs_map[word]].append(obs_map[next_word3])
                    if obs_map[word] not in rhyme[obs_map[next_word3]]:
                        rhyme[obs_map[next_word3]].append(obs_map[word])
        else:
            line_counter = 0

    return obs, obs_map, rhyme

In [388]:
obs, obs_map, rhyme = load_data_dic4('/Users/neymikajain/Downloads/project3/data/spenser.txt')

In [399]:
def load_data_dic5(filename, roman=False):
    """
    RHYME + SYLLABLE Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    
    text = open(filename, 'r').read()
    
    lines = [line.split() for line in text.split('\n') if line.split()]
    lines.remove(lines[0])

    obs_counter = 0
    obs = []
    obs_map = {}
    rhyme = {}
    syllables = {}
    syl_dict = read_syllable()
    line_counter = 0

    for line in lines:
        if line_counter < 14:
            line_counter += 1
            obs_elem = []
            for word in line:
                temp = word.lower()
                word = re.sub(r'[^\w]', '', word).lower()
                if temp != word:
                    REP[temp] = word
                else:
                    REP[temp] = temp
                if word not in obs_map:
                    # Add unique words to the observations map.
                    obs_map[word] = obs_counter
                    rhyme[obs_map[word]] = [-1]
                    if word in syl_dict.keys():
                        syllables[obs_counter] = syl_dict[word]
                    else:
                        syllables[obs_counter] = [find_word(temp)]
                    obs_counter += 1

                # Add the encoded word.
                obs_elem.append(obs_map[word])

            # Add the encoded sequence.
            obs.append(obs_elem)
        else:
            line_counter = 0
    
    line_counter = 0
    for line in range(len(lines)):
        if line_counter < 14:
            line_counter += 1
            word = re.sub(r'[^\w]', '', lines[line][-1]).lower()
            if not roman:
                if line_counter in [1, 2, 5, 6, 9, 10]:
                    next_word = re.sub(r'[^\w]', '', lines[line + 2][-1]).lower()
                    if rhyme[obs_map[word]] == [-1]:
                        rhyme[obs_map[word]] = [obs_map[next_word]]
                    elif obs_map[next_word] not in rhyme[obs_map[word]]:
                        rhyme[obs_map[word]].append(obs_map[next_word])
                        
                    if rhyme[obs_map[next_word]] == [-1]:
                        rhyme[obs_map[next_word]] = [obs_map[word]]
                    elif obs_map[word] not in rhyme[obs_map[next_word]]:
                        rhyme[obs_map[next_word]].append(obs_map[word])
                elif line_counter == 13:
                    next_word = re.sub(r'[^\w]', '', lines[line + 1][-1]).lower()
                    if rhyme[obs_map[word]] == [-1]:
                        rhyme[obs_map[word]] = [obs_map[next_word]]
                    elif obs_map[next_word] not in rhyme[obs_map[word]]:
                        rhyme[obs_map[word]].append(obs_map[next_word])
                        
                    if rhyme[obs_map[next_word]] == [-1]:
                        rhyme[obs_map[next_word]] = [obs_map[word]]
                    elif obs_map[word] not in rhyme[obs_map[next_word]]:
                        rhyme[obs_map[next_word]].append(obs_map[word])
            else:
                if line_counter in [1, 2, 5, 6, 9, 10]:
                    next_word = re.sub(r'[^\w]', '', lines[line + 2][-1]).lower()
                    if rhyme[obs_map[word]] == [-1]:
                        rhyme[obs_map[word]] = [obs_map[next_word]]
                    elif obs_map[next_word] not in rhyme[obs_map[word]]:
                        rhyme[obs_map[word]].append(obs_map[next_word])
                        
                    if rhyme[obs_map[next_word]] == [-1]:
                        rhyme[obs_map[next_word]] = [obs_map[word]]
                    elif obs_map[word] not in rhyme[obs_map[next_word]]:
                        rhyme[obs_map[next_word]].append(obs_map[word])
                elif line_counter == 13:
                    next_word = re.sub(r'[^\w]', '', lines[line + 1][-1]).lower()
                    if rhyme[obs_map[word]] == [-1]:
                        rhyme[obs_map[word]] = [obs_map[next_word]]
                    elif obs_map[next_word] not in rhyme[obs_map[word]]:
                        rhyme[obs_map[word]].append(obs_map[next_word])
                        
                    if rhyme[obs_map[next_word]] == [-1]:
                        rhyme[obs_map[next_word]] = [obs_map[word]]
                    elif obs_map[word] not in rhyme[obs_map[next_word]]:
                        rhyme[obs_map[next_word]].append(obs_map[word])
                elif line_counter in [4, 8]:
                    next_word = re.sub(r'[^\w]', '', lines[line + 1][-1]).lower()
                    if rhyme[obs_map[word]] == [-1]:
                        rhyme[obs_map[word]] = [obs_map[next_word]]
                    elif obs_map[next_word] not in rhyme[obs_map[word]]:
                        rhyme[obs_map[word]].append(obs_map[next_word])
                        
                    if rhyme[obs_map[next_word]] == [-1]:
                        rhyme[obs_map[next_word]] = [obs_map[word]]
                    elif obs_map[word] not in rhyme[obs_map[next_word]]:
                        rhyme[obs_map[next_word]].append(obs_map[word])
                        
                    next_word2 = re.sub(r'[^\w]', '', lines[line - 2][-1]).lower()
                    if obs_map[next_word2] not in rhyme[obs_map[next_word]]:
                        rhyme[obs_map[next_word]].append(obs_map[next_word2])
                    if obs_map[next_word] not in rhyme[obs_map[next_word2]]:
                        rhyme[obs_map[next_word2]].append(obs_map[next_word])
                    
                    next_word3 = re.sub(r'[^\w]', '', lines[line + 3][-1]).lower()
                    if obs_map[next_word3] not in rhyme[obs_map[next_word2]]:
                        rhyme[obs_map[next_word2]].append(obs_map[next_word3])
                    if obs_map[next_word2] not in rhyme[obs_map[next_word3]]:
                        rhyme[obs_map[next_word3]].append(obs_map[next_word2])
                    if obs_map[next_word3] not in rhyme[obs_map[word]]:
                        rhyme[obs_map[word]].append(obs_map[next_word3])
                    if obs_map[word] not in rhyme[obs_map[next_word3]]:
                        rhyme[obs_map[next_word3]].append(obs_map[word])
        else:
            line_counter = 0

    return obs, obs_map, syllables, rhyme

In [400]:
obs, obs_map, syllables, rhyme = load_data_dic5('/Users/neymikajain/Downloads/project3/data/shakespeare.txt')

In [402]:
def load_data_dic6(filename, roman=False):
    """
    EVERYTHING (NO SPENSER) Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    
    text = open(filename, 'r').read()
    
    lines = [line.split() for line in text.split('\n') if line.split()]
    lines.remove(lines[0])

    obs_counter = 0
    obs = []
    obs_map = {}
    rhyme = {}
    syllables = {}
    stressed = {}
    syl_dict = read_syllable()
    line_counter = 0

    for line in lines:
        if line_counter < 14:
            line_counter += 1
            obs_elem = []
            for word in line:
                temp = word.lower()
                word = re.sub(r'[^\w]', '', word).lower()
                if temp != word:
                    REP[temp] = word
                else:
                    REP[temp] = temp
                if word not in obs_map:
                    # Add unique words to the observations map.
                    obs_map[word] = obs_counter
                    rhyme[obs_map[word]] = [-1]
                    if word in syl_dict.keys():
                        syllables[obs_counter] = syl_dict[word]
                    else:
                        syllables[obs_counter] = [find_word(temp)]
                    obs_counter += 1

                # Add the encoded word.
                obs_elem.append(obs_map[word])
            
            line_stress = sum_method(line, obs_map, syllables)            
            for i in range(len(line)):
                word = re.sub(r'[^\w]', '', line[i]).lower()
                if obs_map[word] not in stressed:
                    stressed[obs_map[word]] = [line_stress[i]]
                elif line_stress[i] not in stressed[obs_map[word]]:
                    stressed[obs_map[word]].append(line_stress[i])
                    
            # Add the encoded sequence.
            obs.append(obs_elem)
        else:
            line_counter = 0
    
    line_counter = 0
    for line in range(len(lines)):
        if line_counter < 14:
            line_counter += 1
            word = re.sub(r'[^\w]', '', lines[line][-1]).lower()
            if not roman:
                if line_counter in [1, 2, 5, 6, 9, 10]:
                    next_word = re.sub(r'[^\w]', '', lines[line + 2][-1]).lower()
                    if rhyme[obs_map[word]] == [-1]:
                        rhyme[obs_map[word]] = [obs_map[next_word]]
                    elif obs_map[next_word] not in rhyme[obs_map[word]]:
                        rhyme[obs_map[word]].append(obs_map[next_word])
                        
                    if rhyme[obs_map[next_word]] == [-1]:
                        rhyme[obs_map[next_word]] = [obs_map[word]]
                    elif obs_map[word] not in rhyme[obs_map[next_word]]:
                        rhyme[obs_map[next_word]].append(obs_map[word])
                elif line_counter == 13:
                    next_word = re.sub(r'[^\w]', '', lines[line + 1][-1]).lower()
                    if rhyme[obs_map[word]] == [-1]:
                        rhyme[obs_map[word]] = [obs_map[next_word]]
                    elif obs_map[next_word] not in rhyme[obs_map[word]]:
                        rhyme[obs_map[word]].append(obs_map[next_word])
                        
                    if rhyme[obs_map[next_word]] == [-1]:
                        rhyme[obs_map[next_word]] = [obs_map[word]]
                    elif obs_map[word] not in rhyme[obs_map[next_word]]:
                        rhyme[obs_map[next_word]].append(obs_map[word])
            else:
                if line_counter in [1, 2, 5, 6, 9, 10]:
                    next_word = re.sub(r'[^\w]', '', lines[line + 2][-1]).lower()
                    if rhyme[obs_map[word]] == [-1]:
                        rhyme[obs_map[word]] = [obs_map[next_word]]
                    elif obs_map[next_word] not in rhyme[obs_map[word]]:
                        rhyme[obs_map[word]].append(obs_map[next_word])
                        
                    if rhyme[obs_map[next_word]] == [-1]:
                        rhyme[obs_map[next_word]] = [obs_map[word]]
                    elif obs_map[word] not in rhyme[obs_map[next_word]]:
                        rhyme[obs_map[next_word]].append(obs_map[word])
                elif line_counter == 13:
                    next_word = re.sub(r'[^\w]', '', lines[line + 1][-1]).lower()
                    if rhyme[obs_map[word]] == [-1]:
                        rhyme[obs_map[word]] = [obs_map[next_word]]
                    elif obs_map[next_word] not in rhyme[obs_map[word]]:
                        rhyme[obs_map[word]].append(obs_map[next_word])
                        
                    if rhyme[obs_map[next_word]] == [-1]:
                        rhyme[obs_map[next_word]] = [obs_map[word]]
                    elif obs_map[word] not in rhyme[obs_map[next_word]]:
                        rhyme[obs_map[next_word]].append(obs_map[word])
                elif line_counter in [4, 8]:
                    next_word = re.sub(r'[^\w]', '', lines[line + 1][-1]).lower()
                    if rhyme[obs_map[word]] == [-1]:
                        rhyme[obs_map[word]] = [obs_map[next_word]]
                    elif obs_map[next_word] not in rhyme[obs_map[word]]:
                        rhyme[obs_map[word]].append(obs_map[next_word])
                        
                    if rhyme[obs_map[next_word]] == [-1]:
                        rhyme[obs_map[next_word]] = [obs_map[word]]
                    elif obs_map[word] not in rhyme[obs_map[next_word]]:
                        rhyme[obs_map[next_word]].append(obs_map[word])
                        
                    next_word2 = re.sub(r'[^\w]', '', lines[line - 2][-1]).lower()
                    if obs_map[next_word2] not in rhyme[obs_map[next_word]]:
                        rhyme[obs_map[next_word]].append(obs_map[next_word2])
                    if obs_map[next_word] not in rhyme[obs_map[next_word2]]:
                        rhyme[obs_map[next_word2]].append(obs_map[next_word])
                    
                    next_word3 = re.sub(r'[^\w]', '', lines[line + 3][-1]).lower()
                    if obs_map[next_word3] not in rhyme[obs_map[next_word2]]:
                        rhyme[obs_map[next_word2]].append(obs_map[next_word3])
                    if obs_map[next_word2] not in rhyme[obs_map[next_word3]]:
                        rhyme[obs_map[next_word3]].append(obs_map[next_word2])
                    if obs_map[next_word3] not in rhyme[obs_map[word]]:
                        rhyme[obs_map[word]].append(obs_map[next_word3])
                    if obs_map[word] not in rhyme[obs_map[next_word3]]:
                        rhyme[obs_map[next_word3]].append(obs_map[word])
        else:
            line_counter = 0

    return obs, obs_map, syllables, stressed, rhyme

In [404]:
obs, obs_map, syllables, stressed, rhyme = load_data_dic6('/Users/neymikajain/Downloads/project3/data/shakespeare.txt')

In [421]:
col = ['Word','Index', 'Syllable Options', 'Stress Options', 'Rhyme Options']
csv_file = "test_shakespeare.csv"
try:
    with open('test_shakespeare.csv', 'w') as f:
        f.write("%s,%s,%s,%s,%s\n"%(col[0], col[1], col[2], col[3], col[4]))
        for key in obs_map.keys():
            syl = str(syllables[obs_map[key]]).replace(",", " ").replace("[", "").replace("]", "")
            stress = str(stressed[obs_map[key]]).replace(",", " ").replace("[", "").replace("]", "")
            rhy = str(rhyme[obs_map[key]]).replace(",", " ").replace("[", "").replace("]", "")
            f.write("%s,%s,%s,%s,%s\n"%(key, obs_map[key], syl, stress, rhy))
except IOError:
    print("I/O error") 

In [422]:
# THIS TAKES A WHILE TO RUN
obss, obs_maps, syllabless, rhymes = load_data_dic5('/Users/neymikajain/Downloads/project3/data/spenser.txt')

unquiet ['unquiet']
sithens ['sithens']
woxen ['woxen']
lurkest ['lurkest']
succour ['succour']
humbless ['humbless']
entreat ['entreat']
baseness ['baseness']
ravished ['ravished']
endite ['endite']
dumpish ['dumpish']
cheerless ['cheerless']
dight ['dight']
wrongest ['wrongest']
sdeigne ['sdeigne']
portliness ['portliness']
boldened ['boldened']
lusts ['lusts']
durefull ['durefull']
dints ['dints']
mazed ['mazed']
ensample ['ensample']
persever ['persever']
mought ['mought']
likest ['likest']
unrighteous ['unrighteous']
makest ['makest']
lordeth ['lordeth']
freewill ['freewill']
scorning ['scorning']
tyraness ['tyraness']
comptroll ['comptroll']
th. ['renew', 'th.']
persueth ['persueth']
unpityed ['unpityed']
assoyle ['assoyle']
surcease ['surcease']
guileful ['guileful']
eyen ['eyen']
captiving ['captiving']
graceth ['graceth']
embaseth ['embaseth']
descry ['descry']
remembreth ['remembreth']
lothsome ['lothsome']
drossy ['drossy']
lowliness ['lowliness']
abondon ['abondon', 'd']
ne

In [423]:
col = ['Word','Index', 'Syllable Options', 'Stress Options', 'Rhyme Options']
csv_file = "test_spenser.csv"
try:
    with open('test_spenser.csv', 'w') as f:
        f.write("%s,%s,%s,%s,%s\n"%(col[0], col[1], col[2], col[3], col[4]))
        for key in obs_map.keys():
            syl = str(syllables[obs_map[key]]).replace(",", " ").replace("[", "").replace("]", "")
            stress = str(stressed[obs_map[key]]).replace(",", " ").replace("[", "").replace("]", "")
            rhy = str(rhyme[obs_map[key]]).replace(",", " ").replace("[", "").replace("]", "")
            f.write("%s,%s,%s,%s,%s\n"%(key, obs_map[key], syl, stress, rhy))
except IOError:
    print("I/O error") 

In [424]:
obs, obs_map, syllables, stressed, rhyme = load_data_dic6('/Users/neymikajain/Downloads/project3/data/shakespeare.txt')

In [427]:
np.append(np.array([]), [1, 2])

array([1., 2.])

In [433]:
def load_data():
    '''
    NUMPY FORMAT (IDK WHAT ELSE YOU GUYS WANT)
    '''
    obs, obs_map, syllables, stressed, rhyme = load_data_dic6('/Users/neymikajain/Downloads/project3/data/shakespeare.txt')
    
    ag = np.array([[]])
    
    for word in obs_map.keys():
        count = obs_map[word]
        add = [count, np.array(syllables[count]), np.array(stressed[count]), np.array(rhyme[count])]
        ag = np.append(ag, np.array(add))
    
    return ag
    