In [57]:
import pronouncing as p
import nltk as n
from typing import List

In [58]:
p.phones_for_word("snappiest")

['S N AE1 P IH0 EH2 S T']

In [59]:
# n.download()

In [60]:
from nltk.corpus import gutenberg

In [61]:
gutenberg.words()

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]

In [62]:
len(gutenberg.words())

2621613

In [63]:
gutenberg.words()[0:10]

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER']

In [64]:
# stress_string = "0101010101"
# markov model

# need a function to generate possible next words
def get_next_word(current_word, model):
    """
        :type current_word: string
        :type mode: markov_model
        :rtype: List[string]
    """
    words = model.get_next_word(current_word)
    return words

In [65]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
# https://stackoverflow.com/questions/21971449/how-do-i-increase-the-cell-width-of-the-jupyter-ipython-notebook-in-my-browser

In [66]:
def make_one_zero_str(length: int, start_stress: int) -> str:
    one_zero_str = ""
    for index in range(start_stress, length + start_stress):
        one_zero_str += str(index % 2)
        
    return one_zero_str

def get_possible_stresses(current_stresses: str) -> List[str]:
    """
    can assume that current_stresses is 0[1,2] repeated
    eg: current_stresses = "010"
    """
    iamb_pent_syllables = 10
    
    syllables = len(current_stresses)
    remaining_stresses = iamb_pent_syllables - syllables
    start_stress = remaining_stresses % 2  # if even, start with 0
    possible_stress_lengths = list(range(1, remaining_stresses + 1))
    return list(map(lambda length: make_one_zero_str(length, start_stress), possible_stress_lengths))
    
get_possible_stresses("010101")
    

['0', '01', '010', '0101']

In [67]:
def word_matches_stress(word: str, stress_pattern_match: str) -> bool:
    '''
    eg: stress_pattern_match = "010"
    '''
    pronunciations = p.phones_for_word(word) # word can have more than 1 pronunciation. eg: lead of a pencil, someone lead someone
    for pronunciation in pronunciations:
        original_stress_pattern = p.stresses(pronunciation)
        
        # we consider both 1 and 2 as a stressed syllable
        # our generated pattern match is only ever 1s and 0s
        stress_pattern = original_stress_pattern.replace("2", "1") 
        
        # in case 1 pronunciation matches but the other one doesn't
        if stress_pattern == stress_pattern_match:
            return True
        
        return False
        
assert word_matches_stress("adam", "10")
assert word_matches_stress("janice", "10")
assert word_matches_stress("compare", "01")
assert word_matches_stress("snappiest", "101") # snappiest's stresses are "102", so this should still match since we consider 2 as a stressed syllable

In [94]:
def filter_possible_words(possible_stresses: List[str], words: List[str]) -> List[str]:
    """
        :type possible_stresses List[stress_strings]
        :type words List[strings]
        :filtered_words List[strings]
    """
    filtered_words = []
    for stress_pattern in possible_stresses:
        for word in words:
            if word_matches_stress(word, stress_pattern):
                filtered_words.append(word)
    return filtered_words

In [72]:
words = gutenberg.words()
filtered = list(filter(lambda word: word.isalpha(), words))
filtered_lower = [ word.lower() for word in filtered ]
filtered_lower[10:30]

['handsome',
 'clever',
 'and',
 'rich',
 'with',
 'a',
 'comfortable',
 'home',
 'and',
 'happy',
 'disposition',
 'seemed',
 'to',
 'unite',
 'some',
 'of',
 'the',
 'best',
 'blessings',
 'of']

In [92]:
def get_possible_words(current_line:List[str], current_stress_str: str, ngram_dict) -> List[str]:
    possible_stresses = get_possible_stresses(current_stress_str)
    candidate_words = ngram_dict.get(current_line[-1], [])
    possible_words = filter_possible_words(possible_stresses, candidate_words)

    return possible_words

In [73]:
from collections import defaultdict
ngram_dict = defaultdict(list)
ngrams = zip(filtered_lower, filtered_lower[1:])
for w1, w2 in ngrams:
    ngram_dict[w1].append(w2)

In [97]:
p.phones_for_word("in")

['IH0 N', 'IH1 N']

In [100]:
get_possible_words(['shall', 'i', "apologize"], "01", ngram_dict)[-20:-1]

[]

In [77]:
# from collections import Counter
# Counter(ngram_dict['emma']).most_common()