In [1]:
with open('cmuDict.txt', 'r', encoding='utf-8') as f:
    rawDict = f.read()

cmu = rawDict.split('\n')[56:]
print(cmu[0])

!EXCLAMATION-POINT  EH2 K S K L AH0 M EY1 SH AH0 N P OY2 N T


In [2]:
vowels = [    # General American English specifically
    'AA',     # balm, bot
    'AE',     # bat
    'AH',     # butt
    'AO',     # stOry      this gave me a heart attack when Wikipedia gave another example as 'cAUGHt' which is not at all the same sound to me
    'AW',     # bout
    'AX',     # commA (schwa)
    'AY',     # bite
    'EH',     # bet
    'ER',     # bIRd, forewORd
    'EY',     # bait
    'IH',     # bit
    'IX',     # rosEs, rabbIt
    'IY',     # beat
    'OW',     # boat
    'OY',     # boy
    'UH',     # book
    'UW'      # boot
]
# source: https://en.wikipedia.org/wiki/ARPABET
consonants = [
    'B',      # buy
    'CH',     # China
    'D',      # die
    'DH',     # thy
    'DX',     # buTTer
    'EL',     # bottLE
    'EM',     # rhythM
    'EN',     # buttON
    'F',      # fight
    'G',      # guy
    'HH',     # High
    'JH',     # jive
    'K',      # kite
    'L',      # lie
    'M',      # my
    'N',      # nigh
    'NG',     # siNG
    'P',      # pie
    'Q',      # uh-oh (glottal stop)
    'R',      # rye
    'S',      # sigh
    'SH',     # shy
    'T',      # tie
    'TH',     # thigh
    'V',      # vie
    'W',      # wise
    'WH',     # why (for fancy people)
    'Y',      # yacht
    'Z',      # zoo
    'ZH'      # pleaSure
]

In [3]:
class coolWord:
    def __init__(self, spelling, pronounciation):
        symbols = "~!@#$%^&*()-_=+[]{}\\|;:\'\",<.>/?1234567890"
        self.word = spelling.strip(symbols).lower()
        ps = pronounciation.split()
        ps2 = [p.strip(symbols) for p in ps]
        self.p = ' '.join(ps2)        
        self.x = False
        self.xx = False
        self.xIndex = 0
        if 'x' in self.word:
            self.x = True
            if 'xx' in self.word:
                self.xx = True
                self.s = self.word.replace('xx', 'ks')
            else:
                self.xIndex = self.word.index('x')
                if self.xIndex == 0:
                    self.s = self.word.replace('x', 'z')
                elif self.xIndex == len(self.word) - 1 and ps2[-1] == 'OW':
                    self.s = self.word.replace('x', '')
                elif self.word[self.xIndex - 1] in 'aeiou' and self.xIndex != len(self.word) - 1 and self.word[self.xIndex + 1] in 'aeiou':
                    self.s = self.word.replace('x', 'gz')
                else:
                    self.s = self.word.replace('x', 'ks')
        else:
            self.s = self.word
    def __str__(self):
        return self.word
    def __repr__(self):
        return self.word
    def atomize(self, v, c):
        current_letters = []
        current_sounds = []
        final = {}
        vows = []
        cons = []
        letter_counter = 0
        while letter_counter < len(self.s):
            current_letter = letter_counter
            while letter_counter < len(self.s) and self.s[letter_counter] in "qwrtypsdfghjklzxcvbnm":
                current_letters.append(self.s[letter_counter])
                letter_counter += 1
            final[current_letter] = ''.join(current_letters)
            current_letters = []
            if letter_counter >= len(self.s):
                break
            current_letter = letter_counter
            while letter_counter < len(self.s) and self.s[letter_counter] in "aeiou":
                current_letters.append(self.s[letter_counter])
                letter_counter += 1
            final[current_letter] = ''.join(current_letters)
            current_letters = []
        self.c = cons
        self.v = vows
        self.f = final
#        for i in self.p:
#            if i in v:
#                pass
#            elif i in c:
#                current_sounds.append(i)
#                self.c[letter_counter].append(current_letters)
#                letter_counter += 1
#            else:
#                print("weirdo ->", i)

In [4]:
## from time import sleep
from copy import copy

def sideSpell(word, phones, spells, polarity):
    currentSpell = []
    for p in phones:
        oldLen = len(currentSpell)
        for s in spells[p]:
            newSpell = copy(currentSpell)
            if not polarity:
                newSpell.append(s.g)
                if word.s.startswith(''.join([a if not isinstance(a, graphExample) else a.g for a in newSpell])):
                    currentSpell.append(s)
                    break
            else:
                newSpell.insert(0, s.g)
                if word.s.endswith(''.join([a if not isinstance(a, graphExample) else a.g for a in newSpell])):
                    currentSpell.insert(0, s)
                    break                
        if len(currentSpell) == oldLen:
            break
    return currentSpell

def spell3(word, spellings, exLen=4, debug=False):
    word.s = word.s.lower()
    if not word.s.isalnum():
        if debug:
            print("bad characters", word.s)
        return False
    if debug:
        print(word)
    phones = word.p.split()
    if len(word.s) < len(phones):
        if debug:
            print("too long!", word.s, phones)
        return False
    if not set(word.s) & set('aeiouy'):
        if debug:
            print("no vowels -> not a word!", word.s, phones)
        return False
    if word.s[:-2] == 'le' and phones[:-2] == ['AH', 'L'] and word.s[-3] not in ['aeiou']:
        phones.append(phones.pop[-2])
        if debug:
            print("special case: L")
    rphones = copy(phones)
    rphones.reverse()
    if debug:
        print("phones", phones, rphones)
    frontSpell = sideSpell(word, phones, spellings, 0)
    frontStr = [a.g for a in frontSpell]
    backSpell = sideSpell(word, rphones, spellings, 1)
    backStr = [a.g for a in backSpell]
    if debug:
        print("front & back", frontSpell, backSpell)
    if '' in frontSpell:
        frontSpell.remove('')
    if '' in backSpell:
        backSpell.remove('')
    if frontSpell == backSpell and ''.join(frontStr) == word.s:
        if debug:
            print('yay!')
            print(word, frontSpell, len(frontSpell))
            print(backSpell, len(backSpell))
            print(phones)
        exPhone = choice(phones)
        g = frontSpell[phones.index(exPhone)]
        p = spellings[exPhone]
        exGraph = p[p.index(g)]
        if len(word.s) == exLen and not exGraph.set:
            exGraph.setExample(word)
        return True
    missingG = graphExample(word.s.removeprefix(''.join(frontStr)).removesuffix(''.join(backStr)), isX=word.x)
    #if not missingG.g:
    #    missingG = backSpell[0]
    if debug:
        print("missing grapheme", missingG)
    if len(frontSpell + backSpell) > len(phones):
            #print(word, frontSpell, len(frontSpell))
            #print(backSpell, len(backSpell))
            #print(phones)
            if debug:
                print("too long")
            del backSpell[0]
    if len(frontSpell + backSpell) < len(phones):
        try:
            missingP = phones[len(frontSpell)]
            if debug:
                print("missing phoneme (clean)", missingP)
            if len(word.s) == exLen and not missingG.set:
                missingG.setExample(word)        
            spellings[missingP].append(missingG)    
        except IndexError as e:
            if debug:
                print(word, frontSpell, len(frontSpell))
                print(backSpell, len(backSpell))
                print(phones)
            raise e
    else:
        missingP = phones[len(frontSpell) - 1]
        if debug:
            print("missing phoneme (overlap)", missingP)
        try:
            newG = graphExample(''.join((frontStr[-1], missingG.g) if frontStr else (missingG.g, backStr[0])), isX = missingG.g.count('x') or (frontStr[-1].count('x') if frontStr else backStr[0].count('x')))
            if len(word.s) == exLen and not newG.set:
                newG.setExample(word)  
            spellings[missingP].append(newG)
            if debug:
                print(newG)
        except IndexError as e:
            if debug:
                print(word, frontSpell, len(frontSpell))
                print(backSpell, len(backSpell))
                print(phones)
                print(missingP)
            raise e

def learn3(spellings, words, exLen=4, debug=False):
    newSpells = {s: copy(spellings[s]) for s in spellings}
    for word in words:
        spell3(word, newSpells, exLen=exLen, debug=debug)
        if debug:
            print(word, word.p)
        #break
    return newSpells

In [5]:
wordObjs2 = {line.split(maxsplit=1)[0]: coolWord(*line.split(maxsplit=1)) for line in cmu}

In [6]:
spellings = {
    'AA': ['o'],
    'AE': ['a'],
    'AH': ['u'],
    'AO': ['o'],
    'AW': ['ou'],
    'AX': ['a'],
    'AXR': ['er'],
    'AY': ['i'],
    'EH': ['e'],
    'ER': ['ir'],
    'EY': ['ai'],
    'IH': ['i'],
    'IX': ['e'],
    'IY': ['ea'],
    'OW': ['oa'],
    'OY': ['oy'],
    'UH': ['oo'],
    'UW': ['oo'],
    'UX': ['u'],
    'B': ['b'],
    'CH': ['ch'],
    'D': ['d'],
    'DX': ['tt'],
    'EL': ['le'],
    'EM': ['m'],
    'EN': ['on'],
    'F': ['f'],
    'G': ['g'],
    'H': ['h'],
    'HH': ['h'],
    'JH': ['j'],
    'K': ['k'],
    'L': ['l'],
    'M': ['m'],
    'N': ['n'],
    'NX': ['ng'],
    'NG': ['ng'],
    'P': ['p'],
    'Q': ['-'],
    'R': ['r'],
    'S': ['s'],
    'SH': ['sh'],
    'T': ['t'],
    'TH': ['th'],
    'V': ['v'],
    'W': ['w'],
    'WH': ['wh'],
    'Y': ['y'],
    'Z': ['z'],
    'ZH': ['s']
}

In [7]:
threeLetter = [wordObjs2[word] for word in wordObjs2 if len(wordObjs2[word].s) == 3]

In [8]:
len(threeLetter)

1733

In [9]:
twoLetter = [wordObjs2[word] for word in wordObjs2 if len(wordObjs2[word].s) == 2]

In [10]:
## from time import sleep
from copy import copy

def sideSpell4(word, phones, spells, polarity):
    currentSpell = []
    for p in phones:
        oldLen = len(currentSpell)
        for s in spells[p]:
            newSpell = copy(currentSpell)
            if not polarity:
                newSpell.append(s.g)
                if word.s.startswith(''.join([a if not isinstance(a, graphExample) else a.g for a in newSpell])):
                    currentSpell.append(s)
                    break
            else:
                newSpell.insert(0, s.g)
                if word.s.endswith(''.join([a if not isinstance(a, graphExample) else a.g for a in newSpell])):
                    currentSpell.insert(0, s)
                    break                
        if len(currentSpell) == oldLen:
            break
    return currentSpell

def spell4(word, spellings, exLen=4, debug=False):
    word.s = word.s.lower()
    if not word.s.isalnum():
        if debug:
            print("bad characters", word.s)
        return False
    if debug:
        print(word)
    phones = word.p.split()
    if len(word.s) < len(phones):
        if debug:
            print("too long!", word.s, phones)
        return False
    if not set(word.s) & set('aeiouy'):
        if debug:
            print("no vowels -> not a word!", word.s, phones)
        return False
    if word.s[:-2] == 'le' and phones[:-2] == ['AH', 'L'] and word.s[-3] not in ['aeiou']:
        phones.append(phones.pop[-2])
        if debug:
            print("special case: L")
    rphones = copy(phones)
    rphones.reverse()
    if debug:
        print("phones", phones, rphones)
    frontSpell = sideSpell4(word, phones, spellings, 0)
    frontStr = [a.g for a in frontSpell]
    backSpell = sideSpell4(word, rphones, spellings, 1)
    backStr = [a.g for a in backSpell]
    if debug:
        print("front & back", frontSpell, backSpell)
    if '' in frontSpell:
        frontSpell.remove('')
    if '' in backSpell:
        backSpell.remove('')
    if len(phones) > (len(frontSpell) + len(backSpell) + 1):
        if debug:
            print("failure to spell")
        return False
    if frontSpell == backSpell and ''.join(frontStr) == word.s:
        if debug:
            print('yay!')
            print(word, frontSpell, len(frontSpell))
            print(backSpell, len(backSpell))
            print(phones)
        exPhone = choice(phones)
        g = frontSpell[phones.index(exPhone)]
        p = spellings[exPhone]
        exGraph = p[p.index(g)]
        if len(word.s) == exLen and not exGraph.set:
            exGraph.setExample(word)
        return True
    missingG = graphExample(word.s.removeprefix(''.join(frontStr)).removesuffix(''.join(backStr)), isX=word.x)
    #if not missingG.g:
    #    missingG = backSpell[0]
    if debug:
        print("missing grapheme", missingG)
    if len(frontSpell + backSpell) > len(phones):
            #print(word, frontSpell, len(frontSpell))
            #print(backSpell, len(backSpell))
            #print(phones)
            if debug:
                print("too long")
            del backSpell[0]
    if len(frontSpell + backSpell) < len(phones):
        try:
            missingP = phones[len(frontSpell)]
            if debug:
                print("missing phoneme (clean)", missingP)
            if len(word.s) == exLen and not missingG.set:
                missingG.setExample(word)        
            spellings[missingP].append(missingG)    
        except IndexError as e:
            if debug:
                print(word, frontSpell, len(frontSpell))
                print(backSpell, len(backSpell))
                print(phones)
            raise e
    else:
        missingP = phones[len(frontSpell) - 1]
        if debug:
            print("missing phoneme (overlap)", missingP)
        try:
            newG = graphExample(''.join((frontStr[-1], missingG.g) if frontStr else (missingG.g, backStr[0])), isX = missingG.g.count('x') or (frontStr[-1].count('x') if frontStr else backStr[0].count('x')))
            if len(word.s) == exLen and not newG.set:
                newG.setExample(word)  
            spellings[missingP].append(newG)
            if debug:
                print(newG)
        except IndexError as e:
            if debug:
                print(word, frontSpell, len(frontSpell))
                print(backSpell, len(backSpell))
                print(phones)
                print(missingP)
            raise e

def learn3(spellings, words, exLen=4, debug=False):
    newSpells = {s: copy(spellings[s]) for s in spellings}
    for word in words:
        spell3(word, newSpells, exLen=exLen, debug=debug)
        if debug:
            print(word, word.p)
        #break
    return newSpells

In [11]:
def clean(aDict):
    for key in aDict:
        while '' in aDict[key]:
            aDict[key].remove('')

In [12]:
baseSpellings = {
    'AA': ['o'],
    'AE': ['a'],
    'AH': ['u', 'o'],
    'AO': ['o'],
    'AW': ['ou'],
    'AX': ['a'],
    'AXR': ['er'],
    'AY': ['i'],
    'EH': ['e'],
    'ER': ['ir'],
    'EY': ['ai'],
    'IH': ['i'],
    'IX': ['e'],
    'IY': ['ea'],
    'OW': ['oa'],
    'OY': ['oy'],
    'UH': ['oo'],
    'UW': ['oo'],
    'UX': ['u'],
    'B': ['b'],
    'CH': ['ch'],
    'D': ['d'],
    'DX': ['tt'],
    'EL': ['le'],
    'EM': ['m'],
    'EN': ['on'],
    'F': ['f'],
    'G': ['g'],
    'H': ['h'],
    'HH': ['h'],
    'JH': ['j'],
    'K': ['k'],
    'L': ['l'],
    'M': ['m'],
    'N': ['n'],
    'NX': ['ng'],
    'NG': ['ng'],
    'P': ['p'],
    'Q': ['-'],
    'R': ['r'],
    'S': ['s'],
    'SH': ['sh'],
    'T': ['t'],
    'TH': ['th'],
    'V': ['v'],
    'W': ['w'],
    'WH': ['wh'],
    'Y': ['y'],
    'Z': ['z'],
    'ZH': ['s']
}

In [13]:
class graphExample:
    def __init__(self, grapheme, isX=False):
        self.g = grapheme
        self.set = False
        self.isx = isX
        self.x = 'x' if isX else None

    def __str__(self):
        try:
            return f"'{self.g if not self.isx else self.x}' in '{self.w}'"
        except AttributeError as e:
            return self.g if not self.isx else self.x

    def __repr__(self):
        return self.__str__()

    def __eq__(self, other):
        if isinstance(other, str):
            return self.g == other
        elif isinstance(other, graphExample):
            return self.g == other.g
        return False

    def __hash__(self):
        return self.g.__hash__() if not self.isx else self.x.__hash__()

    def setExample(self, ex):
        self.w = ex
        self.set = True

In [14]:
def initSpellings(plain):
    ans = {p: [graphExample(a, isX=bool(a.count('x'))) for a in plain[p]] for p in plain}
    return ans

In [15]:
coolSpellings = initSpellings(baseSpellings)

In [16]:
coolSpellings

{'AA': [o],
 'AE': [a],
 'AH': [u, o],
 'AO': [o],
 'AW': [ou],
 'AX': [a],
 'AXR': [er],
 'AY': [i],
 'EH': [e],
 'ER': [ir],
 'EY': [ai],
 'IH': [i],
 'IX': [e],
 'IY': [ea],
 'OW': [oa],
 'OY': [oy],
 'UH': [oo],
 'UW': [oo],
 'UX': [u],
 'B': [b],
 'CH': [ch],
 'D': [d],
 'DX': [tt],
 'EL': [le],
 'EM': [m],
 'EN': [on],
 'F': [f],
 'G': [g],
 'H': [h],
 'HH': [h],
 'JH': [j],
 'K': [k],
 'L': [l],
 'M': [m],
 'N': [n],
 'NX': [ng],
 'NG': [ng],
 'P': [p],
 'Q': [-],
 'R': [r],
 'S': [s],
 'SH': [sh],
 'T': [t],
 'TH': [th],
 'V': [v],
 'W': [w],
 'WH': [wh],
 'Y': [y],
 'Z': [z],
 'ZH': [s]}

In [17]:
def getGLen(word):
    pNum = len(word.p.split())
    lNum = len(word.s)
    minG = (lNum / pNum)
    if minG > 4:
        raise ValueError("too many letters")
    if minG < 1:
        raise ValueError("too many phonemes")
    minG = int(minG)
    maxG = min(lNum - pNum + 1, 4)
    return (maxG, minG)

In [18]:
def atomize(word):
    maxGraph, minGraph = getGLen(word)
    lIndex = 0
    potential = []
    while lIndex < len(word.s):
        for i in range(minGraph, maxGraph + 1):
            atom = "*" * lIndex
            atom += word.s[lIndex:lIndex + i]
            atom += "*" * (len(word.s) - len(atom))
            if atom not in potential:
                potential.append(atom)
        lIndex += 1
    return potential

In [19]:
import re
from math import ceil

def match_phones2(word):
    graphs = atomize(word)
    phones = word.p.split()
    #print(phones)
    matches = {}
    grange = range(len(graphs))
    for p in phones:
        matches[p] = []
    for gInd in grange:
        g = graphs[gInd]
        if g.lstrip('*') == g:
            matches[phones[0]].append(g)
        elif g.rstrip('*') == g:
            matches[phones[len(phones) - 1]].append(g)
        else:
            a = re.split('\w', g)
            endGaps = len(a[len(a) - 1])
            absMin = ceil(len(a[0]) / getGLen(word)[0])
            endPhones = len(phones[absMin + 1:])
            pmin = absMin + (endPhones - endGaps if endGaps < endPhones else 0)
            #pmax = max(len(a[0]) - ceil(endGaps / getGLen(word)[0]), 1)
            absMax = ceil(endGaps / getGLen(word)[0])
            pmax = min(len(phones) - 2, len(a[0]))
            #pmax = min(len(a[0]), len(phones))
            #print(g, pmin, pmax)
            for p1 in phones[pmin: pmax + 1]:
                matches[p1].append(g)
    return matches

In [20]:
def count_spaces(word, reverse=False):
    letters = list(word)
    count = 0
    #print(word)
    #print(letters)
    if reverse:
        while letters.pop() == '*':
            count += 1
    else:
        while letters.pop(0) == '*':
            count += 1
    return count

def count_letters(word):
    letters = list(word)
    count = 0
    while len(letters) != 0 and letters.pop(0) != '*':
        count += 1
    return count
    
def stitch(matches):
    stitched = matches.pop(0)
    mapping = [0] * (len(stitched) - count_spaces(stitched, reverse=True))
    num = 1
    for grapheme in matches:
        start = count_spaces(grapheme)
        end = count_spaces(grapheme, reverse=True)
        if start != len(stitched.strip('*')):
            raise ValueError('bad match')
        stitched = stitched.strip('*') + grapheme.lstrip('*')
        newLetterNum = count_letters(grapheme.lstrip('*'))
        mapping += [num] * newLetterNum
        num += 1
    return stitched, mapping

def fancyPrint(stitchObj):
    print(stitchObj[0])
    print(''.join([str(i) for i in stitchObj[1]]))

def recursionPractice(listOfLists):
    return stitch(['something', recursionPractice('something else')])

def combinatory(length, bases):
    combs = []
    

def getIndices(matchDict):
    lengths = [len(matchDict[i]) for i in matchDict]
    indices = []
    pass

def stitchAll(matchDict, phoneInds, graphInds):
    phones = matchDict.keys()
    for graph in matchDict[phone]:
        pass
        

11/12

Copied over everything I remember being important. Now it's time to start my newest attempt: combining both major attempts so far.

I intend to write functions that provide every possible spelling of a word, and every potential grapheme-to-phoneme mapping. Once I can visualize that, hopefully I will have a better idea of what to do.

In [21]:
#class customBase(list):
#    def __init__(self, baseList):
#       self.zero = [0] * len(baseList)
#        self.current = [0] * len(baseList)
#        self.max = baseList
#        super().__init__(self, i for i in baseList)


    
#    def __iadd__(self, other):
#        if isinstance(other, (int, float)):
#            
#        else:
#            return super().__iadd__(self, other)

In [22]:
from math import prod

class customBase():
    def __init__(self, baseInt):
        self.baseRaw = baseInt
        baseList = list(str(baseInt))
        self.bases = [int(i) for i in baseList]
        self.baseMax = prod(self.bases)
        self.baseReprs = []
        k = range(len(self.bases))
        for i in k:
            self.baseReprs.append(prod(self.bases[:i:-1]))
        self.brc = reversed(self.baseReprs)

    def __str__(self):
        return f"base {self.bases}"

    def __repr__(self):
        return self.__str__()

class customBaseInt():
    def __init__(self, customBase, n):
        if n > customBase.baseMax:
            raise ValueError(f"{n} larger than custom base's maximum value of {customBase.baseMax}")
        self.dec = n
        self.base = customBase
        self.nBasedList = []
        for b in self.base.baseReprs:
            a, n = divmod(n, b)
            self.nBasedList.append(a)
        self.nBased = int(''.join([str(i) for i in self.nBasedList]))

    def __str__(self):
        return self.nBased

    def __repr__(self):
        return f"{self.nBased} in base {self.base.bases}, {self.dec} in decimal"

    def __iadd__(self, other):
        if isinstance(other, (int, float)):
            return customBaseInt(self.base, self.dec + other)
        else:
            raise TypeError(f"assignment addition not supported between objects of type {type(self)} and {type(other)}")

In [23]:
newBase = customBase(2734)
newInt = customBaseInt(newBase, 77)
newInt

611 in base [2, 7, 3, 4], 77 in decimal

In [24]:
newInt += 1
newInt

612 in base [2, 7, 3, 4], 78 in decimal

In [25]:
from math import prod

a = [2,3,4,5,6]
prod(a[:0:-1])

360

In [26]:
from math import prod

def multiIndex(l, indices):
    ans = []
    for i in indices:
        ans.append(l[i])
    return ans

def getAllSpells(word, spellings):
    allspells = []
    phones = word.p.split()
    phoneLens = [len(spellings[p]) for p in phones]
    phoneNumBase = customBase(int(''.join([str(i) for i in phoneLens])))
    phoneNum = customBaseInt(phoneNumBase, 0)
    while True:
        try:
            tempDict = {}
            for i in range(len(phones)):
                tempDict[phones[i]] = phoneNum.nBasedList[i]
                
            allspells.append([spellings[phones[i]] for i in phoneNum.nBasedList])
        except ValueError:
            return allSpells

11/14

I don't exactly remember what the above ^^^ cell was about, so I'm going to try again. I have a way to generate numbers how I want, I just need to iterate through each possible spelling using those numbers.

In [27]:
def getSpell(graphNums, phoneDict):
    """ 
        Arguments:
            graphNums (list of tuples): a list of two-tuples. the first item of each tuple is the phoneme, and the second is an index number.
            phoneDict (dict): the dictionary of phonemes and graphemes to index.
    """
    ans = []
    for phone, gNum in graphNums:
        ans.append(phoneDict[phone][gNum])
    return ans

def getAllSpells(word, phoneDict):
    ans = []
    phones = word.p.split()
    #print(phones)
    phoneLens = [len(phoneDict[p]) for p in phones]
    #print(phoneLens)
    phoneNumBase = customBase(int(''.join([str(i) for i in phoneLens])))
    for i in range(phoneNumBase.baseMax):
        inum = customBaseInt(phoneNumBase, i)
        zippedPhones = zip(phones, inum.nBasedList)
        #print(zippedPhones)
        ans.append((inum.nBasedList, getSpell(zippedPhones, phoneDict)))
    return ans

In [28]:
newBase = customBase(1234)
customBaseInt(newBase, 0).nBasedList

[0, 0, 0, 0]

In [29]:
from random import choice

In [30]:
learnedSpells = learn3(coolSpellings, twoLetter, exLen=2)

In [31]:
learnedSpells

{'AA': ['o' in 'og', 'ah' in 'ah', 'a' in 'ar'],
 'AE': ['a' in 'ad'],
 'AH': ['u' in 'du', 'o' in 'of', 'e' in 'em', 'a' in 'an', 'uh' in 'uh'],
 'AO': ['o' in 'om', 'aw' in 'aw'],
 'AW': [ou, 'ow' in 'ow'],
 'AX': [a],
 'AXR': [er],
 'AY': ['i' in 'fi', 'ai' in 'ai', 'ay' in 'ay', 'y' in 'by'],
 'EH': ['e' in 'ed', 'a' in 'as', 'eh' in 'eh'],
 'ER': [ir, 'er' in 'er', 'or' in 'or', 'ur' in 'ur'],
 'EY': [ai,
  'aa' in 'aa',
  'ae' in 'ae',
  'ai' in 'ai',
  'ay' in 'ay',
  'e' in 'de',
  'y' in 'wy'],
 'IH': ['i' in 'ib', 'o' in 'to'],
 'IX': [e],
 'IY': [ea, 'e' in 'be', 'i' in 'di', 'y' in 'uy'],
 'OW': [oa, 'au' in 'au', 'o' in 'bo', 'oh' in 'oh', 'ow' in 'ow'],
 'OY': ['oy' in 'oy', 'oi' in 'oi'],
 'UH': [oo],
 'UW': [oo, 'o' in 'do', 'u' in 'du', 'o' in 'ou'],
 'UX': [u],
 'B': ['b' in 'ab'],
 'CH': [ch],
 'D': ['d' in 'di'],
 'DX': [tt],
 'EL': [le],
 'EM': [m],
 'EN': [on],
 'F': ['f' in 'fe'],
 'G': ['g' in 'go'],
 'H': [h],
 'HH': ['h' in 'ha'],
 'JH': ['j' in 'ji'],
 'K': [

In [32]:
breath = wordObjs2['BREATH']

In [33]:
allSpells = getAllSpells(breath, learnedSpells)

In [34]:
allSpells

[([0, 0, 0, 0], ['b' in 'ab', 'r' in 'ra', 'e' in 'ed', th]),
 ([0, 0, 1, 0], ['b' in 'ab', 'r' in 'ra', 'a' in 'as', th]),
 ([0, 0, 2, 0], ['b' in 'ab', 'r' in 'ra', 'eh' in 'eh', th])]

In [35]:
def prettyPrintAllSpells(allSpellTuples):
    for p, n in allSpellTuples:
        print(p)
        print(n)
        print('\n')

In [36]:
prettyPrintAllSpells(allSpells)

[0, 0, 0, 0]
['b' in 'ab', 'r' in 'ra', 'e' in 'ed', th]


[0, 0, 1, 0]
['b' in 'ab', 'r' in 'ra', 'a' in 'as', th]


[0, 0, 2, 0]
['b' in 'ab', 'r' in 'ra', 'eh' in 'eh', th]




nice!! it seems to work. now to mess with the match_phones stuff

In [37]:
breathMatched = match_phones2(breath)

In [38]:
breathMatched

{'B': ['b*****', 'br****', 'bre***'],
 'R': ['*r****', '*re***', '*rea**', '**e***', '**ea**', '***a**'],
 'EH': ['**e***', '**ea**', '**eat*', '***a**', '***at*', '****t*'],
 'TH': ['***ath', '****th', '*****h']}

In [39]:
def stitchAll(matchDict):
    ans = []
    phoneLens = [len(matchDict[i]) for i in matchDict]
    matchBase = customBase(int(''.join([str(i) for i in phoneLens])))
    for i in range(matchBase.baseMax):
        inum = customBaseInt(matchBase, i)
        toStitch = []
        for j, k in zip(matchDict.values(), inum.nBasedList):
            #print(j, k)
            toStitch.append(j[k])
        try:
            ans.append((inum.nBasedList, stitch(toStitch)))
        except ValueError:
            continue
    return ans

In [40]:
breathStitched = stitchAll(breathMatched)

In [41]:
breathStitched

[([0, 0, 0, 0], ('breath', [0, 1, 2, 3, 3, 3])),
 ([0, 0, 1, 1], ('breath', [0, 1, 2, 2, 3, 3])),
 ([0, 0, 2, 2], ('breath', [0, 1, 2, 2, 2, 3])),
 ([0, 1, 3, 1], ('breath', [0, 1, 1, 2, 3, 3])),
 ([0, 1, 4, 2], ('breath', [0, 1, 1, 2, 2, 3])),
 ([0, 2, 5, 2], ('breath', [0, 1, 1, 1, 2, 3])),
 ([1, 3, 3, 1], ('breath', [0, 0, 1, 2, 3, 3])),
 ([1, 3, 4, 2], ('breath', [0, 0, 1, 2, 2, 3])),
 ([1, 4, 5, 2], ('breath', [0, 0, 1, 1, 2, 3])),
 ([2, 5, 5, 2], ('breath', [0, 0, 0, 1, 2, 3]))]

In [42]:
for b in breathStitched:
    fancyPrint(b[1])
    print('\n')

breath
012333


breath
012233


breath
012223


breath
011233


breath
011223


breath
011123


breath
001233


breath
001223


breath
001123


breath
000123




beautiful. didn't take me long to get this working.

now for the hard part. I need to figure out how to combine both techniques to ensure that my code only learns correct spellings.

vvv copied from above, for easier viewing

In [43]:
## from time import sleep
from copy import copy

def sideSpell4(word, phones, spells, polarity):
    currentSpell = []
    for p in phones:
        oldLen = len(currentSpell)
        for s in spells[p]:
            newSpell = copy(currentSpell)
            if not polarity:
                newSpell.append(s.g)
                if word.s.startswith(''.join([a if not isinstance(a, graphExample) else a.g for a in newSpell])):
                    currentSpell.append(s)
                    break
            else:
                newSpell.insert(0, s.g)
                if word.s.endswith(''.join([a if not isinstance(a, graphExample) else a.g for a in newSpell])):
                    currentSpell.insert(0, s)
                    break                
        if len(currentSpell) == oldLen:
            break
    return currentSpell

def spell4(word, spellings, exLen=4, debug=False):
    word.s = word.s.lower()
    if not word.s.isalnum():
        if debug:
            print("bad characters", word.s)
        return False
    if debug:
        print(word)
    phones = word.p.split()
    if len(word.s) < len(phones):
        if debug:
            print("too long!", word.s, phones)
        return False
    if not set(word.s) & set('aeiouy'):
        if debug:
            print("no vowels -> not a word!", word.s, phones)
        return False
    if word.s[:-2] == 'le' and phones[:-2] == ['AH', 'L'] and word.s[-3] not in ['aeiou']:
        phones.append(phones.pop[-2])
        if debug:
            print("special case: L")
    rphones = copy(phones)
    rphones.reverse()
    if debug:
        print("phones", phones, rphones)
    frontSpell = sideSpell4(word, phones, spellings, 0)
    frontStr = [a.g for a in frontSpell]
    backSpell = sideSpell4(word, rphones, spellings, 1)
    backStr = [a.g for a in backSpell]
    if debug:
        print("front & back", frontSpell, backSpell)
    if '' in frontSpell:
        frontSpell.remove('')
    if '' in backSpell:
        backSpell.remove('')
    if len(phones) > (len(frontSpell) + len(backSpell) + 1):
        if debug:
            print("failure to spell")
        return False
    if frontSpell == backSpell and ''.join(frontStr) == word.s:
        if debug:
            print('yay!')
            print(word, frontSpell, len(frontSpell))
            print(backSpell, len(backSpell))
            print(phones)
        exPhone = choice(phones)
        g = frontSpell[phones.index(exPhone)]
        p = spellings[exPhone]
        exGraph = p[p.index(g)]
        if len(word.s) == exLen and not exGraph.set:
            exGraph.setExample(word)
        return True
    missingG = graphExample(word.s.removeprefix(''.join(frontStr)).removesuffix(''.join(backStr)), isX=word.x)
    #if not missingG.g:
    #    missingG = backSpell[0]
    if debug:
        print("missing grapheme", missingG)
    if len(frontSpell + backSpell) > len(phones):
            #print(word, frontSpell, len(frontSpell))
            #print(backSpell, len(backSpell))
            #print(phones)
            if debug:
                print("too long")
            del backSpell[0]
    if len(frontSpell + backSpell) < len(phones):
        try:
            missingP = phones[len(frontSpell)]
            if debug:
                print("missing phoneme (clean)", missingP)
            if len(word.s) == exLen and not missingG.set:
                missingG.setExample(word)        
            spellings[missingP].append(missingG)    
        except IndexError as e:
            if debug:
                print(word, frontSpell, len(frontSpell))
                print(backSpell, len(backSpell))
                print(phones)
            raise e
    else:
        missingP = phones[len(frontSpell) - 1]
        if debug:
            print("missing phoneme (overlap)", missingP)
        try:
            newG = graphExample(''.join((frontStr[-1], missingG.g) if frontStr else (missingG.g, backStr[0])), isX = missingG.g.count('x') or (frontStr[-1].count('x') if frontStr else backStr[0].count('x')))
            if len(word.s) == exLen and not newG.set:
                newG.setExample(word)  
            spellings[missingP].append(newG)
            if debug:
                print(newG)
        except IndexError as e:
            if debug:
                print(word, frontSpell, len(frontSpell))
                print(backSpell, len(backSpell))
                print(phones)
                print(missingP)
            raise e

def learn3(spellings, words, exLen=4, debug=False):
    newSpells = {s: copy(spellings[s]) for s in spellings}
    for word in words:
        spell3(word, newSpells, exLen=exLen, debug=debug)
        if debug:
            print(word, word.p)
        #break
    return newSpells

In [44]:
def newSpell(word, phoneDict, exLen=4):
    phones = word.p.split()
    allGraphs = getAllSpells(word, phoneDict)
    matches = match_phones2(word)
    allStitches = stitchAll(matches)
    print(allStitches)
    print(allGraphs)

12/3

Experimentation & Refamiliarization (it's been a while)

In [45]:
newSpell(wordObjs2['BREATH'], coolSpellings)

[([0, 0, 0, 0], ('breath', [0, 1, 2, 3, 3, 3])), ([0, 0, 1, 1], ('breath', [0, 1, 2, 2, 3, 3])), ([0, 0, 2, 2], ('breath', [0, 1, 2, 2, 2, 3])), ([0, 1, 3, 1], ('breath', [0, 1, 1, 2, 3, 3])), ([0, 1, 4, 2], ('breath', [0, 1, 1, 2, 2, 3])), ([0, 2, 5, 2], ('breath', [0, 1, 1, 1, 2, 3])), ([1, 3, 3, 1], ('breath', [0, 0, 1, 2, 3, 3])), ([1, 3, 4, 2], ('breath', [0, 0, 1, 2, 2, 3])), ([1, 4, 5, 2], ('breath', [0, 0, 1, 1, 2, 3])), ([2, 5, 5, 2], ('breath', [0, 0, 0, 1, 2, 3]))]
[([0, 0, 0, 0], ['b' in 'ab', 'r' in 'ra', 'e' in 'ed', th])]


In [46]:
angry = wordObjs2['ANGRY']

In [47]:
angry.p

'AE NG G R IY'

In [48]:
len(angry.p.split())

5

In [49]:
angry.s

'angry'

In [50]:
len(angry.s)

5

In [51]:
easyWords = [wordObjs2[i] for i in wordObjs2 if len(wordObjs2[i].p.split()) == len(wordObjs2[i].s) and "'" not in wordObjs2[i].s]

In [52]:
len(easyWords)

34517

In [53]:
len(wordObjs2)

134373

In [54]:
from random import sample

print([(i.s, i.p) for i in sample(easyWords, 15)])

[('hobart', 'HH OW B AA R T'), ('brunken', 'B R AH NG K AH N'), ('seidl', 'S AY D AH L'), ('betzold', 'B EH T Z OW L D'), ('deblasio', 'D IH B L AA S IY OW'), ('park', 'P AA R K'), ('danahy', 'D AE N AH HH IY'), ('nagata', 'N AA G AA T AH'), ('materialistic', 'M AH T IH R IY AH L IH S T IH K'), ('relondo', 'R IH L AO N D OW'), ('lukens', 'L UW K AH N Z'), ('resembled', 'R IY Z EH M B AH L D'), ('siksty', 'S IH K S T IY'), ('davida', 'D AA V IY D AH'), ('kirov', 'K IH R AA V')]


In [55]:
simpleTest = copy(coolSpellings)

I thought that I could start teaching my dictionary with the assumption that words with equivalent amounts of sounds and letters meant it provided 

given a word, find another word that differs by one phoneme. the letters that change between words represent the varied phoneme.

<i>can it really be that simple?</i>

I don't think so. 'fair' and 'care' differ by one sound, but sounds shared between them are spelled differently as well.

could I use phoneme sequence analysis to determine which changed letters belong to the variable sound?
>Probably not. PSA assumed no silent letters, which is just not true here.

what about consonant-vowel sequence analysis? if non-loaned English words have the repeating pattern C^m V C^n, 

In [56]:
def getOneOff(word, wordDict, testWord = None):
    ans = []
    if testWord != None:
        wordDict = {0: testWord}
    for i in wordDict:
        w = wordDict[i]
        ind = 0
        mismatchCount = 0
        if len(w.p.split()) == len(word.p.split()): 
            #print(w)
            for sound in w.p.split():
                #print(sound, word.p.split()[ind])
                if sound != word.p.split()[ind]:
                    #print('not a match')
                    mismatchCount += 1
                ind += 1
            if mismatchCount == 1:
                ans.append(w)
            #break
    return ans

In [57]:
newBreath = wordObjs2['BREATH']
newBreath1 = getOneOff(newBreath, wordObjs2)

In [58]:
len(newBreath1)

16

In [59]:
newBreath1

[b'rith,
 bread,
 breck,
 bred,
 brehm,
 brekke,
 brem,
 bren,
 brenn,
 bress,
 bresse,
 bret,
 brett,
 broth,
 creath,
 greth]

In [60]:
def getOffInd(word, other):
    ind = 0
    for sound in word.p.split():
        if other.p.split()[ind] != sound:
            return ind
        ind += 1

def organizeOneOffs(word, offs):
    ans = {i: [] for i in range(len(word.p.split()))}
    for o in offs:
        ans[getOffInd(word, o)].append(o)
    return ans

In [61]:
organizeOneOffs(newBreath, newBreath1)

{0: [creath, greth],
 1: [],
 2: [b'rith, broth],
 3: [bread,
  breck,
  bred,
  brehm,
  brekke,
  brem,
  bren,
  brenn,
  bress,
  bresse,
  bret,
  brett]}

Given a word, I can find all words that differ by exactly one sound, and group them by the variable sound.

Now, I need a way to 'subtract' two words, in order to find 

In [62]:
anger = wordObjs2['ANGER']
anger1 = getOneOff(anger, wordObjs2)

In [63]:
len(anger1)

7

In [64]:
organizeOneOffs(anger, anger1)

{0: [enger, ingar, ungar, unger], 1: [apgar], 2: [anchor, anker], 3: []}

In [65]:
anger.p

'AE NG G ER'

NEW APPROACH: given some input like the dictionaries above, what should a smart program learn?

aka: what patterns do I notice when I look at them?

From the anger dict:

1. \[vowel\] + 'ng' + junk => /vowel sound/ + /NG/ + /G/ + junk
2. junk + 'ng' + vowel + 'r' => junk + /NG/ + /G/ + /ER/
3. 'a' => /AE/

From the breath dict:

1. 'b' at beginning of word => /B/
2. 'r' as second letter => /R/
3. 'br' => /BR/
4. junk + vowels + 'th' => junk + /vowel sound/ + /TH/
5. /EH/ => 'ea' or 'e'

In [66]:
newBreath.p

'B R EH TH'

12/5

So I realized that the above Markdown cell (along with some paper sketches) was essentially reinventing learning models, but manual. I don't like using models, as it's a black box that cannot be understood. Ideally, my program would know how to discover new patterns in some format, and use those to learn new spellings. It could work, but I'm not sure. So far, it seems very difficult.

In [67]:
creation = wordObjs2['CREATION']
creation1 = getOneOff(creation, wordObjs2)

In [68]:
creation.p

'K R IY EY SH AH N'

In [69]:
organizeOneOffs(creation, creation1)

{0: [], 1: [], 2: [croatian], 3: [], 4: [], 5: [], 6: []}

In [70]:
blinking = wordObjs2['BLINKING']
blinking1 = getOneOff(blinking, wordObjs2)

In [71]:
blinking.p

'B L IH NG K IH NG'

In [72]:
organizeOneOffs(blinking, blinking1)

{0: [plinking], 1: [], 2: [blanking], 3: [], 4: [], 5: [], 6: []}

In [73]:
def getSomeOffs(word, wordDict, offset:int, testWord = None):
    ans = []
    if testWord != None:
        wordDict = {0: testWord}
    for i in wordDict:
        w = wordDict[i]
        ind = 0
        mismatchCount = 0
        if len(w.p.split()) == len(word.p.split()): 
            #print(w)
            for sound in w.p.split():
                #print(sound, word.p.split()[ind])
                if sound != word.p.split()[ind]:
                    #print('not a match')
                    mismatchCount += 1
                ind += 1
            if mismatchCount == offset:
                ans.append(w)
            #break
    return ans

In [80]:
getSomeOffs(wordObjs2[choice(wordObjs2)], wordObjs2, 2)

KeyError: 84681

In [76]:
len(getSomeOffs(breath, wordObjs2, 2))

270