In [1]:
def is_schwa(phone, is_timit=True):
  if is_timit:
    return phone in ["ax", "axr", "ix"]
  else:
    return phone == "AH0"

# CMUdict doesn't have syllabic consonants
def is_syllabic_consonant(phone, is_timit=True):
  SYLLC = ["el", "em", "en", "er", "er1", "er2"]
  if is_timit and phone in SYLLC:
    return True
  else:
    return False

# http://web.archive.org/web/20100614180508/http://semarch.linguistics.fas.nyu.edu/barker/Syllables/syllabify.pl
def sonority(phone):
  STOPS = ["p", "b", "t", "d", "k", "g"]
  AFFRICATES = ["ch", "jh"]
  FRICATIVES = ["th", "dh", "f", "v", "s", "z", "sh", "zh"]
  NASALS = ["m", "n", "ng"]
  LIQUIDS = ["l", "r"]
  GLIDES = ["w", "y"]

  # 's' is special
  if phone == "s":
    return 1
  elif phone in STOPS:
    return 1
  elif phone in AFFRICATES:
    return 2
  elif phone in FRICATIVES:
    return 3
  elif phone in NASALS:
    return 4
  elif phone in LIQUIDS:
    return 5
  elif phone == "hh":
    return 6
  elif phone in GLIDES:
    return 6
  else:
    return 7

In [2]:
def last_phoneme(graphone):
  grapheme, phoneme = graphone.split('}')
  return phoneme.split('|')[-1]
def first_phoneme(graphone):
  grapheme, phoneme = graphone.split('}')
  return phoneme.split('|')[0]

In [7]:
assert last_phoneme('x}e|k|s') == 's'
assert first_phoneme('x}e|k|s') == 'e'

In [16]:
def merge_graphones(graphones):
  graphemes = []
  phonemes = []
  for graphone in graphones:
    graphemes_string, phonemes_string = graphone.split('}')
    cur_graphemes = graphemes_string.split('|')
    cur_phonemes = phonemes_string.split('|')
    graphemes += cur_graphemes
    phonemes += cur_phonemes
  return '}'.join(('|'.join(graphemes), '|'.join(phonemes)))


In [18]:
assert merge_graphones("a}a t|h}th x}k|s".split(' ')) == 'a|t|h|x}a|th|k|s'

In [35]:
def syllabify(graphones):
    sonority_up = True
    last_sonority_up = True
    last_sonority = 0
    stack = []
    output = []
    last_phoneme = ""
    labials = ["p", "b", "m", "f", "v"]
    s_sh = ["s", "sh"]
    for graphone in graphones[::-1]:
        phoneme = first_phoneme(graphone)
        phone_sonority = sonority(phoneme)
  
        sonority_up = last_sonority < phone_sonority 

        if last_sonority == 3 and phone_sonority == 1:
            sonority_up = True

        if last_phoneme == 'w' and phoneme in labials:
            last_sonority_up = False
            sonority_up = True

        if last_phoneme == "m" and not sonority_up and not phoneme in s_sh:
            last_sonority_up = False
            sonority_up = True

        if phoneme == "m" and not sonority_up and last_sonority < 7:
            last_sonority_up = False
            sonority_up = True

        if phoneme == "n" and not sonority_up and last_sonority < 6:
            last_sonority_up = False
            sonority_up = True

        if last_phoneme == "m" and not sonority_up and not phoneme in s_sh:
            last_sonority_up = False
            sonority_up = True

        if not sonority_up and phoneme == "ng":
            last_sonority_up = False
            sonority_up = True

        if last_sonority == 7 and phone_sonority == 7:
            last_sonority_up = True
            sonority_up = True 

        if sonority_up and last_sonority == 1 and sonority == 1 and phoneme != "s":
            sonority_up = True

        if not last_sonority_up and sonority_up:
            output.append(merge_graphones(stack[::-1]))
            stack = []

        stack.append(graphone)
        last_sonority_up = sonority_up
        last_phoneme = phoneme
        last_sonority = phone_sonority

    output.append(merge_graphones(stack[::-1]))
    return output[::-1]


In [38]:
assert syllabify('a}ax b}b o|u}aw1 t}t'.split(' ')) == ['a}ax', 'b|o|u|t}b|aw1|t']


In [None]:
with open('TIMIT.clean.corpus', 'r') as f:
  for line in f.readlines():
     graphones = line.split(' ')
     for i in range(0, len(graphones)):
       print (f'{graphones}: {graphones[i]}')