# SEQ2SEQ

## Step1: Create Letter List and Letter2Index, Index2Letter

In [26]:
import string
import codecs
import os

In [18]:
letters = ['_PAD', '_GO', '_EOS', 'UNK', "'",
          'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
          'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
          'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

In [19]:
string.ascii_lowercase

'abcdefghijklmnopqrstuvwxyz'

In [20]:
letter2index = {}
index2letter = {}
for i, letter in enumerate(letters):
    letter2index[letter] = i
    index2letter[i] = letter
print(letter2index)
print(index2letter)

{'l': 16, "'": 4, '_PAD': 0, 'h': 12, 'v': 26, 'd': 8, 's': 23, 't': 24, 'i': 13, 'a': 5, 'j': 14, 'e': 9, 'q': 21, 'p': 20, 'm': 17, 'r': 22, '_GO': 1, 'o': 19, 'b': 6, '_EOS': 2, 'n': 18, 'f': 10, 'z': 30, 'u': 25, 'g': 11, 'y': 29, 'c': 7, 'w': 27, 'k': 15, 'UNK': 3, 'x': 28}
{0: '_PAD', 1: '_GO', 2: '_EOS', 3: 'UNK', 4: "'", 5: 'a', 6: 'b', 7: 'c', 8: 'd', 9: 'e', 10: 'f', 11: 'g', 12: 'h', 13: 'i', 14: 'j', 15: 'k', 16: 'l', 17: 'm', 18: 'n', 19: 'o', 20: 'p', 21: 'q', 22: 'r', 23: 's', 24: 't', 25: 'u', 26: 'v', 27: 'w', 28: 'x', 29: 'y', 30: 'z'}


## Step 2: Phoneme list, Phone2Index, Index2Phone

In [17]:
%cat data/cmudict.symbols | awk '{printf "\""$1"\","}'

"AA","AA0","AA1","AA2","AE","AE0","AE1","AE2","AH","AH0","AH1","AH2","AO","AO0","AO1","AO2","AW","AW0","AW1","AW2","AY","AY0","AY1","AY2","B","CH","D","DH","EH","EH0","EH1","EH2","ER","ER0","ER1","ER2","EY","EY0","EY1","EY2","F","G","HH","IH","IH0","IH1","IH2","IY","IY0","IY1","IY2","JH","K","L","M","N","NG","OW","OW0","OW1","OW2","OY","OY0","OY1","OY2","P","R","S","SH","T","TH","UH","UH0","UH1","UH2","UW","UW0","UW1","UW2","V","W","Y","Z","ZH",

In [14]:
phonemes = ['_PAD', '_GO', '_EOS', 'UNK',
            "AA","AA0","AA1","AA2","AE",
            "AE0","AE1","AE2","AH","AH0","AH1","AH2","AO","AO0","AO1","AO2","AW","AW0","AW1",
            "AW2","AY","AY0","AY1","AY2","B","CH","D","DH","EH","EH0","EH1","EH2","ER","ER0",
            "ER1","ER2","EY","EY0","EY1","EY2","F","G","HH","IH","IH0","IH1","IH2","IY","IY0",
            "IY1","IY2","JH","K","L","M","N","NG","OW","OW0","OW1","OW2","OY","OY0","OY1","OY2",
            "P","R","S","SH","T","TH","UH","UH0","UH1","UH2","UW","UW0","UW1","UW2","V","W","Y","Z","ZH"]

In [15]:
phone2index = {}
index2phone = {}
for i, phone in enumerate(phonemes):
    phone2index[phone] = i
    index2phone[phone] = i


## Step 3: Create Dictionary

In [39]:
def collect_pronunciations(dic_lines):
  '''Create dictionary mapping word to its different pronounciations.
  '''
  dic = {}
  for line in dic_lines:
    lst = line.strip().split()
    if len(lst) > 1 and "(" not in lst[0]:
      dic[lst[0]] = " ".join(lst[1:])
    elif len(lst) == 1:
      print("WARNING: No phonemes for word '%s' line ignored" % (lst[0]))
  return dic


In [40]:
def split_dictionary(train_path):
  """Split source dictionary to train, validation and test sets.
  """
  with codecs.open(train_path, "r", "utf-8") as f:
    source_dic = f.readlines()
  dic = collect_pronunciations(source_dic)
  return dic

In [41]:
dic = split_dictionary('data/cmudict.dict')

In [42]:
dic

{'sardonically': 'S AA0 R D AA1 N IH0 K AH0 L IY0',
 'orix': 'AO1 R IH0 K S',
 'gastrovascular': 'G AE2 S T R OW0 V AE1 S K Y AH0 L ER0',
 'wallingford': 'W AO1 L IH0 NG F ER0 D',
 'disallowing': 'D IH0 S AH0 L AW1 IH0 NG',
 'powwows': 'P AW1 W AW2 Z',
 'culpa': 'K AH1 L P AH0',
 'frey': 'F R EY1',
 'foxglove': 'F AA1 K S G L AH2 V',
 'alatas': 'AH0 L AA1 T AH0 S',
 'jaworowski': 'Y AH0 W ER0 AO1 F S K IY0',
 'behaviorist': 'B IH0 HH EY1 V Y ER0 IH0 S T',
 'aficionados': 'AH0 F IH2 SH AH0 N AA1 D OW0 Z',
 'clementson': 'K L EH1 M IH0 N T S AH0 N',
 'sherer': 'SH IH1 R ER0',
 'stipulates': 'S T IH1 P Y AH0 L EY2 T S',
 'rescuers': 'R EH1 S K Y UW2 ER0 Z',
 'particulate': 'P ER0 T IH1 K Y AH0 L AH0 T',
 'patronize': 'P EY1 T R AH0 N AY2 Z',
 'repression': 'R IY0 P R EH1 SH AH0 N',
 'blanchards': 'B L AE1 N CH ER0 D Z',
 'sgro': 'S K R OW1',
 'woodyard': 'W UH1 D Y AA2 R D',
 'ageratums': 'AH0 JH EH1 R AH0 T AH0 M Z',
 'nigerians': 'N AY0 JH IH1 R IY0 AH0 N Z',
 'kasagic': 'K AH0 S AA1 G 

In [38]:
for line in dic:
    print(line)
    break

sardonically


In [44]:
dic['sardonically']

'S AA0 R D AA1 N IH0 K AH0 L IY0'