In [1]:
import string
import os
import codecs
import pickle

# Prepare mapping Functions

In [2]:
letter_list = ['_GO', '_EOS', "'", 
          'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
          'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
          'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

In [3]:
letter2index = {}
index2letter = {}
for index, letter in enumerate(letter_list):
    letter2index[letter] = index
    index2letter[index] = letter
    
#print('LETTER TO INDEX: {}\n'.format(letter2index))
#print('INDEX TO LETTER: {}\n'.format(index2letter))

In [4]:
phoneme_list = ['_GO', '_EOS',
                'AA', 'AE', 'AH', 'AO', 'AW',
                'AY', 'B', 'CH', 'D', 'DH', 'EH', 
                'ER', 'EY', 'F', 'G', 'HH', 'IH',
                'IY', 'JH', 'K', 'L', 'M', 'N', 
                'NG', 'OW', 'OY', 'P', 'R', 'S',
                'SH', 'T', 'TH', 'UH', 'UW', 'V', 
                'W', 'Y', 'Z','ZH']

In [5]:
phoneme2index = {}
index2phoneme = {}

for index, phoneme in enumerate(phoneme_list):
    phoneme2index[phoneme] = index
    index2phoneme[index] = phoneme
    
#print('PHONEME TO INDEX: {}\n'.format(phoneme2index))
#print('INDEX TO PHONEME: {}\n'.format(index2phoneme))

To get the data list, run the following command

```%cat cmudict/cmudict.symbols | awk '{printf "\""$1"\","}'```

In [6]:
symbol_list = ['_GO', '_EOS', 
               "AA","AA0","AA1","AA2","AE","AE0","AE1",
               "AE2","AH","AH0","AH1","AH2","AO","AO0",
               "AO1","AO2","AW","AW0","AW1","AW2","AY",
               "AY0","AY1","AY2","B","CH","D","DH","EH",
               "EH0","EH1","EH2","ER","ER0","ER1","ER2",
               "EY","EY0","EY1","EY2","F","G","HH","IH",
               "IH0","IH1","IH2","IY","IY0","IY1","IY2",
               "JH","K","L","M","N","NG","OW","OW0","OW1",
               "OW2","OY","OY0","OY1","OY2","P","R","S",
               "SH","T","TH","UH","UH0","UH1","UH2","UW",
               "UW0","UW1","UW2","V","W","Y","Z","ZH"]

In [7]:
symbol2index = {}
index2symbol= {}

for index, symbol in enumerate(symbol_list):
    symbol2index[symbol] = index
    index2symbol[index] = symbol
    
#print('SYMBOL TO INDEX: {}\n'.format(symbol2index))
#print('INDEX TO SYMBOL: {}\n'.format(index2symbol))

# Split dictionary

In [8]:
def split_to_grapheme_phoneme(source_dic):
    """
    Split input dictionary into two separate lists with graphemes and phonemes.
    Args:
    inp_dictionary: input dictionary.
    """
    special_char = ['.', '(', '-']
    graphemes, phonemes = [], []
    for line in source_dic:
        if '#' in line:
            split_line = line.split('#')[0].strip().split()
        else:   
            split_line = line.strip().split()
        
        if len(split_line) > 1:
            if not any(char in line for char in special_char):
                phoneme = ['_GO']+split_line[1:]+['_EOS']
                
                grapheme = ['_GO']+list(split_line[0])+['_EOS']
                
                graphemes.append(grapheme)
                phonemes.append(phoneme)
    return graphemes, phonemes

In [9]:
def split_dictionary(train_path):
    """
    Split source dictionary to train, validation and test sets.
    """
    with codecs.open(train_path, "r", "utf-8") as f:
        source_dic = f.readlines()
    return split_to_grapheme_phoneme(source_dic)

In [10]:
graphemes, phonemes = split_dictionary('cmudict/cmudict.dict')

```python
# Use this to understand how to split dictionary into a list of graphemes and a list of phonemes
with codecs.open('cmudict/cmudict.dict', "r", "utf-8") as f:
    source_dic = f.readlines()
    line = source_dic[1]
    split_line = line.strip().split()
    first = list(split_line[0])
    print(line)
    print(split_line)
    print(first)
```

In [11]:
graphemes[0:10]

[['_GO', "'", 'b', 'o', 'u', 't', '_EOS'],
 ['_GO', "'", 'c', 'a', 'u', 's', 'e', '_EOS'],
 ['_GO', "'", 'c', 'o', 'u', 'r', 's', 'e', '_EOS'],
 ['_GO', "'", 'c', 'u', 's', 'e', '_EOS'],
 ['_GO', "'", 'e', 'm', '_EOS'],
 ['_GO', "'", 'f', 'r', 'i', 's', 'c', 'o', '_EOS'],
 ['_GO', "'", 'g', 'a', 'i', 'n', '_EOS'],
 ['_GO', "'", 'k', 'a', 'y', '_EOS'],
 ['_GO', "'", 'm', '_EOS'],
 ['_GO', "'", 'n', '_EOS']]

In [12]:
phonemes[:10]

[['_GO', 'B', 'AW1', 'T', '_EOS'],
 ['_GO', 'K', 'AH0', 'Z', '_EOS'],
 ['_GO', 'K', 'AO1', 'R', 'S', '_EOS'],
 ['_GO', 'K', 'Y', 'UW1', 'Z', '_EOS'],
 ['_GO', 'AH0', 'M', '_EOS'],
 ['_GO', 'F', 'R', 'IH1', 'S', 'K', 'OW0', '_EOS'],
 ['_GO', 'G', 'EH1', 'N', '_EOS'],
 ['_GO', 'K', 'EY1', '_EOS'],
 ['_GO', 'AH0', 'M', '_EOS'],
 ['_GO', 'AH0', 'N', '_EOS']]

# Convert to index

In [13]:
def convert2Index(source_list, mappingFunction):
    target_list = []
    for i in source_list:
        target_ele = [mappingFunction[ele] for ele in i]
        target_list.append(target_ele)
    return target_list

In [14]:
graphemes_index = convert2Index(graphemes, letter2index)

In [15]:
phonemes_index = convert2Index(phonemes, symbol2index)

# Split data into train, val, test

In [16]:
def dataSplit(data):
    train_port = 17/20
    val_port = 1/20
    #test_port = 2/20
    train_shape = int(len(data)*train_port)
    val_shape = int(len(data)*val_port)
    #test_shape = int(len(data)*test_port)
    return data[:train_shape], data[train_shape:train_shape+val_shape], data[train_shape+val_shape:]

In [17]:
p_train, p_val, p_test = dataSplit(phonemes_index)

In [18]:
g_train, g_val, g_test = dataSplit(graphemes_index)

In [20]:
len(p_train), len(p_val), len(p_test)

(106082, 6240, 12481)

In [19]:
with open("input/input.pkl", "wb") as f:
    pickle.dump(p_train, f, protocol=-1)
    pickle.dump(p_val, f, protocol=-1)
    pickle.dump(p_test, f, protocol=-1)
    pickle.dump(g_train, f, protocol=-1)
    pickle.dump(g_val, f, protocol=-1)
    pickle.dump(g_test, f, protocol=-1)

In [None]:
class data():
    def __init__(self, ):
        self.i2l = ['START']
        self.l2i = {'START': 0}
        
        for i, l in enumerate(string.ascii_lowercase):
            self.i2l.append(l)
            self.l2i[l] = i+1
            
        self.i2l.append('END')
        self.l2i['END'] = len(self.i2l) - 1