In [1]:
# Task: Come up with alternative spellings for all orthographic words in a database
# Tool 1: IPA -> Graphem Konverter Wiki
# Tool 2: SAMPA -> IPA Konverter Wiki

# Idea: Take the phonetic sequence (SAMPA) of every word, convert it to an IPA sequence and
        # then convert that to all grapheme sequences
    
# Needs: A dictionary for SAMPA -> IPA (ideally non-ambiguous). A dictionary for IPA -> Text (ambigu.)

import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
import os

  from ._conv import register_converters as _register_converters


In [17]:
data = np.load('data/celex.npz')
sampa_dict = {key:data['phon_dict'].item().get(key) for key in data['phon_dict'].item()}
sampa_keys = list(sampa_dict.keys())
print(len(sampa_dict), sampa_keys)


# Step 1: Make a sampa_ipa dict. How to: Go to SAMPA -> IPA tabelle, for every SAMPA char, check whether it 
# is in the CELEX korpus. If yes, look up example word from wiki in korpus and check whether it is the right sampa
# sign. If yes, look up on wiktionary example word in IPA and check whether output sign is correct.

sampa_ipa = dict()
sampa_ipa['i'] = 'i'
sampa_ipa[':'] = 'ː'
sampa_ipa['I'] = 'ɪ'
sampa_ipa['e'] = 'e'
sampa_ipa['E'] = 'ɛ'
sampa_ipa['{'] = 'ɛ'
sampa_ipa['I'] = 'ɪ'
sampa_ipa['I'] = 'ɪ'
sampa_ipa['I'] = 'ɪ'
sampa_ipa['I'] = 'ɪ'
sampa_ipa['I'] = 'ɪ'
sampa_ipa['I'] = 'ɪ'
sampa_ipa['I'] = 'ɪ'
sampa_ipa['I'] = 'ɪ'
sampa_ipa['I'] = 'ɪ'
sampa_ipa['I'] = 'ɪ'
sampa_ipa['I'] = 'ɪ'
sampa_ipa['I'] = 'ɪ'
sampa_ipa['I'] = 'ɪ'



43 ['y', 'f', 'O', '/', 'N', 'e', 'k', '~', 'j', 'Z', 'd', 'i', 'b', 'r', 'S', 'v', 'm', 'o', 'l', 'E', 'Y', 's', '@', 'z', 'x', '{', 'A', ':', 'h', 'g', '+', 'U', 'I', 'n', 'p', ' ', 'u', 'a', '#', '|', 't', '<GO>', '<PAD>']


In [19]:
print('{' in sampa_keys)

True


In [25]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
import os

def BAS_P2G_retrieve():
    """
    Shortcut method for quickly retrieving numerical dataset of BAS-Sprecher corpus
    In case whole dataset is not copied on remote machine
    """
    data = np.load('data/BAS_P2G.npz')
    input_dict = np_dict_to_dict(data['inp_dict'])
    target_dict = np_dict_to_dict(data['tar_dict'])

    return ( (data['inputs'], data['targets']) , (input_dict, target_dict) )

def batch_data(x, y, BATCH_SIZE):
    """
    Receives a batch_size and the entire training data [i.e inputs (x) and labels (y)]
    Returns a data iterator
    """
    shuffle = np.random.permutation(len(x))
    start = 0
    x = x[shuffle]
    y = y[shuffle]
    while start + BATCH_SIZE <= len(x):
        yield x[start:start+BATCH_SIZE], y[start:start+BATCH_SIZE]
        start += BATCH_SIZE

def np_dict_to_dict(np_dict):
    """
    Converts a dictionary saved via np.save (as structured np array) into an object of type dict

    Parameters:
    --------------
    NP_DICT        : {np.array} structured np.array with dict keys and items

    Returns:
    --------------
    DICT            : {dict} converted NP_DICT

    """

    return {key:np_dict.item().get(key) for key in np_dict.item()}

  from ._conv import register_converters as _register_converters


In [17]:



def extract_celex(path):
    """
    Reads in data from the CELEX corpus
    
    Parameters:
    -----------
    PATH        {str} the path to the desired celex file, i.e. gpl.cd 
                    (contains orthography and phonology)
    
    Returns:
    -----------
    WORDS       {list} of words (length 51728) for gpl.cd
    PHONS       {list} of phoneme sequences (length 51728) for gpl.cd
    
    
    
    """
    
    
    with open(path, 'r') as file:

        raw_data = file.read().splitlines()
        words = []
        phons = []
        
        for ind,raw_line in enumerate(raw_data):
            
            
            line = raw_line.split("\\")
            words.append(line[1])
            phons.append(line[-2]) # Using SAMPA notation
                
    return words, phons
                               



In [27]:
path = "/Users/jannisborn/Desktop/LDS_Data/celex2/german/gpl/gpl.cd"
words, phons = extract_celex(path)

print(words[30], phons[30])

((w,p) , (word_dict, phon_dict)) = str_to_num_dataset(words,phons)

print(w.shape, p.shape, len(word_dict), len(phon_dict))

abbeizen ap#baits@
(51728, 31) (51728, 36) 55 43


In [35]:
np.savez('celex.npz', words=w, phons=p, word_dict=word_dict, phon_dict=phon_dict)
data = np.load('data/celex.npz')
print(data['phons'].shape)
print(data['words'].shape)
print(data['phon_dict'])
print(data['word_dict'])

(51728, 36)
(51728, 31)
{'U': 1, 'm': 2, 'd': 3, 'Y': 4, 'k': 5, ':': 6, 'N': 7, 'g': 8, 'i': 9, 'y': 10, 'E': 11, 'Z': 12, 'n': 13, 'S': 14, 'a': 15, '#': 16, '+': 17, '/': 18, 'o': 19, 'z': 20, 'j': 21, 'A': 22, 'f': 23, 'b': 24, '~': 25, '{': 26, 'e': 27, 's': 28, 'I': 29, 'v': 30, 'l': 31, 'O': 32, 't': 33, 'p': 34, '|': 35, 'u': 36, ' ': 37, '@': 38, 'r': 39, 'h': 40, 'x': 41, '<GO>': 41, '<PAD>': 42}
{'G': 0, 'U': 1, 'm': 2, 'd': 3, 'Y': 4, 'k': 5, 'C': 6, 'N': 7, 'H': 8, 'B': 9, 'g': 10, 'x': 11, 'i': 12, 'y': 13, 'Q': 14, 'E': 15, 'M': 16, 'X': 17, 'Z': 18, 'n': 19, 'S': 20, 'a': 21, 'T': 22, 'D': 23, 'q': 24, 'K': 25, 'o': 26, 'R': 27, 'z': 28, 'V': 29, 'w': 30, 'j': 31, 'A': 32, 'W': 33, 'f': 34, 'b': 35, 'e': 36, 's': 37, 'F': 38, 'I': 39, 'v': 40, 'l': 41, 'L': 42, 'O': 43, 'J': 44, 't': 45, 'p': 46, 'u': 47, ' ': 48, 'r': 49, 'h': 50, 'P': 51, 'c': 52, '<PAD>': 53, '<GO>': 54}


In [33]:


def BAS_json(path):
    """
    This method receives a path for the BAS-SprecherInnen corpus and iterates through all JSON files in all subfolders.
    It creates and returns a list of words and a list of pronounciations
    """
    
    import json, os

    words = []
    prons = []
    ind = 0
    # Read in filenames
    for dirpath, dirnames, filenames in os.walk(path):
        for filename in [f for f in filenames if f.endswith(".json")]:

            if filename == 'SprecherInnen_DBconfig.json':
                continue

            # Open the json
            with open(os.path.join(dirpath,filename)) as json_file:
                data = json.load(json_file)

                for item in data['levels'][1]['items']:
                    words.append(item['labels'][0]['value'])
                    prons.append(item['labels'][1]['value'])

    return words,prons


def clean_corpus_BAS_Sprecherinnen(words,prons):
    """
    This method receives a list of words and a list of pronunciations of the BAS-Sprech. corpus and returns a cleaned dataset.
    Clearning means:    1) Removing multiple occurrences of words       2) Remove misspellings and ambiguities
                        3) Remove capitalization at begin of sentence   
    Homophon words (Meer, mehr) are kept!

    This method required manual inspection (once for each corpus).
    Returns a condensed list of words and pronounciations (strings) that can be converted in numerical values next.

    """

    # First, we remove multiple occurrences.
    # We cannot use set(words), set(prons) since some words are homophon 8(results in diff. lengths)
    all_tups = []
    for (w,p) in zip(words,prons):
        all_tups.append(tuple((w,p)))
    set_tup = set(all_tups)
    print('Amount of non-unique words in corpus is ', len(all_tups))
    unique_tups = dict(set_tup)
    print('Amount of unique words in corpus is ', len(set_tup))

    # Now we have removed multiple occurrences and we have a dict of tuples (word, pron)

    def find_poss_mistakes(unique_tups):
        """
        Receives a list of hopefully unique tupels (word,pron) and collect the tupels
        which may have incorrect spelling/pronounciations.
        """
        possible_mistakes = []
        for key, val in unique_tups.items():
            for keyy,vall in unique_tups.items():
                if key != keyy and val == vall:
                    # Detect multiple spellings of same pronounciation
                    possible_mistakes.append((key,val, keyy, vall))
                if key == keyy and val != vall:
                    # Detect multiple pronounciations of same spelling
                    possible_mistakes.append((key,val, keyy, vall))
                    
        return possible_mistakes
        
    poss_mist = find_poss_mistakes(unique_tups)
    """
    print("+++ Possible mistakes are +++")
    for k in range(len(poss_mist)):
        print(poss_mist[k][0],' -> ',poss_mist[k][1], 
              poss_mist[k][2],' -> ',poss_mist[k][2])
    """
        
    # Remove mistakes (after manual inspection)
    unique_tups.pop('BäckerInnen') # removing as a duplicate of Bäckerinnen
    unique_tups.pop('nu') # Duplicate of Nu
    unique_tups.pop('Abonentinnen') # Misspelled
    unique_tups.pop('Mit') # Duplicate of mit
    unique_tups.pop('A') # Duplicate of ah
    unique_tups.pop('Bei') # Duplicate of bei
    unique_tups.pop('backwaren') # Duplicate of Backwaren
    unique_tups.pop('-vertreterinnen') # Duplicate of Vertreterinnen
    unique_tups.pop('leu') # Duplicate of Leu
    unique_tups.pop('teil') # Duplicate of Teilt
    unique_tups.pop('Un') # Duplicate of un
    unique_tups.pop('Ver') # Duplicate of ver
    unique_tups.pop('AutorInnen') # Duplicate of Autorinnen
    unique_tups.pop('FreundInnen') # Duplicate of Freundinnen
    unique_tups.pop('-pflegerin') # Duplicate of Pflegerin
    unique_tups.pop('Neu') # Duplicate of neu
    unique_tups.pop('re') # Duplicate of Re
    unique_tups.pop('-kolleginnen') # Duplicate of Koleginnen
    unique_tups.pop('-trinkerinnen') # Duplicate of Trinkerinnen
    unique_tups.pop('Twitter-NutzerInnen') # Duplicate of Twitter-Nutzerinnen
    unique_tups.pop('kommissionen') # Duplicate of Koleginnen
    # Remaining: (Ihnen, ihnen), (dass,das), (Meer, mehr), (Ihres, ihres), (mal, Mal)

    wordss = list(unique_tups.keys())
    pronss = list(unique_tups.values())
    print('After clearning ', len(wordss), ' different words remain')

    return wordss, pronss




In [22]:
def str_to_num_dataset(X,Y):
    """
    This method receives 2 lists of strings (input X and output Y) and converts it to padded, numerical arrays.
    It returns the numerical dataset as well as the dictionaries to retrieve the strings.
    """

    # 1. Define dictionaries 
    # Dictionary assignining a unique integer to each input character
    try:
        u_characters = set(' '.join(X)) 
    except TypeError:
        # Exception for TIMIT dataset (one phoneme is repr. by seq. of chars)
        print("TypeError occurred.")
        u_characters = set([quant for seq in X for quant in seq])

    char2numX = dict(zip(u_characters, range(len(u_characters))))

    # Dictionary assignining a unique integer to each phoneme
    try:
        v_characters = set(' '.join(Y)) 
    except TypeError:
        print("TypeError occurred.")
        v_characters = set([quant for seq in Y for quant in seq])
    char2numY = dict(zip(v_characters, range(1,len(v_characters)+1))) # Using 0 causes trouble for tf.edit_distance
    
    # 2. Padding
    # Pad inputs
    char2numX['<PAD>'] = len(char2numX) 
    char2numX['<GO>']  = len(char2numX) 
    mx_l_X = max([len(word) for word in X]) # longest input sequence
    # Padd all X for the final form for the LSTM
    x = [[char2numX['<PAD>']]*(mx_l_X - len(word)) +[char2numX[char] for char in word] for word in X]
    x = np.array(x) 

    # Pad targets
    char2numY['<GO>'] = len(char2numY) # Define number denoting the response onset
    char2numY['<PAD>'] = len(char2numY)  
    mx_l_Y = max([len(phon_seq) for phon_seq in Y]) # longest output sequence

    y = [[char2numY['<GO>']] + [char2numY['<PAD>']]*(mx_l_Y - len(ph_sq)) + [char2numY[phon] for phon in ph_sq] for ph_sq in Y]
    y = np.array(y)

    return ((x,y) , (char2numX,char2numY))




In [36]:
path = '/Users/jannisborn/Desktop/LDS_Data/BAS_SprecherInnen'
words, prons = BAS_json(path)
words, prons = clean_corpus_BAS_Sprecherinnen(words,prons)

((x,y) , (char2numX, char2numY)) = str_to_num_dataset(words,prons)
np.savez('BAS_G2P.npz', inputs=x,targets=y, inp_dict=char2numX,
        tar_dict=char2numY)

((x,y) , (char2numX, char2numY)) = str_to_num_dataset(prons,words)
np.savez('BAS_P2G.npz', inputs=x,targets=y, inp_dict=char2numX,
        tar_dict=char2numY)

print(type(x), type(y), type(char2numX),type(char2numY))

Amount of non-unique words in corpus is  15698
Amount of unique words in corpus is  854
After clearning  833  different words remain
<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'dict'> <class 'dict'>


In [None]:
import tensorflow as tf
x = 'x'
b = tf.equal(x,'x')
print(type(b))
y = tf.cond(tf.equal(x, 'x'), lambda: 1, lambda: 0)
o = tf.cond(tf.equal(x, 'x'), lambda: print(2), lambda: print(43))

with tf.Session() as sess:
    print(sess.run(y))

In [None]:
from tensorflow.python.ops import array_ops, gen_math_ops
from time import time

In [None]:
targets = targs
batch_size = array_ops.shape(targets)[0]

print(batch_size.eval(), targets.get_shape().as_list(), targets.shape[0])

In [None]:
import tensorflow as tf
import numpy as np
from time import time
batch_size = 2
seq_len = 3
num_class = 4 # 0=A, 4=D -> Targets are> ABD and CAD
nan = num_class + 1
max_alt_spellings = 6
tf.InteractiveSession()
log = tf.reshape(np.array([[[4.73, 0.3, 2.1, 0.8], [1.0, 2.8, 0.1, 0.2], [0.1, 0.3, 0.1, 8.1]],
                           [[0.14, 0.9, 0.1, 0.2], [0.3, 4.5, 0.9, 0.2], [8.1, 1.0, 1.1, 1.9]]]),
                 [batch_size, seq_len, num_class])
log = tf.cast(log, tf.float32)
targs = tf.reshape(np.array([[0,1,3],[0,1,0]]),[batch_size,seq_len])
alt_targets = tf.reshape(np.array([[[0, 1,nan,nan,nan,nan], [1, 1,nan,nan,nan,nan],
                                        [3,3,nan,nan,nan,nan]],  
                                   [[1, 0, 1, 0,1,0], [2,2, 3,3, 1,1], [0,0,0,0,0,0]]]), 
                                  [batch_size, seq_len, max_alt_spellings])
alt_targets = tf.cast(alt_targets, tf.int64)






print("Targets ", targs.shape, targs.eval())
print("Alternative targets", alt_targets.shape, alt_targets.eval())

tim = time()
writings = tf.argmax(log,axis=-1)
print("Writings shape is ", writings.get_shape().as_list())
print("Writings are ",writings.eval())

# For every output word of the batch, check whether it matches to any alternative writing
# If so, replace the target
fin_targets = []
for wo_ind in range(alt_targets.shape[0]):
    wrote_alternative = False
    for tar_ind in range(alt_targets.shape[2]):
        
        if tf.reduce_all(tf.equal(writings[wo_ind,:],alt_targets[wo_ind,:,tar_ind])).eval():
            fin_targets.append(alt_targets[wo_ind,:,tar_ind])
            wrote_alternative = True
            continue
                     
    if not wrote_alternative:
        fin_targets.append(targs[wo_ind,:])
        
fin_targets = tf.stack(fin_targets)

print()
print()

a = tf.contrib.seq2seq.sequence_loss(log, targs, tf.ones([batch_size, seq_len]))
print("Loss according to regular learning", a.eval())

a = tf.contrib.seq2seq.sequence_loss(log, fin_targets, tf.ones([batch_size, seq_len]))
print("Loss according to LdS learning with seq.loss", a.eval())

a = tf.contrib.seq2seq.sequence_loss_lds(log, targs, alt_targets, tf.ones([batch_size, seq_len]),
                                        True)
print("Loss according to LdS learning with NEW LOSS", a.eval())


In [29]:
p2g = np.load('/Users/jannisborn/Dropbox/GitHub/LSTM/Code/data/BAS_P2G.npz')
bd = np.load('/Users/jannisborn/Dropbox/GitHub/LSTM/Code/data/BAS_G2P.npz')

In [32]:
print(p2g['inp_dict'])


inp_dict = np_dict_to_dict(p2g['inp_dict'])
print(len(inp_dict))
inp_dict['<GO>'] = len(inp_dict)
print(len(inp_dict), inp_dict)

np.save

print(np.array_equal(a['inputs'],ad['inputs']))
print(np.array_equal(a['targets'],ad['targets']))
print(np.array_equal(a['inp_dict'],ad['inp_dict']))
print(np.array_equal(a['tar_dict'],ad['tar_dict']))


{'9': 0, ':': 1, 'Z': 2, 'y': 3, ' ': 4, 'Y': 5, 'p': 6, '2': 7, '>': 8, 'r': 9, '@': 10, 'C': 11, 'd': 12, '6': 13, 'e': 14, 'l': 15, 'S': 16, 'z': 17, 'j': 18, 'k': 19, 'N': 20, 'U': 21, 'E': 22, 'u': 23, 'h': 24, 'o': 25, 'O': 26, 'a': 27, 'g': 28, 'm': 29, 't': 30, 'b': 31, '<': 32, 'I': 33, 'n': 34, 'x': 35, 'f': 36, 's': 37, 'i': 38, '?': 39, 'v': 40, '<PAD>': 41}
42
43 {'9': 0, ':': 1, 'Z': 2, 'y': 3, ' ': 4, 'Y': 5, 'p': 6, '2': 7, '>': 8, 'r': 9, '@': 10, 'C': 11, 'd': 12, '6': 13, 'e': 14, 'l': 15, 'S': 16, 'z': 17, 'j': 18, 'k': 19, 'N': 20, 'U': 21, 'E': 22, 'u': 23, 'h': 24, 'o': 25, 'O': 26, 'a': 27, 'g': 28, 'm': 29, 't': 30, 'b': 31, '<': 32, 'I': 33, 'n': 34, 'x': 35, 'f': 36, 's': 37, 'i': 38, '?': 39, 'v': 40, '<PAD>': 41, '<GO>': 42}


NameError: name 'a' is not defined

In [None]:
%%timeit
writings = tf.argmax(log,axis=-1)
#print("Writings shape is ", writings.get_shape().as_list())
#print("Writings are ",writings.eval())

# For every output word of the batch, check whether it matches to any alternative writing
# If so, replace the target
fin_targets = []
for wo_ind in range(alt_targets.shape[0]):
    wrote_alternative = False
    for tar_ind in range(alt_targets.shape[2]):
        
        if tf.reduce_all(tf.equal(writings[wo_ind,:],alt_targets[wo_ind,:,tar_ind])).eval():
            fin_targets.append(alt_targets[wo_ind,:,tar_ind])
            wrote_alternative = True
            continue
                     
    if not wrote_alternative:
        fin_targets.append(targs[wo_ind,:])
        
fin_targets = tf.stack(fin_targets)


In [None]:
# Insight 1: If you have k different classes, then k is the third 
#    dimension of the logits and then the target matrix must not 
#    contain values higher than k-1 (i.e. labels are 0, ..., k)

# Insight 2: sequence_loss expects unnormalized logits (BEFORE softmax!)
# -> This makes testing for me harder, since I cannot simply use values
# with a sum over 1 (will get normalized)