In [17]:
# Task: Come up with alternative spellings for all orthographic words in a database
# Tool 1: IPA -> Graphem Konverter Wiki
# Tool 2: SAMPA -> IPA Konverter Wiki

# Idea: Take the phonetic sequence (SAMPA) of every word, convert it to an IPA sequence and
        # then convert that to all grapheme sequences
    
# Needs: A dictionary for SAMPA -> IPA (ideally non-ambiguous). A dictionary for IPA -> Text (ambigu.)

import tensorflow as tf
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
import os, sys, pickle

In [58]:
path = '/Users/jannisborn/Desktop/LDS_Data/celex2/german/gpl/gpl.cd'

def extract_celex(path):
    """
    Reads in data from the CELEX corpus
    
    Parameters:
    -----------
    PATH        {str} the path to the desired celex file, i.e. gpl.cd 
                    (contains orthography and phonology)

    Returns:
    -----------
    2 Tuples, each with 2 variables. 
        First tuple:
    W           {np.array} of words (length 51728) for gpl.cd
    P           {np.array} of phoneme sequences (length 51728) for gpl.cd
        Second tuple:
    WORD_DICT   {dict} allowing to map the numerical array W back to strings
    PHON_DICT   {dict} doing the same for the phonetical arrays P

    
    Call via:
    path = "/Users/jannisborn/Desktop/LDS_Data/celex2/german/gpl/gpl.cd"
    ((w,p) , (word_dict, phon_dict)) = extract_celex(path)
    
    """
    
    
    with open(path, 'r') as file:

        raw_data = file.read().splitlines()
        words = []
        phons = []
        m = 0
        t = 0
        for ind,raw_line in enumerate(raw_data):
            
            line = raw_line.split("\\")
            
            

            if line[-2]: # Use only words that HAVE a SAMPA transcript (reduces from 51k to 37345)

            # exclude foreign words that have the 'æ' tone (SAMPA '{' ) like in PoINte   - 18 words
            # exclude foreign words that have the 'ɑ' tone (SAMPA 'A' ) like in NuANce   - 28 words
            # exclude foreign words that have a nasal vowel (SAMPA '~' ) like in Jargon  - 22 words
                if not 'A' in line[-2] and not '{' in line[-2] and not '~' in line[-2]: 

                    if not ('tS' in line[-2] and not 'tsch' in line[1]): # exclude 9 foreign words like 'Image', 'Match', 'Punch', 'Sketch'
                        
                        if not ('e' in line[-2] and not 'e:' in line[-2]) and not 'Z' in line[-2]: # exclude aerosol and Z laut (Garage, Jury)
                            
                            if not 'aero' in line[1].lower(): # exclude words with aero

                                if len(line[-2]) < 15 : # exclude words with more than 10 phons

                                    if len(line[-2]) > m:
                                        m = len(line[-2])
                                        print(line[1],line[-2])


                                    words.append(line[1].lower()) # All words are lowercase only
                                    phons.append(line[-2]) # Using SAMPA notation
                            
                        else:
                            t+=1
                            
    print("Excluded",t, "words because they were too long (more than 15 phons)" )
    print("Size of dataset is", len(words), "samples")

    return words,phons





def str_to_num_dataset(X,Y,pads=10):
    """
    This method receives 2 lists of strings (input X and output Y) and converts it to padded, numerical arrays.
    It returns the numerical dataset as well as the dictionaries to retrieve the strings.
    
    PADS    {int} specifiying how many additional fields should be padded (to allow long words to have longer alt. writings)
    """

    # 1. Define dictionaries 
    # Dictionary assignining a unique integer to each input character
    try:
        u_characters = set(' '.join(X)) 
    except TypeError:
        # Exception for TIMIT dataset (one phoneme is repr. by seq. of chars)
        print("TypeError occurred.")
        u_characters = set([quant for seq in X for quant in seq])

    char2numX = dict(zip(u_characters, range(1,len(u_characters)+1)))

    # Dictionary assignining a unique integer to each phoneme
    try:
        v_characters = set(' '.join(Y)) 
    except TypeError:
        print("TypeError occurred.")
        v_characters = set([quant for seq in Y for quant in seq])
    char2numY = dict(zip(v_characters, range(1,len(v_characters)+1))) # Using 0 causes trouble for tf.edit_distance
    
    # 2. Padding
    # Pad inputs
    char2numX['<GO>'] = len(char2numX) + 1
    char2numX['<PAD>'] = len(char2numX) + 1
    mx_l_X = max([len(word) for word in X]) # longest input sequence
    # Padd all X for the final form for the LSTM
    x = [[char2numX['<GO>']] + [char2numX['<PAD>']]*(mx_l_X - len(word)) +[char2numX[char] for char in word] for word in X]
    x = np.array(x) 

    # Pad targets
    char2numY['<GO>'] = len(char2numY) + 1 # Define number denoting the response onset
    char2numY['<PAD>'] = len(char2numY) + 1 
    mx_l_Y = max([len(phon_seq) for phon_seq in Y]) # longest output sequence

    y = [[char2numY['<GO>']] + pads*[char2numY['<PAD>']] + [char2numY['<PAD>']]*(mx_l_Y - len(ph_sq)) + [char2numY[phon] for phon in ph_sq] for ph_sq in Y]
    y = np.array(y)

    return ((x,y) , (char2numX,char2numY))


Save the dataset

In [72]:
words,phons = extract_celex(path)
((phons_num, words_num), (phon_dict, word_dict)) = str_to_num_dataset(phons,words)
    
print(len(word_dict),word_dict)
print(len(phon_dict),phon_dict)
print(words_num.shape, phons_num.shape)
print(words_num[321,:], phons_num[321,:])
np.savez("data/celex.npz", words=words_num, phons=phons_num, word_dict=word_dict, phon_dict=phon_dict)

A a:
aalen a:l
aalglatt a:l#glat
Aasgeier a:z#gai@r
abaenderlich ap#End@r#lIx
abbauwuerdig ap#bau#vYrdIx
Abbrucharbeit ap#brEx#arbait
Excluded 87 words because they were too long (more than 15 phons)
Size of dataset is 31877 samples
29 {'w': 1, 'h': 2, 's': 3, 'e': 4, 'y': 5, 'b': 6, 'u': 7, 'f': 8, 'z': 9, 'g': 10, ' ': 11, 'k': 12, 'o': 13, 'n': 14, 'i': 15, 'v': 16, 'x': 17, 'q': 18, 'p': 19, 'm': 20, 'l': 21, 'j': 22, 'c': 23, 'd': 24, 'r': 25, 'a': 26, 't': 27, '<GO>': 28, '<PAD>': 29}
39 {'N': 1, 'h': 2, 's': 3, 'e': 4, 'y': 5, 'b': 6, 'Y': 7, 'u': 8, 'f': 9, 'z': 10, 'g': 11, ' ': 12, 'E': 13, 'k': 14, 'o': 15, 'n': 16, 'i': 17, 'v': 18, 'x': 19, ':': 20, 't': 21, 'I': 22, '#': 23, 'U': 24, 'p': 25, 'S': 26, 'm': 27, '|': 28, 'l': 29, 'j': 30, '+': 31, 'O': 32, 'd': 33, 'r': 34, 'a': 35, '/': 36, '@': 37, '<GO>': 38, '<PAD>': 39}
(31877, 28) (31877, 15)
[28 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 26  6 21  7 23
  2  3  4 14] [38 39 39 39 39 39 39 39 35 25 23 29 24 

In [135]:
# Save small subset of corpus
print(len(alt_writings_num))

31877


In [137]:
words,phons = extract_celex(path)
(phons_num, words_num) = str_to_num_with_dict(phons,words,phon_dict, word_dict)

np.savez("/Users/jannisborn/Desktop/LDS_Data/celex_small.npz", words=words_num[:2000], phons=phons_num[:2000], word_dict=word_dict, phon_dict=phon_dict)
np.save("/Users/jannisborn/Desktop/LDS_Data/celex_alt_targets_small.npy", alt_writings_num[:2000])


A a:
aalen a:l
aalglatt a:l#glat
Aasgeier a:z#gai@r
abaenderlich ap#End@r#lIx
abbauwuerdig ap#bau#vYrdIx
Abbrucharbeit ap#brEx#arbait
Excluded 87 words because they were too long (more than 15 phons)
Size of dataset is 31877 samples


In [4]:
#data = np.load('data/celex_no_alt.npz')
#sampa_dict = {key:data['phon_dict'].item().get(key) for key in data['phon_dict'].item()}
sampa_dict = phon_dict
sampa_keys = list(sampa_dict.keys()) # Has 43 keys originally, 40 after excluding {, ~, A
print("Length of phonetic dict is ", len(sampa_dict), " and the keys are: \n", sampa_keys) # Has 43 keys


# Step 1: Make a sampa_ipa dict. How to: Go to SAMPA -> IPA tabelle, for every SAMPA char, check whether it 
# is in the CELEX korpus. If yes, look up example word from wiki in korpus and check whether it is the right sampa
# sign. If yes, look up on wiktionary example word in IPA and check whether output sign is correct.

sampa_ipa = dict()

# Vowels
sampa_ipa['i'] = 'i'
sampa_ipa[':'] = 'ː'
sampa_ipa['I'] = 'ɪ'
sampa_ipa['e'] = 'e'
sampa_ipa['E'] = 'ɛ'
sampa_ipa['y'] = 'y'   # meaning a real ü like in kühl
sampa_ipa['@'] = 'ə'
sampa_ipa['a'] = 'a'
sampa_ipa['u'] = 'u'
sampa_ipa['U'] = 'ʊ'
sampa_ipa['o'] = 'o'
sampa_ipa['O'] = 'ɔ'


# consonants
sampa_ipa['p'] = 'p'
sampa_ipa['b'] = 'b'
sampa_ipa['t'] = 't'
sampa_ipa['d'] = 'd'
sampa_ipa['k'] = 'k'
sampa_ipa['g'] = 'g'
sampa_ipa['f'] = 'f'
sampa_ipa['v'] = 'v'
sampa_ipa['s'] = 's'
sampa_ipa['z'] = 'z'
sampa_ipa['S'] = 'ʃ'
sampa_ipa['x'] = 'x'
sampa_ipa['h'] = 'h'
sampa_ipa['m'] = 'm'
sampa_ipa['n'] = 'n'
sampa_ipa['N'] = 'ŋ'
sampa_ipa['l'] = 'l'
sampa_ipa['r'] = 'r'
sampa_ipa['j'] = 'j'
sampa_ipa['Z'] = 'ʒ'
sampa_ipa['+'] = ''  # meaning a bit unclear
sampa_ipa['#'] = 'ˈ' # following syllabus carries primary intonation
sampa_ipa['|'] = 'ø' # meaning a bit unclear
sampa_ipa['/'] = 'œ' # usually SAMPA uses 9 instead of / for this 
sampa_ipa['Y'] = 'ʏ' # meaning more a 'oü' like in Müll

# These are 37 keys only, so 6 are missing. Remaining ones are:

# <GO>          not needed for alt. writing creation
# <PAD>         not needed for alt. writing creation
#    (SPACE)    not needed
# {             excluded some foreign words
# A             excluded some foreign words
# ~             excluded some foreign words


Length of phonetic dict is  39  and the keys are: 
 ['N', 'h', 's', 'e', 'y', 'b', 'Y', 'u', 'f', 'z', 'g', ' ', 'E', 'k', 'o', 'n', 'i', 'v', 'x', ':', 't', 'I', '#', 'U', 'p', 'S', 'm', '|', 'l', 'j', '+', 'O', 'd', 'r', 'a', '/', '@', '<GO>', '<PAD>']


# Now convert the SAMPA words into IPA words:


In [5]:
ipa = []
for samp in phons:
    s = []
    for char in samp:
        s.append(sampa_ipa[char])
    ipa.append(''.join(s))
print("Amount of IPA samples is", len(words), ". Some samples are: \n")
print(" WORD         ===>        SAMPA        ===>        IPA")
for k in range(10000, 10010):
    print(words[k]," => ", phons[k]," => ", ipa[k])

Amount of IPA samples is 31877 . Some samples are: 

 WORD         ===>        SAMPA        ===>        IPA
glueckselig  =>  glYk#ze:lIg  =>  glʏkˈzeːlɪg
gluecksfall  =>  glYk+s#fal  =>  glʏksˈfal
glueckskind  =>  glYk+s#kInd  =>  glʏksˈkɪnd
gluecksspiel  =>  glYk+s#Spi:l  =>  glʏksˈʃpiːl
glueckstreffer  =>  glYk+s#trEf+@r  =>  glʏksˈtrɛfər
glueckwunsch  =>  glYk#vUnS  =>  glʏkˈvʊnʃ
gluehen  =>  gly:  =>  glyː
gluehbirne  =>  gly:#bIrn@  =>  glyːˈbɪrnə
gluehheiss  =>  gly:#hais  =>  glyːˈhais
gluehlampe  =>  gly:#lamp@  =>  glyːˈlampə


## Now define the second dictionary, mapping IPA signs to graphemes (according to [here](https://de.wiktionary.org/wiki/Verzeichnis:Deutsch/Phoneme_und_Grapheme) ) 

Go to every phon and write down all graphemes (copying table, excluding very weird graphemes)

In [6]:
print(sampa_ipa.values())

dict_values(['i', 'ː', 'ɪ', 'e', 'ɛ', 'y', 'ə', 'a', 'u', 'ʊ', 'o', 'ɔ', 'p', 'b', 't', 'd', 'k', 'g', 'f', 'v', 's', 'z', 'ʃ', 'x', 'h', 'm', 'n', 'ŋ', 'l', 'r', 'j', 'ʒ', '', 'ˈ', 'ø', 'œ', 'ʏ'])


In [7]:
ipa_graph = dict()
ipa_graph['t'] = ['t', 'd', 'tt'] # excluded dt (mostly in Stadt) and th (Methode) -> hard but not do more than 3 opts.
ipa_graph['ə'] = ['e']
ipa_graph['n'] = ['n', 'nn']
ipa_graph['s'] = ['s', 'ss'] # excluded t for Patience, ce for Farce, zz for Jazz, ß (no occ.), c for Sauce, z for Quiz
ipa_graph['a'] = ['a', 'ah']
ipa_graph['r'] = ['r', 'rr'] # excluded rrh for Zirrhose/Myrrhe, rh for Rhythmus
ipa_graph['l'] = ['l', 'll']
ipa_graph['ɛ'] = ['e', 'ae']
ipa_graph['f'] = ['f', 'v', 'ff'] # excluded ph for Physik
ipa_graph['g'] = ['g', 'gh'] # excluded gg for Bagger
ipa_graph['ɪ'] = ['i']
ipa_graph['k'] = ['k', 'ck', 'c'] # Excluded cch for Zucchini, gg for Flaggschiff, qu for Boutique, kk for Akkordeon
                  #  qu chars are usually kv ipa (tracked below), g for Krieg, ch for Chor
ipa_graph['m'] = ['m', 'mm']
ipa_graph['b'] = ['b', 'bb']
ipa_graph['ʃ'] = ['sch', 's'] # excluded sk for Ski, sh for Sheriff, Show and ch for Recherche 
ipa_graph['d'] = ['d','dd']
ipa_graph['p'] = ['p', 'b', 'pp'] # excluded bb for abebben or schrubben
ipa_graph['ŋ'] = ['ng','n']
ipa_graph['ɔ'] = ['o'] # excluded ch for Chauffeur (very rare exception)
ipa_graph['v'] = ['w', 'v']
ipa_graph['ʊ'] = ['u']
ipa_graph['z'] = ['s'] # excluded zz for Blizzard, Puzzle and z for zoomen, bulldozer (since only in foreign words)
ipa_graph['h'] = ['h']
ipa_graph['i'] = ['i'] # excluding y (Baby/Party/Hockey) only 10 words in corpus
ipa_graph['ʏ'] = ['ue', 'u'] # The corpus is weird here and writes Druck as drʏk, i.e. "Drück" rather than drʊk
ipa_graph['x'] = ['ch']
ipa_graph['e'] = ['e'] # excluded ee for Kaffee since IPA would be eː
ipa_graph['j'] = ['j', 'y']
ipa_graph['u'] = ['u'] # excluded ou like in Boutique
ipa_graph['o'] = ['o'] # not needed anyways since o always followed by ː
ipa_graph['œ'] = ['oe']
ipa_graph['y'] = ['y']
ipa_graph['ʒ'] = ['g', 'j'] # no wiki entry, self generated. For Garage or Jury


# 2 character keys:
ipa_graph['ts'] = ['z', 'ts', 'tz'] # excluded zz for Pizza/Skizze, c for circa, Penicillin, tts for trittst
            # t for Aktion, Negation, Infektion, Proportion, ...
ipa_graph['aː'] = ['a', 'ah', 'aa']
ipa_graph['ai'] = ['ei', 'ai'] # excluded ail for Detail, aill for Medaillon, aille for Medaille and y for Nylon
ipa_graph['iː'] = ['ie', 'i', 'ih'] # excluded ieh for Entziehung
ipa_graph['eː'] = ['e', 'ee', 'eh'] # excluded et like in Bidet
ipa_graph['ɛː'] = ['ae', 'aeh']
ipa_graph['uː'] = ['u', 'uh'] # excluded ou like in Ragout, Limousine and oo like on zoomen/Cartoon
ipa_graph['oː'] = ['o', 'oh', 'oo'] # excluded au for aubergine/sauce and eau for plateau, Niveau
ipa_graph['yː'] = ['ue', 'ueh', 'y'] # excluded uet like in Debüt and u like in deja-vu
ipa_graph['ɔy'] = ['eu', 'aeu'] # instead of what wiki calls ɔɪ̯, excluded oi for Boiler and oy for Boykott
ipa_graph['ks'] = ['chs', 'x', 'ks'] #excluded gs like in legst/bugsieren, ggs like in eggst (?), cks like in zwecks, gs (legst)
ipa_graph['øː'] = ['oe', 'oeh'] # excluded eu like in Ingenieur and eue like in Queue (?)
ipa_graph['kv'] = ['qu']


print(len(sampa_ipa), len(ipa_graph))
# We had 36 keys in sampa_ipa dict, now we have 46 already in ipa_graph due to 2-phoneme-groups
# But still there are one-char-values in sampa_ipa which are not keys in ipa_graph. Let us print them:
for key in sampa_ipa.values():
    if key not in ipa_graph.keys():
        print(key)
        
# Okay 4 are missing:
ipa_graph['ː'] = [''] # not needed anyways since ː always occurs after vowel
ipa_graph['ˈ'] = [''] # Just a pronounciation symbol, does not carry meaning for spelling
# Then the empty string '' is not needed as key
# Then ø only occurs followed by a ː


37 46
ː

ˈ
ø


#### DO again with Jäger source

In [8]:
ipa_graph2 = dict()
ipa_graph2['t'] = ['t', 'd', 'tt','dt','th'] 
ipa_graph2['n'] = ['n', 'nn']
ipa_graph2['s'] = ['s', 'ss'] 
ipa_graph2['a'] = ['a']
ipa_graph2['r'] = ['r', 'rr','rh'] # our IPA uses r instead of ʁ
ipa_graph2['l'] = ['l', 'll']
ipa_graph2['ɛ'] = ['e', 'ae']
ipa_graph2['f'] = ['f', 'v', 'ff','ph']
ipa_graph2['g'] = ['g', 'gg'] 
ipa_graph2['ɪ'] = ['i','ie']
ipa_graph2['k'] = ['k', 'ck', 'c','g','ch'] # Excluded cch for Zucchini, gg for Flaggschiff, qu for Boutique, kk for Akkordeon
ipa_graph2['m'] = ['m', 'mm']
ipa_graph2['b'] = ['b', 'bb']
ipa_graph2['ʃ'] = ['sch', 's'] # excluded sk for Ski, sh for Sheriff, Show and ch for Recherche 
ipa_graph2['d'] = ['d','dd']
ipa_graph2['p'] = ['p', 'b', 'pp'] # excluded bb for abebben or schrubben
ipa_graph2['ŋ'] = ['ng','n']
ipa_graph2['ɔ'] = ['o'] # excluded au for Chauffeur (very rare exception)
ipa_graph2['v'] = ['w', 'v']
ipa_graph2['ʊ'] = ['u']
ipa_graph2['z'] = ['s'] 
ipa_graph2['h'] = ['h']
ipa_graph2['ʏ'] = ['ue', 'u','y'] # The corpus is weird here and writes Druck as drʏk, i.e. "Drück" rather than drʊk
ipa_graph2['x'] = ['ch']
ipa_graph2['j'] = ['j']
ipa_graph2['œ'] = ['oe']


# 2 character keys:
ipa_graph2['ts'] = ['z', 'tz'] # excluded zz for Pizza/Skizze, c for circa, Penicillin, tts for trittst
            # t for Aktion, Negation, Infektion, Proportion, ...
ipa_graph2['aː'] = ['a', 'ah', 'aa']

ipa_graph2['aɪ'] = ['ei', 'ai'] # excluded ail for Detail, aill for Medaillon, aille for Medaille and y for Nylon
ipa_graph2['iː'] = ['ie', 'i', 'ih','ieh'] # excluded ieh for Entziehung
ipa_graph2['eː'] = ['e', 'ee', 'eh'] # excluded et like in Bidet
ipa_graph2['ɛː'] = ['ae', 'aeh']
ipa_graph2['uː'] = ['u', 'uh'] # excluded ou like in Ragout, Limousine and oo like on zoomen/Cartoon
ipa_graph2['oː'] = ['o', 'oh', 'oo'] # excluded au for aubergine/sauce and eau for plateau, Niveau
ipa_graph2['yː'] = ['ue', 'ueh'] # excluded uet like in Debüt and u like in deja-vu
ipa_graph2['ɔy'] = ['eu', 'aeu'] # instead of what table calls 'ɔɪ'
ipa_graph2['ks'] = ['chs', 'x', 'ks','cks','gs']
ipa_graph2['øː'] = ['oe', 'oeh'] # excluded eu like in Ingenieur and eue like in Queue (?)
ipa_graph2['aʊ'] = ['au']
ipa_graph2['pf'] = ['pf']

# The Jäger table itself is not sufficient. E.g. there are occurrences of y without yː and 
# since only the latter has a dict entry, we would get a key error. 
# Thus we add some dict entries manually
ipa_graph2['y'] = ['y']
ipa_graph2['ə'] = ['e']
ipa_graph2['i'] = ['i'] 
ipa_graph2['u'] = ['u'] 
ipa_graph2['kv'] = ['qu']
ipa_graph2['ˈ'] = [''] # Just a pronounciation symbol, does not carry meaning for spelling
ipa_graph2['ː'] = ['']


print(len(sampa_ipa), len(ipa_graph))
# We had 36 keys in sampa_ipa dict, now we have 46 already in ipa_graph due to 2-phoneme-groups
# But still there are one-char-values in sampa_ipa which are not keys in ipa_graph. Let us print them:
for key in sampa_ipa.values():
    if key not in ipa_graph2.keys():
        print(key)
        
# Okay 4 are missing:
#ipa_graph['ː'] = [''] # not needed anyways since ː always occurs after vowel
# Then the empty string '' is not needed as key
# Then ø only occurs followed by a ː


37 48
e
o
ʒ

ø


{'w': 1, 'h': 2, 's': 3, 'e': 4, 'y': 5, 'b': 6, 'u': 7, 'f': 8, 'z': 9, 'g': 10, ' ': 11, 'k': 12, 'o': 13, 'n': 14, 'i': 15, 'v': 16, 'x': 17, 'q': 18, 'p': 19, 'm': 20, 'l': 21, 'j': 22, 'c': 23, 'd': 24, 'r': 25, 'a': 26, 't': 27, '<GO>': 28, '<PAD>': 29}


#### Pretty tough to draw the boundary between which phons rarely but "regularly" translate to grapheme strings ([k] to kk like in Akkordeon, Mokka is still regular?) whereas [ʃ] (sch) to ch like in Champagner, Recherche or Lunch is irregular?

## Use the IPA->Graphem dict to create alternative writings

In [9]:
def split_word(word, ipa_graph2):
    """
    Splits up an IPA word into a list of lists each with the possible replacement grapheme for each phoneme
    
    Parameters:
    ----------
    WORD       {list} in IPA notation
    IPA_GRAPH  {dict} mapping IPA symbols to possible grapheme sequences
    
    Returns:
    ---------
    CHARS      {list} containing lists with possible grapheme sequences
    """
    
    
    chars = []
    single_key = True
    for ind in range(len(word)-1):
        
        if single_key: 
            if word[ind:ind+2] in ipa_graph2:
                chars.append(ipa_graph2[word[ind:ind+2]])
                single_key = False
            else:
                chars.append(ipa_graph2[word[ind]])
        else:
            single_key = True
            
    chars.append(ipa_graph2[word[ind+1]])
    
    return chars
    


# The method below generates the alternative writings (takes some time...)

In [10]:
def generate_writings():
    all_writings = []
    m = 0

    for ind,ip in enumerate(ipa):
        if ind % 5000 == 0:
            print("Currently examining word ", ind)

        word_lists = split_word(ip, ipa_graph2)
        alt_write_raw = list(itertools.product(*word_lists))
        alt_write = [''.join(a) for a in alt_write_raw]
        try:
            alt_write.remove(words[ind])
        except ValueError:
            _ = 1

        all_writings.append(alt_write)

        """        
        if len(alt_write) > m:
            print(len(alt_write),ind)
            m = len(alt_write)"""
    print("DONE! Alternative writings generated. Resulting list has", len(all_writings), 'entries.')    
    return all_writings
        
        


#### Now convert the list of alternative writings into a format that can be saved to disk. Tried A LOT of things here. We could use int8 as datatype since the values are in the range of [0,num_dec_symbols], but the arrays have varying size which numpy cannot handle effciently. Low storage solution: Use np.save to save a list that contains for each word of the corpus a list with num_alt_writings lists, each containing a possible target sequence

In [11]:
def convert(all_writings):

    print(words_num.shape)
    seq_len = words_num.shape[1]
    max_alt_spellings = max(len(l) for l in all_writings)
    num_alt_writings = []

    m = 0
    for wo_ind in range(len(all_writings)):

        if wo_ind % 1000 == 0:
            print("Currently converting word", wo_ind)
        tmp = []
        
        for tar_ind in range(len(all_writings[wo_ind])):
            l = seq_len - len(all_writings[wo_ind][tar_ind])
            num = [word_dict['<PAD>']]*l + [word_dict[k] for k in all_writings[wo_ind][tar_ind]]
            tmp.append(num)

        num_alt_writings.append(tmp)
            
    return num_alt_writings

# Alternatives:

#num_alt_writings = dict()
#num_alt_writings = np.array([])
#num_alt_writings = np.zeros((words_num.shape[0],seq_len),dtype=np.int32)
#num_alt_writings = np.zeros((words_num.shape[0],seq_len,max_alt_spellings),dtype=np.int8)

    #for wo_ind in range(num_alt_writings.shape[0]):

        #tmp = np.zeros((seq_len, len(all_writings[wo_ind])),dtype=np.int8)
        
            #num = np.array([word_dict['<PAD>']]*l + [word_dict[k] for k in all_writings[wo_ind][tar_ind]])
            #tmp[:,tar_ind] = num.astype(np.int8)
            
        #num_alt_writings = np.array([num_alt_writings,num])
        #num_alt_writings[wo_ind,:,tar_ind] = num.astype(np.int8)
        
    #num_alt_writings = np.array([num_alt_writings,tmp])
    #num_alt_writings[wo_ind] = tmp
    #num_alt_writings = np.append(num_alt_writings,tmp)



### Finally execute the whole pipeline for CELEX dataset

In [12]:
words,phons = extract_celex(path)
((phons_num, words_num), (phon_dict, word_dict)) = str_to_num_dataset(phons,words)
alt_writings_str = generate_writings()


A a:
aalen a:l
aalglatt a:l#glat
Aasgeier a:z#gai@r
abaenderlich ap#End@r#lIx
abbauwuerdig ap#bau#vYrdIx
Abbrucharbeit ap#brEx#arbait
Excluded 87 words because they were too long (more than 15 phons)
Size of dataset is 31877 samples
Currently examining word  0
Currently examining word  5000
Currently examining word  10000
Currently examining word  15000
Currently examining word  20000
Currently examining word  25000
Currently examining word  30000
DONE! Alternative writings generated. Resulting list has 31877 entries.


In [13]:
alt_writings_num = convert(alt_writings_str)

(31877, 28)
Currently converting word 0
Currently converting word 1000
Currently converting word 2000
Currently converting word 3000
Currently converting word 4000
Currently converting word 5000
Currently converting word 6000
Currently converting word 7000
Currently converting word 8000
Currently converting word 9000
Currently converting word 10000
Currently converting word 11000
Currently converting word 12000
Currently converting word 13000
Currently converting word 14000
Currently converting word 15000
Currently converting word 16000
Currently converting word 17000
Currently converting word 18000
Currently converting word 19000
Currently converting word 20000
Currently converting word 21000
Currently converting word 22000
Currently converting word 23000
Currently converting word 24000
Currently converting word 25000
Currently converting word 26000
Currently converting word 27000
Currently converting word 28000
Currently converting word 29000
Currently converting word 30000
Currently

In [22]:
# Save data
import pickle, sys
path = "/Users/jannisborn/Desktop/LDS_Data/celex_alt_targets.npy"
#pickle.dump(alt_writings_num, open(path, "wb"))

max_bytes = 2**31 - 1
bytes_out = pickle.dumps(alt_writings_num)
n_bytes = sys.getsizeof(bytes_out)
with open(path, 'wb') as f_out:
    for idx in range(0, n_bytes, max_bytes):
        f_out.write(bytes_out[idx:idx+max_bytes])
#np.save(path,alt_writings_num)

In [18]:
def try_to_load_as_pickled_object_or_None(filepath):
    """
    This is a defensive way to write pickle.load, allowing for very large files on all platforms
    """
    max_bytes = 2**31 - 1
    try:
        input_size = os.path.getsize(filepath)
        bytes_in = bytearray(0)
        with open(filepath, 'rb') as f_in:
            for _ in range(0, input_size, max_bytes):
                bytes_in += f_in.read(max_bytes)
        obj = pickle.loads(bytes_in)
    except:
        return None
    return obj

In [24]:
%%timeit
a = try_to_load_as_pickled_object_or_None(path)
print(type(a))

<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
27.2 s ± 354 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
%%timeit
a = np.load("/Users/jannisborn/Desktop/LDS_Data/celex_alt_targets.npy")
print(type(a))

<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
25.1 s ± 229 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Extract childlex data from downloaded CSV and clean it up

In [26]:
import csv
path = 'data/childlex_6-8_lemmata.csv'
forbidden = "_,.!?*:=&%- /\\–•»«…›‹()[]{}’'"


def rem_um(word):
    """
    Converts a word with German umlauts (ü,ä,ö) into a word without
    """
        
    umlautfree = str()
    for char in word:
        if char == 'ä':
            umlautfree += 'ae'
        elif char == 'ü':
            umlautfree += 'ue'
        elif char == 'ö':
            umlautfree += 'oe'
        elif char == 'ß':
            umlautfree += 'ss'
        else:
            umlautfree += char
    return umlautfree


with open(path, 'r') as csvfile:
    
    raw_file = csv.reader(csvfile,delimiter=';')
    raw_data = []
    
    
    for line in raw_file:
        umlautfree = rem_um(line[0].lower())
        
       # data cleanup
        has_digit = any(char.isdigit() for char in umlautfree)
        has_sonderz = any(char in forbidden for char in umlautfree)
        
        if not has_digit and not has_sonderz:
        
            raw_data.append(umlautfree)
        
        
    
childlex_words_all = raw_data
print("Amount of words in childlex", len(childlex_words_all))
    
chars = []
for word in childlex_words_all:
    for char in word:
        if char not in chars:
            chars.append(char)
    
print("Set of characters in childlex words is", chars)

Amount of words in childlex 9769
Set of characters in childlex words is ['a', 'e', 'c', 'h', 'z', 'n', 'd', 'f', 'm', 'l', 'i', 'r', 'g', 's', 't', 'u', 'o', 'b', 'k', 'p', 'q', 'w', 'v', 'x', 'j', 'y']


#### Look up phonetic sequences in CELEX Database. 
#### Problem: We do not have phonetic sequences yet. Idea: Retrieve as many phonetic sequences as possible from the CELEX database, convert this database into numerical values and save it.
#### Also retrieve the alternative writings for these words

In [28]:
#celex_alt_targs = np.load("/Users/jannisborn/Desktop/LDS_Data/celex_alt_targets.npy")
print(len(a))

31877


In [29]:
childlex_phons = []
childlex_words = []
childlex_alt_writings = []
for word in childlex_words_all:
    if word in words:
        ind = words.index(word)
        childlex_alt_writings.append(a[ind])
        childlex_phons.append(phons[ind])
        childlex_words.append(words[ind])

In [30]:
print("From the", len(childlex_words_all),"words in CHILDLEX database", len(childlex_words),"were succesfully retrieved")

From the 9769 words in CHILDLEX database 5891 were succesfully retrieved


In [55]:
def str_to_num_with_dict(X,Y,dic_X,dic_Y,pads=10):
    """
    This method receives 2 lists of strings (input X and output Y) and converts it to padded, numerical arrays - 
    based on a dictionary given as third argument.
    It returns the numerical dataset .
    PADS    {int} specifiying how many additional fields should be padded (to allow long words to have longer alt. writings)
    """

    mx_l_X = max([len(word) for word in X]) # longest input sequence
    # Padd all X for the final form for the LSTM
    x = [[dic_X['<GO>']] + [dic_X['<PAD>']]*(mx_l_X - len(word)) +[dic_X[char] for char in word] for word in X]
    x = np.array(x) 

    # Pad targets

    mx_l_Y = max([len(phon_seq) for phon_seq in Y]) # longest output sequence

    y = [[dic_Y['<GO>']] + pads*[dic_Y['<PAD>']] + [dic_Y['<PAD>']]*(mx_l_Y - len(ph_sq)) + [dic_Y[phon] for phon in ph_sq] for ph_sq in Y]
    y = np.array(y)

    return (x,y)


#### Important: Use the same dictionary like for CELEX (some writings are retrived from there)

In [63]:
(phons_num, words_num) = str_to_num_with_dict(childlex_phons, childlex_words, phon_dict, word_dict)
print(len(word_dict),word_dict)
print(len(phon_dict),phon_dict)
print(words_num.shape, phons_num.shape)
np.savez("data/childlex.npz", words=words_num, phons=phons_num, word_dict=word_dict, phon_dict=phon_dict)
np.savez("/Users/jannisborn/Desktop/LDS_Data/childlex.npz", words=words_num, phons=phons_num, word_dict=word_dict, phon_dict=phon_dict)

29 {'w': 1, 'h': 2, 's': 3, 'e': 4, 'y': 5, 'b': 6, 'u': 7, 'f': 8, 'z': 9, 'g': 10, ' ': 11, 'k': 12, 'o': 13, 'n': 14, 'i': 15, 'v': 16, 'x': 17, 'q': 18, 'p': 19, 'm': 20, 'l': 21, 'j': 22, 'c': 23, 'd': 24, 'r': 25, 'a': 26, 't': 27, '<GO>': 28, '<PAD>': 29}
39 {'N': 1, 'h': 2, 's': 3, 'e': 4, 'y': 5, 'b': 6, 'Y': 7, 'u': 8, 'f': 9, 'z': 10, 'g': 11, ' ': 12, 'E': 13, 'k': 14, 'o': 15, 'n': 16, 'i': 17, 'v': 18, 'x': 19, ':': 20, 't': 21, 'I': 22, '#': 23, 'U': 24, 'p': 25, 'S': 26, 'm': 27, '|': 28, 'l': 29, 'j': 30, '+': 31, 'O': 32, 'd': 33, 'r': 34, 'a': 35, '/': 36, '@': 37, '<GO>': 38, '<PAD>': 39}
(5891, 27) (5891, 15)


In [129]:
# Save the alternative writings
np.save("/Users/jannisborn/Desktop/LDS_Data/childlex_alt_targets2.npy", childlex_alt_writings)

### Extract data from the FIBEL dataset
##### Focus on the "Mia and Mo" Fibel

In [124]:
fibel_path = "/Users/jannisborn/Dropbox/GitHub/LSTM/LdS_bLSTM/Code/data/Fibelwörter.txt"
celex_alt_targs = a

with open(fibel_path, 'r') as txtfile:
    fibel_words = []
    for line in txtfile.read().split(','):
        # Use lowercase letters only, remove leading whitespace and take care of line breaks.
        if '\n' in line:    
            for item in line.split('\n'):
                if item.lstrip().lower() not in fibel_words:
                    fibel_words.append(rem_um(item.lstrip().lower()))
        elif line.lstrip().lower() not in fibel_words:
            fibel_words.append(rem_um(line.lstrip().lower()))
        
lektions_inds = [9,14,20,28,36,46,58,77,98,120,153,173]

# Retrieve some phonetic transcripts and alternative writings from CELEX
fibel_phons = []
fibel_alt_writings = []
k=0
for w in fibel_words:
    if w in words:
        ind = words.index(w)
        fibel_phons.append(phons[ind])
        fibel_alt_writings.append(celex_alt_targs[ind])
        
    else:
        fibel_phons.append('NO')
        fibel_alt_writings.append('NO')
        k +=1
print("The dataset has "+str(len(fibel_words))+" words, from which "+str(len(fibel_words)-k)+" could be retrieved from celex")

The dataset has 174 words, from which 115 could be retrieved from celex


#### Generate the SAMPA spellings for the missing words. First step: Copy the IPA values from wiktionary + convert IPA into SAMPA 

In [125]:
ipa_sampa = dict(zip(sampa_ipa.values(), sampa_ipa.keys()))

fibel_phons[fibel_words.index('mia')] = ''.join([ipa_sampa[w] for w in 'miːa'])
fibel_phons[fibel_words.index('mo')] = ''.join([ipa_sampa[w] for w in 'moː'])
fibel_phons[fibel_words.index('mimi')] = ''.join([ipa_sampa[w] for w in 'mimi'])
fibel_phons[fibel_words.index('im')] = ''.join([ipa_sampa[w] for w in 'ɪm'])
fibel_phons[fibel_words.index('am')] = ''.join([ipa_sampa[w] for w in 'am'])
fibel_phons[fibel_words.index('momo')] = ''.join([ipa_sampa[w] for w in 'moːmoː'])
fibel_phons[fibel_words.index('omi')] = ''.join([ipa_sampa[w] for w in 'oːmiː'])
fibel_phons[fibel_words.index('radio')] = ''.join([ipa_sampa[w] for w in 'raːdioː'])
fibel_phons[fibel_words.index('sissi')] = ''.join([ipa_sampa[w] for w in 'zɪsɪː'])
fibel_phons[fibel_words.index('susi')] = ''.join([ipa_sampa[w] for w in 'susi'])
fibel_phons[fibel_words.index('oli')] = ''.join([ipa_sampa[w] for w in 'ɔliː'])
fibel_phons[fibel_words.index('salami')] = ''.join([ipa_sampa[w] for w in 'zaˈlaːmi'])
fibel_phons[fibel_words.index('ist')] = ''.join([ipa_sampa[w] for w in 'ɪst'])
fibel_phons[fibel_words.index('tim')] = ''.join([ipa_sampa[w] for w in 'tɪm'])
fibel_phons[fibel_words.index('tom')] = ''.join([ipa_sampa[w] for w in 'tɔm'])
fibel_phons[fibel_words.index('mario')] = ''.join([ipa_sampa[w] for w in 'ˈmaːrioː'])
fibel_phons[fibel_words.index('isa')] = ''.join([ipa_sampa[w] for w in 'iːza'])
fibel_phons[fibel_words.index('maria')] = ''.join([ipa_sampa[w] for w in 'maˈriːa'])
fibel_phons[fibel_words.index('rosarot')] = ''.join([ipa_sampa[w] for w in 'ˈroːzaˈroːt'])
fibel_phons[fibel_words.index('nimm')] = ''.join([ipa_sampa[w] for w in 'nɪm'])
fibel_phons[fibel_words.index('nimmt')] = ''.join([ipa_sampa[w] for w in 'nɪmt'])
fibel_phons[fibel_words.index('nina')] = ''.join([ipa_sampa[w] for w in 'niːna'])
fibel_phons[fibel_words.index('rosinen')] = ''.join([ipa_sampa[w] for w in 'roːziːnən'])
fibel_phons[fibel_words.index('anna')] = ''.join([ipa_sampa[w] for w in 'ˈana'])
fibel_phons[fibel_words.index('bananen')] = ''.join([ipa_sampa[w] for w in 'baˈnaːnən'])
fibel_phons[fibel_words.index('birnen')] = ''.join([ipa_sampa[w] for w in 'bɪrnən'])
fibel_phons[fibel_words.index('nuesse')] = ''.join([ipa_sampa[w] for w in 'nʏsə'])
fibel_phons[fibel_words.index('weintrauben')] = ''.join([ipa_sampa[w] for w in 'ˈvaintraʊbn'])
fibel_phons[fibel_words.index('maroni')] = ''.join([ipa_sampa[w] for w in 'maˈroːni'])
fibel_phons[fibel_words.index('erna')] = ''.join([ipa_sampa[w] for w in 'ˈɛrna'])
fibel_phons[fibel_words.index('die')] = ''.join([ipa_sampa[w] for w in 'diː'])
fibel_phons[fibel_words.index('das')] = ''.join([ipa_sampa[w] for w in 'das'])
fibel_phons[fibel_words.index('sind')] = ''.join([ipa_sampa[w] for w in 'zɪnt'])
fibel_phons[fibel_words.index('domino')] = ''.join([ipa_sampa[w] for w in 'ˈdoːminoː'])
fibel_phons[fibel_words.index('indianer')] = ''.join([ipa_sampa[w] for w in 'ɪnˈdiaːnər'])
fibel_phons[fibel_words.index('tonio')] = ''.join([ipa_sampa[w] for w in 'tɔnioː'])
fibel_phons[fibel_words.index('sagt')] = ''.join([ipa_sampa[w] for w in 'zaːkt'])
fibel_phons[fibel_words.index('italien')] = ''.join([ipa_sampa[w] for w in 'iˈtaːliən'])
fibel_phons[fibel_words.index('ute')] = ''.join([ipa_sampa[w] for w in 'uːtə'])
fibel_phons[fibel_words.index('umarmt')] = ''.join([ipa_sampa[w] for w in 'ʊmˈarmt'])
fibel_phons[fibel_words.index('runter')] = ''.join([ipa_sampa[w] for w in 'rʊntər'])
fibel_phons[fibel_words.index('turnen')] = ''.join([ipa_sampa[w] for w in 'tʊrnən'])
fibel_phons[fibel_words.index('dem')] = ''.join([ipa_sampa[w] for w in 'deːm'])
fibel_phons[fibel_words.index('rennt')] = ''.join([ipa_sampa[w] for w in 'rɛnt'])
fibel_phons[fibel_words.index('ene')] = ''.join([ipa_sampa[w] for w in 'eːnə'])
fibel_phons[fibel_words.index('mene')] = ''.join([ipa_sampa[w] for w in 'meːnə'])
fibel_phons[fibel_words.index('mu')] = ''.join([ipa_sampa[w] for w in 'muː'])
fibel_phons[fibel_words.index('turnt')] = ''.join([ipa_sampa[w] for w in 'ˈtʊrnt'])
fibel_phons[fibel_words.index('miauen')] = ''.join([ipa_sampa[w] for w in 'miˈaʊən'])
fibel_phons[fibel_words.index('raus')] = ''.join([ipa_sampa[w] for w in 'raʊs'])
fibel_phons[fibel_words.index('mauert')] = ''.join([ipa_sampa[w] for w in 'tɔm'])
fibel_phons[fibel_words.index('saust')] = ''.join([ipa_sampa[w] for w in 'zaʊst'])
fibel_phons[fibel_words.index('rosaroten')] = ''.join([ipa_sampa[w] for w in 'roːzaˈroːtn'])
fibel_phons[fibel_words.index('martin')] = ''.join([ipa_sampa[w] for w in 'ˈmartiːn'])
fibel_phons[fibel_words.index('martins')] = ''.join([ipa_sampa[w] for w in 'ˈmartiːns'])
fibel_phons[fibel_words.index('eine')] = ''.join([ipa_sampa[w] for w in 'ˈaɪnə'])
fibel_phons[fibel_words.index('miaut')] = ''.join([ipa_sampa[w] for w in 'miˈaʊt'])
fibel_phons[fibel_words.index('otto')] = ''.join([ipa_sampa[w] for w in 'ˈɔtoː'])
fibel_phons[fibel_words.index('isst')] = ''.join([ipa_sampa[w] for w in 'ɪst'])



try:
    fibel_phons.index('NO')
except ValueError:
    print("YAY, all words have phonetic transcript")

YAY, all words have phonetic transcript


### Save the Fibel dataset

In [126]:
(phons_num, words_num) = str_to_num_with_dict(fibel_phons,fibel_words, phon_dict, word_dict)
np.savez("data/fibel.npz", words=words_num, phons=phons_num, word_dict=word_dict, phon_dict=phon_dict)
np.savez("/Users/jannisborn/Desktop/LDS_Data/fibel.npz", words=words_num, phons=phons_num, word_dict=word_dict, phon_dict=phon_dict)

In [128]:
fibel_alt_writings = []
for ind,word in enumerate(fibel_phons):
    
    # If word is in CELEX, just retrieve alternative writings from there
    if word in words:
        
        ind = words.index(word)
        fibel_alt_writings.append(a[ind])
        
    # Otherwise generate alternative writings
    else:        
        # Convert SAMPA to IPA
        ipa = ''.join([sampa_ipa[w] for w in word])
        # generate string writings
        word_lists = split_word(ipa, ipa_graph2)
        alt_write_raw = list(itertools.product(*word_lists))
        alt_write = [''.join(a) for a in alt_write_raw]
        try:
            alt_write.remove(words[ind])
        except ValueError:
            _ = 1
       
        # Convert to numerical
        tmp = []
        for tar_ind in range(len(alt_write)):
            l = 28 - len(alt_write[tar_ind])
            num = [word_dict['<PAD>']]*l + [word_dict[k] for k in alt_write[tar_ind]]
            tmp.append(num)

        fibel_alt_writings.append(tmp)

#### Save the alternative writings

In [132]:
np.save("/Users/jannisborn/Desktop/LDS_Data/fibel_alt_targets.npy", fibel_alt_writings)

In [139]:
a = np.load('/Users/jannisborn/Desktop/LDS_Data/data/celex_alt_targets_small.npy')

In [147]:
print(a[1])
print(a[1].shape)

[[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 26, 4, 2]]


AttributeError: 'list' object has no attribute 'shape'

In [148]:
ar = [np.array(d,dtype=np.int8) for d in a]

In [156]:
from tensorflow.python.ops import array_ops
from tensorflow.python.framework import ops

help(ops)

Help on module tensorflow.python.framework.ops in tensorflow.python.framework:

tensorflow.python.framework.ops = <module 'tensorflow.python.framework.ops' from '...ite-packages/tensorflow/python/framework/ops.py'>


In [170]:
array_ops.constant([2], ops.dtypes.float32)

<tf.Tensor 'Const_3:0' shape=(1,) dtype=float32>

<tf.Tensor 'Const_1:0' shape=(1,) dtype=float32>

In [169]:
help(tf.float32)

Help on DType in module tensorflow.python.framework.dtypes object:

class DType(builtins.object)
 |  Represents the type of the elements in a `Tensor`.
 |  
 |  The following `DType` objects are defined:
 |  
 |  * `tf.float16`: 16-bit half-precision floating-point.
 |  * `tf.float32`: 32-bit single-precision floating-point.
 |  * `tf.float64`: 64-bit double-precision floating-point.
 |  * `tf.bfloat16`: 16-bit truncated floating-point.
 |  * `tf.complex64`: 64-bit single-precision complex.
 |  * `tf.complex128`: 128-bit double-precision complex.
 |  * `tf.int8`: 8-bit signed integer.
 |  * `tf.uint8`: 8-bit unsigned integer.
 |  * `tf.uint16`: 16-bit unsigned integer.
 |  * `tf.uint32`: 32-bit unsigned integer.
 |  * `tf.uint64`: 64-bit unsigned integer.
 |  * `tf.int16`: 16-bit signed integer.
 |  * `tf.int32`: 32-bit signed integer.
 |  * `tf.int64`: 64-bit signed integer.
 |  * `tf.bool`: Boolean.
 |  * `tf.string`: String.
 |  * `tf.qint8`: Quantized 8-bit signed integer.
 |  * `t

In [None]:
tf.InteractiveSession()

a = tf.zeros((3,),dtype=tf.int32)

b = tf.constant([[[1,2,3],[0,0,0]],[[1,2,3],[0,0,0]]],dtype=tf.int64)
print(b[1,1,:].shape)
if tf.reduce_all(tf.equal(a,b[1,1,:])).eval():
    print(True)

In [None]:
a = np.zeros((10,1))
b = a
for k in range(10):
    
    b = np.array([b,np.ones((k,1))])
    print(b.shape)
    

In [None]:
# Insight 1: If you have k different classes, then k is the third 
#    dimension of the logits and then the target matrix must not 
#    contain values higher than k-1 (i.e. labels are 0, ..., k)

# Insight 2: sequence_loss expects unnormalized logits (BEFORE softmax!)
# -> This makes testing for me harder, since I cannot simply use values
# with a sum over 1 (will get normalized)