* CELEX: Nimm die SAMPA ground truth, konvertiere nach IPA mit dem selbstgebauten dict basierend auf der WIKI Tabelle. Dann nimm das IPA -> Buchstaben dict um die alternativen Schreibweisen zu generieren und gehe die manuell durch und sortiere aus
* ChildLex: Nimm alle Wörter aus dem Korpus und schaue nach welche Wörter mit SAMPA im CELEX Korpus existieren. Diejenigen die es gibt: Easy, da SAMPA existiert und sogar schon in IPA umgewandelt und sogar alternative writings da. Diejenigen die es nicht gibt (vermutlich wenige): Nimm den Online Konverter um Text in SAMPA umzuwandeln (das ist die ground truth! Achte darauf, dass Alphabet so wie beim CELEX Korpus). Wandel diese Wörter dann in IPA um und generiere alternative Schreibweisen
* Fibelwörter: Gleicher Ansatz wie bei ChildLex

In [1]:
# Task: Come up with alternative spellings for all orthographic words in a database
# Tool 1: IPA -> Graphem Konverter Wiki
# Tool 2: SAMPA -> IPA Konverter Wiki

# Idea: Take the phonetic sequence (SAMPA) of every word, convert it to an IPA sequence and
        # then convert that to all grapheme sequences
    
# Needs: A dictionary for SAMPA -> IPA (ideally non-ambiguous). A dictionary for IPA -> Text (ambigu.)

import tensorflow as tf
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
import os

  from ._conv import register_converters as _register_converters


In [2]:
path = '/Users/jannisborn/Desktop/LDS_Data/celex2/german/gpl/gpl.cd'
t=0
with open(path, 'r') as file:

    raw_data = file.read().splitlines()
    words = []
    phons = []

    for ind,raw_line in enumerate(raw_data):

        line = raw_line.split("\\")

        
        if line[-2]: # Use only words that HAVE a SAMPA transcript (reduces from 51k to 37345)

            # exclude foreign words that have the 'æ' tone (SAMPA '{' ) like in
            # exclude foreign words that have the 'ɑ' tone (SAMPA 'A' )
            if not 'A' in line[-2] and not '{' in line[-2] and not '~' in line[-2]: 
                if not ('tS' in line[-2] and not 'tsch' in line[1]):
                    if len(line[-2]) <= 15:
                        words.append(line[1].lower()) # Make spellings lowercase only
                        phons.append(line[-2]) # Using SAMPA notation
                    else:
                        t+=1
print("Excluded",t, "words because they were too long (more than 15 phons)" )
print("Size of dataset is", len(words), "samples")


## Helper Method:

def str_to_num_dataset(X,Y):
    """
    This method receives 2 lists of strings (input X and output Y) and converts it to padded, numerical arrays.
    It returns the numerical dataset as well as the dictionaries to retrieve the strings.
    """

    # 1. Define dictionaries 
    # Dictionary assignining a unique integer to each input character
    try:
        u_characters = set(' '.join(X)) 
    except TypeError:
        # Exception for TIMIT dataset (one phoneme is repr. by seq. of chars)
        print("TypeError occurred.")
        u_characters = set([quant for seq in X for quant in seq])

    char2numX = dict(zip(u_characters, range(len(u_characters))))

    # Dictionary assignining a unique integer to each phoneme
    try:
        v_characters = set(' '.join(Y)) 
    except TypeError:
        print("TypeError occurred.")
        v_characters = set([quant for seq in Y for quant in seq])
    char2numY = dict(zip(v_characters, range(1,len(v_characters)+1))) # Using 0 causes trouble for tf.edit_distance
    
    # 2. Padding
    # Pad inputs
    char2numX['<GO>'] = len(char2numX) 
    char2numX['<PAD>']  = len(char2numX) 
    mx_l_X = max([len(word) for word in X]) # longest input sequence
    # Padd all X for the final form for the LSTM
    x = [[char2numX['<PAD>']]*(mx_l_X - len(word)) +[char2numX[char] for char in word] for word in X]
    x = np.array(x) 

    # Pad targets
    char2numY['<GO>'] = len(char2numY) # Define number denoting the response onset
    char2numY['<PAD>'] = len(char2numY)  
    mx_l_Y = max([len(phon_seq) for phon_seq in Y]) # longest output sequence

    y = [[char2numY['<GO>']] + [char2numY['<PAD>']]*(mx_l_Y - len(ph_sq)) + [char2numY[phon] for phon in ph_sq] for ph_sq in Y]
    y = np.array(y)

    return ((x,y) , (char2numX,char2numY))




Excluded 3873 words because they were too long (more than 15 phons)
Size of dataset is 33395 samples


Save the dataset

In [3]:
((phons_num, words_num), (phon_dict, word_dict)) = str_to_num_dataset(phons,words)
#print(len(word_dict),word_dict)
#print(len(phon_dict),phon_dict)
#print(words_num.shape, phons_num.shape)
#print(words_num[321,:], phons_num[321,:])

np.savez("data/celex.npz", words=words_num, phons=phons_num, word_dict=word_dict, phon_dict=phon_dict)

In [4]:
data = np.load('data/celex.npz')
sampa_dict = {key:data['phon_dict'].item().get(key) for key in data['phon_dict'].item()}
sampa_keys = list(sampa_dict.keys()) # Has 43 keys originally, 40 after excluding {, ~, A
print("Length of phonetic dict is ", len(sampa_dict), " and the keys are: \n", sampa_keys) # Has 43 keys


# Step 1: Make a sampa_ipa dict. How to: Go to SAMPA -> IPA tabelle, for every SAMPA char, check whether it 
# is in the CELEX korpus. If yes, look up example word from wiki in korpus and check whether it is the right sampa
# sign. If yes, look up on wiktionary example word in IPA and check whether output sign is correct.

sampa_ipa = dict()

# Vowels
sampa_ipa['i'] = 'i'
sampa_ipa[':'] = 'ː'
sampa_ipa['I'] = 'ɪ'
sampa_ipa['e'] = 'e'
sampa_ipa['E'] = 'ɛ'
sampa_ipa['y'] = 'y'   # meaning a real ü like in kühl
sampa_ipa['@'] = 'ə'
sampa_ipa['a'] = 'a'
sampa_ipa['u'] = 'u'
sampa_ipa['U'] = 'ʊ'
sampa_ipa['o'] = 'o'
sampa_ipa['O'] = 'ɔ'


# consonants
sampa_ipa['p'] = 'p'
sampa_ipa['b'] = 'b'
sampa_ipa['t'] = 't'
sampa_ipa['d'] = 'd'
sampa_ipa['k'] = 'k'
sampa_ipa['g'] = 'g'
sampa_ipa['f'] = 'f'
sampa_ipa['v'] = 'v'
sampa_ipa['s'] = 's'
sampa_ipa['z'] = 'z'
sampa_ipa['S'] = 'ʃ'
sampa_ipa['x'] = 'x'
sampa_ipa['h'] = 'h'
sampa_ipa['m'] = 'm'
sampa_ipa['n'] = 'n'
sampa_ipa['N'] = 'ŋ'
sampa_ipa['l'] = 'l'
sampa_ipa['r'] = 'r'
sampa_ipa['j'] = 'j'
sampa_ipa['Z'] = 'ʒ'
sampa_ipa['+'] = ''  # meaning a bit unclear
sampa_ipa['#'] = 'ˈ' # following syllabus carries primary intonation
sampa_ipa['|'] = 'ø' # meaning a bit unclear
sampa_ipa['/'] = 'œ' # usually SAMPA uses 9 instead of / for this 
sampa_ipa['Y'] = 'ʏ' # meaning more a 'oü' like in Müll

# These are 37 keys only, so 6 are missing. Remaining ones are:

# <GO>          not needed for alt. writing creation
# <PAD>         not needed for alt. writing creation
#    (SPACE)    not needed
# {             excluded some foreign words
# A             excluded some foreign words
# ~             excluded some foreign words


Length of phonetic dict is  40  and the keys are: 
 ['S', ':', 'U', 'Z', '|', '+', 'a', 'o', '#', 'Y', '/', 'r', 'v', 'N', 'O', 'j', 'I', 'e', 'g', 't', 'k', 'f', 'p', 'n', 'E', 'u', 'x', 'b', 'd', 'i', ' ', 'h', 'm', 'z', 's', '@', 'y', 'l', '<GO>', '<PAD>']


# Now convert the SAMPA words into IPA words:


In [5]:
ipa = []
for samp in phons:
    s = []
    for char in samp:
        s.append(sampa_ipa[char])
    ipa.append(''.join(s))
print("Amount of IPA samples is", len(ipa), ". Some samples are: \n")
print(" WORD         ===>        SAMPA        ===>        IPA")
for k in range(25100, 25110):
    print(words[k]," => ", phons[k]," => ", ipa[k])

Amount of IPA samples is 33395 . Some samples are: 

 WORD         ===>        SAMPA        ===>        IPA
strafen  =>  Stra:f  =>  ʃtraːf
strafanstalt  =>  Stra:f#anStalt  =>  ʃtraːfˈanʃtalt
strafarbeit  =>  Stra:f+@#arbait  =>  ʃtraːfəˈarbait
strafbar  =>  Stra:f#ba:r  =>  ʃtraːfˈbaːr
strafbefehl  =>  Stra:f+@#b@fe:l  =>  ʃtraːfəˈbəfeːl
strafe  =>  Stra:f+@  =>  ʃtraːfə
straferlass  =>  Stra:f+@#Er#las  =>  ʃtraːfəˈɛrˈlas
straff  =>  Straf  =>  ʃtraf
straffen  =>  Straf  =>  ʃtraf
straffaellig  =>  Stra:f+@#fal+Ix  =>  ʃtraːfəˈfalɪx


## Now define the second dictionary, mapping IPA signs to graphemes (according to [here](https://de.wiktionary.org/wiki/Verzeichnis:Deutsch/Phoneme_und_Grapheme) ) 

Go to every phon and write down all graphemes (copying table, excluding very weird graphemes)

In [6]:
print(sampa_ipa.values())

dict_values(['i', 'ː', 'ɪ', 'e', 'ɛ', 'y', 'ə', 'a', 'u', 'ʊ', 'o', 'ɔ', 'p', 'b', 't', 'd', 'k', 'g', 'f', 'v', 's', 'z', 'ʃ', 'x', 'h', 'm', 'n', 'ŋ', 'l', 'r', 'j', 'ʒ', '', 'ˈ', 'ø', 'œ', 'ʏ'])


In [7]:
ipa_graph = dict()
ipa_graph['t'] = ['t', 'd', 'tt'] # excluded dt (mostly in Stadt) and th (Methode) -> hard but not do more than 3 opts.
ipa_graph['ə'] = ['e']
ipa_graph['n'] = ['n', 'nn']
ipa_graph['s'] = ['s', 'ss'] # excluded t for Patience, ce for Farce, zz for Jazz, ß (no occ.), c for Sauce, z for Quiz
ipa_graph['a'] = ['a', 'ah']
ipa_graph['r'] = ['r', 'rr'] # excluded rrh for Zirrhose/Myrrhe, rh for Rhythmus
ipa_graph['l'] = ['l', 'll']
ipa_graph['ɛ'] = ['e', 'ae']
ipa_graph['f'] = ['f', 'v', 'ff'] # excluded ph for Physik
ipa_graph['g'] = ['g', 'gh'] # excluded gg for Bagger
ipa_graph['ɪ'] = ['i']
ipa_graph['k'] = ['k', 'ck', 'c'] # Excluded cch for Zucchini, gg for Flaggschiff, qu for Boutique, kk for Akkordeon
                  #  qu chars are usually kv ipa (tracked below), g for Krieg, ch for Chor
ipa_graph['m'] = ['m', 'mm']
ipa_graph['b'] = ['b', 'bb']
ipa_graph['ʃ'] = ['sch', 's'] # excluded sk for Ski, sh for Sheriff, Show and ch for Recherche 
ipa_graph['d'] = ['d','dd']
ipa_graph['p'] = ['p', 'b', 'pp'] # excluded bb for abebben or schrubben
ipa_graph['ŋ'] = ['ng','n']
ipa_graph['ɔ'] = ['o'] # excluded ch for Chauffeur (very rare exception)
ipa_graph['v'] = ['w', 'v']
ipa_graph['ʊ'] = ['u']
ipa_graph['z'] = ['s'] # excluded zz for Blizzard, Puzzle and z for zoomen, bulldozer (since only in foreign words)
ipa_graph['h'] = ['h']
ipa_graph['i'] = ['i'] # excluding y (Baby/Party/Hockey) only 10 words in corpus
ipa_graph['ʏ'] = ['ue', 'u'] # The corpus is weird here and writes Druck as drʏk, i.e. "Drück" rather than drʊk
ipa_graph['x'] = ['ch']
ipa_graph['e'] = ['e'] # excluded ee for Kaffee since IPA would be eː
ipa_graph['j'] = ['j', 'y']
ipa_graph['u'] = ['u'] # excluded ou like in Boutique
ipa_graph['o'] = ['o'] # not needed anyways since o always followed by ː
ipa_graph['œ'] = ['oe']
ipa_graph['y'] = ['y']
ipa_graph['ʒ'] = ['g', 'j'] # no wiki entry, self generated. For Garage or Jury


# 2 character keys:
ipa_graph['ts'] = ['z', 'ts', 'tz'] # excluded zz for Pizza/Skizze, c for circa, Penicillin, tts for trittst
            # t for Aktion, Negation, Infektion, Proportion, ...
ipa_graph['aː'] = ['a', 'ah', 'aa']
ipa_graph['ai'] = ['ei', 'ai'] # excluded ail for Detail, aill for Medaillon, aille for Medaille and y for Nylon
ipa_graph['iː'] = ['ie', 'i', 'ih'] # excluded ieh for Entziehung
ipa_graph['eː'] = ['e', 'ee', 'eh'] # excluded et like in Bidet
ipa_graph['ɛː'] = ['ae', 'aeh']
ipa_graph['uː'] = ['u', 'uh'] # excluded ou like in Ragout, Limousine and oo like on zoomen/Cartoon
ipa_graph['oː'] = ['o', 'oh', 'oo'] # excluded au for aubergine/sauce and eau for plateau, Niveau
ipa_graph['yː'] = ['ue', 'ueh', 'y'] # excluded uet like in Debüt and u like in deja-vu
ipa_graph['ɔy'] = ['eu', 'aeu'] # instead of what wiki calls ɔɪ̯, excluded oi for Boiler and oy for Boykott
ipa_graph['ks'] = ['chs', 'x', 'ks'] #excluded gs like in legst/bugsieren, ggs like in eggst (?), cks like in zwecks, gs (legst)
ipa_graph['øː'] = ['oe', 'oeh'] # excluded eu like in Ingenieur and eue like in Queue (?)
ipa_graph['kv'] = ['qu']


print(len(sampa_ipa), len(ipa_graph))
# We had 36 keys in sampa_ipa dict, now we have 46 already in ipa_graph due to 2-phoneme-groups
# But still there are one-char-values in sampa_ipa which are not keys in ipa_graph. Let us print them:
for key in sampa_ipa.values():
    if key not in ipa_graph.keys():
        print(key)
        
# Okay 4 are missing:
ipa_graph['ː'] = [''] # not needed anyways since ː always occurs after vowel
ipa_graph['ˈ'] = [''] # Just a pronounciation symbol, does not carry meaning for spelling
# Then the empty string '' is not needed as key
# Then ø only occurs followed by a ː


37 46
ː

ˈ
ø


#### Pretty tough to draw the boundary between which phons rarely but "regularly" translate to grapheme strings ([k] to kk like in Akkordeon, Mokka is still regular?) whereas [ʃ] (sch) to ch like in Champagner, Recherche or Lunch is irregular?

## Use the IPA->Graphem dict to create alternative writings

In [8]:
def split_word(word, ipa_graph):
    """
    Splits up an IPA word into a list of lists each with the possible replacement grapheme for each phoneme
    
    Parameters:
    ----------
    WORD       {list} in IPA notation
    IPA_GRAPH  {dict} mapping IPA symbols to possible grapheme sequences
    
    Returns:
    ---------
    CHARS      {list} containing lists with possible grapheme sequences
    """
    
    
    chars = []
    single_key = True
    for ind in range(len(word)-1):
        
        if single_key: 
            if word[ind:ind+2] in ipa_graph:
                chars.append(ipa_graph[word[ind:ind+2]])
                single_key = False
            else:
                chars.append(ipa_graph[word[ind]])
        else:
            single_key = True
            
    chars.append(ipa_graph[word[ind+1]])
    
    return chars
    


# The cell below generates the alternative writings (takes some time...)

In [9]:
a = np.array([1,0,0,1,0])
d = dict()
d[str(a)] = 12

e = np.random.random((2,2,2,2,2))
f = [np.asscalar(aa) for aa in a]
print(f)
d[tuple(a)] = 121
print(d)


[1, 0, 0, 1, 0]
{'[1 0 0 1 0]': 12, (1, 0, 0, 1, 0): 121}


In [105]:
%%timeit
d[np.str(a)]

31.9 µs ± 1.93 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [104]:
%%timeit
e[a[0],a[1],a[2],a[3], a[4]]

633 ns ± 28.2 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [118]:
%%timeit
d[tuple(a)]

1.04 µs ± 37 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [62]:
all_writings = []
m = 0

for ind,ip in enumerate(ipa):
    if ind % 200 == 0:
        print("Currently examining word ", ind)
        
    word_lists = split_word(ip, ipa_graph)
    alt_write_raw = list(itertools.product(*word_lists))
    alt_write = [''.join(a) for a in alt_write_raw]
    try:
        alt_write.remove(words[ind])
    except ValueError:
        _ = 1
        
    all_writings.append(alt_write)
    
    if len(alt_write) > m:
        print(m,ind)
        m = len(alt_write)
        
        
print("DONE! Alternative writings generated for ", ind, "words. Resulting list has", len(all_writings), 'entries.')
np.save('celex_all_writings', all_writings)

Currently examining word  0
0 0
2 2
6 4
143 11
191 13
288 21
575 44
576 56
2304 180
Currently examining word  200
5183 282
Currently examining word  400
5184 478
Currently examining word  600
Currently examining word  800
10367 970
Currently examining word  1000
Currently examining word  1200
Currently examining word  1400
Currently examining word  1600
Currently examining word  1800
11663 1900
Currently examining word  2000
Currently examining word  2200
Currently examining word  2400
Currently examining word  2600
Currently examining word  2800
Currently examining word  3000
Currently examining word  3200
Currently examining word  3400
Currently examining word  3600
Currently examining word  3800
Currently examining word  4000
Currently examining word  4200
Currently examining word  4400
Currently examining word  4600
Currently examining word  4800
Currently examining word  5000
Currently examining word  5200
Currently examining word  5400
Currently examining word  5600
Currently exa

# Extract childlex data from downloaded CSV and clean it up

In [10]:
import csv
path = 'data/childlex_6-8_lemmata.csv'
forbidden = "_,.!?*:=&%- /\\–•»«…›‹()[]{}’'"


def rem_um(word):
    """
    Converts a word with German umlauts (ü,ä,ö) into a word without
    """
        
    umlautfree = str()
    for char in word:
        if char == 'ä':
            umlautfree += 'ae'
        elif char == 'ü':
            umlautfree += 'ue'
        elif char == 'ö':
            umlautfree += 'oe'
        elif char == 'ß':
            umlautfree += 'ss'
        else:
            umlautfree += char
    return umlautfree


with open(path, 'r') as csvfile:
    
    raw_file = csv.reader(csvfile,delimiter=';')
    raw_data = []
    
    
    for line in raw_file:
        umlautfree = rem_um(line[0].lower())
        
       # data cleanup
        has_digit = any(char.isdigit() for char in umlautfree)
        has_sonderz = any(char in forbidden for char in umlautfree)
        
        if not has_digit and not has_sonderz:
        
            raw_data.append(umlautfree)
        
        
    
childlex_words = raw_data
print("Amount of words in childlex", len(childlex_words))
    
chars = []
for word in childlex_words:
    for char in word:
        if char not in chars:
            chars.append(char)
    
print("Set of characters in childlex words is", chars)

Amount of words in childlex 9769
Set of characters in childlex words is ['a', 'e', 'c', 'h', 'z', 'n', 'd', 'f', 'm', 'l', 'i', 'r', 'g', 's', 't', 'u', 'o', 'b', 'k', 'p', 'q', 'w', 'v', 'x', 'j', 'y']


#### Look up phonetic sequences in CELEX Database. 
#### Problem: We do not have phonetic sequences yet. Idea: Retrieve as many phonetic sequences as possible from the CELEX databsae

In [13]:
childlex_phons = []
for ind,word in enumerate(childlex_words):
    if word in words:
        childlex_phons.append(phons[ind])
    else:
        childlex_phons.append('')

In [19]:
a =  childlex_phons.count('')
print("From the", len(childlex_phons),"CHILDLEX database", a,"were not retrieved")

From the 9769 CHILDLEX database 3821 were not retrieved


In [None]:



def extract_celex(path):
    """
    Reads in data from the CELEX corpus
    
    Parameters:
    -----------
    PATH        {str} the path to the desired celex file, i.e. gpl.cd 
                    (contains orthography and phonology)
    
    Returns:
    -----------
    WORDS       {list} of words (length 51728) for gpl.cd
    PHONS       {list} of phoneme sequences (length 51728) for gpl.cd
    
    
    
    """
    
    
    with open(path, 'r') as file:

        raw_data = file.read().splitlines()
        words = []
        phons = []
        
        for ind,raw_line in enumerate(raw_data):
            
            
            line = raw_line.split("\\")
            words.append(line[1])
            phons.append(line[-2]) # Using SAMPA notation
                
    return words, phons
                               



In [None]:
path = "/Users/jannisborn/Desktop/LDS_Data/celex2/german/gpl/gpl.cd"
words, phons = extract_celex(path)

print(words[30], phons[30])

((w,p) , (word_dict, phon_dict)) = str_to_num_dataset(words,phons)

print(w.shape, p.shape, len(word_dict), len(phon_dict))

In [None]:
np.savez('celex.npz', words=w, phons=p, word_dict=word_dict, phon_dict=phon_dict)
data = np.load('data/celex.npz')
print(data['phons'].shape)
print(data['words'].shape)
print(data['phon_dict'])
print(data['word_dict'])

In [None]:
 
"""# Check which words are double in dataset
for ind,word in enumerate(ipa):
    ipa2 = ipa[:]
    del ipa2[ind]
    if word in ipa2:
        print(word, words[ind],phons[ind])"""

In [None]:


def BAS_json(path):
    """
    This method receives a path for the BAS-SprecherInnen corpus and iterates through all JSON files in all subfolders.
    It creates and returns a list of words and a list of pronounciations
    """
    
    import json, os

    words = []
    prons = []
    ind = 0
    # Read in filenames
    for dirpath, dirnames, filenames in os.walk(path):
        for filename in [f for f in filenames if f.endswith(".json")]:

            if filename == 'SprecherInnen_DBconfig.json':
                continue

            # Open the json
            with open(os.path.join(dirpath,filename)) as json_file:
                data = json.load(json_file)

                for item in data['levels'][1]['items']:
                    words.append(item['labels'][0]['value'])
                    prons.append(item['labels'][1]['value'])

    return words,prons


def clean_corpus_BAS_Sprecherinnen(words,prons):
    """
    This method receives a list of words and a list of pronunciations of the BAS-Sprech. corpus and returns a cleaned dataset.
    Clearning means:    1) Removing multiple occurrences of words       2) Remove misspellings and ambiguities
                        3) Remove capitalization at begin of sentence   
    Homophon words (Meer, mehr) are kept!

    This method required manual inspection (once for each corpus).
    Returns a condensed list of words and pronounciations (strings) that can be converted in numerical values next.

    """

    # First, we remove multiple occurrences.
    # We cannot use set(words), set(prons) since some words are homophon 8(results in diff. lengths)
    all_tups = []
    for (w,p) in zip(words,prons):
        all_tups.append(tuple((w,p)))
    set_tup = set(all_tups)
    print('Amount of non-unique words in corpus is ', len(all_tups))
    unique_tups = dict(set_tup)
    print('Amount of unique words in corpus is ', len(set_tup))

    # Now we have removed multiple occurrences and we have a dict of tuples (word, pron)

    def find_poss_mistakes(unique_tups):
        """
        Receives a list of hopefully unique tupels (word,pron) and collect the tupels
        which may have incorrect spelling/pronounciations.
        """
        possible_mistakes = []
        for key, val in unique_tups.items():
            for keyy,vall in unique_tups.items():
                if key != keyy and val == vall:
                    # Detect multiple spellings of same pronounciation
                    possible_mistakes.append((key,val, keyy, vall))
                if key == keyy and val != vall:
                    # Detect multiple pronounciations of same spelling
                    possible_mistakes.append((key,val, keyy, vall))
                    
        return possible_mistakes
        
    poss_mist = find_poss_mistakes(unique_tups)
    """
    print("+++ Possible mistakes are +++")
    for k in range(len(poss_mist)):
        print(poss_mist[k][0],' -> ',poss_mist[k][1], 
              poss_mist[k][2],' -> ',poss_mist[k][2])
    """
        
    # Remove mistakes (after manual inspection)
    unique_tups.pop('BäckerInnen') # removing as a duplicate of Bäckerinnen
    unique_tups.pop('nu') # Duplicate of Nu
    unique_tups.pop('Abonentinnen') # Misspelled
    unique_tups.pop('Mit') # Duplicate of mit
    unique_tups.pop('A') # Duplicate of ah
    unique_tups.pop('Bei') # Duplicate of bei
    unique_tups.pop('backwaren') # Duplicate of Backwaren
    unique_tups.pop('-vertreterinnen') # Duplicate of Vertreterinnen
    unique_tups.pop('leu') # Duplicate of Leu
    unique_tups.pop('teil') # Duplicate of Teilt
    unique_tups.pop('Un') # Duplicate of un
    unique_tups.pop('Ver') # Duplicate of ver
    unique_tups.pop('AutorInnen') # Duplicate of Autorinnen
    unique_tups.pop('FreundInnen') # Duplicate of Freundinnen
    unique_tups.pop('-pflegerin') # Duplicate of Pflegerin
    unique_tups.pop('Neu') # Duplicate of neu
    unique_tups.pop('re') # Duplicate of Re
    unique_tups.pop('-kolleginnen') # Duplicate of Koleginnen
    unique_tups.pop('-trinkerinnen') # Duplicate of Trinkerinnen
    unique_tups.pop('Twitter-NutzerInnen') # Duplicate of Twitter-Nutzerinnen
    unique_tups.pop('kommissionen') # Duplicate of Koleginnen
    # Remaining: (Ihnen, ihnen), (dass,das), (Meer, mehr), (Ihres, ihres), (mal, Mal)

    wordss = list(unique_tups.keys())
    pronss = list(unique_tups.values())
    print('After clearning ', len(wordss), ' different words remain')

    return wordss, pronss




In [None]:
path = '/Users/jannisborn/Desktop/LDS_Data/BAS_SprecherInnen'
words, prons = BAS_json(path)
words, prons = clean_corpus_BAS_Sprecherinnen(words,prons)

((x,y) , (char2numX, char2numY)) = str_to_num_dataset(words,prons)
np.savez('BAS_G2P.npz', inputs=x,targets=y, inp_dict=char2numX,
        tar_dict=char2numY)

((x,y) , (char2numX, char2numY)) = str_to_num_dataset(prons,words)
np.savez('BAS_P2G.npz', inputs=x,targets=y, inp_dict=char2numX,
        tar_dict=char2numY)

print(type(x), type(y), type(char2numX),type(char2numY))

### Helper methods

In [None]:
# Insight 1: If you have k different classes, then k is the third 
#    dimension of the logits and then the target matrix must not 
#    contain values higher than k-1 (i.e. labels are 0, ..., k)

# Insight 2: sequence_loss expects unnormalized logits (BEFORE softmax!)
# -> This makes testing for me harder, since I cannot simply use values
# with a sum over 1 (will get normalized)