In [1]:
import os
from glob import iglob
import pandas as pd
import textgrid
from arpabetandipaconvertor import arpabet2phoneticalphabet

In [2]:
LIBRI_27 = '/Users/jen/Dev/dissertation/dissertation/aligned_librispeech/27'
LIBRI_26 = '/Users/jen/Dev/dissertation/dissertation/aligned_librispeech/26'
LIBRI_19 = '/Users/jen/Dev/dissertation/dissertation/aligned_librispeech/19'
WHD = '/Users/jen/Dev/dissertation/dissertation/Data/WikipediaHomographData.csv'
#test = os.path.join(LIBRI, '26-495-0000.TextGrid')

In [3]:
whd_df = pd.read_csv(WHD)
pronunciation = whd_df['pronunciation'].to_list()

In [4]:
homographs = whd_df['homograph'].to_list()

In [5]:
cvtr = arpabet2phoneticalphabet.ARPAbet2PhoneticAlphabetConvertor()

In [6]:
def get_word_phones(file_tg):
    tg = textgrid.TextGrid.fromFile(file_tg)
    word_phones = []
    for word in tg.tiers[0].intervals:
        if word.mark:
            word_phones_pair = {}
            phones = []
            word_phones_pair['word'] = word.mark
            for phone in tg.tiers[1].intervals:
                if word.overlaps(phone): 
                    phones.append(phone.mark)
            word_phones_pair['phones'] = phones
            word_phones.append(word_phones_pair)
    return word_phones

#[word_phone for word_phone in word_phones]

In [7]:
ipa_whd = []
for f in iglob(os.path.join(LIBRI_19, '*')):
    word_phones = get_word_phones(f)
    for e in word_phones: 
        phone_string = ' '.join(e['phones'])
        try: 
            ipa_string = cvtr.convert_to_american_phonetic_alphabet(phone_string)
            if ipa_string in pronunciation: 
                ipa_whd.append((ipa_string, word_phones))
                print("MFA-arpabet IPA match with WHD: {}".format(ipa_string))
                words = [e['word'] for e in word_phones]
                print(' '.join(words))
                print('\n')
        except: 
            print('Could not convert: {}'.format(phone_string))
            print('\n')
            continue

    
#[word_phone for word_phone in word_phones]

MFA-arpabet IPA match with WHD: haʊs
asked her if she would have the goodness to show him the way you may see the house from this window sir was information on sarah's side which produced only a bow of acknowledgment from the gentleman and a silencing nod from her mother


MFA-arpabet IPA match with WHD: baʊ
asked her if she would have the goodness to show him the way you may see the house from this window sir was information on sarah's side which produced only a bow of acknowledgment from the gentleman and a silencing nod from her mother


MFA-arpabet IPA match with WHD: haʊs
hated confinement and cleanliness and loved nothing so well in the world as rolling down the green slope at the back of the house such was catherine morland at ten at fifteen appearances were mending


Could not convert: spn


MFA-arpabet IPA match with WHD: haʊs
and designed her for his daughter in law on discovering his error to turn her from the house seemed the best though to his feelings an inadequate proof 

In [8]:
ipa_whd = []
for f in iglob(os.path.join(LIBRI_26, '*')):
    word_phones = get_word_phones(f)
    for e in word_phones: 
        phone_string = ' '.join(e['phones'])
        try: 
            ipa_string = cvtr.convert_to_american_phonetic_alphabet(phone_string)
            if ipa_string in pronunciation: 
                ipa_whd.append((ipa_string, word_phones))
                print("MFA-arpabet IPA match with WHD: {}".format(ipa_string))
                words = [e['word'] for e in word_phones]
                print(' '.join(words))
                print('\n')
        except: 
            print('Could not convert: {}'.format(phone_string))
            print('\n')
            continue

#[word_phone for word_phone in word_phones]

MFA-arpabet IPA match with WHD: wɪndz
still continuing very severe even till near the end of february attended with sharp though moderate winds the bills decreased again and the city grew healthy and everybody began to look upon the danger as good as over


MFA-arpabet IPA match with WHD: haʊs
concerning my own case and how i should dispose of myself that is to say whether i should resolve to stay in london or shut up my house and flee as many of my neighbours did i have set this particular down


MFA-arpabet IPA match with WHD: haʊs
and having the key in my pocket i used to go into the house and over most of the rooms to see that all was well for though it be something wonderful to tell


MFA-arpabet IPA match with WHD: haʊs
but after that i think it was about the twelfth of february another died in another house but in the same parish and in the same manner this turned the people's eyes pretty much towards that end of the town


MFA-arpabet IPA match with WHD: haʊs
two physicians and

In [9]:
ipa_whd = []
for f in iglob(os.path.join(LIBRI_27, '*')):
    word_phones = get_word_phones(f)
    for e in word_phones: 
        phone_string = ' '.join(e['phones'])
        try: 
            ipa_string = cvtr.convert_to_american_phonetic_alphabet(phone_string)
            if ipa_string in pronunciation: 
                ipa_whd.append((ipa_string, word_phones))
                print("MFA-arpabet IPA match with WHD: {}".format(ipa_string))
                words = [e['word'] for e in word_phones]
                print(' '.join(words))
                print('\n')
        except: 
            print('Could not convert: {}'.format(phone_string))
            print('\n')
            continue

    
#[word_phone for word_phone in word_phones]

Could not convert: spn


MFA-arpabet IPA match with WHD: haʊs
and to the house of nassau the naval administration of the united provinces was conducted by five distinct boards of admiralty one of those boards sate at amsterdam was partly nominated by the authorities of that city


MFA-arpabet IPA match with WHD: haʊs
james had lately held language which encouraged the hope that he would not patiently submit to the ascendancy of france it seemed probable that he would consent to form a close alliance with the united provinces and the house of austria


Could not convert: spn


Could not convert: spn


MFA-arpabet IPA match with WHD: lɛd
when they had revolted from the way which god had given them to walk therein they were destroyed in battles by many nations and very many of them were led away captive


Could not convert: spn


Could not convert: spn


Could not convert: spn


Could not convert: spn


Could not convert: spn


Could not convert: spn


MFA-arpabet IPA match with WHD: haʊs

### Words, not IPA

In [14]:
wrd_whd = []
for f in iglob(os.path.join(LIBRI_26, '*')):
    word_phones = get_word_phones(f)
    for e in word_phones:
        if e['word'] in homographs: 
            wrd_whd.append((e['word'], word_phones))
            print("WRD WHD: {}".format(e['word']))
            pronunciation = whd_df[whd_df['homograph'] == e['word']]['pronunciation'].values
            print("WHD IPA representations: {}".format(pronunciation))
            try: 
                phone_string = ' '.join(e['phones'])
                ipa_string = cvtr.convert_to_american_phonetic_alphabet(phone_string)
                ipa_whd.append((ipa_string, word_phones))
                print("MFA-arpabet IPA: {}".format(ipa_string))
            except: 
                print("Could not convert: {}".format(e['word']))
                continue
            words = [e['word'] for e in word_phones]
            print(' '.join(words))
            print('\n')

WRD WHD: moderate
WHD IPA representations: ['mɑːdɚət' "mɑːdɚ'eɪt"]
MFA-arpabet IPA: ˈmɑdɝʌt
still continuing very severe even till near the end of february attended with sharp though moderate winds the bills decreased again and the city grew healthy and everybody began to look upon the danger as good as over


WRD WHD: winds
WHD IPA representations: ['wɪndz' 'waɪndz']
MFA-arpabet IPA: wɪndz
still continuing very severe even till near the end of february attended with sharp though moderate winds the bills decreased again and the city grew healthy and everybody began to look upon the danger as good as over


WRD WHD: increase
WHD IPA representations: ['ɪnˌkɹiːs' "ən'kɹiːs"]
MFA-arpabet IPA: ˈɪˌnkris
the like increase of the bills was observed in the parishes of saint bride's adjoining on one side of holborn parish and in the parish of saint james clerkenwell adjoining on the other side of holborn


WRD WHD: house
WHD IPA representations: ['haʊs' 'haʊz']
MFA-arpabet IPA: haʊs
concerning m

In [15]:
wrd_whd = []
for f in iglob(os.path.join(LIBRI_27, '*')):
    word_phones = get_word_phones(f)
    for e in word_phones:
        if e['word'] in homographs: 
            wrd_whd.append((e['word'], word_phones))
            print("WRD WHD: {}".format(e['word']))
            pronunciation = whd_df[whd_df['homograph'] == e['word']]['pronunciation'].values
            print("WHD IPA representations: {}".format(pronunciation))
            try: 
                phone_string = ' '.join(e['phones'])
                ipa_string = cvtr.convert_to_american_phonetic_alphabet(phone_string)
                ipa_whd.append((ipa_string, word_phones))
                print("MFA-arpabet IPA: {}".format(ipa_string))
            except: 
                print("Could not convert: {}".format(e['word']))
                continue
            words = [e['word'] for e in word_phones]
            print(' '.join(words))
            print('\n')

WRD WHD: learned
WHD IPA representations: ['lɚnəd' 'lɚnd']
MFA-arpabet IPA: lɝnd
locke was travelling on the continent for his health when he learned that he had been deprived of his home and of his bread without a trial or even a notice


WRD WHD: advocate
WHD IPA representations: ['ædvəkət' 'ædvəˌkeɪt']
MFA-arpabet IPA: ˈædvʌkʌt
here the earl published a manifesto drawn up in holland under the direction of the committee by james stewart a scotch advocate whose pen was a few months later employed in a very different way


WRD WHD: house
WHD IPA representations: ['haʊs' 'haʊz']
MFA-arpabet IPA: haʊs
and to the house of nassau the naval administration of the united provinces was conducted by five distinct boards of admiralty one of those boards sate at amsterdam was partly nominated by the authorities of that city


WRD WHD: close
WHD IPA representations: ['kloʊs' 'kloʊz']
MFA-arpabet IPA: klos
james had lately held language which encouraged the hope that he would not patiently submit t

In [16]:
wrd_whd = []
for f in iglob(os.path.join(LIBRI_19, '*')):
    word_phones = get_word_phones(f)
    for e in word_phones:
        if e['word'] in homographs: 
            wrd_whd.append((e['word'], word_phones))
            print("WRD WHD: {}".format(e['word']))
            pronunciation = whd_df[whd_df['homograph'] == e['word']]['pronunciation'].values
            print("WHD IPA representations: {}".format(pronunciation))
            try: 
                phone_string = ' '.join(e['phones'])
                ipa_string = cvtr.convert_to_american_phonetic_alphabet(phone_string)
                ipa_whd.append((ipa_string, word_phones))
                print("MFA-arpabet IPA: {}".format(ipa_string))
            except: 
                print("Could not convert: {}".format(e['word']))
                continue
            words = [e['word'] for e in word_phones]
            print(' '.join(words))
            print('\n')

WRD WHD: house
WHD IPA representations: ['haʊs' 'haʊz']
MFA-arpabet IPA: haʊs
asked her if she would have the goodness to show him the way you may see the house from this window sir was information on sarah's side which produced only a bow of acknowledgment from the gentleman and a silencing nod from her mother


WRD WHD: bow
WHD IPA representations: ['boʊ' 'baʊ']
MFA-arpabet IPA: baʊ
asked her if she would have the goodness to show him the way you may see the house from this window sir was information on sarah's side which produced only a bow of acknowledgment from the gentleman and a silencing nod from her mother


WRD WHD: read
WHD IPA representations: ['ɹɛd' 'ɹiːd']
MFA-arpabet IPA: rid
she brought herself to read them and though there seemed no chance of her throwing a whole party into raptures by a prelude on the pianoforte of her own composition she could listen to other people's performance with very little fatigue


WRD WHD: house
WHD IPA representations: ['haʊs' 'haʊz']
MFA-a