# IPA-based syllabification of English texts

## I. Preliminary steps

### Import necessary libraries and set up the working directory

In [2]:
### Necessary libraries
import os 
import pandas as pd
import IPython # To be able to run magic commands within Jupyter
from pyndl import preprocess
import regex
import gzip

### Set working directory
TOP = '/media/Syllabifiers_repo/'
WD = '/media/Syllabifiers_repo/package/'
os.chdir(WD)

### Import the local 'syllabifier_ipa' package 
import syllabifiers.ipa as sy

# Display option for dataframes
pd.set_option('display.max_colwidth', 100) # Max width of columns when dispalying datasets
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20

### Define file paths

In [3]:
# List of Enlish words and sentences
ENGLISH_WORDS = TOP + "illustrative_examples/Data_en/English_words.csv"
ENGLISH_SENTS = TOP + "illustrative_examples/Data_en/English_sentences.csv"
# Main corpus: a sample of 1000 sentences from the BNC corpus
CORPUS_EN =  TOP + 'illustrative_examples/Data_en/BNC_NoTags_1Ksample.txt' 
# locations of the event files that will be created after the different simulations
S2L_EVENTS_EN = TOP + 'illustrative_examples/Results_en/S2L_events_en.gz'

### Define some useful variables for later

In [4]:
### Set of allowed English characters
ENGLISH = "abcdefghijklmnopqrstuvwxyz"
ENGLISH = ENGLISH + ENGLISH.upper()

### Compiled regular expression that detects syllabic patterns for English
syllable_pattern_en = sy.regex_en_ipa()

# Compiled regular expression that detects disallowed English characters
not_symbol_pattern_en = regex.compile(f"[^{ENGLISH}]")

## II. Transcription to IPA

In [5]:
# Test 1 
sy.english_to_ipa('international')

'ɪntəɹnæʃənəl'

In [6]:
# Test 2
sy.english_to_ipa('Online')

'ɔnlajn'

## III. Syllabification of a list of words

### Test the function that syllabifies a single word

In [7]:
# Test 1
s = '''- Word: {word}
- IPA transcription: {ipa}
- Syllabic cues: {syl}
'''.format(word = 'calculation', 
           ipa = sy.english_to_ipa("calculation"), 
           syl = sy.syllabify(word = "calculation", 
                              ipa_converter = sy.english_to_ipa, 
                              syllable_pattern = syllable_pattern_en,
                              add_boundaries = True))
print(s)

- Word: calculation
- IPA transcription: kælkjəlejʃən
- Syllabic cues: #kælkj_lkjəl_lejʃ_ʃən#



In [24]:
# Test 2
s = '''- Word: {word}
- IPA transcription: {ipa}
- Syllabic cues: {syl}
'''.format(word = 'sunny', 
           ipa = sy.english_to_ipa("sunny"), 
           syl = sy.syllabify(word = "sunny", 
                              ipa_converter = sy.english_to_ipa, 
                              syllable_pattern = syllable_pattern_en,
                              add_boundaries = True))
print(s)

- Word: sunny
- IPA transcription: sʌni
- Syllabic cues: #sʌn_ni#



In [25]:
# Test 3
s = '''- Word: {word}
- IPA transcription: {ipa}
- Syllabic cues: {syl}
'''.format(word = 'sh', 
           ipa = sy.english_to_ipa("sh"), 
           syl = sy.syllabify(word = "sh", 
                              ipa_converter = sy.english_to_ipa, 
                              syllable_pattern = syllable_pattern_en,
                              add_boundaries = True))
print(s)

- Word: sh
- IPA transcription: ɛsejt͡ʃ
- Syllabic cues: #ɛs_sejt͡ʃ#



### Syllabify a list of English words

In [27]:
# Import the list as a pandas dataframe
Words = pd.read_csv(ENGLISH_WORDS, names = ['Word'])
Words.head()

Unnamed: 0,Word
0,environmental
1,glass
2,answer
3,skill
4,sister


In [29]:
# Apply it to the whole set
Words['IPA_transcription'] = Words['Word'].apply(lambda x: sy.english_to_ipa(x))
Words['syllabic_cues'] = Words['Word'].apply(lambda x: sy.syllabify(x, sy.english_to_ipa, syllable_pattern_en))
# Display the resulting dataset
Words

Unnamed: 0,Word,IPA_transcription,syllabic_cues
0,environmental,ɪnvajɹənmɛntəl,#ɪnv_nvajɹ_ɹənm_nmɛnt_ntəl#
1,glass,ɡlæs,#ɡlæs#
2,answer,ænsəɹ,#æns_nsəɹ#
3,skill,skɪl,#skɪl#
4,sister,sɪstəɹ,#sɪst_stəɹ#
5,PM,piɛm,#pi_ɛm#
6,professor,pɹəfɛsəɹ,#pɹəf_fɛs_səɹ#
7,operation,ɑpəɹejʃən,#ɑp_pəɹ_ɹejʃ_ʃən#
8,financial,fənænʃəl,#fən_nænʃ_nʃəl#
9,crime,kɹajm,#kɹajm#


## IV. Syllabification of a list of sentences 

### Test the function that syllabifies a sentence

In [9]:
# Test 1 
sent1 = "Imagination is more important than knowledge" 
s = '''- Sentence: {sent}
- IPA transcription: {ipa}
- Syllabic cues: {syl}
'''.format(sent = sent1, 
           ipa = sy.english_to_ipa(sent1), 
           syl = sy.syllabify_line(line = sent1,
                                   ipa_converter = sy.english_to_ipa, 
                                   syllable_pattern = syllable_pattern_en,
                                   not_symbol_pattern = not_symbol_pattern_en,
                                   add_boundaries = True,
                                   as_event = False))
print(s)

- Sentence: Imagination is more important than knowledge
- IPA transcription: ɪmæd͡ʒənejʃən ɪz mɔɹ ɪmpɔɹtənt ðæn nɑləd͡ʒ
- Syllabic cues: ['#ɪm_mæd͡ʒ_d͡ʒən_nejʃ_ʃən#', '#ɪz#', '#mɔɹ#', '#ɪmp_mpɔɹt_ɹtənt#', '#ðæn#', '#nɑl_ləd͡ʒ#']



In [32]:
# Test 2
sent2 = "All models are wrong, but some are useful. "
s = '''- Sentence: {sent}
- IPA transcription: {ipa}
- Syllabic cues: {syl}
'''.format(sent = sent2, 
           ipa = sy.english_to_ipa(sent2), 
           syl = sy.syllabify_line(line = sent2,
                                   ipa_converter = sy.english_to_ipa, 
                                   syllable_pattern = syllable_pattern_en,
                                   not_symbol_pattern = not_symbol_pattern_en,
                                   add_boundaries = True,
                                   as_event = True))
print(s)

- Sentence: All models are wrong, but some are useful. 
- IPA transcription: ɔl mɑdəlz ɑɹ ɹɔŋ, bʌt sʌm ɑɹ jusfəl. 
- Syllabic cues: ('#ɔl#_#mɑd_dəlz#_#ɑɹ#_#ɹɔŋ#_#bʌt#_#sʌm#_#ɑɹ#_#jusf_sfəl#', 'all_models_are_wrong_but_some_are_useful')



In [33]:
# Test 3
sent3 = 'An approximate answer to the right problem is worth a good deal more than an exact answer to an approximate problem.'
s = '''- Sentence: {sent}
- IPA transcription: {ipa}
- Syllabic cues: {syl}
'''.format(sent = sent3, 
           ipa = sy.english_to_ipa(sent3), 
           syl = sy.syllabify_line(line = sent3,
                                   ipa_converter = sy.english_to_ipa, 
                                   syllable_pattern = syllable_pattern_en,
                                   not_symbol_pattern = not_symbol_pattern_en,
                                   add_boundaries = True,
                                   as_event = False))
print(s)

- Sentence: An approximate answer to the right problem is worth a good deal more than an exact answer to an approximate problem.
- IPA transcription: æn əpɹɑksəmət ænsəɹ tə ðə ɹajt pɹɑbləm ɪz wəɹθ ə ɡʊd dil mɔɹ ðæn æn ɪɡzækt ænsəɹ tə æn əpɹɑksəmət pɹɑbləm.
- Syllabic cues: ['#æn#', '#əpɹ_pɹɑks_ksəm_mət#', '#æns_nsəɹ#', '#tə#', '#ðə#', '#ɹajt#', '#pɹɑbl_bləm#', '#ɪz#', '#wəɹθ#', '#ə#', '#ɡʊd#', '#dil#', '#mɔɹ#', '#ðæn#', '#æn#', '#ɪɡz_ɡzækt#', '#æns_nsəɹ#', '#tə#', '#æn#', '#əpɹ_pɹɑks_ksəm_mət#', '#pɹɑbl_bləm#']



### Syllabify a set of English sentences

In [12]:
# Import the list as a pandas dataframe
Sentences = pd.read_csv(ENGLISH_SENTS)
Sentences.head()

Unnamed: 0,Sentence
0,what is aids?
1,how is infection transmitted?
2,"the medical aspects can be cancer, pneumonia, sudden blindness, dementia, dramatic weight loss o..."
3,"often infected people are rejected by family and friends, leaving them to face this chronic cond..."
4,did you know?


In [13]:
# Apply it to the whole set
Sentences['IPA_transcription'] = Sentences['Sentence'].apply(lambda x: sy.english_to_ipa(x))
Sentences['syllabic_cues'] = Sentences['Sentence'].apply(lambda x: sy.syllabify_line(x, 
                                                                                     sy.english_to_ipa, 
                                                                                     syllable_pattern_en,
                                                                                     not_symbol_pattern_en,
                                                                                     add_boundaries = True,
                                                                                     as_event = False))
# Display the resulting dataset
Sentences[:20]

Unnamed: 0,Sentence,IPA_transcription,syllabic_cues
0,what is aids?,wʌt ɪz ejdz?,"[#wʌt#, #ɪz#, #ejdz#]"
1,how is infection transmitted?,haw ɪz ɪnfɛkʃən tɹænsmɪtɪd?,"[#haw#, #ɪz#, #ɪnf_nfɛkʃ_kʃən#, #tɹænsm_nsmɪt_tɪd#]"
2,"the medical aspects can be cancer, pneumonia, sudden blindness, dementia, dramatic weight loss o...","ðə mɛdəkəl æspɛkts kæn bi kænsəɹ, numownjə, sʌdən blajndnəs, dɪmɛnʃiə, dɹəmætɪk wejt lɔs ɔɹ ɛni ...","[#ðə#, #mɛd_dək_kəl#, #æsp_spɛkts#, #kæn#, #bi#, #kæns_nsəɹ#, #num_mownj_njə#, #sʌd_dən#, #blajn..."
3,"often infected people are rejected by family and friends, leaving them to face this chronic cond...","ɔfən ɪnfɛktəd pipəl ɑɹ ɹɪd͡ʒɛktɪd baj fæməli ænd fɹɛndz, livɪŋ ðɛm tə fejs ðɪs kɹɑnɪk kəndɪʃən ə...","[#ɔf_fən#, #ɪnf_nfɛkt_ktəd#, #pip_pəl#, #ɑɹ#, #ɹɪd͡ʒ_d͡ʒɛkt_ktɪd#, #baj#, #fæm_məl_li#, #ænd#, #..."
4,did you know?,dɪd ju now?,"[#dɪd#, #ju#, #now#]"
5,there is no vaccine or cure currently available.,ðɛɹ ɪz now væksin ɔɹ kjʊɹ kəɹəntli əvejləbəl.,"[#ðɛɹ#, #ɪz#, #now#, #væks_ksin#, #ɔɹ#, #kjʊɹ#, #kəɹ_ɹəntl_ntli#, #əv_vejl_ləb_bəl#]"
6,women are twice as at risk from infection as men.,wɪmən ɑɹ twajs æz æt ɹɪsk fɹʌm ɪnfɛkʃən æz mɛn.,"[#wɪm_mən#, #ɑɹ#, #twajs#, #æz#, #æt#, #ɹɪsk#, #fɹʌm#, #ɪnf_nfɛkʃ_kʃən#, #æz#, #mɛn#]"
7,the major impact is yet to come.,ðə mejd͡ʒəɹ ɪmpækt ɪz jɛt tə kʌm.,"[#ðə#, #mejd͡ʒ_d͡ʒəɹ#, #ɪmp_mpækt#, #ɪz#, #jɛt#, #tə#, #kʌm#]"
8,raising money for your favourite charity can be fun.,ɹejzɪŋ mʌni fɔɹ jɔɹ fejvəɹɪt t͡ʃɛɹɪti kæn bi fʌn.,"[#ɹejz_zɪŋ#, #mʌn_ni#, #fɔɹ#, #jɔɹ#, #fejv_vəɹ_ɹɪt#, #t͡ʃɛɹ_ɹɪt_ti#, #kæn#, #bi#, #fʌn#]"
9,you can do it on your own or you can get together with family and friends.,ju kæn du ɪt ɑn jɔɹ own ɔɹ ju kæn ɡɛt təɡɛðəɹ wɪð fæməli ænd fɹɛndz.,"[#ju#, #kæn#, #du#, #ɪt#, #ɑn#, #jɔɹ#, #own#, #ɔɹ#, #ju#, #kæn#, #ɡɛt#, #təɡ_ɡɛð_ðəɹ#, #wɪð#, #f..."


## V. Creating event files with syllabic cues from an English corpus

### Check the corpus

In [4]:
### Check the corpus
with open(CORPUS_EN) as c:
    for x in range(9):
        print(next(c))

what is aids?

how is infection transmitted?

"the medical aspects can be cancer, pneumonia, sudden blindness, dementia, dramatic weight loss or any combination of these."

"often infected people are rejected by family and friends, leaving them to face this chronic condition alone."

did you know?

there is no vaccine or cure currently available.

women are twice as at risk from infection as men.

the major impact is yet to come.

raising money for your favourite charity can be fun.



### Run the syllabifier

In [5]:
# Generate the S2L event file (will also produce the corresponding L2L event file)
sy.syllabify_corpus(corpus_path = CORPUS_EN,
                    event_file_path = S2L_EVENTS_EN,
                    ipa_converter = sy.english_to_ipa,
                    syllable_pattern = syllable_pattern_en,
                    not_symbol_pattern = not_symbol_pattern_en,
                    add_boundaries = True,
                    numcores = 8,
                    chunksize = 125)

All processed in 0.14 minutes!tes


### Check the event file with syllabic cues

In [10]:
### Print the first 10 lines from the S2L event file 
with gzip.open(S2L_EVENTS_EN, 'rt', encoding='utf-8') as e:   
    for x in range(10):
        print(next(e))

#wʌt#_#ɪz#_#ejdz#	what_is_aids

#haw#_#ɪz#_#ɪnf_nfɛkʃ_kʃən#_#tɹænsm_nsmɪt_tɪd#	how_is_infection_transmitted

#ðə#_#mɛd_dək_kəl#_#æsp_spɛkts#_#kæn#_#bi#_#kæns_nsəɹ#_#num_mownj_njə#_#sʌd_dən#_#blajndn_ndnəs#_#dɪm_mɛnʃ_nʃi_ə#_#dɹəm_mæt_tɪk#_#wejt#_#lɔs#_#ɔɹ#_#ɛn_ni#_#kɑmb_mbən_nejʃ_ʃən#_#ʌv#_#ðiz#	the_medical_aspects_can_be_cancer_pneumonia_sudden_blindness_dementia_dramatic_weight_loss_or_any_combination_of_these

#ɔf_fən#_#ɪnf_nfɛkt_ktəd#_#pip_pəl#_#ɑɹ#_#ɹɪd͡ʒ_d͡ʒɛkt_ktɪd#_#baj#_#fæm_məl_li#_#ænd#_#fɹɛndz#_#liv_vɪŋ#_#ðɛm#_#tə#_#fejs#_#ðɪs#_#kɹɑn_nɪk#_#kənd_ndɪʃ_ʃən#_#əl_lown#	often_infected_people_are_rejected_by_family_and_friends_leaving_them_to_face_this_chronic_condition_alone

#dɪd#_#ju#_#now#	did_you_know

#ðɛɹ#_#ɪz#_#now#_#væks_ksin#_#ɔɹ#_#kjʊɹ#_#kəɹ_ɹəntl_ntli#_#əv_vejl_ləb_bəl#	there_is_no_vaccine_or_cure_currently_available

#wɪm_mən#_#ɑɹ#_#twajs#_#æz#_#æt#_#ɹɪsk#_#fɹʌm#_#ɪnf_nfɛkʃ_kʃən#_#æz#_#mɛn#	women_are_twice_as_at_risk_from_infection_as_men

#ðə#_#mejd͡ʒ_d͡ʒəɹ#_#ɪmp_mpæk