In [1]:
import pandas as pd 
import numpy as np 

# Import CMUdict

In [2]:
# download CMUdict from https://github.com/Alexir/CMUdict/blob/master/cmudict-0.7b 
# and trimmed out the punctuations
cmu = pd.read_csv('02_cmudict-07b', names =['cmu'])
cmu[['word','cmu_pronunciation']] = cmu["cmu"].str.split(" ", 1, expand=True)
cmu[1:10] 

Unnamed: 0,cmu,word,cmu_pronunciation
1,A(1) EY1,A(1),EY1
2,A'S EY1 Z,A'S,EY1 Z
3,A. EY1,A.,EY1
4,A.'S EY1 Z,A.'S,EY1 Z
5,A.D. EY2 D IY1,A.D.,EY2 D IY1
6,A.M. EY2 EH1 M,A.M.,EY2 EH1 M
7,A.S EY1 Z,A.S,EY1 Z
8,A42128 EY1 F AO1 R T UW1 W AH1 N T UW1 EY1 T,A42128,EY1 F AO1 R T UW1 W AH1 N T UW1 EY1 T
9,AA EY2 EY1,AA,EY2 EY1


In [3]:
cmu_dict = cmu[["word", "cmu_pronunciation"]]
cmu_dict.head()

Unnamed: 0,word,cmu_pronunciation
0,A,AH0
1,A(1),EY1
2,A'S,EY1 Z
3,A.,EY1
4,A.'S,EY1 Z


In [4]:
cmudict = cmu_dict.set_index('word').T.to_dict('list')

# input sentences

In [5]:
df_input = pd.read_excel('02_input_text.xlsx')
df_input

Unnamed: 0,id,sentence
0,1,"Those promises have fallen apart. India, e..."
1,2,He has come under criticism for departing to L...
2,3,2021 He said he would return to India when he ...
3,4,He told a British newspaper he had received ...
4,5,"When he. returns to India, he will travel..."


In [6]:
#data cleaning
import re
df_input["clean"] = df_input["sentence"].fillna("")
df_input["clean"] = df_input["clean"].apply(lambda x: x.lower())
df_input['clean'] = df_input['clean'].map(lambda x: re.sub('[,\.!?()‘\'"“”’]', '', str(x))) 
df_input['clean'] = df_input['clean'].str.replace(r'\s+'," ", regex=True)
df_input["clean"] = df_input["clean"].str.strip()
df_input

Unnamed: 0,id,sentence,clean
0,1,"Those promises have fallen apart. India, e...",those promises have fallen apart india engulfe...
1,2,He has come under criticism for departing to L...,he has come under criticism for departing to l...
2,3,2021 He said he would return to India when he ...,2021 he said he would return to india when he ...
3,4,He told a British newspaper he had received ...,he told a british newspaper he had received th...
4,5,"When he. returns to India, he will travel...",when he returns to india he will travel with g...


In [7]:
df_input["sep"] = df_input["clean"].str.split(" ")
df_input.head()

Unnamed: 0,id,sentence,clean,sep
0,1,"Those promises have fallen apart. India, e...",those promises have fallen apart india engulfe...,"[those, promises, have, fallen, apart, india, ..."
1,2,He has come under criticism for departing to L...,he has come under criticism for departing to l...,"[he, has, come, under, criticism, for, departi..."
2,3,2021 He said he would return to India when he ...,2021 he said he would return to india when he ...,"[2021, he, said, he, would, return, to, india,..."
3,4,He told a British newspaper he had received ...,he told a british newspaper he had received th...,"[he, told, a, british, newspaper, he, had, rec..."
4,5,"When he. returns to India, he will travel...",when he returns to india he will travel with g...,"[when, he, returns, to, india, he, will, trave..."


In [8]:
# convert numbers to words
# !pip install num2words
import num2words
def convert_num_to_words(utterance):
    utterance = ' '.join([num2words.num2words(i) if i.isdigit() else i for i in utterance])
    return utterance

In [9]:
df_input["words"] = df_input['sep'].apply(convert_num_to_words)
df_input['words'] = df_input['words'].map(lambda x: re.sub('-', ' ', str(x))) # numbers that have '-'
df_input

Unnamed: 0,id,sentence,clean,sep,words
0,1,"Those promises have fallen apart. India, e...",those promises have fallen apart india engulfe...,"[those, promises, have, fallen, apart, india, ...",those promises have fallen apart india engulfe...
1,2,He has come under criticism for departing to L...,he has come under criticism for departing to l...,"[he, has, come, under, criticism, for, departi...",he has come under criticism for departing to l...
2,3,2021 He said he would return to India when he ...,2021 he said he would return to india when he ...,"[2021, he, said, he, would, return, to, india,...",two thousand and twenty one he said he would r...
3,4,He told a British newspaper he had received ...,he told a british newspaper he had received th...,"[he, told, a, british, newspaper, he, had, rec...",he told a british newspaper he had received th...
4,5,"When he. returns to India, he will travel...",when he returns to india he will travel with g...,"[when, he, returns, to, india, he, will, trave...",when he returns to india he will travel with g...


# Count Syllables

In [10]:
df = df_input.copy()
df.head(2)

Unnamed: 0,id,sentence,clean,sep,words
0,1,"Those promises have fallen apart. India, e...",those promises have fallen apart india engulfe...,"[those, promises, have, fallen, apart, india, ...",those promises have fallen apart india engulfe...
1,2,He has come under criticism for departing to L...,he has come under criticism for departing to l...,"[he, has, come, under, criticism, for, departi...",he has come under criticism for departing to l...


In [11]:
df['list'] = df['words'].apply(lambda x: x.upper())
df['list'] = df['list'].str.split(" ")
df['list'].fillna('empty', inplace=True)
df.head()

Unnamed: 0,id,sentence,clean,sep,words,list
0,1,"Those promises have fallen apart. India, e...",those promises have fallen apart india engulfe...,"[those, promises, have, fallen, apart, india, ...",those promises have fallen apart india engulfe...,"[THOSE, PROMISES, HAVE, FALLEN, APART, INDIA, ..."
1,2,He has come under criticism for departing to L...,he has come under criticism for departing to l...,"[he, has, come, under, criticism, for, departi...",he has come under criticism for departing to l...,"[HE, HAS, COME, UNDER, CRITICISM, FOR, DEPARTI..."
2,3,2021 He said he would return to India when he ...,2021 he said he would return to india when he ...,"[2021, he, said, he, would, return, to, india,...",two thousand and twenty one he said he would r...,"[TWO, THOUSAND, AND, TWENTY, ONE, HE, SAID, HE..."
3,4,He told a British newspaper he had received ...,he told a british newspaper he had received th...,"[he, told, a, british, newspaper, he, had, rec...",he told a british newspaper he had received th...,"[HE, TOLD, A, BRITISH, NEWSPAPER, HE, HAD, REC..."
4,5,"When he. returns to India, he will travel...",when he returns to india he will travel with g...,"[when, he, returns, to, india, he, will, trave...",when he returns to india he will travel with g...,"[WHEN, HE, RETURNS, TO, INDIA, HE, WILL, TRAVE..."


In [12]:
# extract words that are not included in the CMUdict
def extract_nocmu(data):
    no_list = []
    for word in data:
        if word not in cmudict:
            no_list.append(word)
    return no_list

In [13]:
df["No_CMU_words"] = df["list"].apply(extract_nocmu)
df.head()

Unnamed: 0,id,sentence,clean,sep,words,list,No_CMU_words
0,1,"Those promises have fallen apart. India, e...",those promises have fallen apart india engulfe...,"[those, promises, have, fallen, apart, india, ...",those promises have fallen apart india engulfe...,"[THOSE, PROMISES, HAVE, FALLEN, APART, INDIA, ...",[CORONAVIRUS]
1,2,He has come under criticism for departing to L...,he has come under criticism for departing to l...,"[he, has, come, under, criticism, for, departi...",he has come under criticism for departing to l...,"[HE, HAS, COME, UNDER, CRITICISM, FOR, DEPARTI...",[]
2,3,2021 He said he would return to India when he ...,2021 he said he would return to india when he ...,"[2021, he, said, he, would, return, to, india,...",two thousand and twenty one he said he would r...,"[TWO, THOUSAND, AND, TWENTY, ONE, HE, SAID, HE...",[]
3,4,He told a British newspaper he had received ...,he told a british newspaper he had received th...,"[he, told, a, british, newspaper, he, had, rec...",he told a british newspaper he had received th...,"[HE, TOLD, A, BRITISH, NEWSPAPER, HE, HAD, REC...",[INDIAS]
4,5,"When he. returns to India, he will travel...",when he returns to india he will travel with g...,"[when, he, returns, to, india, he, will, trave...",when he returns to india he will travel with g...,"[WHEN, HE, RETURNS, TO, INDIA, HE, WILL, TRAVE...",[]


In [14]:
for sub in df['list']:
    for i, word in enumerate(sub):
        if word in cmudict:
            sub[i] = cmudict[word]
df.head(2)

Unnamed: 0,id,sentence,clean,sep,words,list,No_CMU_words
0,1,"Those promises have fallen apart. India, e...",those promises have fallen apart india engulfe...,"[those, promises, have, fallen, apart, india, ...",those promises have fallen apart india engulfe...,"[[ DH OW1 Z], [ P R AA1 M AH0 S AH0 Z], [ HH A...",[CORONAVIRUS]
1,2,He has come under criticism for departing to L...,he has come under criticism for departing to l...,"[he, has, come, under, criticism, for, departi...",he has come under criticism for departing to l...,"[[ HH IY1], [ HH AE1 Z], [ K AH1 M], [ AH1 N D...",[]


In [15]:
# The following code is copied from https://datascience.stackexchange.com/questions/23376/how-to-get-the-number-of-syllables-in-a-word

import re

VOWEL_RUNS = re.compile("[aeiouy]+", flags=re.I)  # re.IGNORECASE
EXCEPTIONS = re.compile(
    # fixes trailing e issues:
    # smite, scared
    "[^aeiou]e[sd]?$|"
    # fixes adverbs:
    # nicely
    + "[^e]ely$",
    flags=re.I
)

ADDITIONAL = re.compile(
    # fixes incorrect subtractions from exceptions:
    # smile, scarred, raises, fated
    "[^aeioulr][lr]e[sd]?$|[csgz]es$|[td]ed$|"
    # fixes miscellaneous issues:
    # flying, piano, video, prism, fire, evaluate
    + ".y[aeiou]|ia(?!n$)|eo|ism$|[^aeiou]ire$|[^gq]ua",
    flags=re.I
)

def count_syllables(word):
    vowel_runs = len(VOWEL_RUNS.findall(word))
    exceptions = len(EXCEPTIONS.findall(word))
    additional = len(ADDITIONAL.findall(word))
    return max(1, vowel_runs - exceptions + additional)

In [16]:
for sub in df['No_CMU_words']:
    for i, word in enumerate(sub):
        sub[i]  = count_syllables(word)
df

Unnamed: 0,id,sentence,clean,sep,words,list,No_CMU_words
0,1,"Those promises have fallen apart. India, e...",those promises have fallen apart india engulfe...,"[those, promises, have, fallen, apart, india, ...",those promises have fallen apart india engulfe...,"[[ DH OW1 Z], [ P R AA1 M AH0 S AH0 Z], [ HH A...",[5]
1,2,He has come under criticism for departing to L...,he has come under criticism for departing to l...,"[he, has, come, under, criticism, for, departi...",he has come under criticism for departing to l...,"[[ HH IY1], [ HH AE1 Z], [ K AH1 M], [ AH1 N D...",[]
2,3,2021 He said he would return to India when he ...,2021 he said he would return to india when he ...,"[2021, he, said, he, would, return, to, india,...",two thousand and twenty one he said he would r...,"[[ T UW1], [ TH AW1 Z AH0 N D], [ AH0 N D], [ ...",[]
3,4,He told a British newspaper he had received ...,he told a british newspaper he had received th...,"[he, told, a, british, newspaper, he, had, rec...",he told a british newspaper he had received th...,"[[ HH IY1], [ T OW1 L D], [ AH0], [ B R IH1 T ...",[3]
4,5,"When he. returns to India, he will travel...",when he returns to india he will travel with g...,"[when, he, returns, to, india, he, will, trave...",when he returns to india he will travel with g...,"[[ W EH1 N], [ HH IY1], [ R IH0 T ER1 N Z], [ ...",[]


In [17]:
df['ARPAsent'] = df['list'].astype(str).str.replace('\[|\]|\'', '')
df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,id,sentence,clean,sep,words,list,No_CMU_words,ARPAsent
0,1,"Those promises have fallen apart. India, e...",those promises have fallen apart india engulfe...,"[those, promises, have, fallen, apart, india, ...",those promises have fallen apart india engulfe...,"[[ DH OW1 Z], [ P R AA1 M AH0 S AH0 Z], [ HH A...",[5],"DH OW1 Z, P R AA1 M AH0 S AH0 Z, HH AE1 V, ..."
1,2,He has come under criticism for departing to L...,he has come under criticism for departing to l...,"[he, has, come, under, criticism, for, departi...",he has come under criticism for departing to l...,"[[ HH IY1], [ HH AE1 Z], [ K AH1 M], [ AH1 N D...",[],"HH IY1, HH AE1 Z, K AH1 M, AH1 N D ER0, K..."
2,3,2021 He said he would return to India when he ...,2021 he said he would return to india when he ...,"[2021, he, said, he, would, return, to, india,...",two thousand and twenty one he said he would r...,"[[ T UW1], [ TH AW1 Z AH0 N D], [ AH0 N D], [ ...",[],"T UW1, TH AW1 Z AH0 N D, AH0 N D, T W EH1 ..."
3,4,He told a British newspaper he had received ...,he told a british newspaper he had received th...,"[he, told, a, british, newspaper, he, had, rec...",he told a british newspaper he had received th...,"[[ HH IY1], [ T OW1 L D], [ AH0], [ B R IH1 T ...",[3],"HH IY1, T OW1 L D, AH0, B R IH1 T IH0 SH, ..."
4,5,"When he. returns to India, he will travel...",when he returns to india he will travel with g...,"[when, he, returns, to, india, he, will, trave...",when he returns to india he will travel with g...,"[[ W EH1 N], [ HH IY1], [ R IH0 T ER1 N Z], [ ...",[],"W EH1 N, HH IY1, R IH0 T ER1 N Z, T UW1, ..."


In [18]:
def count_digits(string):
    return sum(item.isdigit() for item in string) 

In [19]:
df['ARPAcnt'] = df['ARPAsent'].apply(count_digits)
df.head()

Unnamed: 0,id,sentence,clean,sep,words,list,No_CMU_words,ARPAsent,ARPAcnt
0,1,"Those promises have fallen apart. India, e...",those promises have fallen apart india engulfe...,"[those, promises, have, fallen, apart, india, ...",those promises have fallen apart india engulfe...,"[[ DH OW1 Z], [ P R AA1 M AH0 S AH0 Z], [ HH A...",[5],"DH OW1 Z, P R AA1 M AH0 S AH0 Z, HH AE1 V, ...",48
1,2,He has come under criticism for departing to L...,he has come under criticism for departing to l...,"[he, has, come, under, criticism, for, departi...",he has come under criticism for departing to l...,"[[ HH IY1], [ HH AE1 Z], [ K AH1 M], [ AH1 N D...",[],"HH IY1, HH AE1 Z, K AH1 M, AH1 N D ER0, K...",31
2,3,2021 He said he would return to India when he ...,2021 he said he would return to india when he ...,"[2021, he, said, he, would, return, to, india,...",two thousand and twenty one he said he would r...,"[[ T UW1], [ TH AW1 Z AH0 N D], [ AH0 N D], [ ...",[],"T UW1, TH AW1 Z AH0 N D, AH0 N D, T W EH1 ...",28
3,4,He told a British newspaper he had received ...,he told a british newspaper he had received th...,"[he, told, a, british, newspaper, he, had, rec...",he told a british newspaper he had received th...,"[[ HH IY1], [ T OW1 L D], [ AH0], [ B R IH1 T ...",[3],"HH IY1, T OW1 L D, AH0, B R IH1 T IH0 SH, ...",37
4,5,"When he. returns to India, he will travel...",when he returns to india he will travel with g...,"[when, he, returns, to, india, he, will, trave...",when he returns to india he will travel with g...,"[[ W EH1 N], [ HH IY1], [ R IH0 T ER1 N Z], [ ...",[],"W EH1 N, HH IY1, R IH0 T ER1 N Z, T UW1, ...",20


In [20]:
df['NUMsum'] = df['No_CMU_words'].apply(sum)
df['Total Syllable_CMU'] = df['ARPAcnt'] + df['NUMsum']
df.to_excel('syllable_cmu.xlsx')
df

Unnamed: 0,id,sentence,clean,sep,words,list,No_CMU_words,ARPAsent,ARPAcnt,NUMsum,Total Syllable_CMU
0,1,"Those promises have fallen apart. India, e...",those promises have fallen apart india engulfe...,"[those, promises, have, fallen, apart, india, ...",those promises have fallen apart india engulfe...,"[[ DH OW1 Z], [ P R AA1 M AH0 S AH0 Z], [ HH A...",[5],"DH OW1 Z, P R AA1 M AH0 S AH0 Z, HH AE1 V, ...",48,5,53
1,2,He has come under criticism for departing to L...,he has come under criticism for departing to l...,"[he, has, come, under, criticism, for, departi...",he has come under criticism for departing to l...,"[[ HH IY1], [ HH AE1 Z], [ K AH1 M], [ AH1 N D...",[],"HH IY1, HH AE1 Z, K AH1 M, AH1 N D ER0, K...",31,0,31
2,3,2021 He said he would return to India when he ...,2021 he said he would return to india when he ...,"[2021, he, said, he, would, return, to, india,...",two thousand and twenty one he said he would r...,"[[ T UW1], [ TH AW1 Z AH0 N D], [ AH0 N D], [ ...",[],"T UW1, TH AW1 Z AH0 N D, AH0 N D, T W EH1 ...",28,0,28
3,4,He told a British newspaper he had received ...,he told a british newspaper he had received th...,"[he, told, a, british, newspaper, he, had, rec...",he told a british newspaper he had received th...,"[[ HH IY1], [ T OW1 L D], [ AH0], [ B R IH1 T ...",[3],"HH IY1, T OW1 L D, AH0, B R IH1 T IH0 SH, ...",37,3,40
4,5,"When he. returns to India, he will travel...",when he returns to india he will travel with g...,"[when, he, returns, to, india, he, will, trave...",when he returns to india he will travel with g...,"[[ W EH1 N], [ HH IY1], [ R IH0 T ER1 N Z], [ ...",[],"W EH1 N, HH IY1, R IH0 T ER1 N Z, T UW1, ...",20,0,20


In [21]:
# this can be done with the "syllables" library, but the performance is different.
import syllables 
df["syllables"] = df["words"].apply(syllables.estimate)
df.to_excel('syllables_library.xlsx', encoding = 'utf-8')
df

Unnamed: 0,id,sentence,clean,sep,words,list,No_CMU_words,ARPAsent,ARPAcnt,NUMsum,Total Syllable_CMU,syllables
0,1,"Those promises have fallen apart. India, e...",those promises have fallen apart india engulfe...,"[those, promises, have, fallen, apart, india, ...",those promises have fallen apart india engulfe...,"[[ DH OW1 Z], [ P R AA1 M AH0 S AH0 Z], [ HH A...",[5],"DH OW1 Z, P R AA1 M AH0 S AH0 Z, HH AE1 V, ...",48,5,53,60
1,2,He has come under criticism for departing to L...,he has come under criticism for departing to l...,"[he, has, come, under, criticism, for, departi...",he has come under criticism for departing to l...,"[[ HH IY1], [ HH AE1 Z], [ K AH1 M], [ AH1 N D...",[],"HH IY1, HH AE1 Z, K AH1 M, AH1 N D ER0, K...",31,0,31,31
2,3,2021 He said he would return to India when he ...,2021 he said he would return to india when he ...,"[2021, he, said, he, would, return, to, india,...",two thousand and twenty one he said he would r...,"[[ T UW1], [ TH AW1 Z AH0 N D], [ AH0 N D], [ ...",[],"T UW1, TH AW1 Z AH0 N D, AH0 N D, T W EH1 ...",28,0,28,30
3,4,He told a British newspaper he had received ...,he told a british newspaper he had received th...,"[he, told, a, british, newspaper, he, had, rec...",he told a british newspaper he had received th...,"[[ HH IY1], [ T OW1 L D], [ AH0], [ B R IH1 T ...",[3],"HH IY1, T OW1 L D, AH0, B R IH1 T IH0 SH, ...",37,3,40,42
4,5,"When he. returns to India, he will travel...",when he returns to india he will travel with g...,"[when, he, returns, to, india, he, will, trave...",when he returns to india he will travel with g...,"[[ W EH1 N], [ HH IY1], [ R IH0 T ER1 N Z], [ ...",[],"W EH1 N, HH IY1, R IH0 T ER1 N Z, T UW1, ...",20,0,20,21


In [22]:
#end.