## Pronouncing Dictionaries

Map all encoded and recalled words to phoneme breakdown.

In [1]:
# imports
import cmlreaders as cml
import numpy as np
import pandas as pd; pd.set_option('display.max_columns', None)
import json
from tqdm.notebook import tqdm
import warnings; warnings.filterwarnings("ignore")
import nltk
from nltk.corpus import cmudict

# download CMU Pronouncing Dictionary
nltk.download('cmudict')

[nltk_data] Downloading package cmudict to
[nltk_data]     /home1/hherrema/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [2]:
df = cml.get_data_index('ltp')
df_select = df[(df.experiment == 'ltpFR2') & (df.session != 23)]
df_select

Unnamed: 0,all_events,experiment,import_type,math_events,original_session,session,subject,subject_alias,task_events
487,protocols/ltp/subjects/LTP093/experiments/ltpF...,ltpFR2,build,protocols/ltp/subjects/LTP093/experiments/ltpF...,0,0,LTP093,LTP093,protocols/ltp/subjects/LTP093/experiments/ltpF...
488,protocols/ltp/subjects/LTP093/experiments/ltpF...,ltpFR2,build,protocols/ltp/subjects/LTP093/experiments/ltpF...,1,1,LTP093,LTP093,protocols/ltp/subjects/LTP093/experiments/ltpF...
489,protocols/ltp/subjects/LTP093/experiments/ltpF...,ltpFR2,build,protocols/ltp/subjects/LTP093/experiments/ltpF...,10,10,LTP093,LTP093,protocols/ltp/subjects/LTP093/experiments/ltpF...
490,protocols/ltp/subjects/LTP093/experiments/ltpF...,ltpFR2,build,protocols/ltp/subjects/LTP093/experiments/ltpF...,11,11,LTP093,LTP093,protocols/ltp/subjects/LTP093/experiments/ltpF...
491,protocols/ltp/subjects/LTP093/experiments/ltpF...,ltpFR2,build,protocols/ltp/subjects/LTP093/experiments/ltpF...,12,12,LTP093,LTP093,protocols/ltp/subjects/LTP093/experiments/ltpF...
...,...,...,...,...,...,...,...,...,...
6526,protocols/ltp/subjects/LTP393/experiments/ltpF...,ltpFR2,build,protocols/ltp/subjects/LTP393/experiments/ltpF...,5,5,LTP393,LTP393,protocols/ltp/subjects/LTP393/experiments/ltpF...
6527,protocols/ltp/subjects/LTP393/experiments/ltpF...,ltpFR2,build,protocols/ltp/subjects/LTP393/experiments/ltpF...,6,6,LTP393,LTP393,protocols/ltp/subjects/LTP393/experiments/ltpF...
6528,protocols/ltp/subjects/LTP393/experiments/ltpF...,ltpFR2,build,protocols/ltp/subjects/LTP393/experiments/ltpF...,7,7,LTP393,LTP393,protocols/ltp/subjects/LTP393/experiments/ltpF...
6529,protocols/ltp/subjects/LTP393/experiments/ltpF...,ltpFR2,build,protocols/ltp/subjects/LTP393/experiments/ltpF...,8,8,LTP393,LTP393,protocols/ltp/subjects/LTP393/experiments/ltpF...


In [6]:
def find_all_words(df_select):
    all_words = []
    for _, row in tqdm(df_select.iterrows()):
        reader = cml.CMLReader(row.subject, row.experiment, row.session)

        try:
            evs = reader.load('events')

            # select encoding and recall events
            evs = evs[evs['type'].isin(['WORD', 'REC_WORD'])][['type', 'item_name']]
            words = evs.drop_duplicates(subset='item_name')

            all_words.append(words)

        except BaseException as e:
            continue
            
    return pd.concat(all_words, ignore_index=True).drop_duplicates(subset='item_name').reset_index(drop=True)

In [7]:
all_words = find_all_words(df_select)

0it [00:00, ?it/s]

In [8]:
def build_dictionary(all_words):
    # load in the CMU Pronouncing Dictionary
    pronouncing_dict = cmudict.dict()

    pronouncing_cml = {}
    manual_add = []
    for _, row in tqdm(all_words.iterrows()):
        phonemes = pronouncing_dict.get(row.item_name.lower(), None)
        if phonemes:
            pronouncing_cml[row.item_name.upper()] = phonemes
        elif row.type == 'WORD':
            manual_add.append(row.item_name.upper())
            
    return pronouncing_cml, np.sort(manual_add)

In [9]:
pronouncing_cml, manual_add = build_dictionary(all_words)
manual_add

0it [00:00, ?it/s]

array(['CHIPMUNK', 'DUSTPAN', 'NIGHTGOWN'], dtype='<U9')

#### Manual Inclusions

"Chipmunk", "Dustpan", "Nightgown" in ltpFR2 wordpool, don't have phoneme breakdowns.

In [10]:
# load in the CMU Pronouncing Dictionary
pronouncing_dict = cmudict.dict()

In [11]:
pronouncing_dict.get('chip'), pronouncing_dict.get('munk')

([['CH', 'IH1', 'P']], [['M', 'AH1', 'NG', 'K']])

In [12]:
pronouncing_cml['CHIPMUNK'] = [['CH', 'IH1', 'P', 'M', 'AH1', 'NG', 'K']]

In [13]:
pronouncing_dict.get('dust'), pronouncing_dict.get('pan')

([['D', 'AH1', 'S', 'T']], [['P', 'AE1', 'N']])

In [14]:
pronouncing_cml['DUSTPAN'] = [['D', 'AH1', 'S', 'T', 'P', 'AE1', 'N']]

In [15]:
pronouncing_dict.get('night'), pronouncing_dict.get('gown')

([['N', 'AY1', 'T']], [['G', 'AW1', 'N']])

In [16]:
pronouncing_cml['NIGHTGOWN'] = [['N', 'AY1', 'T', 'G', 'AW1', 'N']]

In [18]:
with open('pronouncing_cml.json', 'w') as f:
    json.dump(pronouncing_cml, f)

#### Examples for Paper

In [23]:
# alliterative = same 1st phoneme
shark = pronouncing_cml.get('SHARK')
shoulder = pronouncing_cml.get('SHOULDER')
print(shark, shoulder)

p1 = set(shark[0])
p2 = set(shoulder[0])

len(set.intersection(p1, p2)) / len(set.union(p1, p2)), set.intersection(p1, p2), set.union(p1, p2)

[['SH', 'AA1', 'R', 'K']] [['SH', 'OW1', 'L', 'D', 'ER0']]


(0.125, {'SH'}, {'AA1', 'D', 'ER0', 'K', 'L', 'OW1', 'R', 'SH'})

#### Rhyme

In [44]:
# rhyme = same last 2 phonemes
palace = pronouncing_cml.get('PALACE')
waitress = pronouncing_cml.get('WAITRESS')
print(palace, waitress)

p1 = set(palace[0])
p2 = set(waitress[0])

len(set.intersection(p1, p2)) / len(set.union(p1, p2)), set.intersection(p1, p2), set.union(p1, p2)

[['P', 'AE1', 'L', 'AH0', 'S']] [['W', 'EY1', 'T', 'R', 'AH0', 'S']]


(0.2222222222222222,
 {'AH0', 'S'},
 {'AE1', 'AH0', 'EY1', 'L', 'P', 'R', 'S', 'T', 'W'})

#### Middle

In [3]:
doughnut = pronouncing_cml.get('DOUGHNUT')
notebook = pronouncing_cml.get('NOTEBOOK')
print(doughnut, notebook)

p1 = set(doughnut[0])
p2 = set(notebook[0])

len(set.intersection(p1, p2)) / len(set.union(p1, p2)), set.intersection(p1, p2), set.union(p1, p2)

[['D', 'OW1', 'N', 'AH2', 'T']] [['N', 'OW1', 'T', 'B', 'UH2', 'K']]


(0.375, {'N', 'OW1', 'T'}, {'AH2', 'B', 'D', 'K', 'N', 'OW1', 'T', 'UH2'})