## Pronouncing Dictionaries

Map all encoded and recalled words to phoneme breakdown.

In [53]:
# imports
import cmlreaders as cml
import numpy as np
import pandas as pd; pd.set_option('display.max_columns', None)
import json
from tqdm.notebook import tqdm
import warnings; warnings.filterwarnings("ignore")
import nltk
from nltk.corpus import cmudict

# download CMU Pronouncing Dictionary
nltk.download('cmudict')

[nltk_data] Downloading package cmudict to
[nltk_data]     /home1/hherrema/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [54]:
def find_all_words(df_select):
    all_words = []
    for _, row in tqdm(df_select.iterrows()):
        reader = cml.CMLReader(row.subject, row.experiment, row.session, row.localization, row.montage)

        try:
            evs = reader.load('events')

            # select encoding and recall events
            evs = evs[evs['type'].isin(['WORD', 'REC_WORD'])]

            if row.experiment == 'pyFR':
                words = evs['item'].unique()
            else:
                words = evs['item_name'].unique()

            all_words.extend(list(words))

        except BaseException as e:
            continue

    return np.unique(all_words)

In [55]:
def build_dictionary(all_words):
    # load in the CMU Pronouncing Dictionary
    pronouncing_dict = cmudict.dict()
    
    pronouncing_cml = {}
    for word in tqdm(all_words):
        phonemes = pronouncing_dict.get(word.lower(), None)
        if phonemes:
            pronouncing_cml[word.upper()] = phonemes
            
    return pronouncing_cml

In [56]:
df = cml.get_data_index()

df_intrac = df[df.experiment.isin(['FR1', 'pyFR', 'IFR1'])]
df_scalp = df[(df['experiment'] == 'ltpFR2') & (df['session'] != 23)]

df_select = pd.concat([df_intrac, df_scalp], ignore_index=True)

In [57]:
all_words = find_all_words(df_select)

0it [00:00, ?it/s]

In [58]:
pronouncing_cml = build_dictionary(all_words)

  0%|          | 0/4594 [00:00<?, ?it/s]

#### Manual Inclusions

"Chipmunk", "Dustpan", "Nightgown" in ltpFR2 wordpool, don't have phoneme breakdowns.

In [59]:
[x in pronouncing_cml for x in ['CHIPMUNK', 'DUSTPAN', 'NIGHTGOWN']]

[False, False, False]

In [60]:
# load in the CMU Pronouncing Dictionary
pronouncing_dict = cmudict.dict()

In [61]:
pronouncing_dict.get('chip'), pronouncing_dict.get('munk')

([['CH', 'IH1', 'P']], [['M', 'AH1', 'NG', 'K']])

In [62]:
pronouncing_cml['CHIPMUNK'] = [['CH', 'IH1', 'P', 'M', 'AH1', 'NG', 'K']]

In [63]:
pronouncing_dict.get('dust'), pronouncing_dict.get('pan')

([['D', 'AH1', 'S', 'T']], [['P', 'AE1', 'N']])

In [64]:
pronouncing_cml['DUSTPAN'] = [['D', 'AH1', 'S', 'T', 'P', 'AE1', 'N']]

In [65]:
pronouncing_dict.get('night'), pronouncing_dict.get('gown')

([['N', 'AY1', 'T']], [['G', 'AW1', 'N']])

In [66]:
pronouncing_cml['NIGHTGOWN'] = [['N', 'AY1', 'T', 'G', 'AW1', 'N']]

In [67]:
with open('pronouncing_cml.json', 'w') as f:
    json.dump(pronouncing_cml, f)