# Extend Word2Vec Model with Phonemes 



In [None]:
import math
import random

## Load Word2Vec model trained with 100B words


from https://code.google.com/archive/p/word2vec/ 
### and wrap-it up in a ready-to-use class 


In [None]:
model = None # cache of Model 

In [None]:
import logging

logger = logging.getLogger(__name__)

def set_logging_as(a_level):
    logger.setLevel(a_level)

    # logging.basicConfig(format='%(asctime)s : %(levelname)s : %(module)s:%(lineno)d : %(funcName)s(%(threadName)s) : %(message)s')

#     ,
#         level=a_level)

# initialization: 
set_logging_as(logging.DEBUG)    

In [None]:
logging.getLevelName(logger.getEffectiveLevel())

In [None]:
set_logging_as(logging.DEBUG)
logger.info("lalala")

In [None]:
set_logging_as(logging.CRITICAL)
logger.info("lalala")

In [None]:
# create logger
alogger = logging.getLogger(__name__)
alogger.setLevel(logging.DEBUG)

# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# add formatter to ch
ch.setFormatter(formatter)
# add ch to logger
alogger.addHandler(ch)


In [None]:
logging.getLevelName(alogger.getEffectiveLevel())

In [None]:
alogger.setLevel(logging.DEBUG)

In [None]:
alogger.info("lala")

In [None]:
data_dir = "./data"

In [None]:
import gensim 
import bisect 
import numpy as np
from typing import List, Dict

class ModelWrapper():
        
    default_shelf_filename = 'shelf_from0_for2999999.shelf'
        
    def __init__(self, m, sounds_dict = None):
        if m is None:
            print("Loading model...")
            self.model = gensim.models.word2vec.KeyedVectors.load_word2vec_format('{}/GoogleNews-vectors-negative300.bin.gz'.format(data_dir), binary=True)
            print("Model succesfully loaded")
        else:
            print("[init] Model provided. If you want me to FORCE re-load it, call ModelWrapper's constructor with 'None'")
            self.model = m            
        # sort all the words in the model, so that we can auto-complete queries quickly
        print("Sort all the words in the model, so that we can auto-complete queries quickly...")
        self.orig_words = [gensim.utils.to_unicode(word) for word in self.model.index2word]
        indices = [i for i, _ in sorted(enumerate(self.orig_words), key=lambda item: item[1].lower())]
        self.all_words = [self.orig_words[i].lower() for i in indices]  # lowercased, sorted as lowercased
        self.orig_words = [self.orig_words[i] for i in indices]  # original letter casing, but sorted as if lowercased
        
        # sounds dictionary 
        if sounds_dict is None:
            print("Loading default sounds dictionary from '{}'...".format(self.default_shelf_filename))
            self.sounds_dict = shelve.open(self.default_shelf_filename, flag='r')  
            print("Sounds dictionary succesfully loaded")
        else:
            self.sounds_dict = sounds_dict
    
        
        
        
    def suggest(self, term):
        """
        For a given prefix, return 10 words that exist in the model start start with that prefix
        """
        prefix = gensim.utils.to_unicode(term).strip().lower()
        count = 10
        pos = bisect.bisect_left(self.all_words, prefix)
        result = self.orig_words[pos: pos + count]
        logger.info("suggested %r: %s" % (prefix, result))
        return result      
    
    def most_similar(self, positive, negative):
        """
            positive: an array of positive words
            negative: an array of negative words 
        """                
        try:
            result = self.model.most_similar(
                positive=[word.strip() for word in positive if word],
                negative=[word.strip() for word in negative if word],
                topn=5)
        except:
            result = []
        logger.info("similars for %s vs. %s: %s" % (positive, negative, result))
        return {'similars': result}    
    
    def vec_repr(self, word):
        """
            If 'word' belongs in the vocabulary, returns its 
            word2vec representation. Otherwise returns a vector of 0's
            of the same length of the other words. 
        """
        try:
            return self.model.word_vec(word)
        except KeyError:
            logger.debug("'{}' not in Model. Returning [0]'s vector.".format(word))
            return np.zeros(self.model.vector_size)
        
    def sound_to_word(self, a_sound: str) -> str:
        return self.sounds_dict[a_sound]
    # self.sound_repr(a_sound)["word"]

    def sound_to_vec(self, a_sound: str) -> str:
        return self.vec_repr(self.sound_to_word(a_sound))

    def sound_repr(self, a_sound: str) -> Dict:
        # w = self.sounds_dict[a_sound]
        return {'word': self.sound_to_word(a_sound), 'vec': self.sound_to_vec(a_sound)}  
    

In [None]:
mw = ModelWrapper(model)
model = mw.model # just cache in case I re-call this cell

In [None]:
idx = 67810
print(key_list[idx])
mw.sound_to_word(key_list[idx])

In [None]:
key_list = list(mw.sounds_dict.keys())

In [None]:
idx = 6780
print("'{}' is the sound of '{}'".format(key_list[idx], mw.sounds_dict[key_list[idx]]))

In [None]:
mw.sounds_dict["ju:"]
mw.sound_to_word("ju:")

In [None]:
mw.model.syn0.shape


In [None]:
type(mw.model.vocab.items())

In [None]:
# my_dictionary = {k: f(v) for k, v in my_dictionary.items()}
mw.model.index2word[:10]

In [None]:
mw.model['skill']

In [None]:
# def build_phonemes_dict(from_idx, how_many_words):
#     sent = ' '.join(mw.model.index2word[from_idx:from_idx + how_many_words])
#     print(sent)
#     array_phonemes =  graphs2phones(sent)
#     print(array_phonemes)
#     assert len(array_phonemes) == how_many_words, "(Have {} phoneme-strings, {} words) Looks like some words in vocab have phonemes-strings > 1".format(len(array_phonemes), how_many_words)
#     zz = list(zip(array_phonemes, sent.split())) #  list(zip(graphs2phones(sent), sent.split()))
#     # print(list(zz))
#     return {ph: w for (ph, w) in list(zz)}
#     # print(list(zz_filtered))

#### In order to avoid 'one sound for several words' effect, I break the words: 

In [None]:
-

### Tryout with Shelves

In [None]:
import shelve 

def graphemes_to_phonemes_to_shelves(words_in_sent, shelf_filename):
    """
        Takes a list of words and returns a list of tuples
        (grapheme: phoneme)
        Example:
        > graphemes_to_phonemes(["luis", "papa"])
        [('luis', "lj'u:Iz"), ('papa', "pa#p'A:")]
    """
    MAX_LENGTH_TO_SPEAK = 10 # if I give more than this, espeak fails to do a good job 
    # First step: generate all sounds of words as if they were "alone" (ie, not in a sentence)
    # We want to avoid a combination of words making only 1 sound
    # For example (depending on accent): "what's up?"
    # So in order to do that we'll introduce a word with a unique sound between the words, 
    # generate phonemes and then process them smartly: 
    # separator for words in sentence 
    separator = {"str": "XXX"}
    separator["sound"] = ''.join(graphs2phones(separator["str"]))    
    # 
    how_many_words = len(words_in_sent)
    num_batches = (how_many_words // MAX_LENGTH_TO_SPEAK) + int(how_many_words % MAX_LENGTH_TO_SPEAK != 0)
    result_dict = shelve.open(shelf_filename, flag='c')
    try:
        for i in range(num_batches):
            logger.debug("{}: {} to {}".format(i, i * MAX_LENGTH_TO_SPEAK, (i + 1)*MAX_LENGTH_TO_SPEAK))
            words_in_batch = words_in_sent[i * MAX_LENGTH_TO_SPEAK: (i + 1)*MAX_LENGTH_TO_SPEAK]
            logger.debug("words_in_batch = {}".format(words_in_batch))
            sent_augm = ' '.join([w1 + ' ' + w2 for w1, w2 in list(zip([separator["str"]]*len(words_in_batch), words_in_batch))]) + " " + separator["str"]
            logger.debug("sent_augm = {}".format(sent_augm))
            phonemes_strs_augm = graphs2phones(sent_augm)
            logger.debug("phonemes_strs_augm = {}".format(phonemes_strs_augm))
            # there we go: all (indexes of) sounds that we are interested in. 
            seps_idxs = [i for i,v in enumerate(phonemes_strs_augm) if v.endswith(separator["sound"])]
            logger.debug("seps_idxs = {}".format(seps_idxs))
            how_many_separators = len(seps_idxs)
            logger.debug("how_many_separators = {}".format(how_many_separators))

            all_sounds = list(map(
                lambda t: ' '.join(phonemes_strs_augm[t[0] + 1: t[1]]),
                list(zip(seps_idxs[:-1], seps_idxs[1:]))))
            logger.debug("all sounds = {}".format(all_sounds))
            result_for_batch = list(zip(words_in_batch, all_sounds))
            for word, sound in result_for_batch:
                result_dict[sound] = word 
            result_dict.sync()
    finally:
        logger.info("Closing shelf '{}'".format(shelf_filename))
        result_dict.close()


In [None]:
len(mw.model.index2word)

In [None]:
# alogger.setLevel(logging.DEBUG) # very verbose 
alogger.setLevel(logging.WARNING) # very quiet 
start = 0
n = len(mw.model.index2word) - 1
# (graphemes_and_phonemes, secs) = take_time("graphemes_to_phonemes(mw.model.index2word[{}:{}])".format(start, start + n))
# # (graphemes_and_phonemes, secs) = take_time("graphemes_to_phonemes(['New_York', 'luis', 'papa', 'New_York', 'luis'])".format(start, start + n))
# print("It took {} secs to generate {} phonemes strings".format(secs, n))
# # print(graphemes_and_phonemes)
shelf_filename = "shelf_from{}_for{}.shelf".format(start, n)
(_, secs) = take_time("graphemes_to_phonemes_to_shelves(mw.model.index2word[{}:{}], '{}')".format(start, start + n, shelf_filename))
print("[dict] It took {} secs to generate {} phonemes strings".format(secs, n))
# print(as_dict)


In [None]:
graphemes_to_phonemes('"hey angel  you duh sexy"'.split())

In [None]:
shelf_filename

In [None]:
# shelf_filename = 'shelf_from0_for2999999.shelf'
result_dict = shelve.open(shelf_filename, flag='r')

In [None]:
key_list = list(result_dict.keys())

In [None]:
key_list[0]

In [None]:
result_dict[key_list[2]]

In [None]:
result_dict.close()

### END TRYOUT 

In [None]:
graphemes_to_phonemes(["luis", "papa"])

In [None]:
d1 = {"1": "uno"}
d2 = {"111": "uno11"}
{**d1, **d2}

In [None]:
n = 23
s = list(range(n)) 
len(s) // 10

In [None]:
s[20:30]

In [None]:
for i in range(3):
    print("{}: {} to {}".format(i, i * 10, (i + 1)*10))

In [None]:
alogger.setLevel(logging.DEBUG)
import random
import string 
def get_random_string(N: int) -> string: 
    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=N))
sss = ' '.join(["XXX", "it", "XXX", "be", "XXX", "from", "XXX", "by", "XXX", "are", "XXX", "I", "XXX", "have", "XXX", "he", "XXX", "will", "XXX", "has", "XXX", "####", "XXX", "his", "XXX", "an", "XXX", "this", "XXX", "or", "XXX", "their", "XXX", "who", "XXX", "they", "XXX", "but", "XXX", "$"])
sss = ' '.join([get_random_string(N = 5)] * 30)
sss = ' '.join(["|", "it", "|", "be", "|", "from", "|", "by", "|", "are", "|", "I", "|", "have", "|", "he", "|", "will", "|", "has", "|", "####", "|", "his", "|", "an", "|", "this", "|", "or", "|", "their", "|", "who", "|", "they", "|", "but", "|", "$"])
print(sss)
phons = graphs2phones(sss)
print(phons)
print("len of orig sentence: strings = {}, chars = {}\nlen of sounds = {}".format(len(sss.split()), len(sss), len(phons)))


In [None]:
sss = ' '.join(["it", "be", "from", "by", "are", "I", "have", "he", "will", "has", "####", "his", "an", "this", "or", "their", "who", "they", "but", "$"])
print(sss)
phons = graphs2phones(sss)
print(phons)
print("len of orig sentence: strings = {}, chars = {}\nlen of sounds = {}".format(len(sss.split()), len(sss), len(phons)))


In [None]:
# alogger.setLevel(logging.DEBUG) # very verbose 
alogger.setLevel(logging.WARNING) # very quiet 
start = 55
n = 3000
# (graphemes_and_phonemes, secs) = take_time("graphemes_to_phonemes(mw.model.index2word[{}:{}])".format(start, start + n))
# # (graphemes_and_phonemes, secs) = take_time("graphemes_to_phonemes(['New_York', 'luis', 'papa', 'New_York', 'luis'])".format(start, start + n))
# print("It took {} secs to generate {} phonemes strings".format(secs, n))
# # print(graphemes_and_phonemes)
(as_dict, secs) = take_time("dict_phonemes_to_graphemes(mw.model.index2word[{}:{}])".format(start, start + n))
print("[dict] It took {} secs to generate {} phonemes strings".format(secs, n))
# print(as_dict)


In [None]:
len(mw.model.index2word)

In [None]:
as_dict = dict_phonemes_to_graphemes(mw.model.index2word)

In [None]:
import pickle
def save_dict_to(the_dict, dict_file_name):
    with open(dict_file_name, 'wb') as f:
        pickle.dump(the_dict, f)

def load_dict_from(dict_file_name) -> dict:
    with open(dict_file_name, 'rb') as f:
        return pickle.load(f)

In [None]:
f_name = "blabla.pickle"
save_dict_to(as_dict, dict_file_name = f_name)


In [None]:
as_dict_2 = load_dict_from(dict_file_name = f_name)

In [None]:
as_dict == as_dict_2

In [None]:
for w in mw.model.index2word[0:300]:
    phs = graphs2phones(w)
    if (len(phs) > 1):
        print("'{}' => {} (length {})".format(w, phs, len(phs)))
    else:
        if (len(phs) == 0):
            print("'{}' has no phonemes".format(w))

### Migrate this 

In [5]:
import logging
from subprocess import check_output
from timeit import default_timer as timer
import functools
import shelve


class PhonemesFromGraphemes:

    MAX_LENGTH_TO_SPEAK = 10  # if I give more than this, espeak fails to do a good job

    def __init__(self, a_logger = None):
        print("hello")
        if a_logger is None:
            # create logger
            self.alogger = logging.getLogger(__name__)
            if not len(self.alogger.handlers):
                self.alogger.setLevel(logging.DEBUG)

                # create console handler and set level to debug
                ch = logging.StreamHandler()
                ch.setLevel(logging.DEBUG)

                # create formatter
                formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

                # add formatter to ch
                ch.setFormatter(formatter)
                # add ch to logger
                self.alogger.addHandler(ch)
            print("Logger created")
        else:
            self.alogger = a_logger
            print("Logger copied")

    def set_log_level(self, log_level):
        """
            log_level: one of logging.{WARNING, ...} 

        """
        self.alogger.setLevel(log_level)

    def graphs2phones(self, s):
        """
            Graphemes to Phonemes: 
            Takes a sentence, returns an array of graphemes strings (one per number of words in original sentence)
            Example(s): 
            > graphs2phones('hello world bla and ble')
            > graphs2phones(' wasuuuppp!')
        """
        phs = check_output(["/usr/local/bin/speak", "-q", "-x" ,'-v', 'en-us' ,s]).decode('utf-8')
        self.alogger.debug("Return {} strings: {}".format(len(phs.split()), phs))
        return [w for w in phs.strip().split(" ") if w != ' ']

    def take_time(self, code_snippet_as_string):
        """
            Measures the time it takes to execute the code snippet
            provided as string. 
            Returns: the value of the invocation + number of seconds it took. 
            Example(s): 
            > r, secs = take_time("2 + 2")
            > print("result = {}, time = {} secs".format(r, secs))
        """
        start = timer()
        r = eval(code_snippet_as_string)
        end = timer()
        return (r, end - start)

    def graphemes_to_phonemes(self, words_in_sent):
        """
            Takes a list of words and returns a list of tuples
            (grapheme: phoneme)
            Example:
            > graphemes_to_phonemes(["luis", "papa"])
            [('luis', "lj'u:Iz"), ('papa', "pa#p'A:")]
        """
        # First step: generate all sounds of words as if they were "alone" (ie, not in a sentence)
        # We want to avoid a combination of words making only 1 sound
        # For example (depending on accent): "what's up?"
        # So in order to do that we'll introduce a word with a unique sound between the words,
        # generate phonemes and then process them smartly:
        # separator for words in sentence
        separator = {"str": "XXX"}
        separator["sound"] = ''.join(self.graphs2phones(separator["str"]))
        #
        how_many_words = len(words_in_sent)
        num_batches = (how_many_words // self.MAX_LENGTH_TO_SPEAK) + int(how_many_words % self.MAX_LENGTH_TO_SPEAK != 0)
        result_array = [] # {}
        for i in range(num_batches):
            self.alogger.debug("{}: {} to {}".format(i, i * self.MAX_LENGTH_TO_SPEAK, (i + 1 ) *self.MAX_LENGTH_TO_SPEAK))
            words_in_batch = words_in_sent[i * self.MAX_LENGTH_TO_SPEAK: (i + 1 ) *self.MAX_LENGTH_TO_SPEAK]
            self.alogger.debug("words_in_batch = {}".format(words_in_batch))
            sent_augm = ' '.join \
                ([w1 + ' ' + w2 for w1, w2 in list(zip([separator["str"] ] *len(words_in_batch), words_in_batch))]) + " " + separator["str"]
            self.alogger.debug("sent_augm = {}".format(sent_augm))
            phonemes_strs_augm = self.graphs2phones(sent_augm)
            self.alogger.debug("phonemes_strs_augm = {}".format(phonemes_strs_augm))
            # there we go: all (indexes of) sounds that we are interested in.
            seps_idxs = [i for i ,v in enumerate(phonemes_strs_augm) if v.endswith(separator["sound"])]
            self.alogger.debug("seps_idxs = {}".format(seps_idxs))
            how_many_separators = len(seps_idxs)
            self.alogger.debug("how_many_separators = {}".format(how_many_separators))

            all_sounds = list(map(
                lambda t: ' '.join(phonemes_strs_augm[t[0] + 1: t[1]]),
                list(zip(seps_idxs[:-1], seps_idxs[1:]))))
            self.alogger.debug("all sounds = {}".format(all_sounds))
            result_array += list(zip(words_in_batch, all_sounds))
        return result_array


    def dict_graphemes_to_phonemes(self, words_in_sent) -> dict:
        as_phon_graph_list = self.graphemes_to_phonemes(words_in_sent)
        return {ph: graph for (graph, ph) in as_phon_graph_list}


    def graphemes_to_phonemes_to_shelves(self, words_in_sent, shelf_filename):
        """
            Takes a list of words and returns a list of tuples
            (grapheme: phoneme)
            Example:
            > graphemes_to_phonemes(["luis", "papa"])
            [('luis', "lj'u:Iz"), ('papa', "pa#p'A:")]
        """
        # let's do this in batches:
        how_many_words = len(words_in_sent)
        num_batches = (how_many_words // self.MAX_LENGTH_TO_SPEAK) + int(how_many_words % self.MAX_LENGTH_TO_SPEAK != 0)
        result_dict = shelve.open(shelf_filename, flag='c')
        try:
            for i in range(num_batches):
                self.alogger.debug("batch {} out of {}: {} to {}".format(i + 1, num_batches, i * self.MAX_LENGTH_TO_SPEAK, (i + 1)*self.MAX_LENGTH_TO_SPEAK))
                words_in_batch = words_in_sent[i * self.MAX_LENGTH_TO_SPEAK: (i + 1)*self.MAX_LENGTH_TO_SPEAK]
                result_for_batch = self.graphemes_to_phonemes(words_in_batch)
                self.alogger.debug("[{}] result_for_batch = '{}'".format(i, result_for_batch))
                #
                for word, sound in result_for_batch:
                    ex_word = result_dict.get(sound)
                    if ex_word is None:
                        self.alogger.debug("For sound '{}' the dict is empty".format(sound))
                    else:
                        self.alogger.debug("For sound '{}' the dict already has => {}".format(sound, ex_word))
                    print((ex_word if ex_word is not None else []) + [word])
                    result_dict[sound] = (ex_word if ex_word is not None else []) + [word]
                    self.alogger.debug("After inserting word '{}' => '{}' :: {}".format(word, sound, result_dict[sound]))
                result_dict.sync()
        finally:
            self.alogger.info("Closing shelf '{}'".format(shelf_filename))
            result_dict.close()



In [6]:
pg = PhonemesFromGraphemes()

hello
Logger created


In [7]:
len(pg.alogger.handlers)

1

In [4]:
pg.graphemes_to_phonemes_to_shelves(words_in_sent = "you you you _You".split(), shelf_filename = "delete_this_shelf")

2017-06-05 06:12:57,183 - __main__ - DEBUG - batch 1 out of 1: 0 to 10
2017-06-05 06:12:57,198 - __main__ - DEBUG - Return 1 strings:  ,Eks,Eks'Eks

2017-06-05 06:12:57,199 - __main__ - DEBUG - 0: 0 to 10
2017-06-05 06:12:57,200 - __main__ - DEBUG - words_in_batch = ['you', 'you', 'you', '_You']
2017-06-05 06:12:57,201 - __main__ - DEBUG - sent_augm = XXX you XXX you XXX you XXX _You XXX
2017-06-05 06:12:57,220 - __main__ - DEBUG - Return 9 strings:  ,Eks,Eks'Eks ju: ,Eks,Eks'Eks ju: ,Eks,Eks'Eks ju: ,Eks,Eks'Eks ju: ,Eks,Eks'Eks

2017-06-05 06:12:57,221 - __main__ - DEBUG - phonemes_strs_augm = [",Eks,Eks'Eks", 'ju:', ",Eks,Eks'Eks", 'ju:', ",Eks,Eks'Eks", 'ju:', ",Eks,Eks'Eks", 'ju:', ",Eks,Eks'Eks"]
2017-06-05 06:12:57,222 - __main__ - DEBUG - seps_idxs = [0, 2, 4, 6, 8]
2017-06-05 06:12:57,222 - __main__ - DEBUG - how_many_separators = 5
2017-06-05 06:12:57,223 - __main__ - DEBUG - all sounds = ['ju:', 'ju:', 'ju:', 'ju:']
2017-06-05 06:12:57,224 - __main__ - DEBUG - [0] result_for

['you', 'you', 'you', '_You', 'you', 'you', 'you', '_You', 'you']
['you', 'you', 'you', '_You', 'you', 'you', 'you', '_You', 'you', 'you']
['you', 'you', 'you', '_You', 'you', 'you', 'you', '_You', 'you', 'you', 'you']
['you', 'you', 'you', '_You', 'you', 'you', 'you', '_You', 'you', 'you', 'you', '_You']
