# Extend Word2Vec Model with Phonemes 



In [22]:
import math
import random

## Load Word2Vec model trained with 100B words


from https://code.google.com/archive/p/word2vec/ 
### and wrap-it up in a ready-to-use class 


In [1]:
model = None # cache of Model 

In [2]:
import logging

logger = logging.getLogger(__name__)

def set_logging_as(a_level):
    logger.setLevel(a_level)

    # logging.basicConfig(format='%(asctime)s : %(levelname)s : %(module)s:%(lineno)d : %(funcName)s(%(threadName)s) : %(message)s')

#     ,
#         level=a_level)

# initialization: 
set_logging_as(logging.DEBUG)    

In [3]:
logging.getLevelName(logger.getEffectiveLevel())

'DEBUG'

In [4]:
set_logging_as(logging.DEBUG)
logger.info("lalala")

In [5]:
set_logging_as(logging.CRITICAL)
logger.info("lalala")

In [6]:
# create logger
alogger = logging.getLogger(__name__)
alogger.setLevel(logging.DEBUG)

# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# add formatter to ch
ch.setFormatter(formatter)
# add ch to logger
alogger.addHandler(ch)


In [7]:
logging.getLevelName(alogger.getEffectiveLevel())

'DEBUG'

In [8]:
alogger.setLevel(logging.DEBUG)

In [9]:
alogger.info("lala")

2017-05-29 13:41:39,943 - __main__ - INFO - lala


In [12]:
data_dir = "./data"

In [13]:
import gensim 
import bisect 
import numpy as np

class ModelWrapper():
    
    def __init__(self, m):
        if m is None:
            print("Loading model...")
            self.model = gensim.models.word2vec.KeyedVectors.load_word2vec_format('{}/GoogleNews-vectors-negative300.bin.gz'.format(data_dir), binary=True)
            print("Model succesfully loaded")
#             print("Cleaning up un-needed details from model...")
#             try:
#                 del self.model.syn0  # not needed => free up mem
#                 del self.model.syn1
#             except:
#                 pass
        else:
            print("[init] Model provided. If you want me to FORCE re-load it, call ModelWrapper's constructor with 'None'")
            self.model = m            
        # sort all the words in the model, so that we can auto-complete queries quickly
        print("Sort all the words in the model, so that we can auto-complete queries quickly...")
        self.orig_words = [gensim.utils.to_unicode(word) for word in self.model.index2word]
        indices = [i for i, _ in sorted(enumerate(self.orig_words), key=lambda item: item[1].lower())]
        self.all_words = [self.orig_words[i].lower() for i in indices]  # lowercased, sorted as lowercased
        self.orig_words = [self.orig_words[i] for i in indices]  # original letter casing, but sorted as if lowercased            
        
    def suggest(self, term):
        """
        For a given prefix, return 10 words that exist in the model start start with that prefix
        """
        prefix = gensim.utils.to_unicode(term).strip().lower()
        count = 10
        pos = bisect.bisect_left(self.all_words, prefix)
        result = self.orig_words[pos: pos + count]
        logger.info("suggested %r: %s" % (prefix, result))
        return result      
    
    def most_similar(self, positive, negative):
        """
            positive: an array of positive words
            negative: an array of negative words 
        """                
        try:
            result = self.model.most_similar(
                positive=[word.strip() for word in positive if word],
                negative=[word.strip() for word in negative if word],
                topn=5)
        except:
            result = []
        logger.info("similars for %s vs. %s: %s" % (positive, negative, result))
        return {'similars': result}    
    
    def vec_repr(self, word):
        """
            If 'word' belongs in the vocabulary, returns its 
            word2vec representation. Otherwise returns a vector of 0's
            of the same length of the other words. 
        """
        try:
            return self.model.word_vec(word)
        except KeyError:
            logger.debug("'{}' not in Model. Returning [0]'s vector.".format(word))
            return np.zeros(self.model.vector_size)
            

In [14]:
mw = ModelWrapper(model)
model = mw.model # just cache in case I re-call this cell

Loading model...
Model succesfully loaded
Sort all the words in the model, so that we can auto-complete queries quickly...


In [15]:
mw.model.syn0.shape


(3000000, 300)

In [16]:
type(mw.model.vocab.items())

dict_items

In [17]:
# my_dictionary = {k: f(v) for k, v in my_dictionary.items()}
mw.model.index2word[:10]

['</s>', 'in', 'for', 'that', 'is', 'on', '##', 'The', 'with', 'said']

In [23]:
random.uniform(0, 1)

0.8745957211429348

In [54]:
phonemes_words = [(graphs2phones(w),w) for w in mw.model.index2word[:100]]
phonemes_dict = {ph[0]:w for ph, w in phonemes_words if len(ph) == 1}

In [132]:
def build_phonemes_dict(from_idx, how_many_words):
    sent = ' '.join(mw.model.index2word[from_idx:from_idx + how_many_words])
    print(sent)
    array_phonemes =  graphs2phones(sent)
    print(array_phonemes)
    assert len(array_phonemes) == how_many_words, "(Have {} phoneme-strings, {} words) Looks like some words in vocab have phonemes-strings > 1".format(len(array_phonemes), how_many_words)
    zz = list(zip(array_phonemes, sent.split())) #  list(zip(graphs2phones(sent), sent.split()))
    # print(list(zz))
    return {ph: w for (ph, w) in list(zz)}
    # print(list(zz_filtered))

In [133]:
from_idx = 3; how_many_words = 5
sent = ' '.join(mw.model.index2word[from_idx:from_idx + how_many_words])
print(sent)
len(sent.split())

that is on ## The


5

In [138]:
[' ']*10

[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']

In [131]:
for w in mw.model.index2word[0:300]:
    phs = graphs2phones(w)
    if (len(phs) > 1):
        print("'{}' => {} (length {})".format(w, phs, len(phs)))
    else:
        if (len(phs) == 0):
            print("'{}' has no phonemes".format(w))

'</s>' => ["sl'aS", "_:_:'Es"] (length 2)
'##th' => ["h'aShaS", "t,i:;'eItS"] (length 2)


In [147]:
'luis\n'[0:4]

'luis'

In [216]:
mw.model.index2word[0:8]

['</s>', 'in', 'for', 'that', 'is', 'on', '##', 'The']

#### In order to avoid 'one sound for several words' effect, I break the words: 

In [230]:
words_in_sent = mw.model.index2word[0:8]
sent_augm = ' '.join([w1 + ' ' + w2 for w1, w2 in list(zip(['XXX']*len(words_in_sent), words_in_sent))]) + ' XXX'
print(sent_augm)
phonemes_strs_augm = graphs2phones(sent_augm)
phonemes_strs_augm

XXX </s> XXX in XXX for XXX that XXX is XXX on XXX ## XXX The XXX


[",Eks,Eks'Eks",
 "sl'aS",
 "_:_:'Es",
 "_:_:,Eks,Eks'Eks",
 'In',
 ",Eks,Eks'Eks",
 'fO@r',
 ",Eks,Eks'Eks",
 'Dat',
 ",Eks,Eks'Eks",
 'Iz',
 ",Eks,Eks'Eks",
 ',0n',
 ",Eks,Eks'Eks",
 "h'aShaS",
 ",Eks,Eks'Eks",
 'DI2;',
 ",Eks,Eks'Eks"]

#### Let's generate all sounds, and keep the ones that are NOT 'separator: 

In [231]:
xxx_sound = ''.join(graphs2phones('XXX'))
# xxx_sound
idxs_phonemes_strs = [i for i,v in enumerate(phonemes_strs_augm) if not v.endswith(xxx_sound)]
idxs_phonemes_strs 

[1, 2, 4, 6, 8, 10, 12, 14, 16]

#### let's now detect the sounds that came from same word (ie, contiguous sounds): 

In [211]:
def do_process(idxs):    
    if len(idxs) == 0:
        r = []
    else:
        if len(idxs) == 1:
            r = [idxs]
        else:
            if idxs[0] + 1 == idxs[1]:
                r = [[idxs[0], idxs[1]]] + do_process(idxs[2:])
            else:
                r = [[idxs[0]]] + do_process(idxs[1:])
    return r 
        

In [232]:
do_process(idxs_phonemes_strs)

[[1, 2], [4], [6], [8], [10], [12], [14], [16]]

In [253]:
[(words_in_sent[int(i[0]/2 - 1)], phonemes_strs_augm[i[0]]) if len(i) == 1 else (words_in_sent[int(i[1]/2 - 1)], ' '.join(phonemes_strs_augm[i[0]:i[1] + 1])) for i in do_process(idxs_phonemes_strs)]

[('</s>', "sl'aS _:_:'Es"),
 ('in', 'In'),
 ('for', 'fO@r'),
 ('that', 'Dat'),
 ('is', 'Iz'),
 ('on', ',0n'),
 ('##', "h'aShaS"),
 ('The', 'DI2;')]

In [247]:
idxs_phonemes_strs
# idxs = [i[0] for i in do_process(idxs_phonemes_strs) if len(i) == 1]
idxs = [i[0] if len(i) == 1 else i[1] for i in do_process(idxs_phonemes_strs)]
idxs

[2, 4, 6, 8, 10, 12, 14, 16]

In [242]:
words_in_sent

['</s>', 'in', 'for', 'that', 'is', 'on', '##', 'The']

In [245]:
words_in_sent[int(16/2 - 1)]

'The'

In [246]:
[(words_in_sent[int(i/2 - 1)], phonemes_strs_augm[i]) for i in idxs] 

[('in', 'In'),
 ('for', 'fO@r'),
 ('that', 'Dat'),
 ('is', 'Iz'),
 ('on', ',0n'),
 ('##', "h'aShaS"),
 ('The', 'DI2;')]

In [197]:
rr[eval('1:2')]

SyntaxError: invalid syntax (<string>, line 1)

In [136]:
(phonemes_dict_1, tInSecs_1) = take_time("build_phonemes_dict(0, 15)")
tInSecs_1

</s> in for that is on ## The with said was the at not as
["sl'aS", "_:_:'Es", '_:_:In', 'fO@', 'Dat', 'Iz', ',0n', "h'aShaS", 'D@2', 'wID', "s'Ed", 'w0zDI2;', 'at', 'n,0t', 'az']


0.029420072998618707

In [83]:
tInSecs

1.924127916994621

In [75]:
graphs2phones(' wasuuuppp!')

["w'0s@,VVp"]

In [84]:
(phonemes_dict, tInSecs) = take_time("{''.join(graphs2phones(w)): w for w in mw.model.index2word[1:300]}")

In [85]:
tInSecs

1.9037140449945582

In [88]:
graphs2phones('salut ca va bien')

["s@l'Vt", "k'A:", "v'A:", "baI'En"]

In [48]:
from timeit import default_timer as timer
def take_time(f):
    start = timer()
    r = eval(f)
    end = timer()
    return (r, end - start)


### Graphemes 2 Phonemes 

In [26]:
from subprocess import check_output
def graphs2phones(s): 
    """
        Takes a sentences, returns an array of graphemes strings (one per number of words in original sentence)
    """
    phs = check_output(["speak", "-q", "-x",'-v', 'en-us',s]).decode('utf-8')
    return [w for w in phs.strip().split(" ") if w != ' ']

# example: 
graphs2phones('hello world bla and ble')

["h@l'oU", "w'3:ld", "bl'A:", '_:_:and', "bl'i:"]

In [35]:
graphs2phones('fuckk fuck fuc fuk')[0]

"f'Vkk"

In [31]:
mw.most_similar(positive = ['soccer'], negative = ['messi'])

2017-05-29 13:50:19,665 - __main__ - INFO - similars for ['soccer'] vs. ['messi']: [('Soccer', 0.48688480257987976), ('lacrosse', 0.4622202515602112), ('softball', 0.4572678506374359), ('Lacrosse', 0.4419728219509125), ('basketball', 0.4305872321128845)]


{'similars': [('Soccer', 0.48688480257987976),
  ('lacrosse', 0.4622202515602112),
  ('softball', 0.4572678506374359),
  ('Lacrosse', 0.4419728219509125),
  ('basketball', 0.4305872321128845)]}

## Let's sanity check the Word2Vec model we just wrapped up

In [32]:
assert np.count_nonzero(mw.vec_repr('piripiri')) == 0, "'piripiri' is present in this model???"
print("Sanity check: all good, 'piripiri' was assigned the empty vector as representation")

2017-05-29 13:50:21,973 - __main__ - DEBUG - 'piripiri' not in Model. Returning [0]'s vector.


Sanity check: all good, 'piripiri' was assigned the empty vector as representation


In [33]:
assert np.count_nonzero(mw.vec_repr('dog')) > 0, "'dog' is not present in this model???"
print("Sanity check: all good, 'dog' has a meaningful representation")

Sanity check: all good, 'dog' has a meaningful representation
