### Word2Vec using Gensim

In [1]:
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
sentences = [['first', 'sentence'], ['second', 'sentence']]
# train word2vec on the two sentences
# Gensim only requires that the input must provide sentences sequentially, 
# when iterated over. No need to keep everything in RAM: provide one sentence, process it, forget it, load another sentence
model = gensim.models.Word2Vec(sentences, min_count=1)

2018-11-23 23:42:34,891 : INFO : collecting all words and their counts
2018-11-23 23:42:34,901 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-11-23 23:42:34,903 : INFO : collected 3 word types from a corpus of 4 raw words and 2 sentences
2018-11-23 23:42:34,911 : INFO : Loading a fresh vocabulary
2018-11-23 23:42:34,915 : INFO : effective_min_count=1 retains 3 unique words (100% of original 3, drops 0)
2018-11-23 23:42:34,921 : INFO : effective_min_count=1 leaves 4 word corpus (100% of original 4, drops 0)
2018-11-23 23:42:34,925 : INFO : deleting the raw counts dictionary of 3 items
2018-11-23 23:42:34,932 : INFO : sample=0.001 downsamples 3 most-common words
2018-11-23 23:42:34,938 : INFO : downsampling leaves estimated 0 word corpus (5.7% of prior 4)
2018-11-23 23:42:34,940 : INFO : estimated required memory for 3 words and 100 dimensions: 3900 bytes
2018-11-23 23:42:34,948 : INFO : resetting layer weights
2018-11-23 23:42:34,972 : INFO : training mo

In [45]:
import os
word2vec = gensim.models.KeyedVectors.load_word2vec_format(fname='/home/karen/Downloads/data/glove.6B/glove.6B.100d.tmp.txt', binary=False)
word2vec

2018-11-24 00:24:28,072 : INFO : loading projection weights from /home/karen/Downloads/data/glove.6B/glove.6B.100d.tmp.txt
2018-11-24 00:26:45,137 : INFO : loaded (399999, 100) matrix from /home/karen/Downloads/data/glove.6B/glove.6B.100d.tmp.txt


<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f770ca3cd30>

In [44]:
with open('/home/karen/Downloads/data/glove.6B/glove.6B.100d.txt', 'r') as fread:
    with open('/home/karen/Downloads/data/glove.6B/glove.6B.100d.tmp.txt', 'w') as fwrite:
        fwrite.write('399999 100\n')
        for idx, line in enumerate(fread):
            fwrite.write(line)

In [43]:
!rm /home/karen/Downloads/data/glove.6B/glove.6B.100d.tmp.txt

In [46]:
word2vec.init_sims(replace=True)

2018-11-24 00:28:08,244 : INFO : precomputing L2-norms of word weight vectors


In [47]:
len(word2vec.vocab)

399999

In [56]:
word2vec.vectors_norm[word2vec.vocab['student'].index]

array([ 0.14084136,  0.10044176, -0.17062186, -0.03504317, -0.03259846,
        0.16226351,  0.00164017,  0.02982827, -0.04554551,  0.19863695,
       -0.08848356, -0.03040672,  0.06940352,  0.03717477, -0.05495993,
        0.05361199, -0.01390229, -0.01308273, -0.07149444,  0.08059222,
       -0.16011068, -0.03823438,  0.07592923, -0.01951167, -0.00767571,
       -0.14076705, -0.04336968, -0.19582428,  0.01398277,  0.07748415,
       -0.11785013,  0.21659192, -0.01383613,  0.07638032, -0.10138992,
       -0.1082482 , -0.03673607,  0.08005092, -0.05872075, -0.01275246,
       -0.147268  ,  0.00342861, -0.00873196,  0.02516351,  0.06088596,
       -0.05161836, -0.0100394 ,  0.03327952,  0.04757451,  0.04673779,
       -0.04099397, -0.11782714, -0.05391094,  0.07558075,  0.00579424,
       -0.30187368,  0.01918441, -0.0895591 ,  0.29233897,  0.04310788,
       -0.00594   ,  0.01523573, -0.04347936, -0.01092671,  0.11162515,
        0.01602999,  0.13726982,  0.06173506,  0.18407837,  0.16

#### Sentence2Vec by word2vec

In [59]:
def create_word2vec_matrix(text, word2vec):
    word2vec_matrix=[]
    count=0
    for line in text:
        word_lst=line.split()
        current_word2vec=[]
        for word in word_lst:
            if word in word2vec.vocab:
                # word2vec = token2idx
                vec = word2vec.vectors_norm[word2vec.vocab[word].index]
                if vec is not None:
                    current_word2vec.append(vec)
            else:
                print(word)
                count+=1
                continue
        # add up all the vector of each word to get the vector of a sentence 
        if np.array(current_word2vec).shape[0]!=0:
            sentence_word2vec = list(np.array(current_word2vec).mean(axis=0))
            word2vec_matrix.append(sentence_word2vec)
        current_word2vec=[]
    return word2vec_matrix, count

In [62]:
text = ['fantastic beasts and where to find them', 
        'fantastic beasts the crimes of grindelwald']
word2vec_matrix, count = create_word2vec_matrix(text, word2vec)

In [67]:
np.array(word2vec_matrix)

array([[-0.01363225,  0.0632612 ,  0.0532789 , -0.06067752, -0.03203793,
         0.0573948 , -0.05758385,  0.02891669, -0.01005686, -0.04475319,
         0.03142466, -0.00525038,  0.03889203, -0.01838762,  0.04951621,
        -0.0357757 ,  0.04723538,  0.06406571, -0.110898  ,  0.08045464,
         0.06799715, -0.03494107,  0.05513481, -0.04234928,  0.04503259,
        -0.0038799 , -0.02963971, -0.0674924 ,  0.0407795 , -0.05591179,
        -0.03720056,  0.01724851, -0.06198395, -0.00394624,  0.01963765,
         0.04357598, -0.03199228,  0.04052764,  0.01646447, -0.06384147,
        -0.06349035, -0.01205016, -0.03160828, -0.10188204, -0.02248716,
         0.05475311, -0.00057904, -0.00529261,  0.00310575, -0.06026373,
        -0.03455113,  0.0093182 ,  0.04103724,  0.22216587, -0.00798338,
        -0.37076157,  0.0060763 , -0.02901358,  0.19122097,  0.05265542,
        -0.0445339 ,  0.15990758, -0.07531472,  0.02351545,  0.13658723,
        -0.00910989,  0.08402754,  0.06993966,  0.0

In [70]:
count # so all words are in vocab

0

In [72]:
sentence1 = word2vec_matrix[0]
sentence2 = word2vec_matrix[1]
from scipy import spatial

# calculate the cosine similarity
1 - spatial.distance.cosine(sentence1, sentence2)

0.8124398589134216

#### Most similar words

In [95]:
word2vec.most_similar(positive=['kindergarten', 'college'], negative=['scientist'], topn=10)

  if np.issubdtype(vec.dtype, np.int):


[('school', 0.7116979360580444),
 ('elementary', 0.7066525220870972),
 ('grades', 0.7055771350860596),
 ('schools', 0.6759970188140869),
 ('preschool', 0.6746401786804199),
 ('pupils', 0.6707763075828552),
 ('classes', 0.646867036819458),
 ('schooling', 0.6365389227867126),
 ('vocational', 0.6250067949295044),
 ('enrollment', 0.6218675374984741)]

#### Find the different word in a sentence

In [76]:
word2vec.doesnt_match("breakfast cereal dinner lunch".split())

  if np.issubdtype(vec.dtype, np.int):


'cereal'

#### calculate the similarity between two words

In [100]:
word2vec.similarity('apocalypse', 'disaster')

  if np.issubdtype(vec.dtype, np.int):


0.1993172

### [Word2Vec using Fasttext](https://pypi.org/project/fasttext/)

In [None]:
import io

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

load_vectors('/home/karen/Downloads/data/wiki-news-300d-1M.vec')

In [104]:
import fasttext

# Skipgram model
model = fasttext.skipgram('toy_data/second.txt', 'model')
model['fantastic']

[-0.000991184264421463,
 -0.001029246486723423,
 0.0006811252678744495,
 0.00011803481174865738,
 -0.0006219972274266183,
 0.001100853318348527,
 0.00014544253644999117,
 -0.0008069276809692383,
 0.0008861601236276329,
 0.0005640610470436513,
 0.00018239919154439121,
 0.002081118058413267,
 0.0011119148693978786,
 -8.017678737815004e-06,
 0.0019191585015505552,
 0.002273016609251499,
 0.0004999999655410647,
 0.0008727581007406116,
 0.0017401062650606036,
 0.00136960344389081,
 -0.00010665664740372449,
 0.0006562909111380577,
 -0.0005103441653773189,
 0.0006105859065428376,
 -0.001563229481689632,
 0.0024708015844225883,
 -7.241084676934406e-05,
 0.00035496175405569375,
 -0.0008005818235687912,
 0.001430544420145452,
 0.0005475004436448216,
 -0.000570164353121072,
 0.00010769572691060603,
 -2.468213642714545e-05,
 0.0015175550943240523,
 0.0003394550003577024,
 -0.0006585742812603712,
 -0.0010379229206591845,
 0.0002934975200332701,
 -0.0004705099854618311,
 -0.0005588103667832911,
 -0.

In [None]:
# CBOW model
model = fasttext.cbow('toy_data/second.txt', 'model')

In [103]:
# CBOW model
model = fasttext.cbow('toy_data/second.txt', 'model')
model['fantastic']

[-0.000991184264421463,
 -0.001029246486723423,
 0.0006811252678744495,
 0.00011803481174865738,
 -0.0006219972274266183,
 0.001100853318348527,
 0.00014544253644999117,
 -0.0008069276809692383,
 0.0008861601236276329,
 0.0005640610470436513,
 0.00018239919154439121,
 0.002081118058413267,
 0.0011119148693978786,
 -8.017678737815004e-06,
 0.0019191585015505552,
 0.002273016609251499,
 0.0004999999655410647,
 0.0008727581007406116,
 0.0017401062650606036,
 0.00136960344389081,
 -0.00010665664740372449,
 0.0006562909111380577,
 -0.0005103441653773189,
 0.0006105859065428376,
 -0.001563229481689632,
 0.0024708015844225883,
 -7.241084676934406e-05,
 0.00035496175405569375,
 -0.0008005818235687912,
 0.001430544420145452,
 0.0005475004436448216,
 -0.000570164353121072,
 0.00010769572691060603,
 -2.468213642714545e-05,
 0.0015175550943240523,
 0.0003394550003577024,
 -0.0006585742812603712,
 -0.0010379229206591845,
 0.0002934975200332701,
 -0.0004705099854618311,
 -0.0005588103667832911,
 -0.

In [106]:
model = fasttext.load_model('model.bin')
# model['fantastic']