Understanding word embeddings

Demystifying Word2vec

# Exploring the pretrained Word2vec model using gensim

In [13]:
import gensim
from gensim.models import KeyedVectors

In [14]:
import gensim.downloader as api
model = api.load('word2vec-google-news-300')



In [18]:
model.vector_size

300

In [23]:
#.vocab was removed so we use key to index
len(model.key_to_index)

3000000

In [24]:
#.vocab was removed so we use key to index again
model.key_to_index

{'</s>': 0,
 'in': 1,
 'for': 2,
 'that': 3,
 'is': 4,
 'on': 5,
 '##': 6,
 'The': 7,
 'with': 8,
 'said': 9,
 'was': 10,
 'the': 11,
 'at': 12,
 'not': 13,
 'as': 14,
 'it': 15,
 'be': 16,
 'from': 17,
 'by': 18,
 'are': 19,
 'I': 20,
 'have': 21,
 'he': 22,
 'will': 23,
 'has': 24,
 '####': 25,
 'his': 26,
 'an': 27,
 'this': 28,
 'or': 29,
 'their': 30,
 'who': 31,
 'they': 32,
 'but': 33,
 '$': 34,
 'had': 35,
 'year': 36,
 'were': 37,
 'we': 38,
 'more': 39,
 '###': 40,
 'up': 41,
 'been': 42,
 'you': 43,
 'its': 44,
 'one': 45,
 'about': 46,
 'would': 47,
 'which': 48,
 'out': 49,
 'can': 50,
 'It': 51,
 'all': 52,
 'also': 53,
 'two': 54,
 'after': 55,
 'first': 56,
 'He': 57,
 'do': 58,
 'time': 59,
 'than': 60,
 'when': 61,
 'We': 62,
 'over': 63,
 'last': 64,
 'new': 65,
 'other': 66,
 'her': 67,
 'people': 68,
 'into': 69,
 'In': 70,
 'our': 71,
 'there': 72,
 'A': 73,
 'she': 74,
 'could': 75,
 'just': 76,
 'years': 77,
 'some': 78,
 'U.S.': 79,
 'three': 80,
 'million': 81

In [25]:
model.most_similar('Delhi')

[('Kolkata', 0.7663769125938416),
 ('Mumbai', 0.7306069731712341),
 ('Lucknow', 0.7277829647064209),
 ('Patna', 0.7159016728401184),
 ('Guwahati', 0.7072612643241882),
 ('Jaipur', 0.6992815136909485),
 ('Hyderabad', 0.6983195543289185),
 ('Ranchi', 0.6962575912475586),
 ('Bhubaneswar', 0.6959235072135925),
 ('Chandigarh', 0.6940240263938904)]

In [26]:
result = model.most_similar(positive=['man', 'queen'],
negative=['king'], topn=1)
print(result)

[('woman', 0.7609435319900513)]


In [27]:
result = model.most_similar(positive=['man', 'queen'],
negative=['king'], topn=1)
print(result)

[('woman', 0.7609435319900513)]


In [28]:
result = model.most_similar(positive=['France', 'Rome'],
negative=['Italy'], topn=1)
print(result)

[('Paris', 0.7190686464309692)]


# Training a word to vector model

## Building a basic Word2vec model

In [1]:
from gensim.models import Word2Vec
sentences = [["I", "am", "trying", "to", "understand", "Natural",
"Language", "Processing"],
["Natural", "Language", "Processing", "is", "fun",
"to", "learn"],
["There", "are", "numerous", "use", "cases", "of",
"Natural", "Language", "Processing"]]
model = Word2Vec(sentences, min_count=1)

In [2]:
model.vector_size

100

In [3]:
len(model.wv.key_to_index)

17

## Modifying the min_count parameter

In [4]:
model = Word2Vec(sentences, min_count=2)

In [5]:
len(model.wv.key_to_index)

4

In [6]:
model.wv.key_to_index

{'Processing': 0, 'Language': 1, 'Natural': 2, 'to': 3}

In [7]:
model.vector_size

100

## Playing with the vector size

In [8]:
model = Word2Vec(sentences, min_count=2, vector_size = 300)

In [9]:
model.vector_size

300

## Other important configurable parameters

In [10]:
model = Word2Vec (sentences, min_count=1, vector_size = 300, workers = 2, sg = 1, negative = 1)

In [11]:
len(model.wv.key_to_index)

17

In [12]:
model.wv.key_to_index

{'Processing': 0,
 'Natural': 1,
 'Language': 2,
 'to': 3,
 'of': 4,
 'am': 5,
 'trying': 6,
 'understand': 7,
 'is': 8,
 'cases': 9,
 'fun': 10,
 'learn': 11,
 'There': 12,
 'are': 13,
 'numerous': 14,
 'use': 15,
 'I': 16}

# Word mover’s distance

In [66]:
import gensim
from gensim.models import KeyedVectors
import numpy as np

In [30]:
# we have already imported the model in the previous section and since the model is 1.5GB big, it would be very time inefficient to load it again :)

In [31]:
sentence_1 = "Obama speaks to the media in Illinois"
sentence_2 = "President greets the press in Chicago"
sentence_3 = "Apple is my favorite company"

In [54]:
# Import and download stopwords from NLTK.
from nltk.corpus import stopwords
from nltk import download
download('stopwords')  # Download stopwords list.
stop_words = stopwords.words('english')

def preprocess(sentence):
    return [w for w in sentence.lower().split() if w not in stop_words]

sentence_1 = preprocess(sentence_1)
sentence_2 = preprocess(sentence_2)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [62]:
sentence_1

['obama', 'speaks', 'media', 'illinois']

In [64]:
pip install pot

Collecting pot
  Downloading POT-0.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (823 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/823.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.6/823.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m614.4/823.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m819.2/823.0 kB[0m [31m9.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.0/823.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pot
Successfully installed pot-0.9.3


In [68]:
word_mover_distance = model.wmdistance(sentence_1, sentence_2)
word_mover_distance

1.0174646858929572

In [69]:
sentence_3 = preprocess(sentence_3)

In [71]:
word_mover_distance = model.wmdistance(sentence_1, sentence_3)
word_mover_distance

1.340475408759902