# Word Embeddings
    More details in the official documentation: https://radimrehurek.com/gensim/auto_examples/index.html#documentation

In [31]:
!pip install gensim --upgrade
!pip install numpy --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [32]:
from gensim import downloader
import numpy as np

In [33]:
import gensim
print(gensim.__version__)

4.2.0


## Loading The Pretrained Weights
Supported options are at https://radimrehurek.com/gensim/models/word2vec.html#pretrained-models

In [34]:
WORD_2_VEC_PATH = 'word2vec-google-news-300'
GLOVE_PATH = 'glove-twitter-200'

In [35]:
glove = downloader.load(GLOVE_PATH)

## Using The Pre-Trained Vecotors

In [36]:
sen = "good morning to you asjfgsrafkjhbfgk"
representation = []
for word in sen.split():
    if word not in glove.key_to_index:
        print(f"{word} not an existing word in the model")
        continue
    vec = glove[word]
    representation.append(vec)
representation = np.asarray(representation)
print(representation.shape)

asjfgsrafkjhbfgk not an existing word in the model
(4, 200)


In [37]:
representation

array([[ 1.8223e-02, -1.2323e-02,  3.5569e-02,  2.4232e-01, -1.6069e-01,
         5.2370e-01,  4.4601e-01,  1.2796e-01, -3.9654e-01, -5.5359e-01,
        -4.2493e-01, -5.1860e-01, -4.7980e-01,  1.3340e-01, -2.0102e-02,
         1.1239e-02, -1.6348e-01, -6.4611e-02, -2.1307e-01,  1.8213e-01,
        -5.0681e-02, -4.7004e-02,  2.6059e-01,  2.6835e-01,  5.6722e-02,
         1.2509e+00, -2.3720e-01,  8.2913e-02,  1.9489e-01, -1.5868e-01,
        -3.0604e-01, -1.7661e-01,  1.5068e-03,  3.6496e-01,  2.2036e-01,
         6.2268e-01, -4.1441e-01,  9.6564e-02, -4.3795e-03,  2.7042e-01,
         2.5475e-01, -5.5140e-02,  1.1192e-01,  1.9500e-01,  2.8769e-01,
        -1.1948e-01,  4.3597e-01,  9.1972e-02, -1.5433e-01,  8.4403e-02,
        -1.3209e-01,  2.4921e-01, -1.4751e-01,  3.8311e-02,  6.5674e-02,
         1.9684e-01, -1.9819e-01,  5.0042e-02,  3.4048e-01, -1.3061e-01,
        -2.6501e-01, -3.0854e-01, -5.8837e-01, -2.3073e-01,  2.6364e-02,
        -3.4728e-01, -4.9131e-01,  9.0294e-02,  2.9

# Training A Model

In [61]:
TEXT_PATH = 'Alice_book'
with open(TEXT_PATH, 'r', encoding='utf-8') as f:
    sentences = f.readlines()
sentences = [sen.strip().lower() for sen in sentences]
sentences = [sen.split() for sen in sentences if sen]
sentences[0:2]

[['chapter', 'i.', 'down', 'the', 'rabbit-hole'],
 ['alice',
  'was',
  'beginning',
  'to',
  'get',
  'very',
  'tired',
  'of',
  'sitting',
  'by',
  'her',
  'sister',
  'on',
  'the']]

In [62]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [65]:
from gensim.models import Word2Vec

model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4, epochs=100)
model.save("word2vec.model")

In [68]:
sims = model.wv.most_similar('alice', topn=10)
sims

[('would,’', 0.5151599645614624),
 ('sharply', 0.5133723020553589),
 ('yet,’', 0.48043113946914673),
 ('curiouser!’', 0.477140337228775),
 ('cat,', 0.47523826360702515),
 ('she', 0.47274038195610046),
 ('alice,', 0.4722289443016052),
 ('herself.', 0.4717392921447754),
 ('politely,', 0.4706284701824188),
 ('him,’', 0.4643808603286743)]

# Some Nice Properties

In [69]:
glove.most_similar('program', topn=5)

[('programs', 0.6853476762771606),
 ('seminar', 0.6410127878189087),
 ('training', 0.6214897036552429),
 ('workshop', 0.5917727947235107),
 ('system', 0.5909943580627441)]

In [70]:
glove.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.6820898056030273),
 ('prince', 0.5875527262687683),
 ('princess', 0.5620489120483398),
 ('royal', 0.5522865056991577),
 ('mother', 0.5362966656684875)]

In [71]:
glove.most_similar(positive=['paris','germany'], negative=['berlin'], topn = 5)

[('france', 0.7369073629379272),
 ('spain', 0.6768407821655273),
 ('portugal', 0.6567487716674805),
 ('italy', 0.6421886086463928),
 ('denmark', 0.6146384477615356)]

In [46]:
glove.most_similar(positive=['walking','swam'], negative=['swimming'], topn = 5)

[('walked', 0.5864155292510986),
 ('drove', 0.5215498805046082),
 ('ran', 0.5134605169296265),
 ('sprinted', 0.4759795665740967),
 ('stood', 0.47308677434921265)]