In [1]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [5]:
import numpy as np
import gensim.downloader as api

print(list(api.info()['models'].keys()))


['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [7]:
model=api.load('glove-wiki-gigaword-50')



In [23]:
def skipgram(target_word, model, topn=5):
    similar_words = model.most_similar(positive=[target_word], topn=topn)
    return similar_words, [target_word]

In [24]:
predictions, word_used = skipgram("science", model)

print(f"Target word: {word_used[0]}")
print("Top predictions:")
for word, score in predictions:
    print(f"{word}: {score:.4f}")

Target word: science
Top predictions:
sciences: 0.8548
research: 0.8437
institute: 0.8386
studies: 0.8369
physics: 0.8314


In [17]:
def cbow_predict(context_words, model, topn=5):
    context_vector = []
    valid_words = []

    for word in context_words:
        if word in model.key_to_index:
            context_vector.append(model[word])
            valid_words.append(word)

    if not context_vector:
        return [], []

    mean_vector = np.mean(context_vector, axis=0)

    similar_words = model.most_similar(positive=[mean_vector], topn=topn)

    return similar_words, valid_words


In [25]:
context = ["sciences","research","institute","studies","physics"]
predictions, used_words = cbow_predict(context, model)

print("Used context words:", used_words)
print("Top predictions:")
for word, score in predictions:
    if word not in used_words:
        print(f"{word}: {score:.4f}")

Used context words: ['sciences', 'research', 'institute', 'studies', 'physics']
Top predictions:
science: 0.9228


In [28]:
context = ["messi", "maradona", "neymar", "forlan", "aguero"]
predictions, used_words = cbow_predict(context, model)

print("Used context words:", used_words)
print("Top predictions:")
for word, score in predictions:
    if word not in used_words:
        print(f"{word}: {score:.4f}")

Used context words: ['messi', 'maradona', 'neymar', 'forlan', 'aguero']
Top predictions:
figo: 0.8558
ronaldo: 0.8490
