In [116]:
import logging
from typing import List, Union


import numpy as np
import pandas as pd

# Get the interactive Tools for Matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.decomposition import PCA

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [92]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
log = logging.getLogger()

In [94]:
glove_file = datapath('/nlp/data/glove.6B.100d.txt')
word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt")

In [66]:
model = KeyedVectors.load_word2vec_format(word2vec_glove_file)

In [95]:
postive_vectors = np.array([
    'good', 'positive', 'well', 'fantastic', 'great', 'nice'
])
negative_vectors = np.array([
    'bad', 'negative', 'awfully', 'terrible', 'awful', 'horrible'])

In [96]:
negative_vectors.shape[0]

6

In [111]:
def get_average_similarity(model: KeyedVectors, word: str, vectors: np.array) -> Union[float, None]:
    """
    Predict average similarity betweeen a word and a vector of sentimental words.
    :param vectors: np.array of strings
    """
    similarity = 0
    for vec in vectors:
        try:
            similarity += model.similarity(word, vec)
        except KeyError as e:
            log.warning('Word %s not in dataset.', word)
            return None
    return similarity / vectors.shape[0]
    
    
    
    
    

In [115]:
get_average_similarity(model, 'lovely', negative_vectors)

0.39019736647605896

In [114]:
get_average_similarity(model, 'lovely', postive_vectors)

0.4703294485807419

In [87]:
model.wv.similar_by_word('fuck')

  """Entry point for launching an IPython kernel.


[('shit', 0.6878010034561157),
 ('fucking', 0.6649960875511169),
 ('bleep', 0.6257128715515137),
 ('hell', 0.5946176052093506),
 ("'cause", 0.5863880515098572),
 ('wanna', 0.5832822322845459),
 ('crap', 0.5778754949569702),
 ('everytime', 0.5760316848754883),
 ('gotta', 0.5757386684417725),
 ("'em", 0.5745130777359009)]

In [88]:
model.wv.similarity('fuckasd', 'good')

  """Entry point for launching an IPython kernel.


KeyError: "word 'fuckasd' not in vocabulary"

In [81]:
model.wv.similarity('bad', 'good')

  """Entry point for launching an IPython kernel.


0.7702798

In [77]:
model.wv.similarity('fuck', 'bad')

  """Entry point for launching an IPython kernel.


0.27072722

In [69]:
dir(model.wv)

  """Entry point for launching an IPython kernel.


['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_load_specials',
 '_log_evaluate_word_analogies',
 '_save_specials',
 '_smart_save',
 'accuracy',
 'add',
 'closer_than',
 'cosine_similarities',
 'distance',
 'distances',
 'doesnt_match',
 'evaluate_word_analogies',
 'evaluate_word_pairs',
 'get_keras_embedding',
 'get_vector',
 'index2entity',
 'index2word',
 'init_sims',
 'load',
 'load_word2vec_format',
 'log_accuracy',
 'log_evaluate_word_pairs',
 'most_similar',
 'most_similar_cosmul',
 'most_similar_to_given',
 'n_similarity',
 'rank',
 'relative_cosine_similarity',
 'save',
 'save_word2vec_for

In [58]:
dir(model)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_load_specials',
 '_log_evaluate_word_analogies',
 '_save_specials',
 '_smart_save',
 'accuracy',
 'add',
 'closer_than',
 'cosine_similarities',
 'distance',
 'distances',
 'doesnt_match',
 'evaluate_word_analogies',
 'evaluate_word_pairs',
 'get_keras_embedding',
 'get_vector',
 'index2entity',
 'index2word',
 'init_sims',
 'load',
 'load_word2vec_format',
 'log_accuracy',
 'log_evaluate_word_pairs',
 'most_similar',
 'most_similar_cosmul',
 'most_similar_to_given',
 'n_similarity',
 'rank',
 'relative_cosine_similarity',
 'save',
 'save_word2vec_for

In [60]:
model.distance('woman', 'man')

KeyError: "word 'woman' not in vocabulary"

TypeError: not all arguments converted during string formatting

In [7]:
? model

[0;31mType:[0m        Word2VecKeyedVectors
[0;31mString form:[0m <gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x7fca21712be0>
[0;31mFile:[0m        ~/.local/share/virtualenvs/nlp-67IoPmqz/lib/python3.7/site-packages/gensim/models/keyedvectors.py
[0;31mDocstring:[0m  
Mapping between words and vectors for the :class:`~gensim.models.Word2Vec` model.
Used to perform operations on the vectors such as vector lookup, distance, similarity etc.


In [45]:
model.wv.similarity('negtive', 'positive')

  """Entry point for launching an IPython kernel.


KeyError: "word 'negtive' not in vocabulary"

In [8]:
dir(model)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_load_specials',
 '_log_evaluate_word_analogies',
 '_save_specials',
 '_smart_save',
 'accuracy',
 'add',
 'closer_than',
 'cosine_similarities',
 'distance',
 'distances',
 'doesnt_match',
 'evaluate_word_analogies',
 'evaluate_word_pairs',
 'get_keras_embedding',
 'get_vector',
 'index2entity',
 'index2word',
 'init_sims',
 'load',
 'load_word2vec_format',
 'log_accuracy',
 'log_evaluate_word_pairs',
 'most_similar',
 'most_similar_cosmul',
 'most_similar_to_given',
 'n_similarity',
 'rank',
 'relative_cosine_similarity',
 'save',
 'save_word2vec_for