# Visualize Closest Words in Word2Vec Embeddings

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

import seaborn as sns
sns.set_style("darkgrid")

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from gensim.models import Word2Vec



In [3]:
def plot_n_similar(word, n_similar):
    """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word
    and its list of 30 most similar words. The 'word' is plotted in red.
    The top 'n_similar' most similar words are plotted in blue and the rest in green.
    """
    model = Word2Vec.load("word2vec.model")
    arrays = np.empty((0, model.wv.vector_size), dtype='f')
    word_labels = [word]
    color_list  = ['red']

    # adds the vector of the query word
    arrays = np.append(arrays, np.expand_dims(model.wv[word], axis=0), axis=0)

    # gets list of most similar words
    close_words = model.wv.most_similar([word], topn=n_similar)
    
    # adds the vector for each of the closest words to the array
    for wrd_score in close_words:
        wrd_vector = np.expand_dims(model.wv[wrd_score[0]], axis=0)
        word_labels.append(wrd_score[0])
        color_list.append('blue')
        arrays = np.append(arrays, wrd_vector, axis=0)
    
    # generate a list of 'n' similar words
    list_names = [t[0] for t in model.wv.most_similar(positive=[word], topn=30)][n_similar:]

    # adds the vector for each of the words from list_names to the array
    for wrd in list_names:
        wrd_vector = np.expand_dims(model.wv[wrd], axis=0)
        word_labels.append(wrd)
        color_list.append('green')
        arrays = np.append(arrays, wrd_vector, axis=0)
        
    # Reduces the dimensionality from 100 to 20 dimensions with PCA
    reduc = PCA(n_components=20).fit_transform(arrays)
    
    # Finds t-SNE coordinates for 2 dimensions
    np.set_printoptions(suppress=True)
    
    Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
    
    # Sets everything up to plot
    df = pd.DataFrame({'x': [x for x in Y[:, 0]],
                       'y': [y for y in Y[:, 1]],
                       'words': word_labels,
                       'color': color_list})
    
    fig, _ = plt.subplots()
    fig.set_size_inches(9, 9)
    
    # Basic plot
    p1 = sns.regplot(data=df,
                     x="x",
                     y="y",
                     fit_reg=False,
                     marker="o",
                     scatter_kws={'s': 40,
                                  'facecolors': df['color']
                                 }
                    )
    
    # Adds annotations one by one with a loop
    for line in range(0, df.shape[0]):
         p1.text(df["x"][line],
                 df['y'][line],
                 '  ' + df["words"][line].title(),
                 horizontalalignment='left',
                 verticalalignment='bottom', size='medium',
                 color=df['color'][line],
                 weight='normal'
                ).set_size(15)

    
    plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
    plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
            
    plt.title('t-SNE visualization for {}'.format(word.title()))

In [12]:
# visually interact
import ipywidgets as widgets

n_similar = widgets.IntSlider(description="# Closest Words", 
                              value=5, 
                              min=1, 
                              max=25,
                              style={'description_width':'initial'})

widgets.interact(plot_n_similar, 
                 word="toronto", 
                 n_similar=n_similar)

interactive(children=(Text(value='toronto', description='word'), IntSlider(value=5, description='# Closest Wor…

<function __main__.plot_n_similar>