# Using word2vec to Semantically Plot Self-Improvement

   There are a lot of good guides and tools for exploring word2vec, I'd recommend taking a look at all of them
   :
   
   https://github.com/dominiek/word2vec-explorer
   
   https://gist.github.com/aneesha/da9216fb8d84245f7af6edaa14f4efa9#file-display_closestwords_tsnescatterplot-ipynb
   
   https://medium.com/explore-artificial-intelligence/word2vec-a-baby-step-in-deep-learning-but-a-giant-leap-towards-natural-language-processing-40fe4e8602ba
   
   https://github.com/pg2455/U.S-Presidential-Speeches
   
   https://matplotlib.org/index.html
   
   
   

In [1]:
import os
import gensim
import random
# Need the interactive Tools for Matplotlib
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.manifold import TSNE

In [2]:
# If you haven't downloaded word2vec embeddings, you should do so here
# load pre-trained word2vec embeddings
# The embeddings can be downloaded from command prompt:
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"


--2019-02-12 10:10:15--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.178.125
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.178.125|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [3]:
# load pre-trained word2vec embeddings
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [4]:
# Let's just see what this looks like
model.similar_by_word('computer')

[('computers', 0.7979379892349243),
 ('laptop', 0.6640493273735046),
 ('laptop_computer', 0.6548868417739868),
 ('Computer', 0.6473334431648254),
 ('com_puter', 0.6082079410552979),
 ('technician_Leonard_Luchko', 0.5662748217582703),
 ('mainframes_minicomputers', 0.5617721080780029),
 ('laptop_computers', 0.5585449934005737),
 ('PC', 0.5539618134498596),
 ('maker_Dell_DELL.O', 0.5519254207611084)]

In [5]:
# This function uses TSNE to reduce all the demensions of a vector down to 2 
# so we can plot it on a graph
def display_closestwords_tsnescatterplot(model, word):
    
    arr = np.empty((0,300), dtype='f')
    word_labels = [word]

    # get close words
    close_words = model.similar_by_word(word)
    
    # add the vector for each of the closest words to the array
    arr = np.append(arr, np.array([model[word]]), axis=0)
    for wrd_score in close_words:
        wrd_vector = model[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)
        
    # find tsne coords for 2 dimensions
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)

    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    # display scatter plot
    plt.scatter(x_coords, y_coords)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
    plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
    plt.show()


In [6]:
display_closestwords_tsnescatterplot(model, 'compassionate')

<IPython.core.display.Javascript object>

In [7]:
# This function uses TSNE to reduce all the demensions of a vector down to 3 
# so we can plot it on a 3D graph
def display_closestwords_tsnescatterplot3d(model, word):
    
    arr = np.empty((0,300), dtype='f')
    word_labels = [word]

    # get close words
    close_words = model.similar_by_word(word)
    
    # add the vector for each of the closest words to the array
    arr = np.append(arr, np.array([model[word]]), axis=0)
    for wrd_score in close_words:
        wrd_vector = model[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)
        
    # find tsne coords for 3 dimensions
    tsne = TSNE(n_components=3, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)

    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    z_coords = Y[:, 2]
    # display scatter plot

    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    c = ['b', 'r', 'g', 'y']
    for label, xs, ys, zs in zip(word_labels, x_coords, y_coords, z_coords):
        ax.scatter(xs, ys, zs, c=random.choice(c))
        ax.text(xs, ys, zs, label)
    plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
    plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
    plt.show()

In [8]:
display_closestwords_tsnescatterplot3d(model, "compassionate")

<IPython.core.display.Javascript object>

In [11]:
model.most_similar(positive= 'sugar', topn= 6)

[('refined_sugar', 0.7480024099349976),
 ('cane_sugar', 0.63625168800354),
 ('turbinado', 0.6269999742507935),
 ('Sugar', 0.6100630760192871),
 ('cocoa', 0.6048755645751953),
 ('Refined_sugar', 0.6003995537757874)]

# Plotting the Path

In [61]:
# Vector Path gives us an array of words that are on a semantic path

def vector_path(model, word1, word2):
    direction = model[word2] - model[word1]
    stops = 5
    breadcrumbs = [word1]
    for i in range(stops):
        if i == 0:
            lastword = word1
        else:
            lastword = breadcrumbs[i-1]
        nextwords = model.similar_by_vector(model[lastword]+(direction*(i/stops)))
        #print(nextwords)
        similarity = 0.0
        X = ''
        for j,y in nextwords:
            if word2 == j:
                X = word2
                print('found')
                break
            elif model.similarity(j,word2) > similarity:
                similarity = model.similarity(j,word2)
                X = j  
        breadcrumbs.append(X)
        if X == word2:
            break
        #update direction
        direction = model[word2] - model[X]
    #print(breadcrumbs)
    return breadcrumbs
    #closest = sorted_by_similarity(words, direction)[:10]

In [62]:
print(vector_path(model, 'understanding', 'remorseful'))

found
['understanding', 'understood', 'empathy', 'remorseful']


In [66]:
# Similar to our closest word functions above, this plots the words we found
# to be on a path and relates them with arrows, or the quiver function
def printVectorPath(model, worda, wordb):
    wordarray = vector_path(model, worda,wordb)
    print(wordarray)
    arr = np.empty((0,300), dtype='f')
    # add the vector for each of the closest words to the array
    for word in wordarray:
        arr = np.append(arr, np.array([model[word]]), axis=0)
        
    # find tsne coords for 3 dimensions
    tsne = TSNE(n_components=3, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)

    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    z_coords = Y[:, 2]
    # display scatter plot

    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    c = ['b', 'r', 'g']
    for label, xs, ys, zs in zip(wordarray, x_coords, y_coords, z_coords):
        ax.scatter(xs, ys, zs, c=random.choice(c))
        ax.text(xs, ys, zs, label)
        # plots our arrows between the previous word and the current word
        if label!= wordarray[0]:
            ax.quiver(U, V, W, xs, ys, zs)
        U,V,W = xs, ys, zs
    plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
    plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
    plt.show()

In [67]:
printVectorPath(model, 'understanding', 'remorseful')

found
['understanding', 'understood', 'empathy', 'remorseful']


<IPython.core.display.Javascript object>

## Your Top Current Adjectives

In [55]:
adjective1 = 'bold'
adjective2 = 'fun'
adjective3 = 'reliable'
current_adj = [adjective1, adjective2, adjective3]

## Your Goal Adjectives

In [56]:
adjective4 = 'innovative'
adjective5 = 'entertaining'
adjective6 = 'compassionate'
goal_adj = [adjective4, adjective5, adjective6]

## Compute Closest Aligned

In [57]:
tuples = []
for worda in current_adj:
    print(worda)
    distance = 0.0
    closest = ()
    for wordb in goal_adj:
        if distance < model.similarity(worda,wordb):
            distance = model.similarity(worda,wordb)
            closest = worda, wordb
    tuples.append(closest)
print(tuples)


bold
fun
reliable
[('bold', 'innovative'), ('fun', 'entertaining'), ('reliable', 'innovative')]


## Your Plots

In [68]:
adjective1,adjective4 = tuples[0]
adjective2,adjective5 = tuples[1]
adjective3,adjective6 = tuples[2]


printVectorPath(model, adjective1, adjective4)



found
['bold', 'menthol_flake', 'shoulder_silhouettes_updos', 'innovative']


<IPython.core.display.Javascript object>

found
['fun', 'entertaining']


<IPython.core.display.Javascript object>

found
['reliable', 'efficient', 'most_accourate', 'innovative']


<IPython.core.display.Javascript object>

In [None]:
printVectorPath(model, adjective2, adjective5)

In [None]:
printVectorPath(model, adjective3, adjective6)