In [1]:
import pandas as pd
import os
from gensim.models import Word2Vec
import spacy

In [2]:
nlp = spacy.load('en')

In [3]:

start_data_dir = '.'
data_directory = os.path.join(start_data_dir, 'data',
                              'yelp_dataset')

intermediate_directory = os.path.join(data_directory, 'intermediate')

tsne_filepath = os.path.join(intermediate_directory,
                             u'tsne_model')

tsne_vectors_filepath = os.path.join(intermediate_directory,
                                     u'tsne_vectors.npy')


word2vec_filepath = os.path.join(intermediate_directory, 'word2vec_model_all')

In [4]:
food2vec = Word2Vec.load(word2vec_filepath)
food2vec.init_sims()

print(u'{} training epochs so far.'.format(food2vec.train_count))

12 training epochs so far.


In [5]:
# build a list of the terms, integer indices,
# and term counts from the food2vec model vocabulary
ordered_vocab = [(term, voc.index, voc.count)
                 for term, voc in food2vec.wv.vocab.items()]

def sort_func(tup):
    term, indec, count = tup
    return -count

# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab, key=sort_func)

# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

# create a DataFrame with the food2vec vectors as data,
# and the terms as row labels
word_vectors = pd.DataFrame(food2vec.wv.syn0norm[term_indices, :],
                            index=ordered_terms)

word_vectors



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
-PRON-,-0.041108,0.036578,0.254178,0.100293,0.024046,-0.015691,0.156560,-0.025353,0.056881,0.251399,...,0.037885,0.228471,0.006402,0.044171,-0.117742,-0.040299,-0.077579,0.083768,0.023044,0.085933
be,0.005479,0.063909,0.010632,0.042974,0.001877,0.120524,-0.091634,-0.005393,0.098206,0.191996,...,0.083242,0.081308,0.050597,0.069064,-0.014169,0.131986,-0.154185,0.138034,-0.064113,0.163660
the,-0.019968,-0.018599,0.030403,0.091279,0.090386,0.141655,0.115172,-0.087189,0.027388,0.157450,...,0.100098,0.093134,0.026690,0.066760,-0.089390,-0.100121,-0.186524,0.088263,-0.226336,0.263659
and,0.081139,0.013136,0.106516,0.099658,-0.019699,0.056712,-0.047497,-0.051626,0.069902,0.082603,...,0.091325,0.218942,0.044745,0.031279,0.111612,0.032223,-0.146463,0.064543,-0.198934,0.187167
a,-0.001371,0.201263,-0.025526,0.072634,-0.027957,-0.021553,0.203276,-0.073604,0.015611,0.081752,...,0.113039,-0.065582,0.099271,0.013485,-0.096199,-0.034326,-0.162997,0.027286,-0.093052,0.245475
to,0.092182,-0.102969,0.153912,0.103185,-0.035485,-0.072193,0.197997,-0.077186,-0.057525,0.174217,...,0.030722,0.134326,0.116173,0.024675,0.056588,-0.070667,-0.110054,-0.084305,-0.085341,0.235801
have,-0.038690,0.125542,0.245807,0.060274,0.017346,-0.093866,0.053227,0.001765,-0.009886,0.054052,...,0.056817,0.280355,-0.028588,0.097306,0.054263,0.056908,-0.055594,-0.134492,0.034391,0.244539
of,-0.090765,0.168838,-0.179161,0.054272,0.026726,0.031894,0.017067,-0.022372,-0.134089,0.005996,...,0.086960,-0.000199,0.155634,0.062147,-0.112930,0.002126,-0.072175,0.048358,-0.145528,0.135202
for,-0.074182,0.045114,0.150311,0.002723,0.157037,-0.103920,-0.029432,0.025852,-0.048811,0.128088,...,-0.049487,0.116073,0.098516,0.153534,-0.054499,-0.157289,-0.206760,-0.135975,-0.038987,0.024615
in,-0.063061,0.038797,-0.090068,0.130691,0.201582,-0.088885,0.191099,-0.017693,0.011806,-0.053722,...,0.163475,0.150726,0.103329,0.043864,0.089603,0.017072,-0.148797,0.026060,-0.086892,0.099905


In [6]:
word_vectors = pd.DataFrame(food2vec.wv.syn0norm[term_indices, :],
                            index=ordered_terms)

tsne_input = word_vectors.drop(spacy.lang.en.STOP_WORDS, errors=u'ignore')


tsne_vectors = pd.np.load(tsne_vectors_filepath)

tsne_vectors = pd.DataFrame(tsne_vectors,
                            index=pd.Index(tsne_input.index),
                            columns=[u'x_coord', u'y_coord'])

  """Entry point for launching an IPython kernel.


In [7]:
tsne_vectors.head()

Unnamed: 0,x_coord,y_coord
-PRON-,-31.02989,-16.328735
good,-4.699564,8.958115
food,15.531963,-9.4257
order,0.24242,30.115559
great,-9.514729,8.594784


In [8]:
tsne_vectors[u'word'] = tsne_vectors.index

In [9]:
%load_ext autoreload
%autoreload 1
%aimport bokeh

In [13]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value

output_notebook()

In [15]:
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(tsne_vectors)

# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, reset'),
                   active_scroll=u'wheel_zoom')

# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@word') )

# draw the words as circles on the plot
tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# engage!
show(tsne_plot);