# Test workbook  
### Various iterations to examine the visual impact of changing parameters in sklearn.manifold.TSNE
### Additional check to ensure that the embeddings generated using the TensorFlow code show similar relationships to those generated using the full w2v implementation in gensim

In [1]:
import custom_embedding_functions as embed

import numpy as np
import pandas as pd
import re
import os
import bokeh.plotting as bp
from bokeh.models import HoverTool, LabelSet
from bokeh.io import output_notebook
from nltk.corpus import stopwords
from scipy import stats, optimize
from sklearn.manifold import TSNE
from six.moves import xrange
from tempfile import gettempdir
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from collections import OrderedDict
bp.output_notebook()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simonhodgkinson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load and process data
clean = ''
with open('../Data/Movie_Scripts.txt', 'r') as file:
    clean += file.read().lower()
    clean = re.sub("[^a-z]"," ",clean)
    full_text, _ = embed.tokenize(clean, exclude_stopwords=True)
        
data, count, dictionary, reversed_dictionary = embed.build_dataset(full_text, 10000)

# Train embedding layer of NN to get weights (ie vector representations of the words)
final_embeddings = embed.build_graph_and_train(data, batch_size=100, 
                                         embedding_size=100, 
                                         skip_window=1, 
                                         num_skips=2, 
                                         num_sampled=20)

# Check that the output is as expected, ie an v*n matrix where v = vocab and n=embedding dimensions
print("Embedding matrix shape: ", final_embeddings.shape)


Document contains  74946 distinct words
Total word count =  5350737
Initialized
Average loss at step  0 :  82.4874496459961
Average loss at step  20000 :  8.894759816868604
Average loss at step  40000 :  3.814242089881748
Average loss at step  60000 :  3.7127262602224946
Average loss at step  80000 :  3.7116149370133877
Average loss at step  100000 :  3.6538093282818793
Embedding matrix shape:  (10000, 100)


#### a) Perplexity = 30, learning_rate=200, iterations = 5000

In [3]:
# Limit the data that will be displayed using the variable 'plot_only'
plot_only = 400

def plot_data(final_embeddings, plot_only=400, perplexity=30, n_components=2, learning_rate=200, n_iter=5000):
    # Set the parameters for the t-SNE algorithm
    tsne = TSNE(perplexity=perplexity, n_components=n_components, init='pca', n_iter=n_iter, method='exact')
    # Use t-SNE to transform a slice of the embedding matrix, grab corresponding labels and plot the result
    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
    # Plot the data
    x = low_dim_embs[:,0]
    y = low_dim_embs[:,1]
    lab = [reversed_dictionary[i] for i in xrange(plot_only)]
    colors = ["#%02x%02x%02x" % (r,g,160) for r,g in zip(x.astype(int)+180, y.astype(int)+180)]
    tools = 'zoom_in, zoom_out, pan, save'
    p = bp.figure(title='t-SNE projection of embeddings based on the text of movie scripts', tools=tools)
    p.circle(x, y, radius=1, fill_color=colors, fill_alpha=0.8, line_color=None)
    # add labels
    source = bp.ColumnDataSource(data=dict(x = x, y = y, lab = lab))
    labels = LabelSet(x='x', y='y', text='lab', level='glyph', x_offset=3, y_offset=3,
                      source=source, render_mode='canvas', text_font_size="7pt")
    p.add_layout(labels)
    p.plot_height=750
    p.plot_width=750
    bp.show(p)
    
plot_data(final_embeddings, plot_only=400, perplexity=30, n_components=2, learning_rate=200, n_iter=5000)

![](img/bokeh_plot_1.png)

#### b) Perplexity = 30, learning_rate = 100, iterations = 5000

t-SNE uses a non-convex objective function, which is minimized using gradient descent with random initialization (incidentally, this is why multiple iterations with the same parameters can yield slightly different plots). Learning rate therefore has the same meaning as elsewhere in ML, ie it controls the size of the adjustment to our weights in each iteration. A number too small can cause results to be compressed into a dense cloud with no visible clustering (ie may not have converged on a local minima with the given interations), while a number too large can also cause a failure to converge, which will look like a ball of equidistant results

In [4]:
plot_data(final_embeddings, plot_only=400, perplexity=30, n_components=2, learning_rate=100, n_iter=5000)

![](img/bokeh_plot_2.png)

#### c) Perplexity = 30, learning_rate = 200, iterations = 7500

In [5]:
plot_data(final_embeddings, plot_only=400, perplexity=30, n_components=2, learning_rate=200, n_iter=7500)

![](img/bokeh_plot_3.png)

#### d) Perplexity = 10, learning rate = 200, iterations = 5000

Perplexity has the same meaning as elsewhere in NLP, ie it is a measure of the uncertainty of the distribution, and is defined as 2^k, where k is the Shannon entropy. A lower perplexity implies that we are selecting from a lower number of possible values for an unknown word. For the purposes of t-SNE though, it can be thought of as a dial that sets the number of nearest neighbors in the manifold learning process (ie the number of possible neighboring words whose distance should be the focus of the algorithm). Typically larger datasets will require a larger perplexity input in the t-SNE formula to yield meaningful clusters.

In [6]:
plot_data(final_embeddings, plot_only=400, perplexity=10, n_components=2, learning_rate=200, n_iter=5000)

![](img/bokeh_plot_4.png)

#### e) Perplexity = 20, learning rate = 200, iterations = 5000

In [7]:
plot_data(final_embeddings, plot_only=400, perplexity=20, n_components=2, learning_rate=200, n_iter=5000)

![](img/bokeh_plot_5.png)

#### f) Perplexity = 30, learning_rate=200, iterations = 5000, ie same as (a) but with dimension = 300

In [9]:
final_embeddings = embed.build_graph_and_train(data, batch_size=100, 
                                         embedding_size=300, 
                                         skip_window=1, 
                                         num_skips=2, 
                                         num_sampled=20)

plot_data(final_embeddings, plot_only=400, perplexity=30, n_components=2, learning_rate=200, n_iter=5000)

Initialized
Average loss at step  0 :  101.82319641113281
Average loss at step  20000 :  9.001744016996957
Average loss at step  40000 :  3.9096954645633697
Average loss at step  60000 :  3.7703053982771935
Average loss at step  80000 :  3.777942430484295
Average loss at step  100000 :  3.709189660924673


![](img/bokeh_plot_6.png)

### Compare to embeddings created by gensim word2vec  
  
To use the same 'plot data' function, we need two things;  
a) final_embeddings: a numpy array of the embedding vectors, where words have been ranked by frequency  
b) reversed_dictionary: a mapping of the frequency rank to the actual word (ie the label)

# g) Perplexity = 30, learning_rate=200, iterations = 7500, ie same as (c) but with embeddings created using gensim

In [10]:
# Load vectors
movie_vectors = KeyedVectors.load('../Data/movie_model.wv', mmap='r')

# KeyedVectors stores the frequency rank that we are looking for. Create a dictionary that maps the rank to
# the word
movie_dictionary = {movie_vectors.vocab[word].index : word 
                              for word in movie_vectors.vocab}

# Sort it by rank and check how it looks
reversed_dictionary = dict(sorted(movie_dictionary.items(), key=lambda x: x[0]))
#reversed_dictionary

In [11]:
# Create an ordered list of the embedding vectors
final_embeddings = []
for i in range(len(reversed_dictionary)):
    word = reversed_dictionary[i]
    final_embeddings.append(movie_vectors.get_vector(word))
    
# Turn this list into a numpy array and check that the size is vocab*dim
final_embeddings = np.asarray(final_embeddings)
final_embeddings.shape

(52523, 100)

In [12]:
# Check that everything looks ok by examining the vector for a random word
reversed_dictionary[16724]

'snowstorm'

In [13]:
final_embeddings[16724] - movie_vectors.get_vector('snowstorm')

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

In [14]:
# just to confirm
np.count_nonzero(final_embeddings[16724] - movie_vectors.get_vector('snowstorm'))

0

In [15]:
# now we should be OK to use the plot function
plot_data(final_embeddings, plot_only=400, perplexity=30, n_components=2, learning_rate=200, n_iter=7500)

![](img/bokeh_plot_1_1.png)