## Explore Daedalus Word Vector Space Models
The first three lines in the first cell runs code found in utility scripts which reside in the same folder as the notebook.
The remaining lines imports dependent libraries and sets up the notebook.

### Setup Notebook and Dependencies
The first three lines in the first cell runs code found in utility scripts which reside in the same folder as the notebook. The remaining lines imports dependent libraries and sets up the notebook.

In [17]:
# Setup Environment
%run ./common/wordvector-utility
%run ./common/vectorspace-utility
%run ./common/widgets-utility

import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 

from IPython.core.display import display, HTML, clear_output
from IPython.core.interactiveshell import InteractiveShell
from nltk import word_tokenize

import ipywidgets as widgets
import bokeh.models as bm
import bokeh.plotting as bp
import bokeh.io as bio
import pandas as pd
import numpy as np
import types

%autosave 120
%config IPCompleter.greedy=True

InteractiveShell.ast_node_interactivity = "all"
TOOLS = "pan,wheel_zoom,box_zoom,reset,previewsave"

bp.output_notebook()


Autosaving every 120 seconds


In [18]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) { return false; }

<IPython.core.display.Javascript object>

### Select Word Vector Space Model (to be Used in Subsequent Steps)
Avaliable vector space models are stored in subfolder./data. New models can be added simpy by uploading them to **~/notebooks/VaticanTexts/data** (with for instance WinSCP).
- **get_model_names** retrieves the filenames of all dat-files. 
- **load_model_vector** returns avaliable models stored in ./data sub-folder.
Remember to re-run all dependens cells (use **run** button or **shift-enter**) after a new model is loaded. There is no automatic execution of sub-sequent cells enables.

In [19]:
# Current Model
class ModelState:
    
    def __init__(self, data_folder):
        
        self.data_folder = data_folder
        self.filenames = WordVectorUtility.get_model_names(data_folder)
        self.filename = self.filenames[0]
        self.wordvector = None
        
    def set_model(self, filename=None):

        filename = filename or self.filename
        self.filename = filename
        self.wordvectors = WordVectorUtility.load_model_vector(os.path.join(self.data_folder, filename))
        print('Model {} loaded...'.format(self.filename))

state = ModelState('./vsm-data')

z = BaseWidgetUtility()
z.filename = z.create_select_widget(description='Model', options=state.filenames, value=state.filename, layout=widgets.Layout(width='75%'))
w = widgets.interactive(state.set_model, filename=z.filename)
display(widgets.VBox((z.filename,) + (w.children[-1],)))
w.update()

### Reduce the high-dimensional word vectors to 2D using t-SNE (sklearn)

t-SNE transforms a set of coordinates in a high-dimensional vector space (our word vectors) into a "faithful" representation in a lower-dimensional space e.g. 3D-space or a 2D-plane. The algorithm was introduced in 2008 by van der Maaten and Hinton [\[1\]](http://www.jmlr.org/papers/volume9/vandermaaten08a/vandermaaten08a.pdf). t-SNE (tries to) preserves *local distances* when coordinates are transformes to lower dimensions (i.e preserves clusters).
Things to consider:
1. What is the impact of reducing *only a subset* of the coordinates (compares to reducing the entire vocabulary)? That is, if we are intresested in a subset of words, we can 1) reduce the entire vocabulary or 2) only reduce the subset (for speed). 
2. t-SNE has a “perplexity” parameter which effects the local vs global clustering of the data. The perplexity affects the (guessed) distributon of neighbouring points to a given point. It is important to try different values for this parameter.

In [25]:
# Setup T-SNE Plot
def setup_tsne_plot():
    xp = bp.figure(
        plot_width=900, plot_height=600, title='T-SNE Dimensionality Reduction of Word Vector Space Model',
        tools=TOOLS, toolbar_location="right",
        x_axis_type=None, y_axis_type=None
    )
    source = ColumnDataSource(dict(w=[''],x=[0], y=[0]))
    crs = xp.scatter(x='x', y='y', source=source)
    crl = bm.LabelSet(x='x', y='y', text='w', level='glyph', x_offset=-2, y_offset=5, source=source)
    xp.add_layout(crl)
    handle = bp.show(xp, notebook_handle=True)
    return types.SimpleNamespace(handle=handle,source=source,points=crs,labels=crl)

def tsne_plot_reduce(words_of_interest, method='tsne', perplexity=30):
    global state
    X_m_space, index2word = WordVectorUtility.create_X_m_space_matrix(state.wordvectors, words_of_interest)
    opts = dict(n_components=2, init='pca', random_state=55887, perplexity=perplexity)
    X_2_space = VectorSpaceHelper.reduce_dimensions(X_m_space, method=method, **opts)
    return X_2_space, index2word
    
tsne_plot = setup_tsne_plot()


In [26]:
# T-SNE Plot
def update_tsne_plot(x,y,w):
    tsne_plot.source.data.update(dict(w=w,x=x, y=y))
    bio.push_notebook(handle=tsne_plot.handle)
    
def reduce_dimensions_and_display_words(raw_text):
    global state, u
    if len(raw_text) > 0 and raw_text[-1] != ' ':
        return
    words_of_interest = list(set(word_tokenize(raw_text)))

    if len(words_of_interest) >= 3:
        u.progress.value = 2
        X_2_space, index2word = tsne_plot_reduce(words_of_interest)
        u.progress.value = 3
        update_tsne_plot(x=X_2_space[:, 0], y=X_2_space[:, 1], w=index2word)
        u.progress.value = 5
    u.progress.value = 0
    
u = BaseWidgetUtility(
    progress = wf.create_int_progress_widget(
        description='Idle', min=0, max=5, step=1, value=0, layout=widgets.Layout(width='80%')
    ),
    words=wf.create_text_area_input_widget(
        description='Words', placeholder='(enter or paste words to be plotted)',
        value='',layout=widgets.Layout(width='80%', height='200px') #, continuous_update=False
    ),
    method=wf.create_select_widget(description='Method', options=['tsne', 'pca'], value='pca')
)

iwa = widgets.interactive(reduce_dimensions_and_display_words, raw_text=u.words, method=u.method)
display(widgets.VBox(
    (widgets.HBox((u.words,) + (u.method,)),) +
    (u.progress,) + (iwa.children[-1],)))
iwa.update()

In [22]:
' '.join(list(state.wordvectors.vocab.keys())[:250])

'vehicles projektledare analyser addendum moderniseringar fördröjt assessorn synnerlig mätningarna advancement quality energiska bildning tillkommer go venedig massiv informationsbitar kyrkas kopparverk jämförelsevis cern broschyrer landsvägen praktisera hvari pentium lösta 1860s skellefteå modifierade elan ledningarna navigera dockebyggnaden synen heritage ridån ocb individual jämnhet rutnät göran bomärke utdelning öpnas beryktade levnadsbeskrivning längsgående gasmarknaden kablarna designade halvledarteknik utger pasteur avtog stannade continental minnesbilder tv-program häri mobilen registrering framstöt kallad allsidig vistelsen exklusivt regulatorn egde jnträsenterna gastemperaturen teda inhämtas försiktiga enig torkar tangenter kornen gammaldags observed metallurg magnetosfären systemtänkande funderingar avspelning underkastades tändkulemotorn sällan knöts watnet dekorerat ningen informationssökning ökningen page vrids murbruk strumpor udden förbinder trakter polen tävla mål knag

### Word Vector Expression Calculator
The **compute_most_similar_expression** function in ModelUtility parses a string of words each (optionally) prefixed with a plus or minus sign. The extracted "positive" and "negative" words are then used as arguments to gensims **most_similar** function, which, using numeric vector operations (add and subtract) finds the words most similar (cosine similarity) to the result of the given expression.

In [23]:
#
def update_expression_plot(exp_points, result_points, expr_trail):
    global expression_plot
    expression_plot.expr_words_source.data.update(exp_points)
    expression_plot.result_words_source.data.update(result_points)
    expression_plot.expr_trail_source.data.update(expr_trail)
    bio.push_notebook(handle=expression_plot.handle)


def setup_expression_plot():
    xp = bp.figure(
        plot_width=900, plot_height=600, title='Word Vector Expressions',
        tools=TOOLS, toolbar_location="right",
        #x_axis_type=None, y_axis_type=None
    )

    xp.cross(x=0, y=0, size=10, color='blue')
                                                                    
    expr_words_source = ColumnDataSource(dict(w=[''],x=[0], y=[0]))
    result_words_source = ColumnDataSource(dict(w=[''],x=[0], y=[0]))
    expr_trail_source = ColumnDataSource(dict(w=[''],x=[0], y=[0], x2=[0], y2=[0]))
    
    crs = xp.scatter(x='x', y='y', color='black', source=expr_words_source)
    crl = bm.LabelSet(x='x', y='y', text='w', level='glyph', x_offset=-2, y_offset=5, source=expr_words_source)
    xp.add_layout(crl)
    
    rp = xp.scatter(x='x', y='y', size=5, color='green', source=result_words_source)
    rl = bm.LabelSet(x='x', y='y', text='w', level='glyph', x_offset=-2, y_offset=5, source=result_words_source)
    xp.add_layout(rl)
    
    xp.add_layout(bm.Arrow(
        x_start=0,
        y_start=0,
        x_end='x',
        y_end='y',
        line_color='red',
        source=expr_words_source,
        end=bm.NormalHead(size=10, fill_color='black', fill_alpha=1.0, line_alpha=0.2),
    ))

    xp.add_layout(bm.Arrow(
        x_start='x',
        y_start='y',
        x_end='x2',
        y_end='y2',
        line_color='blue',
        source=expr_trail_source,
        end=bm.NormalHead(size=10, fill_color='black', fill_alpha=1.0, line_alpha=0.2),
    ))
 
    handle = bp.show(xp, notebook_handle=True)
    return types.SimpleNamespace(
        handle=handle,
        points=crs,
        labels=crl,
        expr_words_source=expr_words_source,
        result_words_source=result_words_source,
        expr_trail_source=expr_trail_source
    )

    
expression_plot = setup_expression_plot()

In [24]:
# Calculator
history_state = ['', 'man - pojke + flicka']
z = BaseWidgetUtility(
    method = wf.create_select_widget(
        description='Reducer',
        options=['pca', 'tsne' ],
        value='tsne'
    ),
    perplexity=wf.create_int_slider(description='Perplexity', min=10, max=100, step=1, value=0),
    expression = wf.create_text_input_widget(
        description='Expression',
        placeholder='(enter expression e.g. sverige + oslo - stockholm)',
        value='',
        layout=widgets.Layout(width='90%')
    )
)

def compute_expression(expression, method, perplexity, n_top=10):
    global state, history_state
    
    result, options = WordVectorUtility.compute_most_similar_expression(state.wordvectors, expression)

    if result is None or options is None:
        return
    
    expression_words = (options['positives'] or []) + (options['negatives'] or [])
    
    #if len(expression_words) < 3:
    #    return
    
    result_words, result_weights = list(zip(*result[:n_top]))
    result_words = [ z for z in result_words if z not in expression_words ]

    #df = pd.DataFrame(result_words).assign(weight=result_weights)
    #display(HTML(df.to_html()))
    #return
    
    X_2_space, index2word = tsne_plot_reduce(expression_words+result_words,perplexity=perplexity)
    
    word2index = dict(zip(index2word, range(0, len(index2word))))

    expr_index = [ word2index[x] for x in expression_words ]
    result_index = [ word2index[x] for x in result_words ]
    positives = options['positives']
    color = len(index2word) * ['red']
    expr_words=dict(
        x=list(X_2_space[expr_index, 0]),
        y=list(X_2_space[expr_index, 1]),
        w=[ index2word[i] for i in expr_index ],
        s=[ index2word[i] in positives for i in expr_index ]
    )
    result_words=dict(
        x=list(X_2_space[result_index, 0]),
        y=list(X_2_space[result_index, 1]),
        w=[ index2word[i] for i in result_index ],
    )
    
    expr_points = [dict(zip(expr_words,t)) for t in zip(*expr_words.values())]
    trail = [dict(x=0, y=0, w='', x2=0, y2=0)]
    for p in expr_points:
        tp = trail[-1]
        sign = 1 if p['s'] else -1
        x, y = tp['x'] + sign * p['x'], tp['y'] + sign * p['y']
        w = tp['w'] + (' + ' if p['s'] else ' - ') + p['w']
        np = dict(x=x, y=y, w=w, x2=0, y2=0)
        tp['x2'], tp['y2'] = np['x'], np['y']
        trail += [ np ]
    
    expr_trail = dict(
        x=[ p['x'] for p in trail ][1:-1],
        y=[ p['y'] for p in trail ][1:-1],
        w=[ p['w'] for p in trail ][1:-1],
        x2=[ p['x2'] for p in trail ][1:-1],
        y2=[ p['y2'] for p in trail ][1:-1]
    )

    update_expression_plot(expr_words, result_words, expr_trail)
    
    #point_map = dict(zip(index2word, zip(X_2_space[:, 0], X_2_space[:, 1])))

    #update_expression_plot(x=X_2_space[:, 0], y=X_2_space[:, 1], w=index2word)
            
    df = pd.DataFrame(result_words).assign(weight=result_weights)
    display(HTML(df.to_html()))
        

w = widgets.interactive(compute_expression, expression=z.expression, method=z.method, perplexity=z.perplexity)
display(widgets.VBox(
    (widgets.HBox((z.expression,)),) +
    (widgets.HBox((z.method,) + (z.perplexity,)),) +
    (w.children[-1],)
    )
)
# w.update()
# sverige + oslo - stockholm  

### Test - T-SNE Reduction of Words

Plot top N words with different perplexities:

In [None]:

def scatter_plot_xy_words(
    df_words, title='', xlabel='', ylabel='',
    line_data=False, filename=None, default_color='blue',
    plot_width=900, plot_height=900
):

    plot = bp.figure(
        plot_width=plot_width, plot_height=plot_height, title=title,
        tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave", toolbar_location="above"
    )

    color = 'color' if 'color' in df_words.columns else default_color
    
    plot.scatter(x='x', y='y', size=8, source=df_words, alpha=0.5, color=color)
    
    plot.xaxis[0].axis_label = xlabel
    plot.yaxis[0].axis_label = ylabel

    source = bm.ColumnDataSource(df_words)
    labels = bm.LabelSet(x='x', y='y', text='words', level='glyph',
                      text_font_size="9pt", x_offset=5, y_offset=5, source=source, render_mode='canvas')

    hover = plot.select(dict(type=bm.HoverTool))
    hover.tooltips={"word": "@words"}
    
    if line_data is True:
        end_arrow = bm.OpenHead(line_color="firebrick", line_width=1, size=10)
        for i in range(1, len(df_words.index), 2):
            x_start, y_start = df_words.iloc[i-1]['x'], df_words.iloc[i-1]['y']
            x_end, y_end = df_words.iloc[i]['x'], df_words.iloc[i]['y']
            plot.add_layout(bm.Arrow(end=end_arrow, x_start=x_start, y_start=y_start, x_end=x_end, y_end=y_end))

    plot.add_layout(labels)
    return plot

# dimensionality reduction - selected word vectors are converted to 2d vectors
def reduce_dimensions(X_m_space, n_components=2, perplexity=30):
    from sklearn.manifold import TSNE
    tsne_model = TSNE(n_components=n_components, perplexity=perplexity, verbose=0, random_state=0)
    X_2_space = tsne_model.fit_transform(X_m_space)
    return X_2_space

TITLES = { 'title': "Word vectors reduced to XY using T-SNE", 'xlabel': 'X-component', 'ylabel': 'Y-component' }
@interact(w=(0,100),perplexity=(1,100))
def update(w=10,perplexity=30):
    global state
    words_of_interest = list(state.wordvectors.vocab.keys())[:w]
    ids_of_interest = [ state.wordvectors.vocab[x] for x in words_of_interest ]
    X_m_space = state.wordvectors if words_of_interest is None else \
        [state.wordvectors[w] for w in words_of_interest]
    X_2_space = reduce_dimensions(X_m_space, n_components=2, perplexity=perplexity)
    tsne_df = pd.DataFrame(X_2_space, columns=['x', 'y']).assign(words=words_of_interest)
    p2 = bokeh_scatter_plot_xy_words(tsne_df, line_data=True, **TITLES)
    show(p2)
    

### Plot word pairs

In [None]:
words_of_interest =  ['heaven', 'hell', 'boy', 'girl', 'husband', 'wife', 'son', 'daughter', 'father', 'mother']
tsne_df = reduce_dimensions(word_vectors, words_of_interest)
p3 = bokeh_scatter_plot_xy_words(tsne_df, line_data=True, **TITLES)
show(p3, notebook_handle=True)

In [None]:
holy_words = ['divine', 'hallowed', 'humble', 'pure', 'revered', 'righteous', 'spiritual', 'sublime', 'believing', 'clean', 'devotional', 'faithful', 'good', 'innocent', 'moral', 'perfect', 'upright', 'angelic', 'blessed', 'chaste', 'consecrated', 'dedicated', 'devoted', 'devout', 'faultless', 'glorified', 'god-fearing', 'godlike', 'godly', 'immaculate', 'just', 'messianic', 'pietistic', 'pious', 'prayerful', 'reverent', 'sacrosanct', 'sainted', 'saintlike', 'saintly', 'sanctified', 'seraphic', 'spotless', 'uncorrupt', 'undefiled', 'untainted', 'unworldly', 'venerable', 'venerated']
holy_antonyms = ['lewd', 'nefarious', 'shameless', 'sinful', 'vicious', 'vile', 'wanton', 'warped', 'wicked', 'abandoned', 'base', 'debased', 'debauched', 'degenerate', 'degraded', 'dirty', 'fast', 'low', 'mean', 'perverted', 'twisted', 'vitiate', 'vitiated', 'bad', 'dirty-minded', 'dissolute', 'evil', 'filthy', 'flagitous', 'gone to the dogs', 'kinky', 'lascivious', 'licentious', 'miscreant', 'profligate', 'putrid', 'rotten', 'unhealthy', 'unnatural', 'villainous']

words_of_interest = [ x for x in holy_words + holy_antonyms if x in word_vectors.vocab.keys() ]
tsne_df = reduce_dimensions(word_vectors, words_of_interest)
tsne_df['color'] = tsne_df.words.apply(lambda x: 'green' if x in holy_words else 'firebrick')
p4 = bokeh_scatter_plot_xy_words(tsne_df, **TITLES)
show(p4)

### Visualize anthologies

In [None]:
def compute_similarity_to_anthologies(word_vectors, scale_x_pair, scale_y_pair, word_list):

    scale_x = word_vectors[scale_x_pair[0]] - word_vectors[scale_x_pair[1]]
    scale_y = word_vectors[scale_y_pair[0]] - word_vectors[scale_y_pair[1]]

    word_x_similarity = [1 - spatial.distance.cosine(scale_x, word_vectors[x]) for x in word_list ]
    word_y_similarity = [1 - spatial.distance.cosine(scale_y, word_vectors[x]) for x in word_list ]

    df = pd.DataFrame({ 'words': word_list, 'x': word_x_similarity, 'y': word_y_similarity })

    return df

def compute_similarity_to_single_words(word_vectors, word_x, word_y, word_list):

    word_x_similarity = [ word_vectors.similarity(x, word_x) for x in word_list ]
    word_y_similarity = [ word_vectors.similarity(x, word_y) for x in word_list ]

    df = pd.DataFrame({ 'words': word_list, 'x': word_x_similarity, 'y': word_y_similarity })

    return df

def seed_word_toplist(word_vectors, seed_word, topn=100):
     # return [ seed_word ] + [ z[0] for z in word_vectors.most_similar_cosmul(seed_word, topn=topn) ]
     return [ seed_word ] + [ z[0] for z in word_vectors.most_similar(seed_word, topn=topn) ]
    

In [None]:
import bokeh.plotting as bp
from bokeh.models import ColumnDataSource, HoverTool, BoxSelectTool, LabelSet, Label, Arrow, OpenHead
from bokeh.plotting import figure, show, output_notebook, output_file

TOOLS = "pan,wheel_zoom,box_zoom,reset,hover,previewsave"

def bokeh_plot_xy_words(df_words, title='', xlabel='', ylabel='', line_data=False, filename=None, default_color='blue'):

    plot = bp.figure(plot_width=700, plot_height=600, title=title, tools=TOOLS, toolbar_location="above") #, x_axis_type=None, y_axis_type=None, min_border=1)

    color = 'color' if 'color' in df_words.columns else default_color
    
    plot.xaxis[0].axis_label = xlabel
    plot.yaxis[0].axis_label = ylabel

    source = ColumnDataSource(df_words)

    plot.diamond(x='x', y='y', size=8, source=df_words, alpha=0.5, color=color)

    labels = LabelSet(x='x', y='y', text='words', level='glyph',text_font_size="9pt", x_offset=5, y_offset=5, source=source, render_mode='canvas')
    plot.add_layout(labels)

    return plot

def show_similarity_to_anthologies(word_vectors, xpair, ypair, word_list):
    word_list = [ x for x in word_list if x in word_vectors.vocab.keys() ]
    df = compute_similarity_to_anthologies(word_vectors, xpair, ypair, word_list)
    xlabel = '{}{}{}'.format(xpair[1], 50 * ' ', xpair[0])
    ylabel = '{}{}{}'.format(ypair[1], 50 * ' ', ypair[0])
    p5 = bokeh_plot_xy_words(df, xlabel=xlabel, ylabel=ylabel)
    show(p5)


In [None]:
xpair = ('west', 'east')
ypair = ('south', 'north')
word_list = holy_words + holy_antonyms
show_similarity_to_anthologies(word_vectors, xpair, ypair, word_list)

In [None]:
from ipywidgets import interact
import numpy as np

from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
output_notebook()

x = np.linspace(0, 2*np.pi, 2000)
y = np.sin(x)

p = figure(title="simple line example", plot_height=300, plot_width=600, y_range=(-5,5))
r = p.line(x, y, color="#2222aa", line_width=3)

In [None]:
def update(f, w=1, A=1, phi=0):
    if   f == "sin": func = np.sin
    elif f == "cos": func = np.cos
    elif f == "tan": func = np.tan
    r.data_source.data['y'] = A * func(w * x + phi)
    push_notebook()
    show(p, notebook_handle=True)

interact(update, f=["sin", "cos", "tan"], w=(0,100), A=(1,5), phi=(0, 20, 0.1))