# Explore Daedalus Word Vector Space Models
The first three lines in the first cell runs code found in utility scripts which reside in the same folder as the notebook.
The remaining lines imports dependent libraries and sets up the notebook.

In [4]:
%run ./utility
%run ./model_utility
%run ./bokeh_plot_utility

In [5]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

import bokeh; bokeh.__version__
from bokeh.plotting import show, output_notebook, output_file

In [6]:

output_notebook()

# %autosave 120
#  %config IPCompleter.greedy=True


In [7]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

Avaliable vector space models are stored in subfolder./data and **get_model_names** retrives the filenames of all dat-files. Function **load_model_vector** returns avaliable models stored in ./data sub-folder. New models can be added simpy by uploading them to **~/notebooks/VaticanTexts/data** (with for instance WinSCP). The function **load_model_vector** loads a specific model as an instance of gensims Word2Vec model class.

Remember to re-run all dependens cells (use **run** button or **shift-enter**) after a new model is loaded. There is no automatic execution of sub-sequent cells enables.

In [8]:
word_vectors = None
@interact(model_filename=ModelUtility.get_model_names('./data'))
def load_model(model_filename):
    global word_vectors
    word_vectors = ModelUtility.load_model_vector(os.path.join('./data', model_filename))
    print('Model {} loaded...'.format(model_filename))

### Word vector expression examples
The **compute_most_similar_expression** function in ModelUtility parses a string of words each (optionally) prefixed with a plus or minus sign. The extracted "positive" and "negative" words are then used as arguments to gensims **most_similar** function, which, using numeric vector operations (add and subtract) finds the words most similar (cosine similarity) to the resulting.

In [17]:
w2vcompute = ModelUtility.compute_most_similar_expression

w2vcompute(word_vectors, "man - boy + girl")[:10]

[('woman', 0.7157337069511414),
 ('creature', 0.5413386225700378),
 ('person', 0.5249775648117065),
 ('drunken', 0.4849020838737488),
 ('maiden', 0.4841741621494293),
 ('child', 0.4810275435447693),
 ('canaanite', 0.4793747663497925),
 ('prophetess', 0.4675884246826172),
 ('son', 0.4525896906852722),
 ('man-god', 0.45154744386672974)]

In [18]:
w2vcompute(word_vectors, "heaven - good + evil")[:10]

word_vectors.most_similar(positive=['heaven', 'evil'], negative=['good'])


[('flung', 0.550287127494812),
 ('abyss', 0.5427220463752747),
 ('heavens', 0.5331960916519165),
 ('fury', 0.5196661949157715),
 ('curtain', 0.5175411105155945),
 ('sin', 0.5130163431167603),
 ('clings', 0.5093719959259033),
 ('tempter', 0.5081292390823364),
 ('death', 0.499502032995224),
 ('underworld', 0.49372774362564087)]

In [19]:
w2vcompute(word_vectors, "christ - good + evil")[:3]

[('crucified', 0.5988434553146362),
 ('death', 0.5572134852409363),
 ('risen', 0.5409574508666992)]

In [24]:
w2vcompute(word_vectors, "italy - rome + london")[:3]



[('presbyterate', 0.3858250379562378),
 ('parishes', 0.3382936120033264),
 ('turin', 0.3314526677131653),
 ('umbria', 0.32949939370155334),
 ('diocese', 0.328837513923645),
 ('local', 0.32045096158981323),
 ('diocesan', 0.3154292702674866),
 ('molise', 0.3135276734828949),
 ('incardinated', 0.30767229199409485),
 ('naples', 0.3055073320865631)]

### Reduce the high-dimensional word vectors to 2D using t-SNE (sklearn)

t-SNE transforms a set of coordinates in a high-dimensional vector space (our word vectors) into a "faithful" representation in a lower-dimensional space e.g. 3D-space or a 2D-plane. The algorithm was introduced in 2008 by van der Maaten and Hinton [\[1\]](http://www.jmlr.org/papers/volume9/vandermaaten08a/vandermaaten08a.pdf). t-SNE (tries to) preserves *local distances* when coordinates are transformes to lower dimensions (i.e preserves clusters).
Things to consider:
1. What is the impact of reducing *only a subset* of the coordinates (compares to reducing the entire vocabulary)? That is, if we are intresested in a subset of words, we can 1) reduce the entire vocabulary or 2) only reduce the subset (for speed). 
2. t-SNE has a “perplexity” parameter which effects the local vs global clustering of the data. The perplexity affects the (guessed) distributon of neighbouring points to a given point. It is important to try different values for this parameter.

In [25]:
# dimensionality reduction - selected word vectors are converted to 2d vectors
def reduce_dimensions(word_vectors, words_of_interest=None, n_components=2, perplexity=30):
    from sklearn.manifold import TSNE

    vectors = word_vectors if words_of_interest is None else [word_vectors[w] for w in words_of_interest]

    tsne_model = TSNE(n_components=n_components, perplexity=perplexity, verbose=0, random_state=0)
    tsne_w2v = tsne_model.fit_transform(vectors)

    # put everything in a dataframe
    tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
    tsne_df['words'] = words_of_interest
    return tsne_df

''' 
total_tsne_df = reduce_dimensions(word_vectors, list(word_vectors.vocab.keys()), n_components=2)
'''

' \ntotal_tsne_df = reduce_dimensions(word_vectors, list(word_vectors.vocab.keys()), n_components=2)\n'

Plot top N words with different perplexities:

In [26]:
TITLES = { 'title': "Word vectors reduced to XY using T-SNE", 'xlabel': 'X-component', 'ylabel': 'Y-component' }
@interact(w=(0,100),perplexity=(1,100))
def update(w=10,perplexity=30):
    global word_vectors
    words_of_interest = list(word_vectors.vocab.keys())[:w]
    ids_of_interest = [ word_vectors.vocab[x] for x in words_of_interest ]
    tsne_df = reduce_dimensions(word_vectors, words_of_interest,perplexity=perplexity)
    # tsne_df = total_tsne_df.loc[total_tsne_df.words.isin(words_of_interest)]
    p2 = bokeh_scatter_plot_xy_words(tsne_df, line_data=False, **TITLES)
    show(p2)
    

### Plot word pairs

In [28]:
words_of_interest =  ['heaven', 'hell', 'boy', 'girl', 'husband', 'wife', 'son', 'daughter', 'father', 'mother']
tsne_df = reduce_dimensions(word_vectors, words_of_interest)
p3 = bokeh_scatter_plot_xy_words(tsne_df, line_data=True, **TITLES)
show(p3, notebook_handle=True)

In [32]:
holy_words = ['divine', 'hallowed', 'humble', 'pure', 'revered', 'righteous', 'spiritual', 'sublime', 'believing', 'clean', 'devotional', 'faithful', 'good', 'innocent', 'moral', 'perfect', 'upright', 'angelic', 'blessed', 'chaste', 'consecrated', 'dedicated', 'devoted', 'devout', 'faultless', 'glorified', 'god-fearing', 'godlike', 'godly', 'immaculate', 'just', 'messianic', 'pietistic', 'pious', 'prayerful', 'reverent', 'sacrosanct', 'sainted', 'saintlike', 'saintly', 'sanctified', 'seraphic', 'spotless', 'uncorrupt', 'undefiled', 'untainted', 'unworldly', 'venerable', 'venerated']
holy_antonyms = ['lewd', 'nefarious', 'shameless', 'sinful', 'vicious', 'vile', 'wanton', 'warped', 'wicked', 'abandoned', 'base', 'debased', 'debauched', 'degenerate', 'degraded', 'dirty', 'fast', 'low', 'mean', 'perverted', 'twisted', 'vitiate', 'vitiated', 'bad', 'dirty-minded', 'dissolute', 'evil', 'filthy', 'flagitous', 'gone to the dogs', 'kinky', 'lascivious', 'licentious', 'miscreant', 'profligate', 'putrid', 'rotten', 'unhealthy', 'unnatural', 'villainous']

words_of_interest = [ x for x in holy_words + holy_antonyms if x in word_vectors.vocab.keys() ]
tsne_df = reduce_dimensions(word_vectors, words_of_interest)
tsne_df['color'] = tsne_df.words.apply(lambda x: 'green' if x in holy_words else 'firebrick')
p4 = bokeh_scatter_plot_xy_words(tsne_df, **TITLES)
show(p4)

### Visualize anthologies

In [33]:
def compute_similarity_to_anthologies(word_vectors, scale_x_pair, scale_y_pair, word_list):

    scale_x = word_vectors[scale_x_pair[0]] - word_vectors[scale_x_pair[1]]
    scale_y = word_vectors[scale_y_pair[0]] - word_vectors[scale_y_pair[1]]

    word_x_similarity = [1 - spatial.distance.cosine(scale_x, word_vectors[x]) for x in word_list ]
    word_y_similarity = [1 - spatial.distance.cosine(scale_y, word_vectors[x]) for x in word_list ]

    df = pd.DataFrame({ 'words': word_list, 'x': word_x_similarity, 'y': word_y_similarity })

    return df

def compute_similarity_to_single_words(word_vectors, word_x, word_y, word_list):

    word_x_similarity = [ word_vectors.similarity(x, word_x) for x in word_list ]
    word_y_similarity = [ word_vectors.similarity(x, word_y) for x in word_list ]

    df = pd.DataFrame({ 'words': word_list, 'x': word_x_similarity, 'y': word_y_similarity })

    return df

def seed_word_toplist(word_vectors, seed_word, topn=100):
     # return [ seed_word ] + [ z[0] for z in word_vectors.most_similar_cosmul(seed_word, topn=topn) ]
     return [ seed_word ] + [ z[0] for z in word_vectors.most_similar(seed_word, topn=topn) ]
    

In [34]:
import bokeh.plotting as bp
from bokeh.models import ColumnDataSource, HoverTool, BoxSelectTool, LabelSet, Label, Arrow, OpenHead
from bokeh.plotting import figure, show, output_notebook, output_file

TOOLS = "pan,wheel_zoom,box_zoom,reset,hover,previewsave"

def bokeh_plot_xy_words(df_words, title='', xlabel='', ylabel='', line_data=False, filename=None, default_color='blue'):

    plot = bp.figure(plot_width=700, plot_height=600, title=title, tools=TOOLS, toolbar_location="above") #, x_axis_type=None, y_axis_type=None, min_border=1)

    color = 'color' if 'color' in df_words.columns else default_color
    
    plot.xaxis[0].axis_label = xlabel
    plot.yaxis[0].axis_label = ylabel

    source = ColumnDataSource(df_words)

    plot.diamond(x='x', y='y', size=8, source=df_words, alpha=0.5, color=color)

    labels = LabelSet(x='x', y='y', text='words', level='glyph',text_font_size="9pt", x_offset=5, y_offset=5, source=source, render_mode='canvas')
    plot.add_layout(labels)

    return plot

def show_similarity_to_anthologies(word_vectors, xpair, ypair, word_list):
    word_list = [ x for x in word_list if x in word_vectors.vocab.keys() ]
    df = compute_similarity_to_anthologies(word_vectors, xpair, ypair, word_list)
    xlabel = '{}{}{}'.format(xpair[1], 50 * ' ', xpair[0])
    ylabel = '{}{}{}'.format(ypair[1], 50 * ' ', ypair[0])
    p5 = bokeh_plot_xy_words(df, xlabel=xlabel, ylabel=ylabel)
    show(p5)


In [35]:
xpair = ('west', 'east')
ypair = ('south', 'north')
word_list = holy_words + holy_antonyms
show_similarity_to_anthologies(word_vectors, xpair, ypair, word_list)