## The Culture of International Relations - Text Analysis
### <span style='color: green'>SETUP </span> Prepare and Setup Notebook <span style='float: right; color: red'>MANDATORY</span>

In [10]:
# Setup
%load_ext autoreload
%autoreload 2

import sys, os, collections, zipfile
import re, typing.re

sys.path = list(set(['.', '..']) - set(sys.path)) + sys.path

import nltk, textacy, spacy 
import pandas as pd
import ipywidgets as widgets
import bokeh, bokeh.plotting, bokeh.models, matplotlib.pyplot as plt
import common.utility as utility
import common.widgets_utility as widgets_utility
import common.widgets_config as widgets_config
import common.config as config
import common.utility as utility
import common.treaty_utility as treaty_utility
import common.treaty_state as treaty_repository
import treaty_corpus
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning) 

from beakerx.object import beakerx
from beakerx import *
from IPython.display import display, set_matplotlib_formats

logger = utility.getLogger('corpus_text_analysis')

import pickle
import topic_model
import topic_model_utility
import textacy_corpus_utility as textacy_utility

utility.setup_default_pd_display(pd)

DATA_FOLDER, PATTERN = '../data',  '*.txt'
PERIOD_GROUP = 'years_1945-1972'
DF_TAGSET = pd.read_csv('../data/tagset.csv', sep='\t').fillna('')
WTI_INDEX = treaty_repository.load_wti_index(data_folder=DATA_FOLDER)
TREATY_TIME_GROUPINGS = WTI_INDEX.get_treaty_time_groupings()

%matplotlib inline
# set_matplotlib_formats('svg')   
bokeh.plotting.output_notebook()

current_corpus_container = lambda: textacy_utility.CorpusContainer.container()
current_corpus = lambda: textacy_utility.CorpusContainer.corpus()

current_state = lambda: topic_model_utility.TopicModelContainer.singleton()
current_data = lambda: current_state().data
current_topic_model = lambda: current_state().topic_model


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


2018-12-27 21:49:00,560 : INFO : WTI index loaded!


## <span style='color: green'>PREPARE </span> Load and Prepare Corpus <span style='float: right; color: red'>MANDATORY</span>

In [11]:
import textacy_corpus_gui
try:
    textacy_corpus_gui.display_corpus_load_gui(DATA_FOLDER, WTI_INDEX, current_corpus_container())
except Exception as ex:
    logger.error(ex)

VBox(children=(IntProgress(value=0, layout=Layout(width='90%'), max=5), HBox(children=(Dropdown(description='C…

## <span style='color: green;'>MODEL</span> Compute or Load a Topic Model<span style='color: red; float: right'>MANDATORY RUN</span>

### <span style='color: green;'>MODEL</span> Compute a new Topic Model<span style='color: red; float: right'>OPTIONAL</span>

<span style='color: red'>TODO</span>: Check if display order is correct (compare to relevant_topic_ids order)


In [12]:
import topic_model_gui
try:
    topic_model_gui.display_topic_model_gui(current_state(), current_corpus(), DF_TAGSET)
except Exception as ex:
    raise
    logger.error(ex)

VBox(children=(IntProgress(value=0, layout=Layout(width='90%'), max=5), HBox(children=(VBox(children=(IntSlide…

### <span style='color: green;'>MODEL</span> Store or Load a Topic Model<span style='color: red; float: right'>OPTIONAL</span>

In [7]:
import pickle
import glob
import topic_model
import topic_model_utility

def get_persisted_model_paths():
    return sorted([ x for x in glob.glob(os.path.join(DATA_FOLDER, '*.pickle')) ])

def get_store_filename(identifier):
    filename = os.path.join(DATA_FOLDER, 'topic_model.pickle')
    filename = utility.path_add_date(filename)
    filename = utility.path_add_suffix(filename, identifier)
    return filename
    
def display_persist_topic_model_gui(state):
    
    gui = types.SimpleNamespace(
        stored_path=widgets.Dropdown(description='Path', options=get_persisted_model_paths(), layout=widgets.Layout(width='40%')),
        load=widgets.Button(description='Load', button_style='Success', layout=widgets.Layout(width='80px')),
        store=widgets.Button(description='Store', button_style='Success', layout=widgets.Layout(width='80px')),
        identifier=widgets.Text(description='Identifier', layout=widgets.Layout(width='300px')),
        output=widgets.Output()
    )
    
    boxes = widgets.VBox([
        widgets.HBox([gui.stored_path, gui.load, gui.store, gui.identifier ]),
        widgets.HBox([
            widgets.Label(value="", layout=widgets.Layout(width='40%')),
            widgets.Label(value="Stored models will be named ./data/topic_model_yyyymmdd_$identifier$.pickle", layout=widgets.Layout(width='40%')),
        ]),
        widgets.VBox([gui.output])
    ])
    
    def load_handler(*args):
        
        with gui.output:
            
            if gui.stored_path.value is None:
                print("Please specify which model to load.")
                return

            state.data = topic_model.load_model(gui.stored_path.value)

            topics = topic_model_utility.get_lda_topics(state.topic_model, n_tokens=20)

            display(topics)

    def store_handler(*args):
        
        gui.output.clear_output()

        with gui.output:

            if gui.identifier.value == '':
                print("Please specify a unique identifier for the model.")
                return

            if gui.identifier.value != utility.filename_whitelist(gui.identifier.value):
                print("Please use ONLY valid filename characters in identifier.")
                return

            filename = get_store_filename(gui.identifier.value)

            topic_model.store_model(state.data, filename)

            gui.stored_path.options = get_persisted_model_paths()
            gui.stored_path.value = filename if filename in gui.stored_path.options else None

            print('Model stored in file {}'.format(filename))
            
    gui.load.on_click(load_handler)
    gui.store.on_click(store_handler)
    
    display(boxes)

display_persist_topic_model_gui(current_state())


VBox(children=(HBox(children=(Dropdown(description='Path', layout=Layout(width='40%'), options=('../data/topic…

## <span style='color: green;'>VISUALIZE</span> Display Topic's Word Distribution as a Wordcloud<span style='color: red; float: right'>TRY IT</span>

In [8]:
# Display LDA topic's token wordcloud
opts = { 'max_font_size': 100, 'background_color': 'white', 'width': 900, 'height': 600 }
import wordcloud
import matplotlib.pyplot as plt

def plot_wordcloud(df, token='token', weight='weight', figsize=(14, 14/1.618), **args):
    token_weights = dict({ tuple(x) for x in df[[token, weight]].values })
    image = wordcloud.WordCloud(**args,)
    image.fit_words(token_weights)
    plt.figure(figsize=figsize) #, dpi=100)
    plt.imshow(image, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    
def display_wordcloud(
    state,
    topic_id=0,
    n_words=100,
    output_format='Wordcloud',
    gui=None
):
    def tick(n=None):
        gui.progress.value = (gui.progress.value + 1) if n is None else n
        
    if gui.n_topics != state.num_topics:
        gui.n_topics = state.num_topics
        gui.topic_id.value = 0
        gui.topic_id.max=state.num_topics - 1
        
    tick(1)
    
    topic_token_weights = state.processed.topic_token_weights
    
    df = topic_token_weights.loc[(topic_token_weights.topic_id == topic_id)]
    
    tokens = topic_model_utility.get_topic_title(topic_token_weights, topic_id, n_tokens=n_words)
    gui.text.value = 'ID {}: {}'.format(topic_id, tokens)
   
    tick()
    
    if output_format == 'Wordcloud':
        plot_wordcloud(df, 'token', 'weight', max_words=n_words, **opts)
    else:
        tick()
        df = topic_model_utility.get_topic_tokens(topic_token_weights, topic_id=topic_id, n_words=n_words)
        tick()
        display(df)
    tick(0)
    
def display_wordcloud_gui(state):
    
    output_options = ['Wordcloud', 'Table']
    text_id = 'tx02'
    
    gui = widgets_utility.WidgetUtility(
        n_topics=state.num_topics,
        text_id=text_id,
        text=widgets_config.text(text_id),
        topic_id=widgets.IntSlider(description='Topic ID', min=0, max=state.num_topics - 1, step=1, value=0, continuous_update=False),
        word_count=widgets.IntSlider(description='#Words', min=5, max=250, step=1, value=25, continuous_update=False),
        output_format=widgets.Dropdown(description='Format', options=output_options, value=output_options[0], layout=widgets.Layout(width="200px")),
        progress=widgets.IntProgress(min=0, max=4, step=1, value=0, layout=widgets.Layout(width="95%"))
    )

    gui.prev_topic_id = gui.create_prev_id_button('topic_id', state.num_topics)
    gui.next_topic_id = gui.create_next_id_button('topic_id', state.num_topics)

    iw = widgets.interactive(
        display_wordcloud,
        state=widgets.fixed(state),
        topic_id=gui.topic_id,
        n_words=gui.word_count,
        output_format=gui.output_format,
        gui=widgets.fixed(gui)
    )

    display(widgets.VBox([
        gui.text,
        widgets.HBox([gui.prev_topic_id, gui.next_topic_id, gui.topic_id, gui.word_count, gui.output_format]),
        gui.progress,
        iw.children[-1]
    ]))

    iw.update()

try:
    display_wordcloud_gui(current_state())
except topic_model_utility.TopicModelException as ex:
    logger.info(ex)
    

VBox(children=(HTML(value="<span class='tx02'></span>", placeholder=''), HBox(children=(Button(description='<<…

## <span style='color: green'>EXPLORE </span> pyLDAvis <span style='float: right; color: red'>TRY IT</span>
http://www.aclweb.org/anthology/W14-3110 presented at the 2014 ACL Workshop on Interactive Language Learning, Visualization, and Interfaces in Baltimore on June 27, 2014.
https://github.com/bmabey/pyLDAvis

In [14]:
import pyLDAvis, pyLDAvis.gensim, pyLDAvis.sklearn
import gensim
pyLDAvis.enable_notebook()
def display_pyLDAvis(state):

    if isinstance(state.data.topic_model, textacy.tm.topic_model.TopicModel):
        topic_model = state.data.topic_model.model
    elif isinstance(state.data.topic_model, gensim.models.wrappers.LdaMallet):
        topic_model = topic_model_utility.malletmodel2ldamodel(state.data.topic_model)
    else:
        topic_model = state.data.topic_model

    if 'sklearn' in str(type(topic_model)):
        p = pyLDAvis.sklearn.prepare(topic_model, state.data.bow_corpus, state.data.id2term)
    else:
        p = pyLDAvis.gensim.prepare(topic_model, state.data.bow_corpus, state.data.id2term)
        
    display(p)
    
display_pyLDAvis(current_state())


## <span style='color: green;'>VISUALIZE</span> Display Topic's Word Distribution as a Chart<span style='color: red; float: right'>TRY IT</span>


In [20]:
# Display topic's word distribution
import numpy as np

def plot_topic_word_distribution(tokens, **args):

    source = bokeh.models.ColumnDataSource(tokens)

    p = bokeh.plotting.figure(toolbar_location="right", **args)

    cr = p.circle(x='xs', y='ys', source=source)

    label_style = dict(level='overlay', text_font_size='8pt', angle=np.pi/6.0)

    text_aligns = ['left', 'right']
    for i in [0, 1]:
        label_source = bokeh.models.ColumnDataSource(tokens.iloc[i::2])
        labels = bokeh.models.LabelSet(x='xs', y='ys', text_align=text_aligns[i], text='token', text_baseline='middle',
                          y_offset=5*(1 if i == 0 else -1),
                          x_offset=5*(1 if i == 0 else -1),
                          source=label_source, **label_style)
        p.add_layout(labels)

    p.xaxis[0].axis_label = 'Token #'
    p.yaxis[0].axis_label = 'Probability%'
    p.ygrid.grid_line_color = None
    p.xgrid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_text_font_size = "6pt"
    p.axis.major_label_standoff = 0
    return p

def display_topic_tokens(state, topic_id=0, n_words=100, output_format='Chart', gui=None):
    
    def tick(n=None):
        gui.progress.value = (gui.progress.value + 1) if n is None else n
        
    if gui.n_topics != state.num_topics:
        gui.n_topics = state.num_topics
        gui.topic_id.value = 0
        gui.topic_id.max=state.num_topics - 1
        
    tick(1)
    
    tokens = topic_model_utility.get_topic_tokens(state.processed.topic_token_weights, topic_id=topic_id, n_tokens=n_words).\
        copy()\
        .drop('topic_id', axis=1)\
        .assign(weight=lambda x: 100.0 * x.weight)\
        .sort_values('weight', axis=0, ascending=False)\
        .reset_index()\
        .head(n_words)
    
    if output_format == 'Chart':
        tick()
        tokens = tokens.assign(xs=tokens.index, ys=tokens.weight)
        p = plot_topic_word_distribution(tokens, plot_width=1200, plot_height=500, title='', tools='box_zoom,wheel_zoom,pan,reset')
        bokeh.plotting.show(p)
        tick()
    else:
        display(tokens)
        
    tick(0)
    
def display_topic_distribution_gui(state):
    
    text_id = 'wc01'
    output_options = ['Chart', 'Table']
    
    gui = widgets_utility.WidgetUtility(
        n_topics=state.num_topics,
        text_id=text_id,
        text=widgets_config.text(text_id),
        topic_id=widgets.IntSlider(description='Topic ID', min=0, max=state.num_topics - 1, step=1, value=0),
        n_words=widgets.IntSlider(description='#Words', min=5, max=500, step=1, value=75),
        output_format=widgets.Dropdown(description='Format', options=output_options, value=output_options[0], layout=widgets.Layout(width="200px")),
        progress=widgets.IntProgress(min=0, max=4, step=1, value=0, layout=widgets.Layout(width="95%"))
    )

    gui.prev_topic_id = gui.create_prev_id_button('topic_id', state.num_topics)
    gui.next_topic_id = gui.create_next_id_button('topic_id', state.num_topics)

    iw = widgets.interactive(
        display_topic_tokens,
        state=widgets.fixed(state),
        topic_id=gui.topic_id,
        n_words=gui.n_words,
        output_format=gui.output_format,
        gui=widgets.fixed(gui)
    )

    display(widgets.VBox([
        gui.text,
        widgets.HBox([gui.prev_topic_id, gui.next_topic_id, gui.topic_id, gui.n_words, gui.output_format]),
        gui.progress,
        iw.children[-1]
    ]))

    iw.update()

try:
    display_topic_distribution_gui(current_state())
except Exception as ex:
    logger.error(ex)
    


VBox(children=(HTML(value="<span class='wc01'></span>", placeholder=''), HBox(children=(Button(description='<<…

## <span style='color: green;'>VISUALIZE</span> Display Topic's Trend Over Time or Documents<span style='color: red; float: right'>RUN</span>
- Displays topic's share over documents.

- BUGG? Values > 1.0

In [25]:
# Plot a topic's yearly weight over time in selected LDA topic model
import math

def plot_topic_trend(df, category_column, value_column, x_label=None, y_label=None, **figopts):
    
    xs = df[category_column].astype(np.str)
    ys = df[value_column]
    
    figopts = utility.extend(dict(title='', toolbar_location="right"), figopts)
    
    p = bokeh.plotting.figure(**figopts)

    glyph = p.vbar(x=xs, top=ys, width=0.5, fill_color="#b3de69")
    
    p.xaxis.major_label_orientation = math.pi/4
    p.xgrid.grid_line_color = None
    p.xaxis[0].axis_label = (x_label or category_column.title().replace('_', ' ')).title()
    p.yaxis[0].axis_label = (y_label or value_column.title().replace('_', ' ')).title()
    p.y_range.start = 0.0
    p.x_range.range_padding = 0.01
    
    return p

def display_topic_trend(
    state,
    topic_id,
    year,
    year_aggregate,
    threshold=0.01,
    output_format='Chart',
    topic_changed=utility.noop
):
    figopts = dict(plot_width=1000, plot_height=700, title='', toolbar_location="right")
    
    document_topic_weights = state.processed.document_topic_weights

    topic_changed(topic_id)
    
    pivot_column = 'signed_year' if year is None else None
    value_column = year_aggregate if year is None else 'weight'

    df = document_topic_weights[(document_topic_weights.topic_id == topic_id)]
    
    if year is not None:
        df = df[(df.signed_year == year)]
        
    df = df[(df.weight > threshold)].reset_index()
    
    if pivot_column is not None:
        df = df.groupby([pivot_column, 'topic_id']).agg([np.mean, np.max])['weight'].reset_index()
        df.columns = [pivot_column, 'topic_id', 'mean', 'max']
        category_column = pivot_column
        min_year = document_topic_weights.signed_year.min()
        max_year = document_topic_weights.signed_year.max()
        figopts['x_range'] = list(map(str, range(min_year, max_year+1))) # utility.complete_value_range(df[category_column].unique(), str)
    else:
        df['treaty'] = df.treaty_id + ' ' + df.party1 + ' ' + df.party2
        category_column = 'treaty'
        figopts['x_range'] = df['treaty'].unique()
        
    if output_format == 'Table':
        display(df)
    else:
        p = plot_topic_trend(df, category_column, value_column, **figopts)
        bokeh.plotting.show(p)

def display_topic_trend_gui(state):
    
    year_options = [ ('all years', None) ] + [ (x,x) for x in range(state.processed.year_period[0], state.processed.year_period[1] + 1)]
    
    text_id = 'topic_share_plot'
    
    gui = widgets_utility.WidgetUtility(
        n_topics=state.num_topics,
        text_id=text_id,
        text=widgets_config.text(text_id),
        year=widgets.Dropdown(description='Year', options=year_options, value=None),
        year_aggregate=widgets.Dropdown(description='Aggregate', options=['mean', 'max'], value='max'),
        threshold=widgets.FloatSlider(description='Threshold', min=0.0, max=0.25, step=0.01, value=0.10, continuous_update=False),
        topic_id=widgets.IntSlider(description='Topic ID', min=0, max=state.num_topics - 1, step=1, value=0, continuous_update=False),
        output_format=widgets.Dropdown(description='Format', options=['Chart', 'Table'], value='Chart'),
        progress=widgets.IntProgress(min=0, max=4, step=1, value=0, layout=widgets.Layout(width="50%")),
    )
    
    gui.prev_topic_id = gui.create_prev_id_button('topic_id', state.num_topics)
    gui.next_topic_id = gui.create_next_id_button('topic_id', state.num_topics)
    
    def on_topic_changed(topic_id):
        
        if gui.n_topics != state.num_topics:
            gui.n_topics = state.num_topics
            gui.topic_id.value = 0
            gui.topic_id.max = state.num_topics - 1
            
        tokens = topic_model_utility.get_topic_title(state.processed.topic_token_weights, topic_id, n_tokens=200)
        gui.text.value = 'ID {}: {}'.format(topic_id, tokens)
        
    iw = widgets.interactive(
        display_topic_trend,
        state=widgets.fixed(state),
        topic_id=gui.topic_id,
        year=gui.year,
        year_aggregate=gui.year_aggregate,
        threshold=gui.threshold,
        output_format=gui.output_format,
        topic_changed=widgets.fixed(on_topic_changed)
    )

    display(widgets.VBox([
        gui.text,
        widgets.HBox([gui.prev_topic_id, gui.next_topic_id, gui.year, gui.year_aggregate, gui.output_format]),
        widgets.HBox([gui.topic_id, gui.threshold, gui.progress]),
        iw.children[-1]
    ]))
    
    iw.update()

try:
    display_topic_trend_gui(current_state())
except Exception as ex:
    logger.error(ex)

VBox(children=(HTML(value="<span class='topic_share_plot'></span>", placeholder=''), HBox(children=(Button(des…

## <span style='color: green;'>VISUALIZE</span> Display Topic to Document Network<span style='color: red; float: right'>TRY IT</span>
The green nodes are documents, and blue nodes are topics. The edges (lines) indicates the strength of a topic in the connected document. The width of the edge is proportinal to the strength of the connection. Note that only edges with a strength above the certain threshold are displayed.

In [35]:
# Visualize year-to-topic network by means of topic-document-weights
from common.plot_utility import layout_algorithms, PlotNetworkUtility
from common.network_utility import NetworkUtility, DISTANCE_METRICS, NetworkMetricHelper

def plot_document_topic_network(network, layout, scale=1.0, titles=None):
    tools = "pan,wheel_zoom,box_zoom,reset,hover,previewsave"
    year_nodes, topic_nodes = NetworkUtility.get_bipartite_node_set(network, bipartite=0)  
    
    year_source = NetworkUtility.get_node_subset_source(network, layout, year_nodes)
    topic_source = NetworkUtility.get_node_subset_source(network, layout, topic_nodes)
    lines_source = NetworkUtility.get_edges_source(network, layout, scale=6.0, normalize=False)
    
    edges_alphas = NetworkMetricHelper.compute_alpha_vector(lines_source.data['weights'])
    
    lines_source.add(edges_alphas, 'alphas')
    
    p = bokeh.plotting.figure(plot_width=1000, plot_height=600, x_axis_type=None, y_axis_type=None, tools=tools)
    
    r_lines = p.multi_line(
        'xs', 'ys', line_width='weights', alpha='alphas', color='black', source=lines_source
    )
    r_years = p.circle(
        'x','y', size=40, source=year_source, color='lightgreen', level='overlay', line_width=1,alpha=1.0
    )
    
    r_topics = p.circle('x','y', size=25, source=topic_source, color='skyblue', level='overlay', alpha=1.00)
    
    p.add_tools(bokeh.models.HoverTool(renderers=[r_topics], tooltips=None, callback=widgets_utility.wf.\
        glyph_hover_callback(topic_source, 'node_id', text_ids=titles.index, text=titles, element_id='nx_id1'))
    )

    text_opts = dict(x='x', y='y', text='name', level='overlay', x_offset=0, y_offset=0, text_font_size='8pt')
    
    p.add_layout(
        bokeh.models.LabelSet(
            source=year_source, text_color='black', text_align='center', text_baseline='middle', **text_opts
        )
    )
    p.add_layout(
        bokeh.models.LabelSet(
            source=topic_source, text_color='black', text_align='center', text_baseline='middle', **text_opts
        )
    )
    
    return p
        
def display_document_topic_network(layout_algorithm, state, threshold=0.10, parties=None, period=None, ignores=None, scale=1.0, output_format='network', tick=utility.noop):

    tick(1)
    
    topic_token_weights = state.processed.topic_token_weights
    document_topic_weights = state.processed.document_topic_weights
    
    titles = topic_model_utility.get_topic_titles(topic_token_weights)

    df = document_topic_weights[document_topic_weights.weight > threshold].reset_index()

    if len(parties or []) > 0:
        df = df[df.party1.isin(parties) | df.party2.isin(parties)]

    if len(period or []) == 2:
        df = df[(df.signed_year>=period[0]) & (df.signed_year<=period[1])]
        
    if len(ignores or []) > 0:
        df = df[~df.topic_id.isin(ignores)]

    df['weight'] = utility.clamp_values(list(df.weight), (0.1, 2.0))

    if len(df) == 0:
        print('No data')
        return
    
    df['title'] = df.treaty_id + ' ' + df.party1 + ' ' + df.party2

    network = NetworkUtility.create_bipartite_network(df, 'title', 'topic_id')
    tick()

    if output_format == 'network':
        args = PlotNetworkUtility.layout_args(layout_algorithm, network, scale)
        layout = (layout_algorithms[layout_algorithm])(network, **args)
        tick()
        p = plot_document_topic_network(network, layout, scale=scale, titles=titles)
        bokeh.plotting.show(p)

    elif output_format == 'table':
        display(df)

    tick(0)
        
def document_topic_network_gui(wti_index, state):
    
    lw = lambda w: widgets.Layout(width=w)
    
    text_id = 'nx_id1'
    layout_options = [ 'Circular', 'Kamada-Kawai', 'Fruchterman-Reingold']
    party_preset_options = wti_index.get_party_preset_options()
    parties_options = [ x for x in wti_index.get_countries_list() if x not in ['ALL', 'ALL OTHER'] ]
    year_min, year_max = state.processed.year_period
    
    n_topics = state.num_topics
    
    gui = types.SimpleNamespace(
        text=widgets_config.text(text_id),
        period=widgets.IntRangeSlider(description='Time', min=year_min, max=year_min+5, step=1, value=(year_min, year_max), continues_update=False),
        scale=widgets.FloatSlider(description='Scale', min=0.0, max=1.0, step=0.01, value=0.1, continues_update=False),
        threshold=widgets.FloatSlider(description='Threshold', min=0.0, max=1.0, step=0.01, value=0.50, continues_update=False),
        output_format=widgets_utility.dropdown('Output', { 'Network': 'network', 'Table': 'table' }, 'network', layout=lw('200px')),
        layout=widgets_utility.dropdown('Layout', layout_options, 'Fruchterman-Reingold', layout=lw('250px')),
        parties=widgets.SelectMultiple(description='Parties', options=parties_options, value=['FRANCE'], rows=7, layout=lw('180px')),
        party_preset=widgets_config.dropdown('Presets', party_preset_options, None, layout=lw('180px')),
        progress=widgets.IntProgress(min=0, max=4, step=1, value=0, layout=widgets.Layout(width="99%")),
        ignores=widgets.SelectMultiple(description='Ignore', options=[('', None)] + [ ('Topic #'+str(i), i) for i in range(0, n_topics) ], value=[], rows=8, layout=lw('180px')),
    )
    
    def tick(x=None):
        gui.progress.value = gui.progress.value + 1 if x is None else x
        
    def on_party_preset_change(change):  # pylint: disable=W0613
        if gui.party_preset.value is None:
            return
        gui.parties.value = gui.parties.options if 'ALL' in gui.party_preset.value else gui.party_preset.value
            
    gui.party_preset.observe(on_party_preset_change, names='value')
    
    iw = widgets.interactive(
        display_document_topic_network,
        layout_algorithm=gui.layout,
        state=widgets.fixed(state),
        threshold=gui.threshold,
        parties=gui.parties,
        period=gui.period,
        ignores=gui.ignores,
        scale=gui.scale,
        output_format=gui.output_format,
        tick=widgets.fixed(tick)
    )

    display(widgets.VBox([
        widgets.HBox([
            widgets.VBox([gui.layout, gui.threshold, gui.scale, gui.period]), 
            widgets.VBox([gui.parties, gui.party_preset]), 
            widgets.VBox([gui.ignores]), 
            widgets.VBox([gui.output_format, gui.progress]),
        ]),
        iw.children[-1],
        gui.text,
    ]))
    iw.update()

try:
    document_topic_network_gui(WTI_INDEX, current_state())
except Exception as ex:
    logger.error(ex)
    

VBox(children=(HBox(children=(VBox(children=(Dropdown(description='Layout', index=2, layout=Layout(width='250p…

## <span style='color: green;'>VISUALIZE</span> Topic Trends Overview<span style='color: red; float: right'>TRY IT</span>

- The topic shares  displayed as a scattered heatmap plot using gradient color based on topic's weight in document.
- [Stanford’s Termite software](http://vis.stanford.edu/papers/termite) uses a similar visualization.

In [39]:
# plot_topic_relevance_by_year
import bokeh.transform

def get_topic_weight_by_year_or_document(document_topic_weights, key='mean', year=None):
    pivot_column = 'year' if year is None else 'document_id'
    #if df[(df.year == year)]
    df = self.get_document_topic_weights(year) \
        .groupby([pivot_column,'topic_id']) \
        .agg(config.AGGREGATES[key])[['weight']].reset_index()
    return df, pivot_column
    
def setup_glyph_coloring(df):
    max_weight = df.weight.max()
    #colors = list(reversed(bokeh.palettes.Greens[9]))
    colors = ['#ffffff', '#f7fcf5', '#e5f5e0', '#c7e9c0', '#a1d99b', '#74c476', '#41ab5d', '#238b45', '#006d2c', '#00441b']
    mapper = bokeh.models.LinearColorMapper(palette=colors, low=0.0, high=1.0) # low=df.weight.min(), high=max_weight)
    color_transform = bokeh.transform.transform('weight', mapper)
    color_bar = bokeh.models.ColorBar(color_mapper=mapper, location=(0, 0),
                         ticker=bokeh.models.BasicTicker(desired_num_ticks=len(colors)),
                         formatter=bokeh.models.PrintfTickFormatter(format=" %5.2f"))
    return color_transform, color_bar

def compute_int_range_categories(values):
    categories = values.unique()
    if all(map(utility.isint, categories)):
        categories = sorted(list(map(int, categories)))
        return list(map(str, categories))
    else:
        return sorted(list(categories))

HEATMAP_FIGOPTS = dict(title="Topic heatmap", toolbar_location="right",  x_axis_location="above", plot_width=1000)

def plot_topic_relevance_by_year(df, xs, ys, flip_axis, titles, text_id, **figopts):

    line_height = 7
    if flip_axis is True:
        xs, ys = ys, xs
        line_height = 10

    x_range = compute_int_range_categories(df[xs])
    y_range = compute_int_range_categories(df[ys])
    
    color_transform, color_bar = setup_glyph_coloring(df)
    
    source = bokeh.models.ColumnDataSource(df)

    if x_range is not None:
        figopts['x_range'] = x_range

    if y_range is not None:
        figopts['y_range'] = y_range
        figopts['plot_height'] = max(len(y_range) * line_height, 500)
    
    p = bokeh.plotting.figure(**figopts)

    args = dict(x=xs, y=ys, source=source, alpha=1.0, hover_color='red')
    
    cr = p.rect(width=1, height=1, line_color=None, fill_color=color_transform, **args)

    p.x_range.range_padding = 0
    p.ygrid.grid_line_color = None
    p.xgrid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_text_font_size = "8pt"
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = 1.0
    p.add_layout(color_bar, 'right')
    
    p.add_tools(bokeh.models.HoverTool(tooltips=None, callback=widgets_utility.WidgetUtility.glyph_hover_callback(
        source, 'topic_id', titles.index, titles, text_id), renderers=[cr]))
    
    return p

def display_doc_topic_heatmap(state, key='max', flip_axis=False, glyph='Circle', year=None, year_aggregate=None, output_format=None):
    try:

        titles = topic_model_utility.get_topic_titles(state.processed.topic_token_weights, n_tokens=100)
        
        df = state.processed.document_topic_weights.copy()

        if year is not None:
            df = df[(df.signed_year == year)]

        if year is None:
            
            ''' Display aggregate value grouped by year  '''
            df = df.groupby(['signed_year', 'topic_id']).agg([np.mean, np.max])['weight'].reset_index()
            df.columns = ['signed_year', 'topic_id', 'mean', 'max']
            df['weight'] = df[year_aggregate]
            df['signed_year'] = df.signed_year.astype(str)
            category_column = 'signed_year'
            
        else:
            ''' Display individual treaties for selected year  '''
            df['treaty'] = df.treaty_id + ' ' + df.party1 + ' ' + df.party2
            df = df[['treaty', 'treaty_id', 'topic_id', 'weight']]
            category_column = 'treaty'  
        
        df['document_id'] = df.index.astype(str)
        df['topic_id'] = df.topic_id.astype(str)
         
        if output_format.lower() == 'heatmap':
            
            p = plot_topic_relevance_by_year(
                df,
                xs=category_column,
                ys='topic_id',
                flip_axis=flip_axis,
                titles=titles,
                text_id='topic_relevance',
                **HEATMAP_FIGOPTS)

            bokeh.plotting.show(p)
            
        else:
            display(df)
        
    except Exception as ex:
        raise
        logger.error(ex)
        
def doc_topic_heatmap_gui(state):

    lw = lambda w: widgets.Layout(width=w)
    
    text_id = 'topic_relevance'
    
    year_min, year_max = state.processed.year_period
    year_options = [ ('all years', None) ] + [ (x,x) for x in range(year_min, year_max + 1)]
    
    gui = types.SimpleNamespace(
        text_id=text_id,
        text=widgets_config.text(text_id),
        flip_axis=widgets.ToggleButton(value=True, description='Flip', icon='', layout=lw("80px")),
        year=widgets.Dropdown(description='Year', options=year_options, value=None, layout=lw("160px")),
        year_aggregate=widgets.Dropdown(description='Aggregate', options=['mean', 'max'], value='max', layout=lw("160px")),
        output_format=widgets.Dropdown(description='Output', options=['Heatmap', 'Table'], value='Heatmap', layout=lw("180px"))
    )
    
    iw = widgets.interactive(
        display_doc_topic_heatmap,
        state=widgets.fixed(state),
        flip_axis=gui.flip_axis,
        year=gui.year,
        year_aggregate=gui.year_aggregate,
        output_format=gui.output_format
    )

    display(widgets.VBox([
        widgets.HBox([gui.year, gui.year_aggregate, gui.output_format, gui.flip_axis ]),
        widgets.HBox([iw.children[-1]]), gui.text
    ]))

    iw.update()

try:
    doc_topic_heatmap_gui(current_state())
except Exception as ex:
    logger.error(ex)
    

VBox(children=(HBox(children=(Dropdown(description='Year', layout=Layout(width='160px'), options=(('all years'…

In [41]:

# TODO SOMETHING IS WRONG WITH THE WEIGHT!!!
current_state().processed.document_topic_weights

## <span style='color: green;'>VISUALIZE</span> Topic Cooccurrence<span style='color: red; float: right'>TRY IT</span>

Computes weighted graph of topics co-occurring in the same document. Topics are defined as co-occurring if they both exists  in the same document both having weights above threshold. Weight are number of co-occurrences (binary yes or no). Node size reflects topic proportions over the entire corpus (normalized document) length, and are computed in accordance to how node sizes are computed in LDAvis.

In [None]:
# Visualize topic co-occurrence

import common.plot_utility as plot_utility
import common.network_utility as network_utility
import bokeh.plotting # import figure, show, output_notebook, output_file

bokeh.plotting.output_notebook()

def get_topic_titles(topic_token_weights, topic_id=None, n_words=100):
    df_temp = topic_token_weights if topic_id is None else topic_token_weights[(topic_token_weights.topic_id==topic_id)]
    df = df_temp\
            .sort_values('weight', ascending=False)\
            .groupby('topic_id')\
            .apply(lambda x: ' '.join(x.token[:n_words].str.title()))
    return df

# FIXME: add doc token length to df_documents
def get_topic_proportions(corpus_documents, document_topic_weights):
    topic_proportion = topic_model.compute_topic_proportions(document_topic_weights, corpus_documents)
    return topic_proportion
    
def display_topic_co_occurrence_network(
    tm_data,
    parties=None,
    period=None,
    ignores=None,
    threshold=0.10,
    layout='Fruchterman-Reingold',
    scale=1.0,
    output_format='table'
):
    try:
        
        model_data = tm_data.compiled_data
        
        titles = topic_model_utility.get_topic_titles(model_data.topic_token_weights)
        df = model_data.document_topic_weights
        df['document_id'] = df.index
        
        node_sizes = topic_model.compute_topic_proportions(df, model_data.documents)

        if ignores is not None:
            df = df[~df.topic_id.isin(ignores)]
            
        if len(parties or []) > 0:
            df = df[df.party1.isin(parties) | df.party2.isin(parties)]
            
        if period is not None:
            df = df[df.signed_year.between(period[0], period[1], inclusive=True)]
            
        df = df.loc[(df.weight >= threshold)]
        df = pd.merge(df, df, how='inner', left_on='document_id', right_on='document_id')
        df = df.loc[(df.topic_id_x < df.topic_id_y)]
        df = df.groupby([df.topic_id_x, df.topic_id_y]).size().reset_index()
        df.columns = ['source', 'target', 'weight']
        
        if len(df) == 0:
            print('No data. Please change selections.')
            return
        
        if output_format == 'table':
            display(df)
        else:
            network = network_utility.NetworkUtility.create_network(df, source_field='source', target_field='target', weight='weight')
            p = plot_utility.PlotNetworkUtility.plot_network(
                network=network,
                layout_algorithm=layout,
                scale=scale,
                threshold=0.0,
                node_description=titles,
                node_proportions=node_sizes,
                weight_scale=10.0,
                normalize_weights=True,
                element_id='cooc_id',
                figsize=(900,500)
            )
            bokeh.plotting.show(p)

    except Exception as x:
        raise
        print("No data: please adjust filters")

def topic_coocurrence_network_gui(wti_index, tm_data):
    
    lw = lambda w: widgets.Layout(width=w)
    n_topics = tm_data.tm_model.num_topics
    
    model = tm_data.tm_model
    text_id = 'cooc_id'
    layout_options = [ 'Circular', 'Kamada-Kawai', 'Fruchterman-Reingold']
    party_preset_options = wti_index.get_party_preset_options()
    parties_options = [ x for x in wti_index.get_countries_list() if x != 'ALL OTHER' ]
    year_min, year_max = tm_data.compiled_data.year_period
    
    gui = types.SimpleNamespace(
        n_topics=n_topics,
        text=widgets_utility.wf.create_text_widget(text_id),
        period=widgets.IntRangeSlider(description='Time', min=year_min, max=year_max, step=1, value=(year_min, year_max), continues_update=False),
        scale=widgets.FloatSlider(description='Scale', min=0.0, max=1.0, step=0.01, value=0.1, continues_update=False),
        threshold=widgets.FloatSlider(description='Threshold', min=0.0, max=1.0, step=0.01, value=0.20, continues_update=False),
        output_format=widgets_utility.dropdown('Output', { 'Network': 'network', 'Table': 'table' }, 'network', layout=lw('200px')),
        layout=widgets_utility.dropdown('Layout', layout_options, 'Fruchterman-Reingold', layout=lw('250px')),
        parties=widgets.SelectMultiple(description='Parties', options=parties_options, value=[], rows=7, layout=lw('180px')),
        party_preset=widgets_config.dropdown('Presets', party_preset_options, None, layout=lw('180px')),
        progress=widgets.IntProgress(min=0, max=4, step=1, value=0, layout=widgets.Layout(width="99%")),
        ignores=widgets.SelectMultiple(description='Ignore', options=[('', None)] + [ ('Topic #'+str(i), i) for i in range(0, n_topics) ], value=[], rows=8, layout=lw('180px')),
    )
    def tick(x=None):
        gui.progress.value = gui.progress.value + 1 if x is None else x
        
    def on_party_preset_change(change):  # pylint: disable=W0613
        if gui.party_preset.value is None:
            return
        gui.parties.value = gui.parties.options if 'ALL' in gui.party_preset.value else gui.party_preset.value
            
    gui.party_preset.observe(on_party_preset_change, names='value')
     
    iw = widgets.interactive(
        display_topic_co_occurrence_network,
        tm_data=widgets.fixed(tm_data),
        parties=gui.parties,
        period=gui.period,
        ignores=gui.ignores,
        threshold=gui.threshold,
        layout=gui.layout,
        scale=gui.scale,
        output_format=gui.output_format
    )
    display(widgets.VBox([
        gui.text,
        widgets.HBox([
            widgets.VBox([gui.layout, gui.threshold, gui.scale, gui.period]), 
            widgets.VBox([gui.parties, gui.party_preset]), 
            widgets.VBox([gui.ignores]), 
            widgets.VBox([gui.output_format, gui.progress]),
        ]),
        iw.children[-1]
    ]))
    iw.update()
    
try:
    tm_data = get_current_model()
    topic_coocurrence_network_gui(WTI_INDEX, tm_data)
except Exception as ex:
    logger.error(ex)

## <span style='color: green'>EXPLORE </span> Topic Similarity <span style='float: right; color: red'>WORK IN PROGRESS</span>


#### <span style='color: green'>EXPLORE </span> Topic Similarity Network<span style='float: right; color: red'>WORK IN PROGRESS</span>
This plot displays topic similarity based on **euclidean or cosine distances** between the **topic-to-word vectors**. Please note that the computations can take some time to exceute, especially for larger LDA models.

In [None]:
# Visualization
import types

# if 'zy_data' not in globals():
zy_data = types.SimpleNamespace(
    basename=None,
    network=None,
    X_n_space=None,
    X_n_space_feature_names=None,
    distance_matrix=None,
    metric=None,
    topic_proportions=None,
    n_words = 0
)

def plot_clustering_dendogram(clustering):
    plt.figure(figsize=(16,6))
    # https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.dendrogram.html
    R = dendrogram(clustering)
    plt.show()
    plt.close()

def VectorSpaceHelper_compute_distance_matrix(X_n_space, metric='euclidean'):
    # https://se.mathworks.com/help/stats/pdist.html
    metric = metric.lower()
    if metric == 'kullback–leibler': metric = VectorSpaceHelper.kullback_leibler_divergence
    if metric == 'scipy.stats.entropy': metric = scipy.stats.entropy
    #print(metric)
    X = X_n_space.toarray() if hasattr(X_n_space, 'toarray') else X_n_space
    #X_n_space += 0.00001
    distances = distance.pdist(X, metric=metric)
    #print(distances)
    distance_matrix = distance.squareform(distances)
    #print(distance_matrix)    
    return distance_matrix
    
def display_correlation_network(
    layout_algorithm,
    threshold=0.10,
    scale=1.0,
    metric='Euclidean',
    n_words=200,
    output_format='Network'
):
    global state, zy_data, zy

    try:

        zy.progress.value = 1
        metric = DISTANCE_METRICS[metric]

        node_description = state.get_topics_tokens_as_text()
        node_proportions = state.get_topic_proportions()

        zy.progress.value = 2
        if zy_data.network is None or state.basename != zy_data.basename or zy_data.metric != metric or zy_data.n_words != n_words:

            zy_data.basename = state.basename
            zy_data.n_words = n_words
            zy_data.X_n_space, zy_data.X_n_space_feature_names = state.compute_topic_terms_vector_space(n_words=n_words)
            
            #print(zy_data.X_n_space.shape)
            #print(zy_data.X_n_space_feature_names)
            zy.progress.value = 3
            zy_data.distance_matrix = VectorSpaceHelper_compute_distance_matrix(zy_data.X_n_space, metric=metric)
            zy_data.network = None

        edges_data = VectorSpaceHelper.lower_triangle_iterator(zy_data.distance_matrix, threshold)

        zy.progress.value = 4
        if output_format == 'List':
            df = pd.DataFrame(edges_data, columns=['x', 'y', 'weight'])
            zy.progress.value = 5
            display(HTML(df.to_html()))
        else:
            zy.progress.value = 5
            if zy_data.network is None:
                zy_data.network = NetworkUtility.create_network_from_xyw_list(edges_data) # zy_data.distance_matrix)
            zy.progress.value = 6
            p = PlotNetworkUtility.plot_network(
                network=zy_data.network,
                layout_algorithm=layout_algorithm,
                scale=scale,
                threshold=threshold,
                node_description=node_description,
                node_proportions=node_proportions,
                element_id='nx_id3',
                figsize=(1000,600)
            )
            zy.progress.value = 6
            show(p)

        zy.progress.value = 7
        zy.progress.value = 0
    except Exception as ex:
        # logger.exception(ex)
        print('Error: {}'.format(ex))
        print('Empty set: please change filters')
        zy.progress.value = 0

zy = widgets_utility.WidgetUtility(
    n_topics=state.n_topics,
    text_id='nx_id3',
    text=wf.create_text_widget('nx_id3'),
    scale=wf.create_float_slider('Scale', min=0.0, max=1.0, step=0.01, value=0.1),
    year=wf.create_int_slider(
        description='Year', min=state.min_year, max=state.max_year, step=1, value=state.min_year
    ),
    n_words=wf.create_int_slider(description='#words*', min=10, max=500, step=1, value=20),
    metric=wf.create_select_widget(label='Metric*', values=list(DISTANCE_METRICS.keys()), default='Euclidean'),
    threshold=wf.create_float_slider('Threshold', min=0.0, max=1.0, step=0.01, value=0.01),
    output_format=wf.create_select_widget('Format', ['Network', 'List'], default='Network'),
    layout=wf.create_select_widget('Layout', list(layout_algorithms.keys()), default='Fruchterman-Reingold'),
    progress=wf.create_int_progress_widget(min=0, max=7, step=1, value=0, layout=widgets.Layout(width="90%"))
) 
    
wy = widgets.interactive(
    display_correlation_network,
    layout_algorithm=zy.layout,
    threshold=zy.threshold,
    scale=zy.scale,
    metric=zy.metric,
    n_words=zy.n_words,
    output_format=zy.output_format
)

display(widgets.VBox(
    (zy.text, ) +
    (widgets.HBox((zy.threshold,) + (zy.metric,) + (zy.output_format,)),) +
    (widgets.HBox((zy.n_words,) + (zy.layout,) + (zy.scale,)),) +
    (zy.progress,) +
    (wy.children[-1],)))

wy.update()
                                   

In [None]:
import gensim
with open('sttm_corpus_text.txt', 'r') as f:
    corpus = [ x.rstrip().split(' ') for x in f.read().split('\n') if x != '' ]
    
id2word = gensim.corpora.dictionary.Dictionary(documents=corpus)
bow_corpus = [ id2word.doc2bow(doc) for doc in corpus ]

sstm_jar_path = '../../../source/STTM/STTM.jar'

btm_model = WrapperSTTM(
     sstm_jar_path,
     model='BTM',
     corpus=bow_corpus,
     id2word=id2word,
     #vectors,
     num_topics=20,
     #alpha=0.1,
     #beta=0.01,
     iterations=2000,
     prefix='results/',
     name='test_model'
     #twords=20,
     #sstep=0
)
                    

# pyLDAvis tests

In [7]:
"""
pyLDAvis Prepare
===============
Main transformation functions for preparing LDAdata to the visualization's data structures
"""

from __future__ import absolute_import
from past.builtins import basestring
from collections import namedtuple
import json
import logging
from joblib import Parallel, delayed, cpu_count
import numpy as np
import pandas as pd
from scipy.stats import entropy
from scipy.spatial.distance import pdist, squareform
#from .utils import NumPyEncoder
try:
    from sklearn.manifold import MDS, TSNE
    sklearn_present = True
except ImportError:
    sklearn_present = False


def __num_dist_rows__(array, ndigits=2):
    return array.shape[0] - int((pd.DataFrame(array).sum(axis=1) < 0.999).sum())


class ValidationError(ValueError):
    pass


def _input_check(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency):
    ttds = topic_term_dists.shape
    dtds = doc_topic_dists.shape
    errors = []
    def err(msg):
        errors.append(msg)

    if dtds[1] != ttds[0]:
        err('Number of rows of topic_term_dists does not match number of columns of doc_topic_dists; both should be equal to the number of topics in the model.')

    if len(doc_lengths) != dtds[0]:
        err('Length of doc_lengths not equal to the number of rows in doc_topic_dists; both should be equal to the number of documents in the data.')

    W = len(vocab)
    if ttds[1] != W:
        err('Number of terms in vocabulary does not match the number of columns of topic_term_dists (where each row of topic_term_dists is a probability distribution of terms for a given topic).')
    if len(term_frequency) != W:
        err('Length of term_frequency not equal to the number of terms in the vocabulary (len of vocab).')

    if __num_dist_rows__(topic_term_dists) != ttds[0]:
        err('Not all rows (distributions) in topic_term_dists sum to 1.')

    if __num_dist_rows__(doc_topic_dists) != dtds[0]:
        err('Not all rows (distributions) in doc_topic_dists sum to 1.')

    if len(errors) > 0:
        return errors


def _input_validate(*args):
    res = _input_check(*args)
    if res:
        raise ValidationError('\n' + '\n'.join([' * ' + s for s in res]))


def _jensen_shannon(_P, _Q):
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))


def _pcoa(pair_dists, n_components=2):
    """Principal Coordinate Analysis,
    aka Classical Multidimensional Scaling
    """
    # code referenced from skbio.stats.ordination.pcoa
    # https://github.com/biocore/scikit-bio/blob/0.5.0/skbio/stats/ordination/_principal_coordinate_analysis.py

    # pairwise distance matrix is assumed symmetric
    pair_dists = np.asarray(pair_dists, np.float64)

    # perform SVD on double centred distance matrix
    n = pair_dists.shape[0]
    H = np.eye(n) - np.ones((n, n)) / n
    B = - H.dot(pair_dists ** 2).dot(H) / 2
    eigvals, eigvecs = np.linalg.eig(B)

    # Take first n_components of eigenvalues and eigenvectors
    # sorted in decreasing order
    ix = eigvals.argsort()[::-1][:n_components]
    eigvals = eigvals[ix]
    eigvecs = eigvecs[:, ix]

    # replace any remaining negative eigenvalues and associated eigenvectors with zeroes
    # at least 1 eigenvalue must be zero
    eigvals[np.isclose(eigvals, 0)] = 0
    if np.any(eigvals < 0):
        ix_neg = eigvals < 0
        eigvals[ix_neg] = np.zeros(eigvals[ix_neg].shape)
        eigvecs[:, ix_neg] = np.zeros(eigvecs[:, ix_neg].shape)

    return np.sqrt(eigvals) * eigvecs


def js_PCoA(distributions):
    """Dimension reduction via Jensen-Shannon Divergence & Principal Coordinate Analysis
    (aka Classical Multidimensional Scaling)

    Parameters
    ----------
    distributions : array-like, shape (`n_dists`, `k`)
        Matrix of distributions probabilities.

    Returns
    -------
    pcoa : array, shape (`n_dists`, 2)
    """
    dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
    return _pcoa(dist_matrix)


def js_MMDS(distributions, **kwargs):
    """Dimension reduction via Jensen-Shannon Divergence & Metric Multidimensional Scaling

    Parameters
    ----------
    distributions : array-like, shape (`n_dists`, `k`)
        Matrix of distributions probabilities.

    **kwargs : Keyword argument to be passed to `sklearn.manifold.MDS()`

    Returns
    -------
    mmds : array, shape (`n_dists`, 2)
    """
    dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
    model = MDS(n_components=2, random_state=0, dissimilarity='precomputed', **kwargs)
    return model.fit_transform(dist_matrix)


def js_TSNE(distributions, **kwargs):
    """Dimension reduction via Jensen-Shannon Divergence & t-distributed Stochastic Neighbor Embedding

    Parameters
    ----------
    distributions : array-like, shape (`n_dists`, `k`)
        Matrix of distributions probabilities.

    **kwargs : Keyword argument to be passed to `sklearn.manifold.TSNE()`

    Returns
    -------
    tsne : array, shape (`n_dists`, 2)
    """
    dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
    model = TSNE(n_components=2, random_state=0, metric='precomputed', **kwargs)
    return model.fit_transform(dist_matrix)


def _df_with_names(data, index_name, columns_name):
    if type(data) == pd.DataFrame:
      # we want our index to be numbered
      df = pd.DataFrame(data.values)
    else:
        df = pd.DataFrame(data)
    df.index.name = index_name
    df.columns.name = columns_name
    return df

def _series_with_name(data, name):
    if type(data) == pd.Series:
        data.name = name
        # ensures a numeric index
        return data.reset_index()[name]
    else:
        return pd.Series(data, name=name)

def _topic_coordinates(mds, topic_term_dists, topic_proportion):
    K = topic_term_dists.shape[0]
    mds_res = mds(topic_term_dists)
    assert mds_res.shape == (K, 2)
    mds_df = pd.DataFrame({'x': mds_res[:,0], 'y': mds_res[:,1], 'topics': range(1, K + 1), \
                          'cluster': 1, 'Freq': topic_proportion * 100})
    # note: cluster (should?) be deprecated soon. See: https://github.com/cpsievert/LDAvis/issues/26
    return mds_df

def _chunks(l, n):
    """ Yield successive n-sized chunks from l.
    """
    for i in range(0, len(l), n):
        yield l[i:i+n]

def _job_chunks(l, n_jobs):
    n_chunks = n_jobs
    if n_jobs < 0:
        # so, have n chunks if we are using all n cores/cpus
        n_chunks = cpu_count() + 1 - n_jobs
    return _chunks(l, n_chunks)

def _find_relevance(log_ttd, log_lift, R, lambda_):
    relevance = lambda_ * log_ttd + (1 - lambda_) * log_lift
    return relevance.T.apply(lambda s: s.sort_values(ascending=False).index).head(R)

def _find_relevance_chunks(log_ttd, log_lift, R, lambda_seq):
    return pd.concat([_find_relevance(log_ttd, log_lift, R, l) for l in lambda_seq])

def _topic_info(topic_term_dists, topic_proportion, term_frequency, term_topic_freq, vocab, lambda_step, R, n_jobs):
    # marginal distribution over terms (width of blue bars)
    term_proportion = term_frequency / term_frequency.sum()

    # compute the distinctiveness and saliency of the terms:
    # this determines the R terms that are displayed when no topic is selected
    topic_given_term = topic_term_dists / topic_term_dists.sum()
    kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
    distinctiveness = kernel.sum()
    saliency = term_proportion * distinctiveness

    # Order the terms for the "default" view by decreasing saliency:
    default_term_info  = pd.DataFrame({'saliency': saliency, 'Term': vocab, \
                                      'Freq': term_frequency, 'Total': term_frequency, \
                                      'Category': 'Default'}). \
      sort_values(by='saliency', ascending=False). \
      head(R).drop('saliency', 1)
    # Rounding Freq and Total to integer values to match LDAvis code:
    default_term_info['Freq'] = np.floor(default_term_info['Freq'])
    default_term_info['Total'] = np.floor(default_term_info['Total'])
    ranks = np.arange(R, 0, -1)
    default_term_info['logprob'] = default_term_info['loglift'] = ranks

    ## compute relevance and top terms for each topic
    log_lift = np.log(topic_term_dists / term_proportion)
    log_ttd = np.log(topic_term_dists)
    lambda_seq = np.arange(0, 1 + lambda_step, lambda_step)

    def topic_top_term_df(tup):
        new_topic_id, (original_topic_id, topic_terms) = tup
        term_ix = topic_terms.unique()
        return pd.DataFrame({'Term': vocab[term_ix], \
                           'Freq': term_topic_freq.loc[original_topic_id, term_ix], \
                           'Total': term_frequency[term_ix], \
                           'logprob': log_ttd.loc[original_topic_id, term_ix].round(4), \
                           'loglift': log_lift.loc[original_topic_id, term_ix].round(4), \
                           'Category': 'Topic%d' % new_topic_id})

    top_terms = pd.concat(Parallel(n_jobs=n_jobs)(delayed(_find_relevance_chunks)(log_ttd, log_lift, R, ls) \
                                                 for ls in _job_chunks(lambda_seq, n_jobs)))
    topic_dfs = map(topic_top_term_df, enumerate(top_terms.T.iterrows(), 1))
    return pd.concat([default_term_info] + list(topic_dfs), sort=True)


def _token_table(topic_info, term_topic_freq, vocab, term_frequency):
    # last, to compute the areas of the circles when a term is highlighted
    # we must gather all unique terms that could show up (for every combination
    # of topic and value of lambda) and compute its distribution over topics.

    # term-topic frequency table of unique terms across all topics and all values of lambda
    term_ix = topic_info.index.unique()
    term_ix = np.sort(term_ix)

    top_topic_terms_freq = term_topic_freq[term_ix]
    # use the new ordering for the topics
    K = len(term_topic_freq)
    top_topic_terms_freq.index = range(1, K + 1)
    top_topic_terms_freq.index.name = 'Topic'

    # we filter to Freq >= 0.5 to avoid sending too much data to the browser
    token_table = pd.DataFrame({'Freq': top_topic_terms_freq.unstack()}). \
                 reset_index().set_index('term'). \
                 query('Freq >= 0.5')

    token_table['Freq'] = token_table['Freq'].round()
    token_table['Term'] = vocab[token_table.index.values].values
    # Normalize token frequencies:
    token_table['Freq'] = token_table.Freq / term_frequency[token_table.index]
    return token_table.sort_values(by=['Term', 'Topic'])


def vis_prepare(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency, \
            R=30, lambda_step=0.01, mds=js_PCoA, n_jobs=-1, \
            plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, sort_topics=True):
    """Transforms the topic model distributions and related corpus data into
    the data structures needed for the visualization.

    Parameters
    ----------
    topic_term_dists : array-like, shape (`n_topics`, `n_terms`)
        Matrix of topic-term probabilities. Where `n_terms` is `len(vocab)`.
    doc_topic_dists : array-like, shape (`n_docs`, `n_topics`)
        Matrix of document-topic probabilities.
    doc_lengths : array-like, shape `n_docs`
        The length of each document, i.e. the number of words in each document.
        The order of the numbers should be consistent with the ordering of the
        docs in `doc_topic_dists`.
    vocab : array-like, shape `n_terms`
        List of all the words in the corpus used to train the model.
    term_frequency : array-like, shape `n_terms`
        The count of each particular term over the entire corpus. The ordering
        of these counts should correspond with `vocab` and `topic_term_dists`.
    R : int
        The number of terms to display in the barcharts of the visualization.
        Default is 30. Recommended to be roughly between 10 and 50.
    lambda_step : float, between 0 and 1
        Determines the interstep distance in the grid of lambda values over
        which to iterate when computing relevance.
        Default is 0.01. Recommended to be between 0.01 and 0.1.
    mds : function or a string representation of function
        A function that takes `topic_term_dists` as an input and outputs a
        `n_topics` by `2`  distance matrix. The output approximates the distance
        between topics. See :func:`js_PCoA` for details on the default function.
        A string representation currently accepts `pcoa` (or upper case variant),
        `mmds` (or upper case variant) and `tsne` (or upper case variant),
        if `sklearn` package is installed for the latter two.
    n_jobs : int
        The number of cores to be used to do the computations. The regular
        joblib conventions are followed so `-1`, which is the default, will
        use all cores.
    plot_opts : dict, with keys 'xlab' and `ylab`
        Dictionary of plotting options, right now only used for the axis labels.
    sort_topics : sort topics by topic proportion (percentage of tokens covered). Set to false to
        to keep original topic order.

    Returns
    -------
    prepared_data : PreparedData
        A named tuple containing all the data structures required to create
        the visualization. To be passed on to functions like :func:`display`.

    Notes
    -----
    This implements the method of `Sievert, C. and Shirley, K. (2014):
    LDAvis: A Method for Visualizing and Interpreting Topics, ACL Workshop on
    Interactive Language Learning, Visualization, and Interfaces.`

    http://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf

    See Also
    --------
    :func:`save_json`: save json representation of a figure to file
    :func:`save_html` : save html representation of a figure to file
    :func:`show` : launch a local server and show a figure in a browser
    :func:`display` : embed figure within the IPython notebook
    :func:`enable_notebook` : automatically embed visualizations in IPython notebook
    """
    # parse mds
    if isinstance(mds, basestring):
        mds = mds.lower()
        if mds == 'pcoa':
            mds = js_PCoA
        elif mds in ('mmds', 'tsne'):
            if sklearn_present:
                mds_opts = {'mmds': js_MMDS, 'tsne': js_TSNE}
                mds = mds_opts[mds]
            else:
                logging.warning('sklearn not present, switch to PCoA')
                mds = js_PCoA
        else:
            logging.warning('Unknown mds `%s`, switch to PCoA' % mds)
            mds = js_PCoA

    topic_term_dists = _df_with_names(topic_term_dists, 'topic', 'term')
    doc_topic_dists  = _df_with_names(doc_topic_dists, 'doc', 'topic')
    term_frequency   = _series_with_name(term_frequency, 'term_frequency')
    doc_lengths      = _series_with_name(doc_lengths, 'doc_length')
    vocab            = _series_with_name(vocab, 'vocab')
    _input_validate(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency)
    R = min(R, len(vocab))

    topic_freq       = (doc_topic_dists.T * doc_lengths).T.sum()
    # topic_freq       = np.dot(doc_topic_dists.T, doc_lengths)
    if (sort_topics):
        topic_proportion = (topic_freq / topic_freq.sum()).sort_values(ascending=False)
    else:
        topic_proportion = (topic_freq / topic_freq.sum())

    topic_order      = topic_proportion.index
    # reorder all data based on new ordering of topics
    topic_freq       = topic_freq[topic_order]
    topic_term_dists = topic_term_dists.iloc[topic_order]
    doc_topic_dists  = doc_topic_dists[topic_order]

    # token counts for each term-topic combination (widths of red bars)
    term_topic_freq = (topic_term_dists.T * topic_freq).T
    ## Quick fix for red bar width bug.  We calculate the
    ## term frequencies internally, using the topic term distributions and the
    ## topic frequencies, rather than using the user-supplied term frequencies.
    ## For a detailed discussion, see: https://github.com/cpsievert/LDAvis/pull/41
    term_frequency = np.sum(term_topic_freq, axis=0)

    topic_info         = _topic_info(topic_term_dists, topic_proportion, term_frequency, term_topic_freq, vocab, lambda_step, R, n_jobs)
    token_table        = _token_table(topic_info, term_topic_freq, vocab, term_frequency)
    topic_coordinates = _topic_coordinates(mds, topic_term_dists, topic_proportion)
    client_topic_order = [x + 1 for x in topic_order]

    return PreparedData(topic_coordinates, topic_info, token_table, R, lambda_step, plot_opts, client_topic_order)

class PreparedData(namedtuple('PreparedData', ['topic_coordinates', 'topic_info', 'token_table',\
                                               'R', 'lambda_step', 'plot_opts', 'topic_order'])):
    def to_dict(self):
        return {'mdsDat': self.topic_coordinates.to_dict(orient='list'),
               'tinfo': self.topic_info.to_dict(orient='list'),
               'token.table': self.token_table.to_dict(orient='list'),
               'R': self.R,
               'lambda.step': self.lambda_step,
               'plot.opts': self.plot_opts,
               'topic.order': self.topic_order}

    #def to_json(self):
    #    return json.dumps(self.to_dict(), cls=NumPyEncoder)


In [8]:
"""
pyLDAvis Gensim
===============
Helper functions to visualize LDA models trained by Gensim
"""

from __future__ import absolute_import
import funcy as fp
import numpy as np
import pandas as pd
from scipy.sparse import issparse
from past.builtins import xrange
#from . import prepare as vis_prepare

def _extract_data(topic_model, corpus, dictionary, doc_topic_dists=None):
   import gensim

   if not gensim.matutils.ismatrix(corpus):
      corpus_csc = gensim.matutils.corpus2csc(corpus, num_terms=len(dictionary))
   else:
      corpus_csc = corpus
      # Need corpus to be a streaming gensim list corpus for len and inference functions below:
      corpus = gensim.matutils.Sparse2Corpus(corpus_csc)

   vocab = list(dictionary.token2id.keys())
   # TODO: add the hyperparam to smooth it out? no beta in online LDA impl.. hmm..
   # for now, I'll just make sure we don't ever get zeros...
   beta = 0.01
   fnames_argsort = np.asarray(list(dictionary.token2id.values()), dtype=np.int_)
   term_freqs = corpus_csc.sum(axis=1).A.ravel()[fnames_argsort]
   term_freqs[term_freqs == 0] = beta
   doc_lengths = corpus_csc.sum(axis=0).A.ravel()

   assert term_freqs.shape[0] == len(dictionary), 'Term frequencies and dictionary have different shape {} != {}'.format(term_freqs.shape[0], len(dictionary))
   assert doc_lengths.shape[0] == len(corpus), 'Document lengths and corpus have different sizes {} != {}'.format(doc_lengths.shape[0], len(corpus))

   if hasattr(topic_model, 'lda_alpha'):
       num_topics = len(topic_model.lda_alpha)
   else:
       num_topics = topic_model.num_topics

   if doc_topic_dists is None:
      # If its an HDP model.
      if hasattr(topic_model, 'lda_beta'):
          gamma = topic_model.inference(corpus)
      else:
          gamma, _ = topic_model.inference(corpus)
      doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
   else:
      if isinstance(doc_topic_dists, list):
         doc_topic_dists = gensim.matutils.corpus2dense(doc_topic_dists, num_topics).T
      elif issparse(doc_topic_dists):
         doc_topic_dists = doc_topic_dists.T.todense()
      doc_topic_dists = doc_topic_dists / doc_topic_dists.sum(axis=1)

   assert doc_topic_dists.shape[1] == num_topics, 'Document topics and number of topics do not match {} != {}'.format(doc_topic_dists.shape[1], num_topics)

   # get the topic-term distribution straight from gensim without
   # iterating over tuples
   if hasattr(topic_model, 'lda_beta'):
       topic = topic_model.lda_beta
   else:
       topic = topic_model.state.get_lambda()
   topic = topic / topic.sum(axis=1)[:, None]
   topic_term_dists = topic[:, fnames_argsort]

   assert topic_term_dists.shape[0] == doc_topic_dists.shape[1]

   return {'topic_term_dists': topic_term_dists, 'doc_topic_dists': doc_topic_dists,
           'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_freqs}

def prepare(topic_model, corpus, dictionary, doc_topic_dist=None, **kwargs):
    """Transforms the Gensim TopicModel and related corpus and dictionary into
    the data structures needed for the visualization.

    Parameters
    ----------
    topic_model : gensim.models.ldamodel.LdaModel
        An already trained Gensim LdaModel. The other gensim model types are
        not supported (PRs welcome).

    corpus : array-like list of bag of word docs in tuple form or scipy CSC matrix
        The corpus in bag of word form, the same docs used to train the model.
        The corpus is transformed into a csc matrix internally, if you intend to
        call prepare multiple times it is a good idea to first call
        `gensim.matutils.corpus2csc(corpus)` and pass in the csc matrix instead.

    For example: [(50, 3), (63, 5), ....]

    dictionary: gensim.corpora.Dictionary
        The dictionary object used to create the corpus. Needed to extract the
        actual terms (not ids).

    doc_topic_dist (optional): Document topic distribution from LDA (default=None)
        The document topic distribution that is eventually visualised, if you will
        be calling `prepare` multiple times it's a good idea to explicitly pass in
        `doc_topic_dist` as inferring this for large corpora can be quite
        expensive.

    **kwargs :
        additional keyword arguments are passed through to :func:`pyldavis.prepare`.

    Returns
    -------
    prepared_data : PreparedData
        the data structures used in the visualization

    Example
    --------
    For example usage please see this notebook:
    http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/Gensim%20Newsgroup.ipynb

    See
    ------
    See `pyLDAvis.prepare` for **kwargs.
    """
    opts = fp.merge(_extract_data(topic_model, corpus, dictionary, doc_topic_dist), kwargs)
    return vis_prepare(**opts)
