In [2]:
import pandas as pd
# Import Bokeh Libraries
from bokeh.io import curdoc
from bokeh.layouts import column, row
from bokeh.models import (Button, ColumnDataSource, Legend, GroupFilter, IndexFilter, CDSView, DataTable, TableColumn, TextInput, LassoSelectTool, BoxSelectTool)
from bokeh.plotting import figure, show
from bokeh.models import DataTable, TableColumn, ColorBar, HTMLTemplateFormatter, Spinner, RangeSlider
from bokeh.io import output_notebook
from bokeh.application import Application
from bokeh.application.handlers import FunctionHandler
import numpy as np
from bokeh.palettes import Spectral6, Magma256, Category20
from bokeh.transform import factor_cmap, linear_cmap
from bokeh.models.callbacks import CustomJS
from bokeh.events import SelectionGeometry

In [4]:
output_notebook()

In [30]:
def bulk_text(df, num_topics, cat_name, keywords=None):
    df['alpha'] = 0.5
    if keywords:
        df['color'] = [determine_keyword(str(t), keywords) for t in df['text']]
        df['alpha'] = [0.4 if c == 'none' else 1 for c in df['color']]

    highlighted_idx = []

    columns = [
        TableColumn(field="Headline", title="Headline"),
        TableColumn(field="text", title="text", width=1000),
        TableColumn(field="Id", title="Id"),
        TableColumn(field="topic", title="topic"),
        # TableColumn(field="article", title="article", formatter=HTMLTemplateFormatter(template=r'<a href="<%= link %>", target="_blank">View Article</a>')),
    ]

    def update(attr, old, new):
        """Callback used for plot update when lasso selecting"""
        global highlighted_idx
        subset = df.iloc[new]
        highlighted_idx = new
        subset = subset.iloc[np.random.permutation(len(subset))]
        source.data = subset

    def save():
        """Callback used to save highlighted data points"""
        global highlighted_idx
        df.iloc[highlighted_idx][['text']].to_csv(text_filename.value, index=False)

    source = ColumnDataSource(data=dict())
    source_orig = ColumnDataSource(data=df)

    data_table = DataTable(source=source, columns=columns, width=1500, height=700)
    source.data = df
    
    p = figure(title="", sizing_mode="scale_both", tools=["lasso_select", "box_select", "pan", "box_zoom", "wheel_zoom", "reset"])
    p.toolbar.active_drag = None
    p.toolbar.active_inspect = None
    
    # CDS Views allow you to plot different parts of your CDS without making separates sources. We can use 
    # this with a GroupFilter to create views based on a categorical variable (column) in our dataset.
    # Note that I'm using Category20 as a pallette but that can be changed for palettes with more colors.
    legends = []
    for col_index, topic in enumerate(num_topics):
        view = CDSView(source=source_orig, filters=[GroupFilter(column_name=cat_name, group=topic)])
        glyph = p.circle(x='x', y='y', size=8, source=source_orig, color=Category20[6][col_index], view=view, alpha="alpha")
        legends.append((topic, [glyph]))
        
    # Add legend to plot
    legend = Legend(items=legends)
    legend.click_policy = "hide"
    p.add_layout(legend, 'right')
    
    # Plot settings
    p.plot_width = 1000
    if "color" in df.columns:
        p.plot_width=350
    p.plot_height = 700
    
    # We have all the glyph renderers in the legends list so we can iterate over it to link the Spinner widget to
    # the renderers.
    spinner = Spinner(title="Circle Size", low = 1, high=60, step=1, value=8, width=200)
    for topic, glyph in legends:
        spinner.js_link("value", glyph[0].glyph, "size")

    # Link the update callback to the data source
    source_orig.selected.on_change('indices', update)

    text_filename = TextInput(value="out.csv", title="Filename:")
    save_btn = Button(label="SAVE")
    save_btn.on_click(save)

    plot = column(p)
    controls_main = column(spinner)
    controls = column(text_filename, save_btn)
    
    def make_doc(doc):
        doc.add_root(row(controls_main))
        doc.add_root(row(plot, controls))
        doc.add_root(row(data_table))
    handler = FunctionHandler(make_doc)
    app=Application(handler)
    return app
    

In [31]:
# Requires a dataset with x, y coordinates, a categorical variable (in string format). You can add any other 
# fields/columns you would like to display in the DataTable. In our case our categorical variable is the assigned topic.
df = pd.read_csv('aug-30-copy-2.csv')
categories = df['topic'].unique()
categories = [str(x) for x in categories]
df['topic'] = df['topic'].astype(str)
cat_name = 'topic'

app = bulk_text(df, categories, cat_name)
show(app)