In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import BallTree
from umap import UMAP

from bokeh.io import output_notebook
from bokeh.io.notebook import show_app
from bokeh.layouts import row
from bokeh.models.tools import TapTool,WheelZoomTool
from bokeh.models.widgets import PreText
from bokeh.layouts import layout, column, row, widgetbox

import holoviews as hv
from holoviews.operation.datashader import datashade
from holoviews.plotting import bokeh
hv.Store.current_backend = 'bokeh'
renderer = hv.Store.renderers['bokeh'].instance(mode='server', holomap='server')
output_notebook()

import logging
logger = logging.getLogger()
logger.setLevel(logging.ERROR)

# Interactive exploration

To better understand your data, a common workflow is to embed it in a meaningful 2-dimensional space and explore it by zooming around and clicking on a cluster of points. We're building a little viewing tool with Bokeh, Datashader and Holoviews. These libraries are useful because it allows us to visualize large data (eg. if you try to plot something with more than 10,000 points, your browser does not like that).

## Key python packages
- holoviews==1.10.7
- datashader==0.6.6
- bokeh==0.13.0
- umap-learn==0.3.0


## Fetch the 20 newsgroups dataset

In [2]:
from sklearn.datasets import fetch_20newsgroups

d = fetch_20newsgroups(subset='train')
d = d['data']

In [3]:
# Each item is an email of variable length

d[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

## Embed the data into a vector space

In [4]:
# Vectorize our text data using sklearn's TFIDF

X = TfidfVectorizer().fit_transform(d)

In [4]:
# UMAP to reduce its dimension into a 2-dimensional space
# https://github.com/lmcinnes/umap

embed = UMAP().fit_transform(X)

#pickle.dump(embed, open('embed.p', 'wb'))
#embed = pickle.load(open('embed.p', 'rb'))

## Build a small interactive application

In [5]:
# When a user clicks on somewhere on the plot, we use this data structure to quickly determine the closest point

tree = BallTree(embed, leaf_size=2) 

In [8]:
# A typical Bokeh application

def my_app(doc):
    points = hv.Points(embed)
    datashaded = datashade(points, x_sampling=0.01, y_sampling=0.01)
    plot = renderer.get_plot(datashaded, doc=doc)
    fig = plot.state
    fig.height = 400
    fig.width = 400

    def callback(event):
        # When you left-click on the plot
        points = np.array([[event.x, event.y]])
        dist, ind = tree.query(points, k=1)
        dist = dist[0][0]
        ind = ind[0][0]
        if dist < 0.1:
            pre.update(text=d[ind])
            
    fig.add_tools(TapTool())
    fig.on_event('tap', callback)

    # Make tool(s) active
    for t in fig.tools:
        if isinstance(t, WheelZoomTool):
            fig.toolbar.active_scroll = t

    pre = PreText(text="Left click on something", width=400)
    
    doc_layout = layout([[row(fig, pre)]], sizing_mode='scale_width')
    doc.add_root(doc_layout)

## Interrogate the data
- Use mouse wheel to zoom in and out
- Left click on the point to display text
- Pan around by holding down the left mouse button

In [9]:
# Note that the port parameter is a separate port used by the Bokeh server
# In certain networking situations you may need to know this port beforehand

show_app(my_app, None, notebook_url="http://localhost:8888", port=8889)