In [1]:
import pandas as pd
from umap import UMAP
from sklearn.pipeline import make_pipeline 

# pip install "embetter[text]"
from embetter.text import SentenceEncoder

# Build a sentence encoder pipeline with UMAP at the end.
enc = SentenceEncoder('all-MiniLM-L6-v2')
umap = UMAP()

text_emb_pipeline = make_pipeline(
  enc, umap
)

# Load sentences
sentences = list(pd.read_csv("tests/data/text.csv")['text'])

# Calculate embeddings 
X_tfm = text_emb_pipeline.fit_transform(sentences)

# Write to disk. Note! Text column must be named "text"
df = pd.DataFrame({"text": sentences})
df['x'] = X_tfm[:, 0]
df['y'] = X_tfm[:, 1]

In [2]:
X = enc.transform(sentences)

In [3]:
import jscatter
import numpy as np
import pandas as pd
from ipywidgets import HBox, VBox, HTML, Layout, Button, Text
from IPython.display import display
from sklearn.metrics.pairwise import cosine_similarity

class BaseTextExplorer:
    def __init__(self, dataf, X=None, encoder=None):
        self.dataf = dataf
        self.scatter = jscatter.Scatter(data=self.dataf, x="x", y="y", width=500, height=500)
        self.html = HTML(layout=Layout(width='600px', overflow_y='scroll', height='400px'))
        self.sample_btn = Button(description='resample')
        self.elem = HBox([self.scatter.show(), VBox([self.sample_btn, self.html])])
        self.X = X
        self.encoder = encoder
        
        if self.encoder and (self.X is not None):
            self.text_input = Text(value='', placeholder='Type something', description='String:')
            self.elem = HBox([VBox([self.text_input, self.scatter.show()]), VBox([self.sample_btn, self.html])])

            def update_text(change):
                X_tfm = encoder.transform([self.text_input.value])
                dists = cosine_similarity(X, X_tfm).reshape(1, -1)
                self.dists = dists
                norm_dists = 0.01 + (dists - dists.min())/(0.1 + dists.max() - dists.min())
                print(norm_dists)
                explorer.scatter.color(by=norm_dists[0])
                explorer.scatter.size(by=norm_dists[0])

            self.text_input.observe(update_text)
        
        self.scatter.widget.observe(lambda d: self.update(), ['selection'])
        self.sample_btn.on_click(lambda d: self.update())

    def show(self):
        return self.elem

    def update(self):
        if len(self.scatter.selection()) > 10:
            texts = self.dataf.iloc[self.scatter.selection()].sample(10)["text"]
        else:
            texts = self.dataf.iloc[self.scatter.selection()]["text"]
        self.html.value = ''.join([f'<p style="margin: 0px">{t}</p>' for t in texts])

    def observe(self, func):
        self.scatter.widget.observe(func, ['selection'])
        
    @property
    def selected_idx(self):
        return self.scatter.selection()

    @property
    def selected_texts(self):
        return list(self.dataf.iloc[self.selection_idx]["text"])

    @property
    def selected_dataframe(self):
        return self.dataf.iloc[self.selection_idx]

    def _repr_html_(self):
        return display(self.elem)

explorer = BaseTextExplorer(df, encoder=enc, X=X)
explorer.show()

HBox(children=(VBox(children=(Text(value='', description='String:', placeholder='Type something'), HBox(childr…

In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
text_emb_pipeline = make_pipeline(
  enc, pca
)

# Calculate embeddings 
X_tfm_pca = pca.fit_transform(X)

# Write to disk. Note! Text column must be named "text"
df = pd.DataFrame({"text": sentences})
df['x'] = X_tfm_pca[:, 0]
df['y'] = X_tfm_pca[:, 1]

In [9]:
explorer = BaseTextExplorer(df, encoder=enc, X=X)
explorer.show()

HBox(children=(VBox(children=(Text(value='', description='String:', placeholder='Type something'), HBox(childr…

In [29]:
import altair as alt
import pandas as pd
import numpy as np

rand = np.random.RandomState(42)

df = pd.DataFrame({
    'xval': range(100),
    'yval': rand.randn(100).cumsum()
})

slider = alt.binding_range(min=0, max=100, step=1)
cutoff = alt.param(name="cutoff", bind=slider, value=50)

chart = alt.Chart(df).mark_point().encode(
    x='xval',
    y='yval',
    color=alt.condition(
        alt.datum.xval < cutoff,
        alt.value('red'), alt.value('blue')
    )
).add_params(
    cutoff
)
jchart = alt.JupyterChart(chart)
jchart

JupyterChart(spec={'config': {'view': {'continuousWidth': 300, 'continuousHeight': 300}}, 'data': {'name': 'da…

In [30]:
jchart.params

Params({'cutoff': 50.0})

In [37]:
import altair as alt
from vega_datasets import data

source = alt.UrlData(
    data.flights_2k.url,
    format={'parse': {'date': 'date'}}
)

brush = alt.selection_interval(name='select',encodings=['x'])

# Define the base chart, with the common parts of the
# background and highlights
base = alt.Chart(width=160, height=130).mark_bar().encode(
    x=alt.X(alt.repeat('column')).bin(maxbins=20),
    y='count()'
)

# gray background with selection
background = base.encode(
    color=alt.value('#ddd')
).add_params(brush)

# blue highlights on the transformed data
highlight = base.transform_filter(brush)

# layer the two charts & repeat
c = alt.JupyterChart(alt.layer(
    background,
    highlight,
    data=source
).transform_calculate(
    "time",
    "hours(datum.date)"
).repeat(column=["distance", "delay", "time"]))

In [38]:
c

JupyterChart(spec={'config': {'view': {'continuousWidth': 300, 'continuousHeight': 300}}, 'repeat': {'column':…

In [39]:
c.selections

Selections({'select': IntervalSelection(name='select', value={}, store=[])})