In [1]:
# Import Libraries
from sentence_transformers import SentenceTransformer
from top2vec import Top2Vec
import os
import pandas as pd
from umap import UMAP

# Import Bokeh Libraries
from bokeh.io import curdoc
from bokeh.layouts import column, row
from bokeh.models import (Button, ColumnDataSource, DataTable, TableColumn, TextInput)
from bokeh.plotting import figure, show
from bokeh.models import DataTable, TableColumn, ColorBar, HTMLTemplateFormatter, Spinner, RangeSlider
from bokeh.io import output_notebook
from bokeh.application import Application
from bokeh.application.handlers import FunctionHandler
import numpy as np
from bokeh.palettes import Spectral6, Magma256
from bokeh.transform import factor_cmap, linear_cmap

In [2]:
output_notebook()

In [11]:
def bulk_text(df, num_topics, keywords=None):
    df['alpha'] = 0.5
    if keywords:
        df['color'] = [determine_keyword(str(t), keywords) for t in df['text']]
        df['alpha'] = [0.4 if c == 'none' else 1 for c in df['color']]

    highlighted_idx = []

    # mapper, df = get_color_mapping(df)
    columns = [
        TableColumn(field="Headline", title="Headline"),
        TableColumn(field="text", title="text", width=1000),
        TableColumn(field="Id", title="Id"),
        TableColumn(field="topic", title="topic"),
        # TableColumn(field="article", title="article", formatter=HTMLTemplateFormatter(template=r'<a href="<%= link %>", target="_blank">View Article</a>')),
    ]

    def update(attr, old, new):
        """Callback used for plot update when lasso selecting"""
        global highlighted_idx
        subset = df.iloc[new]
        highlighted_idx = new
        subset = subset.iloc[np.random.permutation(len(subset))]
        source.data = subset

    def save():
        """Callback used to save highlighted data points"""
        global highlighted_idx
        df.iloc[highlighted_idx][['text']].to_csv(text_filename.value, index=False)

    source = ColumnDataSource(data=dict())
    source_orig = ColumnDataSource(data=df)

    data_table = DataTable(source=source, columns=columns, width=1500, height=700)
    source.data = df
    
    colors = linear_cmap('topic', palette=Spectral6, low=min(num_topics), high=max(num_topics)) 
    

    p = figure(title="", sizing_mode="scale_both", tools=["lasso_select", "box_select", "pan", "box_zoom", "wheel_zoom", "reset"])
    p.toolbar.active_drag = None
    p.toolbar.active_inspect = None

    circle_kwargs = {"x": "x", "y": "y", "size": 1, "source": source_orig, "alpha": "alpha", "fill_color":colors, "line_color":colors, "legend_field":"topic"}

    scatter = p.circle(**circle_kwargs)
    p.plot_width = 1000
    if "color" in df.columns:
        p.plot_width=350
    p.plot_height = 700
    ## Spinner for Node Size
    spinner = Spinner(title="Circle Size", low = 1, high=60, step=1, value=scatter.glyph.size, width=200)
    spinner.js_link("value", scatter.glyph, "size")

        
    scatter.data_source.selected.on_change('indices', update)

    text_filename = TextInput(value="out.csv", title="Filename:")
    save_btn = Button(label="SAVE")
    save_btn.on_click(save)

    plot = column(p)
    controls_main = column(spinner)
    controls = column(text_filename, save_btn)
    
    def make_doc(doc):
        doc.add_root(row(controls_main))
        doc.add_root(row(plot, controls))
        doc.add_root(row(data_table))
    handler = FunctionHandler(make_doc)
    app=Application(handler)
    print(app)
    print("SPACER")
    return app

def gen_embeddings(csv_name):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    # Load original dataset
    df = pd.read_csv(csv_name)
    sentences = df["text"]
    # Calculate embeddings 
    X =  model.encode(sentences)
    # Reduce the dimensions with UMAP
    umap = UMAP()
    X_tfm = umap.fit_transform(X)
    # Apply coordinates
    df['x'] = X_tfm[:, 0]
    df['y'] = X_tfm[:, 1]
    return df

def top2vec_run(embedding_model, speed, min_count, docs, ids, embed_df):
    top_model = Top2Vec(docs, document_ids=ids, min_count=min_count, embedding_model=embedding_model, speed=speed)
    # Create a df with the original document id and the Top2Vec assigned topic
    top_df = pd.DataFrame({'Id' : top_model.document_ids, 'topic' : top_model.get_documents_topics(top_model.document_ids)[0]})
    # Read in csv with metadata and article text, and append the umap vectors
    df_merged = embed_df.merge(top_df, how='left')
    df_merged.to_csv("aug-30-copy.csv")
    # Retrieve the topic ids (1,2,...) in an array
    num_topics = df_merged['topic'].unique()
    # Create the Bokeh viz
    app = bulk_text(df_merged, num_topics)
    show(app)
    
    
    
    
    
    
    
    
    

    

In [4]:
embed_df = gen_embeddings("copy-set.csv")

In [6]:
ids = pd.read_csv("copy-set.csv")['Id'].tolist()
docs = pd.read_csv("copy-set.csv")['text'].tolist()

In [12]:
top2vec_run('all-MiniLM-L6-v2', 'deep-learn', 5, docs, ids, embed_df)

2022-08-30 12:29:30,211 - top2vec - INFO - Pre-processing documents for training
INFO:top2vec:Pre-processing documents for training
2022-08-30 12:29:30,635 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model
INFO:top2vec:Downloading all-MiniLM-L6-v2 model
2022-08-30 12:29:32,929 - top2vec - INFO - Creating joint document/word embedding
INFO:top2vec:Creating joint document/word embedding
2022-08-30 12:29:46,400 - top2vec - INFO - Creating lower dimension embedding of documents
INFO:top2vec:Creating lower dimension embedding of documents
2022-08-30 12:29:48,035 - top2vec - INFO - Finding dense areas of documents
INFO:top2vec:Finding dense areas of documents
2022-08-30 12:29:48,043 - top2vec - INFO - Finding topics
INFO:top2vec:Finding topics


<bokeh.application.application.Application object at 0x29745a970>
SPACER


In [36]:
top2vec_run('doc2vec', 'fast-learn', 5, docs, ids, embed_df)

2022-08-25 16:22:11,909 - top2vec - INFO - Pre-processing documents for training
INFO:top2vec:Pre-processing documents for training
2022-08-25 16:22:12,278 - top2vec - INFO - Creating joint document/word embedding
INFO:top2vec:Creating joint document/word embedding
2022-08-25 16:22:20,390 - top2vec - INFO - Creating lower dimension embedding of documents
INFO:top2vec:Creating lower dimension embedding of documents
2022-08-25 16:22:21,820 - top2vec - INFO - Finding dense areas of documents
INFO:top2vec:Finding dense areas of documents
2022-08-25 16:22:21,826 - top2vec - INFO - Finding topics
INFO:top2vec:Finding topics


<bokeh.application.application.Application object at 0x2c291e040>
SPACER
<bokeh.application.application.Application object at 0x2c291e040>
