# Using Bulk to Visualize Wire Stories
We can use a program called Bulk to visualize our vectorized newspaper articles! Bulk is an open source project that allows for quick and easy visualizations of embeddings

### Importing libraries and the folder of newspapers

In [None]:
import os
import pandas as pd
from umap import UMAP
from sentence_transformers import SentenceTransformer
master = []
for article in os.listdir('./nov-fin'):
    with open('./nov-fin/' + article, 'r', encoding='utf-8', errors='ignore') as f:
        master.append(f.readlines()[0])

### Importing some more libraries and creating our vectors of the newspaper articles

In [9]:
import pandas as pd
from umap import UMAP

from sentence_transformers import SentenceTransformer

# Load the universal sentence encoder
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Load original dataset
df = pd.read_csv("ok.csv")
sentences = df["text"]

# Calculate embeddings 
X =  model.encode(sentences)

# Reduce the dimensions with UMAP
umap = UMAP()
X_tfm = umap.fit_transform(X)

# Apply coordinates
df['x'] = X_tfm[:, 0]
df['y'] = X_tfm[:, 1]
df.to_csv("done.csv")

In [10]:
result = df.sort_values(by=['Id'], ascending=[False])

In [11]:
for index, row in result.iterrows():
    with open('./nov-fin/' + str(row['Id']) + '.txt', 'r', encoding='utf-8', errors='ignore') as f:
        row['text'] = f.readlines()[0]
        print(index)
        print(row['text'])

341
340
"Poles Report Mass Murder Of Their Jews Infants, Cripples Shot On Himmler's Orders, Officials Announce LONDON, Nov. 24. Elderly per-ons. children, infants and cripples nmong the Jewish population of Poland are being shot or forced to undergo death dealing hardships as a means of carrying out orders of the gestapo ehlof. llelnrteh Hltnmler. that half the Polish Jews must dp exterminated by the end of December. according to a report Issued Tuesday by Polish government ofliclnls in Lon- This report said only abled bodied iws were beinc snared because they ide valuable \"slave labor\" for (he Oen Polish authorities- here cav statistics showing that up to October 1 Jews had been elimi nated.\" Polish officials here said that October rnlotn card wolf printed for Jews living in the She I e popul that district last March was 433,000. RuthlewneM Charged declaring thBt a reduction of the Jewish population In Poland oy SO percent is merely thr \"first step toward Its complete liquidation.

### Importing the Bokeh Python Data Visualization Library

In [13]:


from bokeh.io import curdoc
from bokeh.layouts import column, row
from bokeh.models import (Button, ColumnDataSource, DataTable, TableColumn, TextInput)
from bokeh.plotting import figure, show
from bokeh.models import DataTable, TableColumn, ColorBar, HTMLTemplateFormatter, Spinner, RangeSlider
from bokeh.io import output_notebook
from bokeh.application import Application
from bokeh.application.handlers import FunctionHandler
import numpy as np



In [14]:


output_notebook()

import pandas as pd

### Using Bulk to Visualize our documents

In [17]:
def bulk_text(path, keywords=None):
    df = pd.read_csv(path)
    df['alpha'] = 0.5
    if keywords:
        df['color'] = [determine_keyword(str(t), keywords) for t in df['text']]
        df['alpha'] = [0.4 if c == 'none' else 1 for c in df['color']]

    highlighted_idx = []

    # mapper, df = get_color_mapping(df)
    columns = [
        TableColumn(field="text", title="text", width=1000),
        TableColumn(field="Id", title="Id"),
        TableColumn(field="link", title="article", formatter=HTMLTemplateFormatter(template=r'<a href="<%= link %>", target="_blank">View Article</a>')),
    ]

    def update(attr, old, new):
        """Callback used for plot update when lasso selecting"""
        global highlighted_idx
        subset = df.iloc[new]
        highlighted_idx = new
        subset = subset.iloc[np.random.permutation(len(subset))]
        source.data = subset

    def save():
        """Callback used to save highlighted data points"""
        global highlighted_idx
        df.iloc[highlighted_idx][['text']].to_csv(text_filename.value, index=False)

    source = ColumnDataSource(data=dict())
    source_orig = ColumnDataSource(data=df)

    data_table = DataTable(source=source, columns=columns, width=1500, height=700)
    source.data = df

    p = figure(title="", sizing_mode="scale_both", tools=["lasso_select", "box_select", "pan", "box_zoom", "wheel_zoom", "reset"])
    p.toolbar.active_drag = None
    p.toolbar.active_inspect = None

    circle_kwargs = {"x": "x", "y": "y", "size": 1, "source": source_orig, "alpha": "alpha"}

    scatter = p.circle(**circle_kwargs)
    p.plot_width = 1000
    if "color" in df.columns:
        p.plot_width=350
    p.plot_height = 700
    ## Spinner for Node Size
    spinner = Spinner(title="Circle Size", low = 1, high=60, step=1, value=scatter.glyph.size, width=200)
    spinner.js_link("value", scatter.glyph, "size")

        
    scatter.data_source.selected.on_change('indices', update)

    text_filename = TextInput(value="out.csv", title="Filename:")
    save_btn = Button(label="SAVE")
    save_btn.on_click(save)

    plot = column(p)
    controls_main = column(spinner)
    controls = column(text_filename, save_btn)
    
    def make_doc(doc):
        doc.add_root(row(controls_main))
        doc.add_root(row(plot, controls))
        doc.add_root(row(data_table))
    handler = FunctionHandler(make_doc)
    app=Application(handler)
    return app


app = bulk_text("./done.csv")
show(app)
