In [None]:
import pandas as pd
import src.constants as const
import pickle
import os
import numpy as np

In [None]:
# load data
df = pd.read_pickle(os.path.join(const.ARTIFACTS_DIR, "journals-with-topics.pkl"))

embedding_type = "gensim-doc2vec" # "doc2vec"

df = df[~df["dc:description"].isna()]
p = os.path.join(const.BOKEH_DIR, f"X-embedding-{embedding_type}.pkl")
X_embedded = pickle.load(open(p, "rb"))

p = os.path.join(const.BOKEH_DIR, f"y-pred-{embedding_type}.pkl")
y_pred = pickle.load(open(p, "rb"))

In [None]:

df["prism:publicationName"] = df["prism:publicationName"]\
    .apply(lambda x: x.replace("&amp;", "and"))\
    .apply(lambda x: x.title())\
    .apply(lambda x: x.replace("Affilia - Journal Of Women And Social Work", "Affilia"))\
    .apply(lambda x: x.replace("The Social Service Review", "Social Service Review"))

print(df["prism:publicationName"].unique())
print(len(df["prism:publicationName"].unique()))


In [None]:
# preprocessing

def pre(x):
    if type(x) == list:
        return ", ".join(x)
    return str(x)


# for some reason we have to divide by 1x10^6
df["ts"] = df["prism:coverDate"].values.astype(np.int64) // 1e6
df["author:name:pretty"] = df["author:name"].apply(pre)

topic_strings = []
topic_top = []

for topics, props in zip(df["lda:topics"], df["lda:topics:props"]):
    s = ""
    m = len(topics)

    for n, (topic, prop) in enumerate(zip(topics, props)):
        s += f"{topic} ({prop:.1%})"
        if n < m -1 :
            s += ", "

    top_topic = topics[np.argmax(props)]

    topic_strings.append(s)
    topic_top.append(top_topic)

df["lda:topics:pretty"] = pd.Series(topic_strings, index=df.index)
df["lda:topics:top"] = pd.Series(topic_top, index=df.index)


In [None]:
from bokeh.plotting import output_file
# output to static HTML file
output_file(os.path.join(const.BOKEH_DIR ,"interactive.html"))


In [None]:
from bokeh.plotting import figure, show
from bokeh.palettes import Category20, Turbo256, linear_palette
from bokeh.transform import linear_cmap, factor_cmap
from bokeh.models import HoverTool, Div
from datetime import date
from bokeh.models import ColumnDataSource


data = {"x1": X_embedded[:,0],
        "x2": X_embedded[:,1],
        "x1_backup": X_embedded[:,0],
        "x2_backup": X_embedded[:,1],
        "label": df["lda:topics:top"],
        "label2": y_pred,
        "topics": df["lda:topics:pretty"],
        "title": df["dc:title"],
        "date": df["prism:coverDate"].apply(lambda x: x.strftime("%d.%m.%Y")),
        "timestamp": df["ts"],
        "author": df["author:name:pretty"],
        "journal": df["prism:publicationName"],
        "abstract": df["dc:description"],
        "doi": df["prism:doi"]
    }

source = ColumnDataSource(data=data)

color_mapper = factor_cmap(
    field_name="journal",
    palette=linear_palette(Turbo256, len(df["prism:publicationName"].unique())),
    factors=df["prism:publicationName"].unique(),
    )

#     linear_cmap(
#     field_name="journal",
#     palette=Turbo256, #Category20[20],
#     low=min(df["lda:topics:top"]),
#     high=max(df["lda:topics:top"])
#     )


# hover over information
hover = HoverTool(tooltips=[
    ("Title", '<div style="width:400px;">@title{safe}</div>'),
    ("Date", "@date"),
    ("Author(s)", "@author{safe}"),
    ("Topics(s)", "@topics"),
    ("Journal", "@journal"),
    ("Abstract", '<div style="width:400px;">@abstract{safe}</div>'), # wrap abstracts
    ("DOI", "@doi")
], point_policy="follow_mouse")

plot = figure(plot_width=1800, plot_height=1000, tools=[hover, 'pan', 'wheel_zoom', 'box_zoom', 'reset', 'save', 'tap'],
           title="Clustering of the Scopus Literature with t-SNE and K-Means",)

plot.scatter(source=source,x="x1",y="x2",size=5,fill_color=color_mapper,line_alpha=0.3,
          line_color="black", legend = "journal")


In [None]:
import bokeh
print(bokeh.__version__)

# reload changes in local modules
import importlib
import demo.lib.call_backs as cb
from bokeh.models import TextInput, DateRangeSlider, MultiChoice, Paragraph
importlib.reload(cb)


callback = cb.selection_callback(source)


text_search = TextInput(title="Search:")
text_search.js_on_change("value", callback)

text_info = Paragraph(text=f"Documents: {len(df)}", height=25, name="textinfo")


date_range_slider = DateRangeSlider(
    value=(date(1960, 1, 1), date.today()),
    start=date(1960, 1, 1), end=date.today(), step=1)

date_range_slider.js_on_change("value", callback)

OPTIONS = [str(i) for i in df["lda:topics:top"].unique()]

cluster_choice = MultiChoice(value=OPTIONS, options=OPTIONS)
cluster_choice.js_on_change("value", callback)


# pass call back arguments
callback.args["text_search"] = text_search
callback.args["date_range_slider"] = date_range_slider
callback.args["cluster_choice"] = cluster_choice
callback.args["text_info"] = text_info


In [None]:
plot.sizing_mode = "scale_both"
plot.margin = 5

# plot.legend.location = (1000,0)
plot.legend.visible = True
plot.legend.click_policy="hide"
plot.add_layout(plot.legend[0], 'right')

In [None]:
from bokeh.layouts import column
l = column([date_range_slider, text_search, cluster_choice, text_info, plot])
show(l)

