In [1]:
import pandas as pd
import src.constants as const
import pickle
import os
import numpy as np
from os.path import join

In [2]:
# load data
df = pd.read_pickle(os.path.join(const.ARTIFACTS_DIR, "journals-with-stm-topics.pkl"))


df = df[~df["dc:description"].isna()]
X_embedded = pickle.load(open(join(const.BOKEH_DIR, "X-embedding-stm-tfidf.pkl"), "rb"))

# y_pred = list(pickle.load(open(const.TFIDF_CLUSTERING, "rb")))

y_pred = list(df["stm:topics"].apply(lambda x: x[0]))

In [3]:
topics_list = list(pd.read_csv(os.path.join(const.ARTIFACTS_DIR, "Topics_40.csv"), delimiter=";").columns[1:])

topics_list = [t.replace("&amp;", "\&") for t in topics_list]

In [4]:

df["prism:publicationName"] = df["prism:publicationName"]\
    .apply(lambda x: x.replace("&amp;", "and"))\
    .apply(lambda x: x.title())\
    .apply(lambda x: x.replace("Affilia - Journal Of Women And Social Work", "Affilia"))\
    .apply(lambda x: x.replace("The Social Service Review", "Social Service Review"))

print(df["prism:publicationName"].unique())
print(len(df["prism:publicationName"].unique()))


['Journal Of Teaching In Social Work' 'Families In Society' 'Affilia'
 'Journal Of Sociology And Social Welfare'
 'Research On Social Work Practice' 'Social Work'
 'Child And Family Social Work' 'Child And Adolescent Social Work Journal'
 'Social Work In Health Care' 'International Social Work'
 'Journal Of Social Service Research' 'Social Work Research'
 'Health And Social Work' 'British Journal Of Social Work'
 'Journal Of Gerontological Social Work'
 'Journal Of Social Work Education' 'Social Service Review'
 'European Journal Of Social Work'
 'Journal Of The Society For Social Work And Research'
 'Journal Of Social Work Practice' 'Journal Of Community Practice'
 'Australian Social Work' 'Qualitative Social Work']
23


In [5]:
# preprocessing

def pre(x):
    if type(x) == list:
        return ", ".join(x)
    return str(x)


# for some reason we have to divide by 1x10^6
df["ts"] = df["prism:coverDate"].values.astype(np.int64) // 1e6
df["author:name:pretty"] = df["author:name"].apply(pre)

topic_strings = []
topic_top = []

for topics, props in zip(df["stm:topics"], df["stm:topics:probs"]):
    s = ""
    m = len(topics)

    for n, (topic, prop) in enumerate(zip(topics, props)):
        s += f"{topics_list[topic]} ({prop:.1%})"
        if n < m -1 :
            s += ", "

    top_topic = topics[np.argmax(props)]

    topic_strings.append(s)
    topic_top.append(top_topic)

df["lda:topics:pretty"] = pd.Series(topic_strings, index=df.index)
df["lda:topics:top"] = pd.Series(topic_top, index=df.index)


In [6]:
from bokeh.plotting import output_file
# output to static HTML file
output_file(os.path.join(const.BOKEH_DIR ,"interactive.html"))


In [7]:
from bokeh.plotting import figure, show
from bokeh.palettes import Category20, Turbo256, linear_palette
from bokeh.transform import linear_cmap, factor_cmap
from bokeh.models import HoverTool, Div
from datetime import date
from bokeh.models import ColumnDataSource


data = {"x1": X_embedded[:,0],
        "x2": X_embedded[:,1],
        "x1_backup": X_embedded[:,0],
        "x2_backup": X_embedded[:,1],
        "label": df["lda:topics:top"],
        "label2": y_pred,
        "topics": df["lda:topics:pretty"],
        "title": df["dc:title"],
        "date": df["prism:coverDate"].apply(lambda x: x.strftime("%d.%m.%Y")),
        "timestamp": df["ts"],
        "author": df["author:name:pretty"],
        "journal": df["prism:publicationName"],
        "cluster": [topics_list[c] for c in y_pred],
        "abstract": df["dc:description"],
        "doi": df["prism:doi"]
    }

source = ColumnDataSource(data=data)


color_mapper = factor_cmap(
    field_name="cluster",
    palette=linear_palette(Turbo256, len(set(y_pred))),
    factors=topics_list,
    )

# color_mapper = factor_cmap(
#     field_name="journal",
#     palette=linear_palette(Turbo256, len(df["prism:publicationName"].unique())),
#     factors=df["prism:publicationName"].unique(),
#     )

#     linear_cmap(
#     field_name="journal",
#     palette=Turbo256, #Category20[20],
#     low=min(df["lda:topics:top"]),
#     high=max(df["lda:topics:top"])
#     )


# hover over information
hover = HoverTool(tooltips=[
    ("Title", '<div style="width:400px;">@title{safe}</div>'),
    ("Date", "@date{safe}"),
    ("Author(s)", "@author{safe}"),
    ("LDA Topics(s)", "@topics{safe}"),
    ("Journal", "@journal{safe}"),
    ("Abstract", '<div style="width:400px;">@abstract{safe}</div>'), # wrap abstracts
    ("DOI", "@doi")
], point_policy="follow_mouse")

plot = figure(plot_width=1800, plot_height=1000, tools=[hover, 'pan', 'wheel_zoom', 'box_zoom', 'reset', 'save', 'tap'],
           title=None,  toolbar_location="above")

plot.scatter(source=source,x="x1",y="x2",size=5, fill_color=color_mapper, line_alpha=0.3,
          line_color="black", legend="cluster")




In [8]:
import bokeh
print(bokeh.__version__)

# reload changes in local modules
import importlib
import demo.callbacks as cb
from bokeh.models import TextInput, DateRangeSlider, MultiChoice, Div, CustomJS, TapTool, Title
importlib.reload(cb)



2.2.3


<module 'demo.callbacks' from '/home/ki/projects/scopus/scopus-mining/demo/callbacks.py'>

In [9]:
div_info = Div(text="Click on an article for details.",height=150)
callback_selected = CustomJS(args=dict(source=source, current_selection=div_info), code=cb.selected_code())
taptool = plot.select(type=TapTool)
taptool.callback = callback_selected

In [10]:
input_callback = cb.input_callback(source)


text_search = TextInput(title="Search:")
text_search.js_on_change("value", input_callback)


text_cout_label = Div(text="<b>Total Documents:</b>", height=25)
text_count = Div(text=f"{len(df)}", height=25)


date_range_slider = DateRangeSlider(
    title="Publication Date",
    value=(date(1960, 1, 1), date.today()),
    start=date(1960, 1, 1), end=date.today(), step=1)

date_range_slider.js_on_change("value", input_callback)


JOURNAL_OPTIONS = [str(i) for i in df["prism:publicationName"].unique()]
journal_choice = MultiChoice(value=JOURNAL_OPTIONS, options=JOURNAL_OPTIONS)
journal_choice.js_on_change("value", input_callback)

topic_choice = MultiChoice(value=topics_list, options=topics_list)
topic_choice.js_on_change("value", input_callback)


# pass call back arguments
input_callback.args["text_search"] = text_search
input_callback.args["date_range_slider"] = date_range_slider
input_callback.args["journal_choice"] = journal_choice
input_callback.args["text_count"] = text_count
input_callback.args["topic_choice"] = topic_choice

In [11]:
title = Div(text="<h1>SWORM - Social Work Research Map</h1>")

In [12]:
plot.sizing_mode = "scale_both"
plot.margin = 5

# plot.legend.location = (1000,0)
plot.legend.visible = True
plot.legend.label_text_font_size = "10px"

# plot.legend.click_policy="hide"
plot.add_layout(plot.legend[0], 'right')

plot.toolbar.logo = None

In [13]:
from bokeh.layouts import column, row
from bokeh.models.widgets import Tabs, Panel

info = column([
    row([text_cout_label, text_count]),
    div_info
])

journal_pane = Panel(child=journal_choice, title="Journals")
topic_pane = Panel(child=topic_choice, title="Topics")
tab = Tabs(tabs=[journal_pane, topic_pane])

selection = column([date_range_slider, text_search, tab])
content = row([selection, plot, info])
layout = column([title, content])
# scaling
layout.sizing_mode = "scale_both"

show(layout)




