## Research Visualisations

This notebook plots comparisons between fight for sight publications and general eye research publications in the form of visualisation maps, word clouds, and fraction of each topic over time.

### Import Libraries

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import textwrap

import pickle
from sklearn.externals import joblib

import pandas as pd

from PreprocessText import lemmatize_abstracts

from wordcloud import WordCloud

import seaborn as sns

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

from bokeh.io import output_notebook, output_file, show
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource,HoverTool,Circle,Legend, LegendItem
output_notebook()

### Year Range to Investigate

In [2]:
START_YEAR = 2008
END_YEAR = 2018

### Load Data and Apply Preprocessing Steps

Using models pre-trained on dataset of half a million eye research publications.

In [3]:
print('Done.')

print('Loading and transforming ffs data')
df = pd.read_pickle('data/EPMC/ffs_papers.pkl')
df.set_index('pmid',inplace=True)
print(len(df),'rows')

ffs_abstracts = df.loc[~df['abstractText'].isnull(),'abstractText']

pubYear = pd.to_datetime(df['firstPublicationDate']).dt.year.astype(int)
pubYear.name = 'pubYear'

ffs_abstracts = ffs_abstracts[(pubYear>=START_YEAR) & (pubYear<=END_YEAR)]
pubYear = pubYear[(pubYear>=START_YEAR) & (pubYear<=END_YEAR)]

ffs_abstracts = lemmatize_abstracts(ffs_abstracts)

ffs_pmids = ffs_abstracts.index
ffs_titles = df.loc[ffs_pmids,'title']

del df

print('Counting')
countvec = joblib.load('models/sklearn/CountVec_eyekw.joblib')
features = countvec.get_feature_names()
ffs_counts = countvec.transform(ffs_abstracts)
ffs_counts = pd.DataFrame(ffs_counts.toarray(),
                                index=ffs_abstracts.index,
                                columns=features)
del countvec

print('LDA')
lda = joblib.load('models/sklearn/LDA_eyekw.joblib') 
topics = pd.DataFrame(lda.components_,columns=features)

ffs_lda_vectors = lda.transform(ffs_counts)
ffs_lda_vectors = pd.DataFrame(ffs_lda_vectors,index=ffs_pmids)

# select only papers with a strong topic association
ffs_lda_vectors = ffs_lda_vectors[ffs_lda_vectors.max(axis=1)>0.2]
ffs_abstracts = ffs_abstracts[ffs_lda_vectors.index]
ffs_pmids = ffs_pmids[ffs_pmids.isin(ffs_lda_vectors.index)]
ffs_titles = ffs_titles.loc[ffs_lda_vectors.index]
ffs_counts = ffs_counts.loc[ffs_lda_vectors.index]

ffs_topics = ffs_lda_vectors.idxmax(axis=1)
ffs_topicpercent = ffs_lda_vectors.max(axis=1)

ffs_topicwords = pd.Series(index=ffs_lda_vectors.index)
for pmid,counts in ffs_counts.iterrows():
    topic = topics.loc[ffs_topics.loc[pmid]]
    words = (counts*topic).nlargest(5).index
    ffs_topicwords.loc[pmid] = ', '.join(words)

del lda

print('UMAP')
umap_file = 'models/umap/eyekws_LDA50_umap_neighbors_30_metric_euclidean_dist_0.pkl'
with open(umap_file,'rb') as f:
    umapper = pickle.load(f)
ffs_umap = umapper.transform(ffs_lda_vectors)
del umapper
print('Done.')

print('Loading eye keyword data')
kw_umap = pd.read_pickle('models/umap/eyekws_datapoints.pkl')
kw_umap = kw_umap[(kw_umap['pubYear']>=START_YEAR) & (kw_umap['pubYear']<=END_YEAR)]

kw_lda_vectors = pd.read_pickle('data/EPMC/eyekw_ABSTRACTS_LDAVEC.pkl')
kw_lda_vectors = kw_lda_vectors[(kw_lda_vectors['pubYear']>=START_YEAR) & (kw_lda_vectors['pubYear']<=END_YEAR)]

print('Done.')

Done.
Loading and transforming ffs data
1602 rows
Removing missing abstracts
Initial preprocessing: case, punctuation, whitespace
Lemmatizing
Counting




LDA




UMAP
Done.
Loading eye keyword data
Done.


### UMAP Visualisation of Whole Field

In [4]:
# create a new plot
p = figure(title='Fight for Sight Publication Topics Compared to All Eyesight Research',
           plot_width=1600, plot_height=900)

# colours
colours = [
    "#%02x%02x%02x" % (int(r), int(g), int(b)) for r, g, b, _ in 255*mpl.cm.hsv(mpl.colors.Normalize()(kw_umap['topic'].values))
]

kw_source = ColumnDataSource({'x':kw_umap['umap_x'],'y':kw_umap['umap_y'], 
                           'c':colours})


kw_glyph = Circle(x='x', y='y', 
                  fill_color='c',
                  line_color='c',
                  size=1, fill_alpha=0.1)

kw_renderer = p.add_glyph(source_or_glyph=kw_source, glyph=kw_glyph)

ffs_source = ColumnDataSource({'x':ffs_umap[:,0],'y':ffs_umap[:,1],
                               'topic':ffs_topics,
                               'pmid':ffs_pmids,
                               'title':ffs_titles,
                               'value':ffs_topicpercent,
                               'words':ffs_topicwords})

ffs_glyph = Circle(x='x', y='y', 
                   fill_color='black',
                   line_color='white',
                   size=8, fill_alpha=1)

ffs_renderer = p.add_glyph(source_or_glyph=ffs_source, glyph=ffs_glyph)

# tooltips
hover = HoverTool(renderers=[ffs_renderer])
hover.tooltips = [
    ('PMID','@pmid'),('Title','@title'),('Topic','@topic (@value{0%})'), ('Keywords','@words')
]
p.add_tools(hover)

li = LegendItem(label='Fight for Sight Publications', renderers=[ffs_renderer])
legend1 = Legend(items=[li], location='top_left')
p.add_layout(legend1)

p.axis.visible = False
p.grid.visible = False

output_file('topics_interactive.html', title='topics_interactive')
show(p)

### Information on a Topic: FFS Compared to All Eyesight Research

Choose a topic number in the dropdown list to display information about that topic at the bottom of the notebook - a wordcloud representing keywords in that topic, a visualisation map coloured to show where that topic fits in the map, and a plot showing the change in that topic over time.

e.g. Topic 33: FFS high activity, Topic 8: FFS low activity.

In [5]:
import ipywidgets as widgets
from ipywidgets.widgets.interaction import show_inline_matplotlib_plots

ffs_topic_by_year = ffs_lda_vectors>0.1
ffs_topic_by_year = ffs_topic_by_year.groupby(pubYear).mean()

kw_pubYear = kw_lda_vectors['pubYear']
kw_topic_by_year = kw_lda_vectors>0.1
kw_topic_by_year = kw_topic_by_year.groupby(kw_pubYear).mean()

print_width = 120

def print_top(titles,values):
    for pmid in titles.index:
        print('PMID',pmid, '({:.0%} match)'.format(values.loc[pmid]))        
        print(textwrap.fill(titles.loc[pmid], width=print_width))
        print('')
    
opts = [str(i) for i in range(50)]
topic_selector = widgets.Dropdown(
    options=opts,
    value='2',
    description='Topic:',
    disabled=False,
)

out = widgets.Output()

@out.capture(clear_output=True)
def show_topic(change):
    with out:
        topic = int(change['new'])

        print('='*print_width)
        print('TOPIC',topic,': Keywords')
        print('='*print_width)
        
        # Topic word cloud
        text = topics.iloc[topic].sort_values(ascending=False).to_dict()
        
        wordcloud = WordCloud(width=1600, 
                              height=900,
                              prefer_horizontal=0.9,
                              max_words=100,
                              min_font_size=8,
                              max_font_size=None,
                              font_step=1,
                              background_color='white',
                              relative_scaling='auto',
                              colormap='tab10').generate_from_frequencies(text)

        plt.figure(figsize=(16,9))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.tight_layout()

        X_ffs = ffs_topic_by_year[topic].index.values
        y_ffs = ffs_topic_by_year[topic].values

        X_eyekw = kw_topic_by_year[topic].index.values
        y_eyekw = kw_topic_by_year[topic].values

        show_inline_matplotlib_plots()
        
        # Top FFS Papers Matching this Topic
        print('='*print_width)
        print('TOPIC',topic,': Top 5 Matches in Fight for Sight Publications')
        print('='*print_width)
        ffs_top5 = ffs_lda_vectors[topic].nlargest(5)
        print_top(ffs_titles.loc[ffs_top5.index],ffs_top5)

        # Top Eye KW Papers Matching this Topic
        print('='*print_width)
        print('TOPIC',topic,': Top 5 Matches in Eyesight Research Publications')
        print('='*print_width)        
        kw_top5 = kw_lda_vectors[topic].nlargest(5)
        print_top(kw_umap.loc[kw_top5.index,'title'],kw_top5)
        
        # Topic trend plot
        print('='*print_width)
        print('TOPIC',topic,': Trend')
        print('='*print_width)
        
        plt.figure(figsize=(16,5))
        ax = sns.regplot(x=X_eyekw, y=y_eyekw,label='All Research',color='#009fe3')
        sns.regplot(x=X_ffs, y=y_ffs,label='Fight for Sight Research',color='#77b800')
        
        leg=plt.legend(fontsize=18)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel('Publication Year',fontsize=18)
        plt.ylabel('Fraction of Publications',fontsize=18)

        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.xaxis.set_ticks_position('bottom')
        ax.yaxis.set_ticks_position('left')
        
        show_inline_matplotlib_plots()        

topic_selector.observe(show_topic, names='value')

display(topic_selector)
display(out)

Dropdown(description='Topic:', index=2, options=('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',…

Output()