In [1]:
#pip install scispacy

In [None]:
#pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz

In [None]:
import glob
import re
import pandas as pd
import numpy as np
from pprint import pprint
from sys import getsizeof # Get size of a variable in bytes
from tqdm import tqdm
import pickle

#from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Spacy
import scispacy
import spacy
import en_core_sci_lg

# Gensim
import gensim
from gensim import corpora, models
from gensim.matutils  import Sparse2Corpus
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import plotly.graph_objects as go

%matplotlib inline

In [None]:
def sorted_nicely(l): 
    """ Sort the given iterable in the way that humans expect."""
    """Taken from: http://stackoverflow.com/questions/2669059/how-to-sort-alpha-numeric-set-in-python"""
    convert = lambda text: int(text) if text.isdigit() else text 
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(l, key = alphanum_key)
###END FUNCTION sorted_nicely(l)

<b>Read keywords (taken from external data sources) from csv files and store in dataframe<br>
Files are obtained from 2.fulltext_external-keywords.ipynb.<br>
They are stored as chunks of 500 documents each for ease of handling.
</b>

In [None]:
# Get file names
files = glob.glob(f'keyword_match_data/*.csv', recursive=True)
filelist = sorted_nicely(files)
print("# of csv files read:", len(filelist))

In [None]:
# Create a dataframe from contents of all files.
kw_df = pd.read_csv(filelist[0])
for fname in filelist[1:]:
    #print(fname)
    kw_df = pd.concat([kw_df, pd.read_csv(fname)], ignore_index=True)

In [36]:
# Check dataframe
kw_df

Unnamed: 0,paper_id,keyword_match
0,053bbe38ee5f3c97ade5b24a07b34b9930d3f474,"['severe acute respiratory syndrome', 'glioma'..."
1,060f83d466d0f77513f8e3d2b55e149894211b64,"['pseudomonas aeruginosa', 'human metapneumovi..."
2,ce19f6664fc109d488b095746ef87195906df39d,"['notch', 'nr', 'falls', 'hepatitis b', 'hepat..."
3,c83346b9e5e45cc987f59be4e9ad1e63c483ac9a,"['dengue', 'water', 'vesicular stomatitis viru..."
4,af9d045b63744eb78fa9b2c1ad2a27169d9e0f01,"['yellow fever', 'apoptosis', 'nr', 'atopic de..."
...,...,...
36452,1f55628e0dcd9b8f71cf130bbfc0baa1c4381900,"['nr', 'apoptosis', 'tiletamine', 'arginine', ..."
36453,5d2ed78357c0c88b4500d644f4a717d0192a0d2d,"['water', 'hepatitis e', 'serine', 'aved', 'et..."
36454,e701970d6ca6c78bd19df6c7cb72224cdec7ee07,"['water', 'ethanol', 'nr', 'pica', 'pain', 'bi..."
36455,110f5cd40aa0ff4381f7abc3cabbc629f9794e66,"['vaccinia', 'aved', 'ethylcellulose', 'pica',..."


<b>Tokanization and vectorization of keywords</b>

In [None]:
# Define tokenizer for list comprehension stored as string
def list_comprehension_str_tokenizer(list_comprehension_str):
    # Generates a list of strings from a list comprehension in string format
    # Example: String literal "['word1','word2']"  will be converted to list ['word1','word2']
    return list_comprehension_str[1:-2].replace("\'","").split(", ")

In [None]:
# Plot and check distribution of words
l = np.array(list(map(lambda x: len(list_comprehension_str_tokenizer(x)), kw_df['keyword_match'])))
print("# of documents with 0 keywords:", sum(l==0))
#Plot partial histogram
plt.hist(l, bins = list(range(0,100,5)))

In [None]:
# Create vectorizer. More stop words can be added.
kw_vectorizer = CountVectorizer(tokenizer=list_comprehension_str_tokenizer,
                             stop_words=['tic','ether','pica','rigi','nr','thc','orf',
                                         'elm','fits','aids','water','agar','aved',
                                         '', 'at', 'g', 'i', 'th', 'v'])


In [None]:
# NOTE: The texts variable is a list of strings. Get it either as a Dataframe column or as a list of strings.
kw_vectorized = kw_vectorizer.fit_transform(tqdm(kw_df['keyword_match'])) # Use tqdm(texts) if you want to see the progress.

In [None]:
# Check sizes of vectorized data
print("Type of vectorized data:", type(kw_vectorized))
print("Vectorized data (#documents, #features):", kw_vectorized.shape)
print("Vectorizer (#features):", len(kw_vectorizer.get_feature_names()))

In [None]:
# Dump vectorizer and vetorized data (OPTIONAL). Might save time next time. Just use pickle.load().
pickle.dump(kw_vectorizer, open('3.lda_visualization_fulltext-keyword-vectorizer.pkl', "wb")) # Writing binary file
pickle.dump(kw_vectorized, open('3.lda_visualization_fulltext-keyword-vectorized.pkl', "wb")) # Writing binary file

In [None]:
# Load vectorizer and vectorized data
#kw_vectorizer = pickle.load(open('3.lda_visualization_fulltext-keyword-vectorizer.pkl', "rb"))
#kw_vectorized = pickle.load(open('3.lda_visualization_fulltext-keyword-vectorized.pkl', "rb"))

# Check whether all features are loaded
#print("# Features:", len(kw_vectorizer.get_feature_names()))

In [None]:
# Most frequent keywords
word_count = pd.DataFrame({'word': kw_vectorizer.get_feature_names(), 'count': np.asarray(kw_vectorized.sum(axis=0))[0]})
word_count.sort_values('count', ascending=False).set_index('word')[:20].sort_values('count', ascending=True).plot(kind='barh')

<b>Merge vectorized keywords with vectorized entities from abstracts (Obtained from 1.abstrac_entity_vectorization.ipynb)</b>

In [None]:
# Convert vectorized data to data frame
kw_vec_mat = kw_vectorized.todense()
kw_vec_df = pd.DataFrame(kw_vec_mat, index=kw_df['paper_id'], columns=kw_vectorizer.get_feature_names())
kw_vec_df = kw_vec_df.reset_index()
print("DataFrame shape:", kw_vec_df.shape)

In [38]:
# Read vectorized entities from abstract and corresponding dataframe
ab_vectorizer = pickle.load(open('1.abstrac_entity_vectorization_vectorizer.pkl', "rb"))
ab_vectorized = pickle.load(open('1.abstrac_entity_vectorization_vectorized.pkl', "rb"))
ab_df_full = pd.read_csv("1.abstrac_entity_vectorization_abstract-entities.csv")

In [39]:
# Convert vectorized data to data frame
ab_vec_mat = ab_vectorized.todense()
ab_vec_df = pd.DataFrame(ab_vec_mat, index=ab_df_full['paper_id'], columns=ab_vectorizer.get_feature_names())
ab_vec_df = ab_vec_df.reset_index()
print("DataFrame shape:", ab_vec_df.shape)

DataFrame shape: (36957, 790)


In [40]:
# Get common columns
kw_cols = set(kw_vec_df.columns)
ab_cols = set(ab_vec_df.columns)
kw_ab_intersect_cols = kw_cols.intersection(ab_cols)
list(kw_ab_intersect_cols)

['rotavirus',
 'adenovirus',
 'immune',
 'dengue',
 'human',
 'medical',
 'paper_id',
 'cough',
 'transcription',
 'air',
 'rhinovirus',
 'ifn',
 'influenza',
 'apoptosis',
 'parainfluenza']

In [41]:
# Drop common columns from keywords dataframe, except for paper_id
# Manual input needed here. Copy the column names from the above cell and paste here. Remove paper_id.
drop_cols = ['rotavirus',
 'adenovirus',
 'immune',
 'dengue',
 'human',
 'medical',
 'cough',
 'transcription',
 'air',
 'rhinovirus',
 'ifn',
 'influenza',
 'apoptosis',
 'parainfluenza']

kw_vec_df.drop(drop_cols, axis=1, inplace=True)

In [42]:
# Merge dataframes
vec_df = kw_vec_df.merge(ab_vec_df, how="left", on="paper_id")

In [43]:
print(vec_df.shape)

(36457, 5837)


**Get all texts in the list format required by Gensim.**

In [44]:
# Get list of words for each document
def getVectorized(catch_array):
    columnanmes = vec_df.columns[2:]
    catch_array = catch_array[2:]
    new_array = []
    for idx, val in enumerate(catch_array):
        if catch_array[idx]>0:
            #new_array.append(catch_array[idx]*[columnanmes[idx]])
            new_array.append(catch_array[idx]*[columnanmes[idx]])
    return [item for sublist in new_array for item in sublist if item]

In [45]:
vec_df['vectorized_array'] = vec_df.apply(lambda x : getVectorized(x.values), axis=1)
# Confirm that the conversion worked
#print(vec_df['vectorized_array'][9])

In [46]:
# Combine all lists of words as a single list of lists
texts = list(vec_df['vectorized_array'])

In [47]:
# Create dictionary and corpus for Gensim LDA model
np.random.seed(42)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
print(dictionary)
print("Length of corpus:", len(corpus))

Dictionary(5835 unique tokens: ['acne', 'alcohol', 'alcohol dependence', 'breast cancer', 'cell cycle']...)
Length of corpus: 36457


**Topic modeling using LDA**

In [50]:
def lda_gridsearch(dictionary, corpus, texts, limit, start=2, step=3):
    # Function to perform grid search on LDA model parameters (currently for number of topics)
    # Returns coherence score of tested models
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit+step, step):
        print("Building model for", num_topics, "topics.")
        #Use following for standard gensim model
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=num_topics, 
                                                random_state=42,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                minimum_probability=0,
                                                per_word_topics=True)
        #Use following for mallet model (if installed)
        #model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [75]:
# Grid search LDA model parameters (currently, number of topics)
start=2; limit=30; step=2;
model_list, coherence_values = lda_gridsearch(dictionary=dictionary, corpus=corpus, texts=texts,
                                                    start=start, limit=limit, step=step) # Inclusive of start and limit

In [76]:
# Plot coherence scores
x = range(start, limit+step, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [55]:
# Choose number of topics based on grid search results
ntopics = 10

In [56]:
# Build LDA model. You can try to give more number of passes.
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=dictionary,
                                            num_topics=ntopics, 
                                            random_state=42,
                                            update_every=1,
                                            chunksize=100,
                                            passes=50,
                                            alpha='auto',
                                            minimum_probability=0,
                                            per_word_topics=True)

In [57]:
# Save model / Load model
pickle.dump(lda_model, open('3.lda_visualization_nTopics'+str(lda_model.num_topics)+'.pkl', "wb")) # Writing binary file
#lda_model = pickle.load(open('3.lda_visualization_nTopics'+str(lda_model.num_topics)+'.pkl', "wb")) # Writing binary file

In [58]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.538123283423558

Coherence Score:  0.5934183841010918


In [59]:
# Print the Keyword in the topics
# NOTE: A human understable name needs to be created for each topic.
pprint(lda_model.print_topics())

[(0,
  '0.074*"cell" + 0.056*"infection" + 0.041*"response" + 0.027*"immune" + '
  '0.023*"mouse" + 0.021*"expression" + 0.018*"effect" + 0.018*"symptom" + '
  '0.017*"treatment" + 0.016*"increase"'),
 (1,
  '0.058*"vaccine" + 0.050*"day" + 0.050*"antibody" + 0.042*"strain" + '
  '0.024*"serum" + 0.023*"pedv" + 0.021*"animal" + 0.018*"porcine" + '
  '0.018*"pig" + 0.018*"group"'),
 (2,
  '0.049*"protein" + 0.047*"virus" + 0.027*"cell" + 0.026*"viral" + '
  '0.024*"rna" + 0.023*"gene" + 0.020*"host" + 0.016*"sequence" + '
  '0.013*"replication" + 0.013*"human"'),
 (3,
  '0.034*"drug" + 0.023*"hepatitis c" + 0.021*"domain" + 0.017*"ethanol" + '
  '0.016*"hepatitis c virus" + 0.015*"glucose" + 0.015*"calcium" + '
  '0.015*"compound" + 0.014*"rosin" + 0.014*"serine"'),
 (4,
  '0.083*"disease" + 0.063*"health" + 0.034*"population" + 0.033*"infectious" '
  '+ 0.029*"public" + 0.028*"country" + 0.026*"human" + 0.023*"pandemic" + '
  '0.022*"risk" + 0.020*"global"'),
 (5,
  '0.045*"model" + 0.

In [None]:
# Another way to print topics
# Print only two topics and represent by top 6 words
#pprint(lda_model.print_topics(num_topics=2, num_words=6))

In [None]:
# OPTIONAL: Visualize the topics. This can take too long for large models.
#pyLDAvis.enable_notebook()
#vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
#vis

In [60]:
# Save doc-topic distribution
doc_topic_lda = lda_model[corpus]
doc_topic_df = pd.DataFrame(doc_topic_lda)

In [61]:
# Check doc-topic distribution
print(doc_topic_df.shape)
doc_topic_df.head()

(36457, 3)


Unnamed: 0,0,1,2
0,"[(0, 0.01544548), (1, 0.010459503), (2, 0.0200...","[(0, [3]), (1, [6, 3]), (2, [6]), (3, [3]), (4...","[(0, [(3, 0.9997795)]), (1, [(3, 0.3732544), (..."
1,"[(0, 0.0666787), (1, 0.0041270824), (2, 0.0094...","[(6, [7, 4, 6, 3, 2]), (22, [8, 0]), (23, [7, ...","[(6, [(2, 0.01743876), (3, 0.086157136), (4, 0..."
2,"[(0, 0.056981664), (1, 0.0041263695), (2, 0.31...","[(7, [4, 6]), (25, [5, 9, 2]), (33, [4, 8, 0])...","[(7, [(4, 0.7389295), (6, 0.2608259)]), (25, [..."
3,"[(0, 0.03264456), (1, 0.0055407207), (2, 0.713...","[(25, [5, 2]), (54, [6, 7]), (108, [0]), (109,...","[(25, [(2, 0.29172263), (5, 0.7080283)]), (54,..."
4,"[(0, 0.2987733), (1, 0.062128287), (2, 0.05205...","[(10, [6, 3, 7, 2]), (14, [6, 3, 0]), (33, [4,...","[(10, [(2, 0.018720211), (3, 0.24642244), (6, ..."


<h3>Plot topics for selected document</h3>
<b>Bubble chart is as follows:
<ul>
    <li>X-axis shows top N topics for a chosen document.</li>
    <li>Y-axis denotes how much importance of the topic in the document. (Higher the y-value, more the importance of the topic in this document.)</li>
    <li>Radius of the bubble denotes in how many other documents this topic is important. The importance can be set using a threshold. (Larger the radius, higher chances of finding a similar document. Or, smaller the radius, more unique is the document.)</li>
</ul>
</b>

In [68]:
# Get number of documents for all topics beyond a selected threshold.
# This number will be used to assign radius of bubble.
topic_doc_cnt_threshod = 0.5
topic_doc_cnt = {i:0 for i in range(0,lda_model.num_topics)}
for topicid in range(0,lda_model.num_topics):
    for docid in doc_topic_df.index:
        for item in doc_topic_df.iloc[docid][0]:
            if item[0] == topicid and item[1]>topic_doc_cnt_threshod:
                topic_doc_cnt[topicid] += 1
print(topic_doc_cnt)

{0: 327, 1: 67, 2: 3110, 3: 266, 4: 224, 5: 379, 6: 5025, 7: 1147, 8: 703, 9: 7}


In [77]:
# Choose docment id to view
docid = 2

doc_topic_weight = {item[0]:item[1] for item in doc_topic_df.iloc[docid][0]}
for topicid in range(0,lda_model.num_topics):
    if topicid not in doc_topic_weight:
        doc_topic_weight[topicid] = 0.0

fig = go.Figure(data=[go.Scatter(
    #x=[1, 3.2, 5.4, 7.6, 9.8, 12.5],
    x=["Topic"+str(i) for i in range(0,lda_model.num_topics)],
    y=[doc_topic_weight[i] for i in sorted(doc_topic_weight.keys())],
    mode='markers',
    marker=dict(
        #color=[120, 125, 130, 135, 140, 145],
        size=[0.03*topic_doc_cnt[i] for i in sorted(topic_doc_cnt.keys())],
        #showscale=False
        )
)])
fig.show()
print("Selected document id:", docid)


Selected document id: 2
