## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, HdpModel
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts
from gensim.parsing.preprocessing import strip_tags, strip_punctuation, strip_multiple_whitespaces, remove_stopwords, preprocess_string

from tqdm import tqdm_notebook as tqdm

## Read in data

In [2]:
# Read in articles
articles = pd.read_csv('NLT_data/merged_data.csv')

In [3]:
articles.shape

(37562, 10)

In [4]:
articles.head(2)

Unnamed: 0,paper_id,source,title,abstract,publish_time,authors,journal,url,discussion,text_body
0,b2897e1277f56641193a6db73825f707eed3e4c9,PMC,Sequence requirements for RNA strand transfer ...,Nidovirus subgenomic mRNAs contain a leader se...,2001-12-17,"Pasternak, Alexander O.; van den Born, Erwin; ...",The EMBO Journal,http://europepmc.org/articles/pmc125340?pdf=re...,,The genetic information of RNA viruses is orga...
1,e3d0d482ebd9a8ba81c254cc433f314142e72174,PMC,"Crystal structure of murine sCEACAM1a[1,4]: a ...",CEACAM1 is a member of the carcinoembryonic an...,2002-05-01,"Tan, Kemin; Zelus, Bruce D.; Meijers, Rob; Liu...",The EMBO Journal,http://europepmc.org/articles/pmc125375?pdf=re...,,Carcinoembryonic antigen (CEA; CD66e) was init...


In [5]:
# Read in topics 
topics = pd.read_csv('df_dominant_topic.csv')

In [6]:
topics.shape

(37562, 5)

In [7]:
topics.head()

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,18.0,0.5541,"protein, proteins, rna, binding, membrane, str...","['nidovirus', 'subgenomic', 'mrnas', 'contain'..."
1,1,18.0,0.2898,"protein, proteins, rna, binding, membrane, str...","['ceacam1', 'member', 'carcinoembryonic', 'ant..."
2,2,18.0,0.3064,"protein, proteins, rna, binding, membrane, str...","['hepatitis', 'c', 'virus', 'hcv', 'important'..."
3,3,18.0,0.5376,"protein, proteins, rna, binding, membrane, str...","['key', 'enzyme', 'coronavirus', 'polyprotein'..."
4,4,18.0,0.4824,"protein, proteins, rna, binding, membrane, str...","['arteri', 'corona', 'toro', 'roniviruses', 'e..."


## Concatenate topics to original dataframe

In [8]:
articles_with_top = pd.concat([topics, articles], axis=1)

In [9]:
articles_with_top.head()

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text,paper_id,source,title,abstract,publish_time,authors,journal,url,discussion,text_body
0,0,18.0,0.5541,"protein, proteins, rna, binding, membrane, str...","['nidovirus', 'subgenomic', 'mrnas', 'contain'...",b2897e1277f56641193a6db73825f707eed3e4c9,PMC,Sequence requirements for RNA strand transfer ...,Nidovirus subgenomic mRNAs contain a leader se...,2001-12-17,"Pasternak, Alexander O.; van den Born, Erwin; ...",The EMBO Journal,http://europepmc.org/articles/pmc125340?pdf=re...,,The genetic information of RNA viruses is orga...
1,1,18.0,0.2898,"protein, proteins, rna, binding, membrane, str...","['ceacam1', 'member', 'carcinoembryonic', 'ant...",e3d0d482ebd9a8ba81c254cc433f314142e72174,PMC,"Crystal structure of murine sCEACAM1a[1,4]: a ...",CEACAM1 is a member of the carcinoembryonic an...,2002-05-01,"Tan, Kemin; Zelus, Bruce D.; Meijers, Rob; Liu...",The EMBO Journal,http://europepmc.org/articles/pmc125375?pdf=re...,,Carcinoembryonic antigen (CEA; CD66e) was init...
2,2,18.0,0.3064,"protein, proteins, rna, binding, membrane, str...","['hepatitis', 'c', 'virus', 'hcv', 'important'...",00b1d99e70f779eb4ede50059db469c65e8c1469,PMC,Synthesis of a novel hepatitis C virus protein...,Hepatitis C virus (HCV) is an important human ...,2001-07-16,"Xu, Zhenming; Choi, Jinah; Yen, T.S.Benedict; ...",EMBO J,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,Previous studies have demonstrated that the lo...,Hepatitis C virus (HCV) is a positive-stranded...
3,3,18.0,0.5376,"protein, proteins, rna, binding, membrane, str...","['key', 'enzyme', 'coronavirus', 'polyprotein'...",cf584e00f637cbd8f1bb35f3f09f5ed07b71aeb0,PMC,Structure of coronavirus main proteinase revea...,The key enzyme in coronavirus polyprotein proc...,2002-07-01,"Anand, Kanchan; Palm, Gottfried J.; Mesters, J...",The EMBO Journal,http://europepmc.org/articles/pmc126080?pdf=re...,,Transmissible gastroenteritis virus (TGEV) bel...
4,4,18.0,0.4824,"protein, proteins, rna, binding, membrane, str...","['arteri', 'corona', 'toro', 'roniviruses', 'e...",dde02f11923815e6a16a31dd6298c46b109c5dfa,PMC,Discontinuous and non-discontinuous subgenomic...,"Arteri-, corona-, toro- and roniviruses are ev...",2002-12-01,"van Vliet, A.L.W.; Smits, S.L.; Rottier, P.J.M...",The EMBO Journal,http://europepmc.org/articles/pmc136939?pdf=re...,,Positive (+)-strand RNA viruses have developed...


## Drop duplicate rows

In [10]:
articles_with_top_dropped = articles_with_top.drop_duplicates(subset='title')

In [11]:
articles_with_top_dropped.shape # 354 rows removed

(37208, 15)

In [12]:
# Save out as csv
articles_with_top_dropped.to_csv('articles_with_topics.csv', index=False)

## Train on Doc2Vec model
Taken from Clay's code in `gensim_doc2vec_recommender.ipynb` notebook.

In [13]:
articles_with_top_dropped.isnull().sum()

Document_No               0
Dominant_Topic            0
Topic_Perc_Contrib        0
Keywords                  0
Text                      0
paper_id                  0
source                    0
title                     0
abstract               4923
publish_time              0
authors                 397
journal                3344
url                     103
discussion            21649
text_body                 0
dtype: int64

### Cleaning

In [14]:
df = articles_with_top_dropped.copy()

In [15]:
# filling some missing abstract values with the discussion section
df['abstract'] = df['abstract'].fillna(value = df['discussion'])
# Filling the rest with the title value
df['abstract'] = df['abstract'].fillna(value = df['title'])

In [16]:
# funtion for cleaning text
def preprocess(text):
    ls = []
    for i in tqdm(text):
        custom_filters = [strip_tags, strip_punctuation, strip_multiple_whitespaces, 
                          remove_stopwords]
        ls.append(preprocess_string(i.lower(), custom_filters))
    return ls

In [17]:
raw_data = df['abstract']

In [18]:
data = preprocess(raw_data)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=37208.0), HTML(value='')))




### Model training

In [19]:
def create_tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield TaggedDocument(list_of_words, [i])

In [20]:
data_tagged = list(create_tagged_document(data))

In [21]:
model = Doc2Vec(vector_size = 200, min_count = 3, epochs = 20)

In [22]:
model.build_vocab(data_tagged)

In [39]:
%%time
model.train(data_tagged, total_examples = model.corpus_count, epochs = model.epochs)

CPU times: user 4min 44s, sys: 9.06 s, total: 4min 53s
Wall time: 2min 2s


In [40]:
model.save('doc2vec_updated.model')

In [41]:
# Grab most similar articles of the input word and its most similar words 
def most_sim_docs(word, n_articles, topn=3):
    dfs = []
    sims = []
    
    # Input word with topn of its most similar keywords
    keywords = [word] + [w[0] for w in model.wv.most_similar_cosmul(word, topn=topn)]
    
    # Loop through each keyword
    for term in keywords:
        new_vec = model.infer_vector([term]) # random, different everytime 
        tag_list = model.docvecs.most_similar([new_vec])[0:n_articles]
        
        
        tags = [] 
        for num in tag_list:
            tags.append(num[0])
            sims.append(num[1])
        
        for t in tags:
            dfs.append(df.iloc[t, :])
        
    new_df = pd.DataFrame(dfs)
    new_df['similarity_percentage'] = sims
    
    return new_df

In [42]:
protein = most_sim_docs('protein', 2)

In [45]:
protein.iloc[1]['Keywords']

'protein, proteins, rna, binding, membrane, structure, sequence, cell, domain, structural'

In [46]:
protein

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text,paper_id,source,title,abstract,publish_time,authors,journal,url,discussion,text_body,similarity_percentage
14831,14831,18.0,0.4355,"protein, proteins, rna, binding, membrane, str...","['inference', 'accomplished', 'assigning', 'pe...",6747c4bf9d71daeab63b36eada34498f0346bf4a,PMC,Protein Analysis by Shotgun/Bottom-up Proteomics,Protein Analysis by Shotgun/Bottom-up Proteomics,2013-02-26,"Zhang, Yaoyang; Fonslow, Bryan R.; Shan, Bing;...",Chemical Reviews,http://europepmc.org/articles/pmc3751594?pdf=r...,,inference is accomplished by assigning peptide...,0.860158
12585,12585,18.0,0.6381,"protein, proteins, rna, binding, membrane, str...","['number', 'approaches', 'taken', 'elucidate',...",fc4839e258edd3040fe5bf354a6bf1b0ebcff19b,PMC,Genetic and Molecular Biological Analysis of P...,Genetic and Molecular Biological Analysis of P...,2006-01-01,"Masters, Paul S.; Kuo, Lili; Ye, Rong; Hurst, ...",The Nidoviruses,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,,A number of approaches have been taken to eluc...,0.803823
14831,14831,18.0,0.4355,"protein, proteins, rna, binding, membrane, str...","['inference', 'accomplished', 'assigning', 'pe...",6747c4bf9d71daeab63b36eada34498f0346bf4a,PMC,Protein Analysis by Shotgun/Bottom-up Proteomics,Protein Analysis by Shotgun/Bottom-up Proteomics,2013-02-26,"Zhang, Yaoyang; Fonslow, Bryan R.; Shan, Bing;...",Chemical Reviews,http://europepmc.org/articles/pmc3751594?pdf=r...,,inference is accomplished by assigning peptide...,0.810082
18304,18304,18.0,0.6319,"protein, proteins, rna, binding, membrane, str...","['palmitoylation', ""tn)'nstoylation"", '1ypiati...",0f1791125afe17384fcaef1cd0aa40f82b5269ba,Elsevier,Fatty acylation of proteins,Fatty acylation of proteins,1989-12-06,"Schmidt, Michael F.G.",Biochimica et Biophysica Acta (BBA) - Reviews ...,https://doi.org/10.1016/0304-4157(89)90013-0,,"palmitoylation, tn)'nstoylation and $1ypiation...",0.808822
14831,14831,18.0,0.4355,"protein, proteins, rna, binding, membrane, str...","['inference', 'accomplished', 'assigning', 'pe...",6747c4bf9d71daeab63b36eada34498f0346bf4a,PMC,Protein Analysis by Shotgun/Bottom-up Proteomics,Protein Analysis by Shotgun/Bottom-up Proteomics,2013-02-26,"Zhang, Yaoyang; Fonslow, Bryan R.; Shan, Bing;...",Chemical Reviews,http://europepmc.org/articles/pmc3751594?pdf=r...,,inference is accomplished by assigning peptide...,0.719687
21863,21863,13.0,0.5115,"care, health, patients, medical, patient, risk...","['april', '15', '2020', '1', '918', '138', 'co...",fcb6109e808c165aef9bdcacb8e45184c75d21e0,Elsevier,Focusing on health-care providers' experiences...,Focusing on health-care providers' experiences...,2020-04-29,"Xiong, Yang; Peng, Lingli",The Lancet Global Health,https://doi.org/10.1016/s2214-109x(20)30214-x,,"As of April 15, 2020, there have been 1 918 13...",0.678215
9758,9758,3.0,0.1879,"al, cells, cell, acid, 2, found, 1, activity, ...","['avian', 'infectious', 'bronchitis', 'virus',...",12ce8410d1ca0fa6f438c4f22bd92af6f8b78f04,PMC,The polypeptide composition of avian infectiou...,Avian infectious bronchitis virus grownin ovo ...,1975-01-01,"Bingham, R. W.",Arch Virol,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,,The eoronaviruses have been classified as a se...,0.722517
14831,14831,18.0,0.4355,"protein, proteins, rna, binding, membrane, str...","['inference', 'accomplished', 'assigning', 'pe...",6747c4bf9d71daeab63b36eada34498f0346bf4a,PMC,Protein Analysis by Shotgun/Bottom-up Proteomics,Protein Analysis by Shotgun/Bottom-up Proteomics,2013-02-26,"Zhang, Yaoyang; Fonslow, Bryan R.; Shan, Bing;...",Chemical Reviews,http://europepmc.org/articles/pmc3751594?pdf=r...,,inference is accomplished by assigning peptide...,0.706015


## Assigning topic number to name

In [64]:
topic_list = [f"Topic {i}" for i in range(1, 26)]
topic_dict = {topic: float(i) for i, topic in enumerate(topic_list)}
topic_dict

{'Topic 1': 0.0,
 'Topic 2': 1.0,
 'Topic 3': 2.0,
 'Topic 4': 3.0,
 'Topic 5': 4.0,
 'Topic 6': 5.0,
 'Topic 7': 6.0,
 'Topic 8': 7.0,
 'Topic 9': 8.0,
 'Topic 10': 9.0,
 'Topic 11': 10.0,
 'Topic 12': 11.0,
 'Topic 13': 12.0,
 'Topic 14': 13.0,
 'Topic 15': 14.0,
 'Topic 16': 15.0,
 'Topic 17': 16.0,
 'Topic 18': 17.0,
 'Topic 19': 18.0,
 'Topic 20': 19.0,
 'Topic 21': 20.0,
 'Topic 22': 21.0,
 'Topic 23': 22.0,
 'Topic 24': 23.0,
 'Topic 25': 24.0}

In [70]:
# Search articles by topic name
articles_with_top_dropped.loc[df['Dominant_Topic'] == dic['Topic 4']]

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text,paper_id,source,title,abstract,publish_time,authors,journal,url,discussion,text_body
399,399,3.0,0.3540,"al, cells, cell, acid, 2, found, 1, activity, ...","['background', 'describe', 'new', 'technical',...",0e7e8aa9e5d952e8486bc9599764a9fcddd7f579,PMC,Improved production of human type II procollag...,BACKGROUND: Here we describe a new technical s...,2008-03-27,"Ruottinen, Maria; Bollok, Monika; Kögler, Mart...",BMC Biotechnol,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,Experiments at the scale of shake flasks are g...,The methylotrophic yeast Pichia pastoris is a ...
616,616,3.0,0.1495,"al, cells, cell, acid, 2, found, 1, activity, ...","['background', 'phaseolus', 'vulgaris', 'commo...",8013bbf01944aa3f25d0faf16386ea51c7586066,PMC,Generation of Phaseolus vulgaris ESTs and inve...,BACKGROUND: Phaseolus vulgaris (common bean) i...,2009-04-27,"Thibivilliers, Sandra; Joshi, Trupti; Campbell...",BMC Plant Biol,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,,"Common bean, Phaseolus vulgaris, represents a ..."
660,660,3.0,0.3158,"al, cells, cell, acid, 2, found, 1, activity, ...","['background', 'aims', 'tuberization', 'mechan...",9966ccef050ac0dc4d794363dbd1f865442ef0aa,PMC,IbMADS1 (Ipomoea batatas MADS-box 1 gene) is I...,BACKGROUND AND AIMS: The tuberization mechanis...,2008-05-07,"Ku, Amy Tsu; Huang, Yi-Shiuan; Wang, Yu-Shu; M...",Annals of Botany,https://academic.oup.com/aob/article-pdf/102/1...,A global gene analysis by cDNA-AFLP (Fig. 1) w...,Tubers are among the principal food crops in t...
870,870,3.0,0.2695,"al, cells, cell, acid, 2, found, 1, activity, ...","['background', 'hepatitis', 'c', 'virus', 'hcv...",b9b85a4893f9ed75881adcedbf142098bd7edc93,PMC,Thermal stability and inactivation of hepatiti...,BACKGROUND: Hepatitis C virus (HCV) is a blood...,2010-02-18,"Song, Hongshuo; Li, Jin; Shi, Shuang; Yan, Lin...",Virol J,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,"In this study, a detailed analysis was conduct...","Hepatitis C virus (HCV) is a small enveloped, ..."
1286,1286,3.0,0.2976,"al, cells, cell, acid, 2, found, 1, activity, ...","['scutellaria', 'species', 'lamiaceae', 'used'...",abf6f66d8e18b652d10944bb21e19debca5dc5e8,PMC,Analyzing Cytotoxic and Apoptogenic Properties...,The Scutellaria species (Lamiaceae) is used as...,2011-03-09,"Tayarani-Najaran, Zahra; Emami, Seyed Ahmad; A...",Evid Based Complement Alternat Med,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,Natural products have long been used to preven...,Plant materials have served as medicines acros...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37454,37454,3.0,0.2834,"al, cells, cell, acid, 2, found, 1, activity, ...","['abstract', 'japanese', 'encephalitis', 'viru...",6a5ff06ef79286c7bf9d24bd03caffd7ca534393,Elsevier,Maturation of Japanese encephalitis virus glyc...,Abstract The Japanese encephalitis virus (JE) ...,1989-04-30,"Mason, Peter W.",Virology,https://doi.org/10.1016/0042-6822(89)90161-x,,Japanese encephalitis virus (JE) is one of the...
37462,37462,3.0,0.1380,"al, cells, cell, acid, 2, found, 1, activity, ...","['abstract', 'hotels', 'designed', 'provide', ...",59eadcaf2ef4085813b911b168d11035a9d2f026,Elsevier,Indoor air quality audit implementation in a h...,Abstract Hotels are designed to provide high l...,2011-08-31,"Asadi, Ehsan; Costa, J.J.; Gameiro da Silva, M...",Building and Environment,https://doi.org/10.1016/j.buildenv.2011.01.027,"In the course of this audit, some problems or ...",Hotels are designed to provide high overall co...
37526,37526,3.0,0.2536,"al, cells, cell, acid, 2, found, 1, activity, ...","['abstract', 'hybridisation-based', 'genosenso...",372516ceb5b840cbc46c7aa04ca7907d7596bce1,Elsevier,Genosensor on gold films with enzymatic electr...,Abstract A hybridisation-based genosensor was ...,2005-05-15,"Abad-Valle, Patricia; Fernández-Abedul, M. Ter...",Biosensors and Bioelectronics,https://doi.org/10.1016/j.bios.2004.10.019,,Biosensors have become a very important area o...
37546,37546,3.0,0.3374,"al, cells, cell, acid, 2, found, 1, activity, ...","['blindness', 'one', 'major', 'health', 'probl...",ec280cc8cac358abfae9c00b8b72eecd55c2fed0,Elsevier,"Retinal degenerations of hereditary, viral and...","Retinal degenerations of hereditary, viral and...",1994-12-31,"Chader, Gerald J.",Progress in Retinal and Eye Research,https://doi.org/10.1016/1350-9462(94)90005-1,,Blindness is one of the major health problems ...
