# Research Paper Recommender

### :Review article recommender using PubMed API and Key word exraction from article titles using TF-IDF

# Importing Packages and Functions

In [1]:
from Bio import Entrez
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import itertools

pd.set_option('display.max_colwidth', 1000)

In [2]:
def search(query):
    Entrez.email = 'sxxize@gmail.com'
    handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax='1000',
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    return results

In [3]:
def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'sxxize@gmail.com'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

In [4]:
def Author_list(papers):
    paper_author_lst=[i['MedlineCitation']['Article']['AuthorList']\
                      for i in papers['PubmedArticle']]
    dfs=[pd.DataFrame(paper_author_lst[i]) for i in range(len(paper_author_lst))]
    names_dfs=pd.concat(dfs, axis=0, sort=True )
    author_count_df=names_dfs[['ForeName', 'LastName']]\
                    .groupby(['ForeName', 'LastName']).size()\
                    .reset_index(name='count').sort_values(by='count', ascending=False)
    top=author_count_df.head(10)
    google_url='https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q='
    name=top['ForeName']+' '+top['LastName']
    result=top.reset_index(drop=True).join(pd.DataFrame({'Google Scholar':[google_url+i for i in name.str.replace(' ', '+')+'+review&oq=']}))
    
    return result

In [5]:
def key_from_papers(papers):
    fetch_key_word_papers=[i['MedlineCitation']['KeywordList'] for i in papers['PubmedArticle']]
    lst_key_papers=list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(fetch_key_word_papers))))
    key_from_paper=pd.DataFrame({'key word from papers':lst_key_papers})
    return key_from_paper

In [6]:
def title_key(papers):
    titles=[i['MedlineCitation']['Article']['ArticleTitle'].lower()\
            .replace(',','').replace('.','').replace(':', '').replace('?','')\
            .replace('<sub>', '').replace('</sub>','').replace('<sup>','').replace('</sup>','')\
            .replace('<i>','').replace('</i>','')\
            .replace(search_word.lower(),'') for i in papers['PubmedArticle']]
    tfidf=TfidfVectorizer(ngram_range=(2,2),stop_words='english')
    X=tfidf.fit_transform(titles)
    tfidf_df=pd.DataFrame(X.todense(), columns=sorted(tfidf.vocabulary_))
    key_rank=tfidf_df.sum().sort_values(ascending=False)
    return key_rank


In [7]:
def Abstract_key(papers):
    abstract_key=[]
    for i in papers['PubmedArticle']:
        try:
            abstract_key.append(i['MedlineCitation']['Article']['Abstract']['AbstractText'][0].lower()\
            .replace(',','').replace('.','').replace(':', '').replace('?','')\
            .replace('<sub>', '').replace('</sub>','').replace('<sup>','').replace('</sup>','')\
            .replace('<i>','').replace('</i>','').replace(search_word.lower(),''))
        except:
            continue
    tfidf=TfidfVectorizer(ngram_range=(2,2),stop_words='english')
    X=tfidf.fit_transform(abstract_key)
    tfidf_df=pd.DataFrame(X.todense(), columns=sorted(tfidf.vocabulary_))
    key_rank=tfidf_df.sum().sort_values(ascending=False)
    return key_rank

In [8]:
search_word='bioactive'

In [9]:
results = search(search_word)
id_list = results['IdList']
papers = fetch_details(id_list)

# Author List

In [10]:
result=Author_list(papers)
result

Unnamed: 0,ForeName,LastName,count,Google Scholar
0,Aldo R,Boccaccini,34,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Aldo+R+Boccaccini+review&oq=
1,Francesco,Baino,11,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Francesco+Baino+review&oq=
2,Jiang,Chang,10,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Jiang+Chang+review&oq=
3,Mohamed N,Rahaman,10,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Mohamed+N+Rahaman+review&oq=
4,Julian R,Jones,9,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Julian+R+Jones+review&oq=
5,Chengtie,Wu,9,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Chengtie+Wu+review&oq=
6,Wenhai,Huang,9,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Wenhai+Huang+review&oq=
7,Robert G,Hill,7,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Robert+G+Hill+review&oq=
8,Hui,Wang,7,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Hui+Wang+review&oq=
9,Xiaofeng,Chen,7,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Xiaofeng+Chen+review&oq=


In [11]:
result['Google Scholar']

0    https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Aldo+R+Boccaccini+review&oq=
1      https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Francesco+Baino+review&oq=
2          https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Jiang+Chang+review&oq=
3    https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Mohamed+N+Rahaman+review&oq=
4       https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Julian+R+Jones+review&oq=
5          https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Chengtie+Wu+review&oq=
6         https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Wenhai+Huang+review&oq=
7        https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Robert+G+Hill+review&oq=
8             https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Hui+Wang+review&oq=
9        https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Xiaofeng+Chen+review&oq=
Name: Google Scholar, dtype: object

# Key Word List

In [12]:
key_paper_lst=key_from_papers(papers)

In [13]:
key_paper_lst['key word from papers'].value_counts().head(20)

Bioactive glass          78
bioactive glass          61
Bioactive compounds      57
bioactive compounds      57
Bioactive peptides       28
bioactive peptides       27
bone regeneration        19
Bioactivity              18
tissue engineering       14
Bone regeneration        14
bioactivity              13
Antioxidant activity     12
bioactive                12
Scaffold                 12
bioactive glasses        12
scaffolds                11
antioxidant activity     11
scaffold                 11
Mechanical properties    11
cytotoxicity             11
Name: key word from papers, dtype: int64

# Key Words from Title

In [14]:
key_title=title_key(papers)
key_title[:20]

tissue engineering       7.379735
bone regeneration        5.453781
bone tissue              5.438178
mesoporous glass         5.277637
antioxidant activity     5.028168
mass spectrometry        4.581099
derived peptides         4.422247
glass nanoparticles      4.404354
glass scaffolds          4.288476
compounds antioxidant    4.106274
extraction compounds     3.712952
scaffolds bone           3.527795
stem cells               3.463278
liquid chromatography    3.339055
natural products         3.333138
glass based              3.149574
containing glass         3.146186
mechanical properties    2.951971
composite scaffolds      2.787523
traditional chinese      2.636741
dtype: float64

# Key Words from Abstract

In [15]:
key_abstract=Abstract_key(papers)

In [18]:
key_abstract[:20]

tissue engineering       4.713979
antioxidant activity     4.362882
bone regeneration        4.331038
bone tissue              4.081523
mechanical properties    3.916979
body fluid               3.479124
anti inflammatory        3.348707
simulated body           3.312674
present study            3.296005
aim study                3.128448
stem cells               3.103424
sol gel                  3.087326
glass nanoparticles      2.968308
bone defects             2.843108
natural products         2.712479
fatty acids              2.708933
liquid chromatography    2.693396
mass spectrometry        2.618966
electron microscopy      2.532426
scanning electron        2.527249
dtype: float64

LDA
word Vec
pyldavis

pyldavis

textrank
lexrank