In [1]:
from Bio import Entrez
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

pd.set_option('display.max_colwidth', 1000)

In [2]:
def search(query):
    Entrez.email = 'sxxize@gmail.com'
    handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax='1000',
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    return results

In [3]:
def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'sxxize@gmail.com'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

In [4]:
search_word='fuel cell'

In [5]:
results = search(search_word)
id_list = results['IdList']
papers = fetch_details(id_list)

# Key Word List

In [6]:
[str(i) for i in papers['PubmedArticle'][0]['MedlineCitation']['KeywordList'][0]]

['Carbon brush cylindrical microbial fuel cell',
 'Internal resistance',
 'Microbial fuel cell',
 'P. aeruginosa',
 'Polarization curve',
 'Power overshoot']

# Author List

In [7]:
paper_author_lst=[i['MedlineCitation']['Article']['AuthorList'] for i in papers['PubmedArticle']]

dfs=[pd.DataFrame(paper_author_lst[i]) for i in range(len(paper_author_lst))]

names_dfs=pd.concat(dfs, axis=0, sort=True )

author_count_df=names_dfs[['ForeName', 'LastName']].groupby(['ForeName', 'LastName']).size().reset_index(name='count').sort_values(by='count', ascending=False)

top=author_count_df.head(10)

google_url='https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q='

name=top['ForeName']+' '+top['LastName']

result=top.reset_index(drop=True).join(pd.DataFrame({'Google Scholar':[google_url+i for i in name.str.replace(' ', '+')+'+review&oq=']}))

In [8]:
result

Unnamed: 0,ForeName,LastName,count,Google Scholar
0,Carlo,Santoro,19,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Carlo+Santoro+review&oq=
1,Plamen,Atanassov,16,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Plamen+Atanassov+review&oq=
2,Alexey,Serov,16,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Alexey+Serov+review&oq=
3,Hong,Liu,13,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Hong+Liu+review&oq=
4,Ioannis,Ieropoulos,13,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Ioannis+Ieropoulos+review&oq=
5,John,Greenman,12,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=John+Greenman+review&oq=
6,Wei,Zhou,9,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Wei+Zhou+review&oq=
7,S K,Kamarudin,8,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=S+K+Kamarudin+review&oq=
8,San Ping,Jiang,8,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=San+Ping+Jiang+review&oq=
9,Booki,Min,8,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Booki+Min+review&oq=


In [9]:
print(pd.DataFrame({'Google Scholar':[google_url+i for i in name.str.replace(' ', '+')+'+review&oq=']}))

                                                                            Google Scholar
0       https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Carlo+Santoro+review&oq=
1    https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Plamen+Atanassov+review&oq=
2        https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Alexey+Serov+review&oq=
3            https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Hong+Liu+review&oq=
4  https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Ioannis+Ieropoulos+review&oq=
5       https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=John+Greenman+review&oq=
6            https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Wei+Zhou+review&oq=
7       https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=S+K+Kamarudin+review&oq=
8      https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=San+Ping+Jiang+review&oq=
9           https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Booki+Min+review&oq=

# Key Words from Title

In [10]:
titles=[i['MedlineCitation']['Article']['ArticleTitle'].lower().replace(',','').replace('.','').replace(':', '').replace('?','').replace('<sub>', '').replace('</sub>','').replace('<i>','').replace('</i>','').replace(search_word,'') for i in papers['PubmedArticle']]

tfidf=TfidfVectorizer(ngram_range=(2,2),stop_words='english')

X=tfidf.fit_transform(titles)

tfidf_df=pd.DataFrame(X.todense(), columns=sorted(tfidf.vocabulary_))

key_rank=tfidf_df.sum().sort_values(ascending=False)

In [11]:
key_rank[:20]

solid oxide                  14.399595
exchange membrane            10.642937
electricity generation       10.300425
proton exchange               9.668957
polymer electrolyte           8.670652
oxygen reduction              7.216754
microbial community           6.840843
performance microbial         6.668812
wastewater treatment          6.550377
single chamber                6.477291
direct methanol               6.390863
air cathode                   6.128711
chamber microbial             6.057837
high performance              5.963384
generation microbial          5.749137
temperature solid             5.208095
electrolyte membrane          5.097043
bioelectricity generation     4.863232
cathode microbial             4.855944
electricity production        4.520554
dtype: float64

In [12]:
key_rank[(key_rank>key_rank.quantile(.75))&(key_rank<key_rank.quantile(.8))][:20]

characterization electricity    0.408083
structure long                  0.408083
phototrophic bacteria           0.408021
using anoxygenic                0.408021
insights photosynthetic         0.408021
anoxygenic phototrophic         0.408021
validation chromium             0.407935
chromium getters                0.407935
getters solid                   0.407935
development validation          0.407935
applications overview           0.407465
based nanohybrid                0.407465
nanohybrid proton               0.407465
study ceria                     0.406627
nanocomposite electrolytes      0.406627
carbonate nanocomposite         0.406627
ceria carbonate                 0.406627
temperature pem                 0.405939
membranes high                  0.405939
electrolysers critical          0.405939
dtype: float64

LDA
word Vec
pyldavis

pyldavis

textrank
lexrank