In [1]:
from Bio import Entrez

In [2]:
import pprint

In [3]:
import pandas as pd

In [4]:
def search(query):
    Entrez.email = 'sxxize@gmail.com'
    handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax='1000',
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    return results

In [5]:
def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'sxxize@gmail.com'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

In [6]:
search_word='cancer'

In [7]:
results = search(search_word)
id_list = results['IdList']
papers = fetch_details(id_list)
for i, paper in enumerate(papers['PubmedArticle']):
    print("%d) %s" % (i+1, paper['MedlineCitation']['Article']['ArticleTitle']))

1) Genetic variation as a modifier of association between therapeutic exposure and subsequent malignant neoplasms in cancer survivors.
2) Somatic and Germline <i>TP53</i> Alterations in Second Malignant Neoplasms from Pediatric Cancer Survivors.
3) Cancer risk among Holocaust survivors in Israel-A nationwide study.
4) Prevalence of Germline Mutations Associated With Cancer Risk in Patients With Intraductal Papillary Mucinous Neoplasms.
5) Early estimates of SEER cancer incidence, 2014.
6) Risk of subsequent myeloid neoplasms after radiotherapy treatment for a solid cancer among adults in the United States, 2000-2014.
7) Metachronous triple primary neoplasms with primary prostate cancer, lung cancer, and colon cancer: A case report.
8) Increased risk of second malignant neoplasms in adolescents and young adults with cancer.
9) Cancer incidence profile in sub-Saharan African-born blacks in the United States: Similarities and differences with US-born non-Hispanic blacks.
10) Psychological

In [8]:
papers.keys()

dict_keys(['PubmedBookArticle', 'PubmedArticle'])

In [9]:
papers['PubmedArticle'][0].keys()

dict_keys(['MedlineCitation', 'PubmedData'])

In [10]:
papers['PubmedArticle'][0]['MedlineCitation'].keys()

dict_keys(['OtherAbstract', 'OtherID', 'CitationSubset', 'SpaceFlightMission', 'KeywordList', 'GeneralNote', 'PMID', 'DateCompleted', 'DateRevised', 'Article', 'MedlineJournalInfo', 'ChemicalList', 'MeshHeadingList'])

# Key Word List

In [11]:
[str(i) for i in papers['PubmedArticle'][0]['MedlineCitation']['KeywordList'][0]]

['cancer survivors',
 'gene-environment interactions',
 'genetic susceptibility',
 'second cancers',
 'therapeutic exposures']

# Author List

In [12]:
papers['PubmedArticle'][0]['MedlineCitation']['Article'].keys()

dict_keys(['Language', 'ELocationID', 'ArticleDate', 'Journal', 'ArticleTitle', 'Pagination', 'Abstract', 'AuthorList', 'GrantList', 'PublicationTypeList'])

In [13]:
papers['PubmedArticle'][1]['MedlineCitation']['Article']['AuthorList']

ListElement([DictElement({'AffiliationInfo': [{'Identifier': [], 'Affiliation': 'Department of Radiation Oncology, University of California, San Francisco, California.'}], 'Identifier': [], 'LastName': 'Sherborne', 'ForeName': 'Amy L', 'Initials': 'AL'}, attributes={'ValidYN': 'Y'}), DictElement({'AffiliationInfo': [{'Identifier': [], 'Affiliation': 'Department of Radiation Oncology, University of California, San Francisco, California.'}], 'Identifier': [], 'LastName': 'Lavergne', 'ForeName': 'Vincent', 'Initials': 'V'}, attributes={'ValidYN': 'Y'}), DictElement({'AffiliationInfo': [{'Identifier': [], 'Affiliation': 'Department of Radiation Oncology, University of California, San Francisco, California.'}], 'Identifier': [], 'LastName': 'Yu', 'ForeName': 'Katharine', 'Initials': 'K'}, attributes={'ValidYN': 'Y'}), DictElement({'AffiliationInfo': [{'Identifier': [], 'Affiliation': 'Department of Radiation Oncology, University of California, San Francisco, California.'}], 'Identifier': []

In [14]:
paper_author_lst=[i['MedlineCitation']['Article']['AuthorList'] for i in papers['PubmedArticle']]

In [15]:
len(paper_author_lst)

1000

In [16]:
dfs=[pd.DataFrame(paper_author_lst[i]) for i in range(len(paper_author_lst))]

In [17]:
names_dfs=pd.concat(dfs, axis=0, sort=True )

In [18]:
names_dfs.head()

Unnamed: 0,AffiliationInfo,CollectiveName,ForeName,Identifier,Initials,LastName,Suffix
0,"[{'Identifier': [], 'Affiliation': 'Department...",,Smita,[],S,Bhatia,
0,"[{'Identifier': [], 'Affiliation': 'Department...",,Amy L,[],AL,Sherborne,
1,"[{'Identifier': [], 'Affiliation': 'Department...",,Vincent,[],V,Lavergne,
2,"[{'Identifier': [], 'Affiliation': 'Department...",,Katharine,[],K,Yu,
3,"[{'Identifier': [], 'Affiliation': 'Department...",,Leah,[],L,Lee,


In [19]:
author_count_df=names_dfs[['ForeName', 'LastName']].groupby(['ForeName', 'LastName']).size().reset_index(name='count').sort_values(by='count', ascending=False)

In [20]:
author_count_df.hist();

# Top Authors

In [21]:
top=author_count_df.head(10)

In [22]:
top

Unnamed: 0,ForeName,LastName,count
4148,Leslie L,Robison,19
104,Ahmedin,Jemal,19
2346,Gregory T,Armstrong,15
7524,Young-Joo,Won,11
7578,Yutaka,Yasui,10
3984,Kyu-Won,Jung,9
2441,Hannah K,Weir,9
4869,Melissa M,Hudson,8
5774,Rebecca L,Siegel,8
5502,Paul L,Nguyen,8


In [23]:
google_url='https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q='

In [24]:
name=top['ForeName']+' '+top['LastName']

In [25]:
result=top.reset_index(drop=True).join(pd.DataFrame({'Google Scholar':[google_url+i for i in name.str.replace(' ', '+')+'+review&oq=']}))

In [26]:
pd.set_option('display.max_colwidth', 1000)

In [27]:
result

Unnamed: 0,ForeName,LastName,count,Google Scholar
0,Leslie L,Robison,19,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Leslie+L+Robison+review&oq=
1,Ahmedin,Jemal,19,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Ahmedin+Jemal+review&oq=
2,Gregory T,Armstrong,15,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Gregory+T+Armstrong+review&oq=
3,Young-Joo,Won,11,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Young-Joo+Won+review&oq=
4,Yutaka,Yasui,10,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Yutaka+Yasui+review&oq=
5,Kyu-Won,Jung,9,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Kyu-Won+Jung+review&oq=
6,Hannah K,Weir,9,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Hannah+K+Weir+review&oq=
7,Melissa M,Hudson,8,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Melissa+M+Hudson+review&oq=
8,Rebecca L,Siegel,8,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Rebecca+L+Siegel+review&oq=
9,Paul L,Nguyen,8,https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Paul+L+Nguyen+review&oq=


In [28]:
print(pd.DataFrame({'Google Scholar':[google_url+i for i in name.str.replace(' ', '+')+'+review&oq=']}))

                                                                             Google Scholar
0     https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Leslie+L+Robison+review&oq=
1        https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Ahmedin+Jemal+review&oq=
2  https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Gregory+T+Armstrong+review&oq=
3        https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Young-Joo+Won+review&oq=
4         https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Yutaka+Yasui+review&oq=
5         https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Kyu-Won+Jung+review&oq=
6        https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Hannah+K+Weir+review&oq=
7     https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Melissa+M+Hudson+review&oq=
8     https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Rebecca+L+Siegel+review&oq=
9        https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q=Paul+L+Nguyen

# Key Words from Title

In [29]:
titles=[i['MedlineCitation']['Article']['ArticleTitle'].lower().replace(',','').replace('.','').replace(':', '').replace('?','').replace('<sub>', '').replace('</sub>','').replace('<i>','').replace('</i>','').replace(search_word,'') for i in papers['PubmedArticle']]

In [30]:
titles

['genetic variation as a modifier of association between therapeutic exposure and subsequent malignant neoplasms in  survivors',
 'somatic and germline tp53 alterations in second malignant neoplasms from pediatric  survivors',
 ' risk among holocaust survivors in israel-a nationwide study',
 'prevalence of germline mutations associated with  risk in patients with intraductal papillary mucinous neoplasms',
 'early estimates of seer  incidence 2014',
 'risk of subsequent myeloid neoplasms after radiotherapy treatment for a solid  among adults in the united states 2000-2014',
 'metachronous triple primary neoplasms with primary prostate  lung  and colon  a case report',
 'increased risk of second malignant neoplasms in adolescents and young adults with ',
 ' incidence profile in sub-saharan african-born blacks in the united states similarities and differences with us-born non-hispanic blacks',
 'psychological distress associated with  screening a systematic review',
 'the rise of concurre

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
tfidf=TfidfVectorizer(ngram_range=(2,2),stop_words='english')

In [33]:
#tfidf.get_stop_words()

In [34]:
X=tfidf.fit_transform(titles)

In [35]:
#tfidf.get_feature_names()

In [36]:
X.shape

(1000, 6985)

In [37]:
#print(X)

In [38]:
#tfidf.vocabulary_

In [39]:
tfidf_df=pd.DataFrame(X.todense(), columns=sorted(tfidf.vocabulary_))

In [40]:
key_rank=tfidf_df.sum().sort_values(ascending=False)

In [43]:
key_rank[:20]

population based        10.514490
united states            7.627839
cohort study             6.377511
malignant neoplasms      5.803577
systematic review        5.774280
based study              5.708662
survivors childhood      5.393320
incidence mortality      5.271867
long term                5.073028
meta analysis            5.028101
young adults             4.851765
human papillomavirus     4.826119
young adult              4.550059
quality life             4.053024
adult survivors          3.930960
survival patients        3.906444
adolescents young        3.855721
childhood survivor       3.674553
survivor study           3.616191
survival united          3.547047
dtype: float64

In [44]:
key_rank[(key_rank>key_rank.quantile(.75))&(key_rank<key_rank.quantile(.8))][:20]

pain symptoms              0.423574
low intermediate           0.423574
longitudinal regret        0.423574
decreased physical         0.423574
fatigue pain               0.423574
symptoms decreased         0.423574
regret treatment           0.423574
treatment low              0.423574
pancreatic nationwide      0.423362
radiotherapy unresected    0.423362
unresected pancreatic      0.423362
nationwide review          0.423362
sexual functioning         0.421667
year longitudinal          0.421667
functioning young          0.421667
seasonal influenza         0.421579
clinical effectiveness     0.421579
influenza vaccine          0.421579
effectiveness seasonal     0.421579
vaccine adult              0.421579
dtype: float64