<a href="https://colab.research.google.com/github/kev-iotairx/PubMed-data/blob/main/PubMed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### PubMed Central (PMC)

- Text mine to retrieve articles and scientific publications.

1.   https://www.ncbi.nlm.nih.gov/pmc/tools/textmining/
2.   https://www.ncbi.nlm.nih.gov/pmc/tools/oa-service/
3. https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
4. https://www.nlm.nih.gov/dataguide/eutilities/utilities.html#efetch



In [None]:
import os
os.getcwd()

'/content'

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!cd /content/drive/MyDrive/PubMed/

In [None]:
#!pip install pymed
#!pip install metapub

In [None]:
import pandas as pd
from functools import reduce
from metapub import PubMedFetcher
fetch = PubMedFetcher(cachedir='/content/drive/MyDrive/PubMed/')

In [None]:
# get pmids
pmids = fetch.pmids_for_query(query="Diabetes", since="2000", retmax=1000, pmc_only=True)
pmids[:5]

['39083826', '39083820', '39083816', '39083815', '39082939']

Get titles from pmids

In [None]:
titles = {}

for pmid in pmids[:5]:
  titles[pmid] = fetch.article_by_pmid(pmid).title

In [None]:
# Read dict into DataFrame, keys of the dict should be rows, so orient="index"
df_titles = pd.DataFrame.from_dict(titles, orient="index").reset_index()
df_titles.columns = ['pmid','title']
df_titles

Unnamed: 0,pmid,title
0,39083826,"Investigation of environmental ethics, spiritu..."
1,39083820,The effect of Orem self-care model on the impr...
2,39083816,Educational intervention on perceived stress a...
3,39083815,Educational intervention on perceived stress a...
4,39082939,A kidney-hypothalamus axis promotes compensato...


Get abstracts

In [None]:
abstracts = {}

for pmid in pmids[:5]:
  abstracts[pmid] = fetch.article_by_pmid(pmid).abstract

In [None]:
# Read dict into DataFrame, keys of the dict should be rows, so orient="index"
df_abstract = pd.DataFrame.from_dict(abstracts, orient="index").reset_index()
df_abstract.columns = ['pmid','abstracts']
df_abstract

Unnamed: 0,pmid,abstracts
0,39083826,OBJECTIVE: To investigate the relationship bet...
1,39083820,OBJECTIVE: to evaluate the association of Orem...
2,39083816,OBJECTIVE: To evaluate the effectiveness of Vi...
3,39083815,OBJECTIVE: To assess the effectiveness of an e...
4,39082939,The kidneys facilitate energy conservation thr...


Get authors

In [None]:
authors = {}

for pmid in pmids[:5]:
  authors[pmid] = fetch.article_by_pmid(pmid).authors

In [None]:
# Read dict into DataFrame, keys of the dict should be rows, so orient="index"
df_authors = pd.DataFrame(list(authors.items()), columns=['pmid','author'])
df_authors

Unnamed: 0,pmid,author
0,39083826,"[Saeed Jadgal M, Bamri A, Fattahi Ardakani M, ..."
1,39083820,"[Jandaghian Bidgoli M, Jamalnia S, Pashmforosh..."
2,39083816,"[Guleria A, LastName K, Kumar Y, Sarin J]"
3,39083815,"[Alves de Araújo W, Cardoso Santos IS, Souza R..."
4,39082939,"[Faniyan TS, Zhang X, Morgan DA, Robles J, Bat..."


Get publication date

In [None]:
dates = {}

for pmid in pmids[:5]:
  dates[pmid] = fetch.article_by_pmid(pmid).year

In [None]:
# Read dict into DataFrame, keys of the dict should be rows, so orient="index"
df_year = pd.DataFrame.from_dict(dates, orient="index").reset_index()
df_year.columns = ['pmid','year']
df_year

Unnamed: 0,pmid,year
0,39083826,2024
1,39083820,2024
2,39083816,2024
3,39083815,2024
4,39082939,2024


Get journal

In [None]:
journal = {}

for pmid in pmids[:5]:
  journal[pmid] = fetch.article_by_pmid(pmid).journal

{'39083826': 'Invest Educ Enferm',
 '39083820': 'Invest Educ Enferm',
 '39083816': 'Invest Educ Enferm',
 '39083815': 'Invest Educ Enferm',
 '39082939': 'Elife'}

In [None]:
# Read dict into DataFrame, keys of the dict should be rows, so orient="index"
df_journal = pd.DataFrame.from_dict(journal, orient="index").reset_index()
df_journal.columns = ['pmid','journal']
df_journal

Unnamed: 0,pmid,journal
0,39083826,Invest Educ Enferm
1,39083820,Invest Educ Enferm
2,39083816,Invest Educ Enferm
3,39083815,Invest Educ Enferm
4,39082939,Elife


Get DOI

In [None]:
doi = {}

for pmid in pmids[:5]:
  doi[pmid] = fetch.article_by_pmid(pmid).doi

doi

{'39083826': '10.17533/udea.iee.v42n1e14',
 '39083820': '10.17533/udea.iee.v42n1e08',
 '39083816': '10.17533/udea.iee.v42n1e04',
 '39083815': '10.17533/udea.iee.v42n1e03',
 '39082939': '10.7554/eLife.91540'}

In [None]:
# Read dict into DataFrame, keys of the dict should be rows, so orient="index"
df_doi = pd.DataFrame.from_dict(doi, orient="index").reset_index()
df_doi.columns = ['pmid','doi']
df_doi

Unnamed: 0,pmid,doi
0,39083826,10.17533/udea.iee.v42n1e14
1,39083820,10.17533/udea.iee.v42n1e08
2,39083816,10.17533/udea.iee.v42n1e04
3,39083815,10.17533/udea.iee.v42n1e03
4,39082939,10.7554/eLife.91540


Get URL

In [None]:
url = {}

for pmid in pmids[:5]:
  url[pmid] = fetch.article_by_pmid(pmid).url

url

{'39083826': 'https://ncbi.nlm.nih.gov/pubmed/39083826',
 '39083820': 'https://ncbi.nlm.nih.gov/pubmed/39083820',
 '39083816': 'https://ncbi.nlm.nih.gov/pubmed/39083816',
 '39083815': 'https://ncbi.nlm.nih.gov/pubmed/39083815',
 '39082939': 'https://ncbi.nlm.nih.gov/pubmed/39082939'}

In [None]:
# Read dict into DataFrame, keys of the dict should be rows, so orient="index"
df_url = pd.DataFrame.from_dict(url, orient="index").reset_index()
df_url.columns = ['pmid','url']
df_url

Unnamed: 0,pmid,url
0,39083826,https://ncbi.nlm.nih.gov/pubmed/39083826
1,39083820,https://ncbi.nlm.nih.gov/pubmed/39083820
2,39083816,https://ncbi.nlm.nih.gov/pubmed/39083816
3,39083815,https://ncbi.nlm.nih.gov/pubmed/39083815
4,39082939,https://ncbi.nlm.nih.gov/pubmed/39082939


**Merge DataFrames**

In [None]:
dfs = [df_titles, df_abstract, df_authors, df_year, df_journal, df_doi, df_url]

df = reduce(lambda left, right: pd.merge(left, right, on='pmid', how='left'), dfs)
df

Unnamed: 0,pmid,title,abstracts,author,year,journal,doi,url
0,39083826,"Investigation of environmental ethics, spiritu...",OBJECTIVE: To investigate the relationship bet...,"[Saeed Jadgal M, Bamri A, Fattahi Ardakani M, ...",2024,Invest Educ Enferm,10.17533/udea.iee.v42n1e14,https://ncbi.nlm.nih.gov/pubmed/39083826
1,39083820,The effect of Orem self-care model on the impr...,OBJECTIVE: to evaluate the association of Orem...,"[Jandaghian Bidgoli M, Jamalnia S, Pashmforosh...",2024,Invest Educ Enferm,10.17533/udea.iee.v42n1e08,https://ncbi.nlm.nih.gov/pubmed/39083820
2,39083816,Educational intervention on perceived stress a...,OBJECTIVE: To evaluate the effectiveness of Vi...,"[Guleria A, LastName K, Kumar Y, Sarin J]",2024,Invest Educ Enferm,10.17533/udea.iee.v42n1e04,https://ncbi.nlm.nih.gov/pubmed/39083816
3,39083815,Educational intervention on perceived stress a...,OBJECTIVE: To assess the effectiveness of an e...,"[Alves de Araújo W, Cardoso Santos IS, Souza R...",2024,Invest Educ Enferm,10.17533/udea.iee.v42n1e03,https://ncbi.nlm.nih.gov/pubmed/39083815
4,39082939,A kidney-hypothalamus axis promotes compensato...,The kidneys facilitate energy conservation thr...,"[Faniyan TS, Zhang X, Morgan DA, Robles J, Bat...",2024,Elife,10.7554/eLife.91540,https://ncbi.nlm.nih.gov/pubmed/39082939
