# Notebook for getting PubMed metadata for FactPICO abstracts

**Metadata**: PMID, PMCID, title

In [91]:
# import packages
from Bio import Entrez, Medline
import pandas as pd

Entrez.email = "yun.hy@northeastern.edu"

In [None]:
# read original abstract csv file
df_abstracts = pd.read_csv('FactPico_115_abstracts.csv')
df_abstracts.head()

## Get PMIDs via API

In [None]:
def process_text(text):
    # Remove leading and trailing whitespaces
    cleaned_text = text.strip()
    # Split the text into sections based on single newline
    sections = cleaned_text.split('\n')
    # Return the second section which should be first part of abstract without headers
    return sections[1] if sections else ""

def get_pubmed_id_by_abstract(abstract_text):
    shorter_abstract_text = process_text(abstract_text)
    handle = Entrez.esearch(db="pubmed", term=f'"{shorter_abstract_text}"', retmax=1)
    record = Entrez.read(handle)
    handle.close()
    pmid = record["IdList"]
    return pmid[0] if pmid else None

In [None]:
# Apply a function row-wise for getting pmid
df_abstracts['pmid'] = df_abstracts.apply(lambda row: get_pubmed_id_by_abstract(row['abstract']), axis=1)

In [None]:
# count how many pmid we got that is not None
df_abstracts['pmid'].count()

In [None]:
# save the dataframe to a new csv file - the rest will be retrieved manually from pubmed
df_abstracts.to_csv('FactPico_115_abstracts_with_pmid.csv', index=False)

## Checking Correctness of PMIDs

In [98]:
# read the new csv file
df_abstracts_with_pmid = pd.read_csv('FactPico_115_abstracts_with_pmid.csv')
df_abstracts_with_pmid.head()

Unnamed: 0,id,abstract,data_split,pmid
0,1,ABSTRACT.BACKGROUND:\nMetabolic syndrome is a ...,TEST,15892894
1,2,ABSTRACT.\nThalassemia is the commonest single...,TEST,22131679
2,3,ABSTRACT.BACKGROUND:\nNasal insufflation of CO...,TEST,23574808
3,4,ABSTRACT.OBJECTIVE.\nThere are controversial f...,TEST,24665285
4,5,ABSTRACT.PURPOSE:\nTo evaluate the analgesic e...,TEST,24639945


In [None]:
def fetch_abstract_by_pmid(pmid=None):
    # Fetch the article data
    handle = Entrez.efetch(db="pubmed", id=pmid, rettype="abstract", retmode="text")
    data = handle.read()
    handle.close()
    
    return data

def compare_abstracts(df):
    """Compare abstracts in DataFrame with abstracts fetched via PMID"""
    results = []

    for _, row in df.iterrows():
        pmid = row.get("pmid")
        abstract_in_df = row.get("abstract")
        
        # Fetch the abstract using the API
        abstract_from_api = fetch_abstract_by_pmid(pmid=pmid)
        # Store the results
        results.append({
            "pmid": pmid,
            "abstract_from_api": abstract_from_api,
            "abstract_in_df": abstract_in_df
        })

    return pd.DataFrame(results)


result_df = compare_abstracts(df_abstracts_with_pmid)

result_df.to_csv('FactPico_115_abstracts_comparison.csv', index=False)

## Get PMCIDs from PMIDs via API

In [100]:
# read the updated csv file (manually verified and corrected wrong pmids)
df_abstracts_with_pmid = pd.read_csv('FactPico_115_abstracts_with_pmid.csv')
df_abstracts_with_pmid.head()

Unnamed: 0,id,abstract,data_split,pmid
0,1,ABSTRACT.BACKGROUND:\nMetabolic syndrome is a ...,TEST,15892894
1,2,ABSTRACT.\nThalassemia is the commonest single...,TEST,22131679
2,3,ABSTRACT.BACKGROUND:\nNasal insufflation of CO...,TEST,23574808
3,4,ABSTRACT.OBJECTIVE.\nThere are controversial f...,TEST,24665285
4,5,ABSTRACT.PURPOSE:\nTo evaluate the analgesic e...,TEST,24639945


In [None]:
def get_pmcid_from_pmid(pmid):
    # Use Entrez.elink to link PMID to PMCID
    handle = Entrez.elink(dbfrom="pubmed", db="pmc", id=pmid)
    records = Entrez.read(handle)
    handle.close()
    
    # Parse the results
    pmcid = None
    linksets = records[0]["LinkSetDb"]
    if linksets:
        for linkset in linksets:
            if linkset["DbTo"] == "pmc" and linkset["LinkName"] == "pubmed_pmc":
                pmcid = linkset["Link"][0]["Id"]  # Retrieve the PMCID
                break
    
    return pmcid

In [None]:
# Apply a function row-wise for getting pmcid
df_abstracts_with_pmid['pmid'] = df_abstracts_with_pmid['pmid'].astype(str)
df_abstracts_with_pmid['pmcid'] = df_abstracts_with_pmid.apply(lambda row: get_pmcid_from_pmid(row['pmid']), axis=1)

In [107]:
# count how many pmcid we got that is not None
df_abstracts_with_pmid['pmcid'].count()

115

## Get titles from PMIDs via API

In [None]:
def get_title_from_pmid_medline(pmid):
    # Use Entrez.efetch to fetch the article data
    handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text")
    records = Medline.parse(handle)
    
    # Parse and get the title
    for record in records:
        return record.get("TI")  # Retrieve the title
    return None

In [None]:
# Apply a function row-wise for getting title
df_abstracts_with_pmid['title'] = df_abstracts_with_pmid.apply(lambda row: get_title_from_pmid_medline(row['pmid']), axis=1)

In [110]:
# count how many titles we got that is not None
df_abstracts_with_pmid['title'].count()

115

In [111]:
# save the dataframe to a new csv file - the rest will be retrieved manually from pubmed
df_abstracts_with_pmid.to_csv('FactPico_115_abstracts_with_pmid_pmcid_title.csv', index=False)