In [1]:
from Bio import Entrez
import pandas as pd
import time
import numpy as np
Entrez.email = "john.saxon2002@gmail.com"

In [None]:
def search_pubmed(query, max_results=99999):
    """Search PubMed and return article titles and IDs."""
    # Search PubMed
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()
    
    # Extract PubMed IDs
    pubmed_ids = record["IdList"]
    if not pubmed_ids:
        print(f"No results found for query: {query}")
        return [], []
    
    # Fetch article details
    handle = Entrez.efetch(db="pubmed", id=",".join(pubmed_ids), retmode="xml")
    print("Fetching articles...")
    records = Entrez.read(handle)
    handle.close()

    # Extract details
    articles = []
    for article in records["PubmedArticle"]:
        medline = article["MedlineCitation"]
        article_data = medline["Article"]

        # Extract fields
        pmid = medline["PMID"]
        title = article_data["ArticleTitle"]
        authors = ", ".join([author["LastName"] + " " + author["ForeName"]
                            for author in article_data.get("AuthorList", []) 
                            if "LastName" in author and "ForeName" in author])
        journal = article_data["Journal"]["Title"]
        pub_date = article_data["Journal"]["JournalIssue"]["PubDate"]
        abstract = article_data.get("Abstract", {}).get("AbstractText", ["No abstract available"])[0]
        pub_type = ", ".join(article_data.get("PublicationTypeList", []))
        mesh_terms = ", ".join([term["DescriptorName"] for term in medline.get("MeshHeadingList", [])])
        if medline.get("KeywordList"):
            key_words = medline["KeywordList"][0]
        else:
            key_words = []
        data_bank_list = article_data.get("DataBankList", [])
        publication_type_list = article_data.get("PublicationTypeList", [])

        # Append to results
        articles.append({
            "PMID": pmid,
            "Title": title,
            "Authors": authors if authors else "No authors listed",
            "Journal": journal,
            "PublicationDate": pub_date,
            "Abstract": abstract,
            "PublicationType": pub_type,
            "MeSHTerms": mesh_terms,
            "KeyWords": key_words,
            "DataBankList": data_bank_list,
            "PublicationTypeList": publication_type_list
        })
    print(f"Fetched {len(articles)} articles successfully.")
    return articles, records

In [40]:
SEARCH_1 = '"leukemia, myeloid, acute"[MeSH Terms] AND hasdatabanklist AND ("2021/12/30"[Date - Publication] : "3000"[Date - Publication])'

In [41]:
articles, records = search_pubmed(SEARCH_1, max_results=99999)

Fetching articles...


In [42]:
articles_df = pd.DataFrame(articles)
articles_df["DataBankName"] = articles_df["DataBankList"].apply(lambda row: row[0].get("DataBankName"))
articles_df["Accession"] = articles_df["DataBankList"].apply(lambda row: row[0].get("AccessionNumberList")[0])

articles_df.to_csv("lit_search_articles_since_2022.csv", index=False)

In [43]:
articles_df["MeSHTerms"].str.extractall("(Aged|Adolescent|Child|Adult)").value_counts()

0         
Aged          221
Adult         123
Child          30
Adolescent     27
Name: count, dtype: int64

In [44]:
articles_df["DataBankName"].value_counts()

DataBankName
ClinicalTrials.gov    246
GEO                    11
ChiCTR                  6
figshare                5
Dryad                   3
ISRCTN                  3
RefSeq                  1
ANZCTR                  1
EudraCT                 1
Name: count, dtype: int64

In [45]:
pd.DataFrame(articles_df["MeSHTerms"].str.split(", ").explode().value_counts()).to_csv("mesh_terms.csv", index=True, header=False)

Add CT Data

In [51]:
import requests, json

# Function to format outcomes as markdown
def format_outcomes(outcomes):
    if not outcomes:
        return "None"
    md = []
    for outcome in outcomes:
        measure = outcome.get("measure", "No measure provided")
        description = outcome.get("description", "No description provided")
        time_frame = outcome.get("timeFrame", "No time frame provided")
        md.append(f"**{measure}**\n\n{description}\n\n*Time Frame:* {time_frame}\n")
    return "\n".join(md)

def get_clinical_trial(nct_id, fields= [
        "protocolSection.identificationModule.briefTitle",
        "protocolSection.identificationModule.officialTitle",
        "protocolSection.descriptionModule.briefSummary",
        "protocolSection.statusModule.overallStatus",
        "protocolSection.statusModule.startDateStruct.date",
        "protocolSection.statusModule.primaryCompletionDateStruct.date",
        "protocolSection.designModule.studyType",
        "protocolSection.designModule.phases",
        "protocolSection.designModule.enrollmentInfo.count",
        "protocolSection.outcomesModule.primaryOutcomes",
        "protocolSection.outcomesModule.secondaryOutcomes",
        "protocolSection.outcomesModule.otherOutcomes"
    ]):
    
    field_str = ",".join(fields)
    url = f"https://clinicaltrials.gov/api/v2/studies/{nct_id}?fields={field_str}"
    
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses (4xx or 5xx)
        data = response.json()

        # Extracting the required fields
        extracted_data = {}
        
        if data["protocolSection"].get("identificationModule") is None:
            print(f"No identification module found for NCT ID: {nct_id}")
            extracted_data = {
                "briefTitle": None,
                "officialTitle": None,
            }
        else:
            extracted_data["briefTitle"] = data["protocolSection"]["identificationModule"].get("briefTitle", None)
            extracted_data["officialTitle"] = data["protocolSection"]["identificationModule"].get("officialTitle", None)
        
        if data["protocolSection"].get("descriptionModule") is None:   
            print(f"No description module found for NCT ID: {nct_id}")
            extracted_data["summary"] = None
        else:
            extracted_data["summary"] = data["protocolSection"]["descriptionModule"].get("briefSummary", None)
        
        if data["protocolSection"].get("statusModule") is None:
            print(f"No status module found for NCT ID: {nct_id}")
            extracted_data["startDate"] = None
            extracted_data["completionDate"] = None
        else:
            extracted_data["startDate"] = data["protocolSection"]["statusModule"].get("startDateStruct", {}).get("date", None)
            extracted_data["completionDate"] = data["protocolSection"]["statusModule"].get("primaryCompletionDateStruct", {}).get("date", None)
        
        if data["protocolSection"].get("designModule") is None:
            print(f"No design module found for NCT ID: {nct_id}")
            extracted_data["phases"] = None
            extracted_data["numParticipants"] = None
        else:
            extracted_data["phases"] = ", ".join(data["protocolSection"]["designModule"].get("phases", []))
            extracted_data["numParticipants"] = str(data["protocolSection"]["designModule"]["enrollmentInfo"].get("count", None))
        
        if data["protocolSection"].get("outcomesModule") is None:
            print(f"No outcomes module found for NCT ID: {nct_id}")
            extracted_data["primaryOutcomes"] = None
            extracted_data["secondaryOutcomes"] = None
            extracted_data["otherOutcomes"] = None
        else:
            extracted_data["primaryOutcomes"] = format_outcomes(data["protocolSection"]["outcomesModule"].get("primaryOutcomes", []))
            extracted_data["secondaryOutcomes"] = format_outcomes(data["protocolSection"]["outcomesModule"].get("secondaryOutcomes", []))
            extracted_data["otherOutcomes"] = format_outcomes(data["protocolSection"]["outcomesModule"].get("otherOutcomes", []))

        return extracted_data  # Return extracted data
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return {
            "briefTitle": None,
            "officialTitle": None,
            "summary": None,
            "startDate":None,
            "completionDate": None,
            "phases": None,
            "numParticipants": None,
            "primaryOutcomes": None,
            "secondaryOutcomes": None,
            "otherOutcomes": None,
        }


In [52]:
df = articles_df.join(articles_df['Accession'].apply(get_clinical_trial).apply(pd.Series))

Error fetching data: 404 Client Error: Not Found for url: https://clinicaltrials.gov/api/v2/studies/10.5061/dryad.h9w0vt4t2?fields=protocolSection.identificationModule.briefTitle,protocolSection.identificationModule.officialTitle,protocolSection.descriptionModule.briefSummary,protocolSection.statusModule.overallStatus,protocolSection.statusModule.startDateStruct.date,protocolSection.statusModule.primaryCompletionDateStruct.date,protocolSection.designModule.studyType,protocolSection.designModule.phases,protocolSection.designModule.enrollmentInfo.count,protocolSection.outcomesModule.primaryOutcomes,protocolSection.outcomesModule.secondaryOutcomes,protocolSection.outcomesModule.otherOutcomes
Error fetching data: 404 Client Error: Not Found for url: https://clinicaltrials.gov/api/v2/studies/10.6084/m9.figshare.25013561.v1?fields=protocolSection.identificationModule.briefTitle,protocolSection.identificationModule.officialTitle,protocolSection.descriptionModule.briefSummary,protocolSection.s

In [55]:
df.to_csv("lit_search_articles_since_2022_with_trials.csv", index=False)

In [68]:
import re 

matches_df = pd.DataFrame({
    "primaryOutcomes": df["primaryOutcomes"].str.extractall(r"\b("+"|".join(["mrd", "minimal residual disease", "measurable residual disease"])+r"\b)", flags=re.IGNORECASE).value_counts(),
    "secondaryOutcomes": df["secondaryOutcomes"].str.extractall(r"\b("+"|".join(["mrd", "minimal residual disease", "measurable residual disease"])+r"\b)", flags=re.IGNORECASE).value_counts(),
    "otherOutcomes": df["otherOutcomes"].str.extractall(r"\b("+"|".join(["mrd", "minimal residual disease", "measurable residual disease"])+r"\b)", flags=re.IGNORECASE).value_counts()
})

In [69]:
matches_df

Unnamed: 0_level_0,primaryOutcomes,secondaryOutcomes,otherOutcomes
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MRD,15.0,94,4.0
Measurable residual disease,,5,
Minimal Residual Disease,1.0,24,
Minimal residual disease,1.0,3,2.0
measurable residual disease,2.0,3,
minimal residual disease,3.0,11,
