In [1]:
from time import sleep
import numpy as np
import pandas as pd

# Download articles from PubMed

In [2]:
from Bio import Entrez


# Example of search keyworkds used for PubMed
pm_keywords = 'breast cancer'


def Search_PubMed(pm_keywords):
    # number of papers
    num = 500
    Entrez.email = "yourEmail@gmail.com"


    # search for 500 papers
    search_results = Entrez.esearch(db="pubmed", term=pm_keywords, retmax=num)
    record = Entrez.read(search_results)
    id_list = record['IdList']
    handle = Entrez.efetch(db="pubmed", id=id_list, retmode="xml")
    articles = Entrez.read(handle)


    # Extract all articles we have scratched
    for article in articles['PubmedArticle']:
        title = article['MedlineCitation']['Article']['ArticleTitle']
    

    # Input all articles into a dataframe 
    article_list = []
    c = 0

    for article in articles['PubmedArticle']:
        try:
            title = article['MedlineCitation']['Article']['ArticleTitle']
            abstract = article['MedlineCitation']['Article']['Abstract']["AbstractText"][0]
            article_list.append({"Title": title, "Abstract": abstract})
            # print("Title:", title)
        except: 
            c+=1
            continue       
    print("# Errors: ", c)
    df = pd.DataFrame(article_list)
    return df

pm_df = Search_PubMed(pm_keywords)

# Errors:  26


## Download articles from ClinicalTrials.gov database

In [3]:
from pytrials.client import ClinicalTrials


# Example of search keyworkds used for PubMed
ct_keywords = "breast cancer"


def Search_PubMed(ct_keywords):
    # number of papers
    num = 500

    # Initialize the ClinicalTrials client
    ct = ClinicalTrials()


    # Specify the search fields
    fields = ["NCTId", "OfficialTitle", "DetailedDescription"]


    # search for papers
    trials = ct.get_study_fields(ct_keywords, fields, max_studies=num)


    # Input all articles into a dataframe 
    ct_list = []
    c = 0
    for trial in trials[1:]:
        try:
            title = trial[2]
            abstract = trial[3]
            ct_list.append({"Title": title, "Abstract": abstract})
            # print("Title:", title)
            
        except:
            c+=1
            continue
            
    print("# Errors: ", c)  
    df = pd.DataFrame(ct_list)
    return df

ct_df = Search_PubMed(ct_keywords)

# Errors:  0


In [4]:
## Merge two dataframes, each from a different database
df = pd.concat([pm_df, ct_df], ignore_index=True)
df.shape

(974, 2)

# OpenAI API

In [16]:
import openai

openai.api_key = 'Your-API-KEY'

In [18]:
# Generate a response to a prompt using an OpenAI model
def get_completion(prompt, model="gpt-4"):
    messages = [{"role": "user", "content": prompt}]
    for _ in range(10):
        try:
            response = openai.ChatCompletion.create(
                model=model,
                messages=messages,
                temperature=0, # this is the degree of randomness of the model's output
            )
            return response.choices[0].message["content"]
        except:
            print(">> get into error.., retrying..")
            sleep(1)
    assert False, "Unexpected error"

In [19]:
# Use GPT to condense the 'Abstract' of articles by summarizing its content
summarization = []
for i in range(df.shape[0]):
    title = df.iloc[i]["Title"]
    text = df.iloc[i]["Abstract"]
    
    prompt = f"""
    Please summarize the following academic article into a concise, \
    informative abstract. Focus on highlighting the key points, including the research question, \
    methodology, main findings, and conclusions. Ensure the summary is clear and accessible to readers \
    who may not be specialists in this field. Here's the article which is delimited by triple backticks: ```{text}``` 
    """
    
    summary = get_completion(prompt)
    summarization.append({"Title": title, "Summary": summary})
    
data = pd.DataFrame(summarization)
data.to_csv("Summarization.csv", index=False, encoding='utf-8')

>> get into error.., retrying..


NameError: name 'sleep' is not defined

# Experiment2: Generate the keywrods through summarize the content

In [20]:
text = """Lymphedema:
What it is: Swelling, typically in the arm or hand, due to lymph node removal or damage.
Management Strategies:
Physical therapy and exercises to promote lymph fluid drainage.
Wearing a prescribed compression garment.
Avoiding tight clothing and extreme temperatures in the affected limb."""

prompt = f"""
    Please summarize the following academic article into a few concise, \
    informative keywords. Focus on highlighting the key points, including the research question, \
    methodology, main findings, and conclusions. Ensure the keywords are clear and related to breast cancer treatment. \

    If the keywords are: "Molecular Subtypes" and "Treatment Response", \
    Your output should be: Molecular Subtypes[Title/Abstract] OR Treatment Response[Title/Abstract] AND "breast cancer"[Title]
    
    
    Here's the article which is delimited by triple backticks: ```{text}``` 
    """

keywords = get_completion(prompt)
print(keywords)

Lymphedema[Title/Abstract] OR "Swelling"[Title/Abstract] OR "lymph node removal"[Title/Abstract] OR "damage"[Title/Abstract] OR "Management Strategies"[Title/Abstract] OR "Physical therapy"[Title/Abstract] OR "exercises"[Title/Abstract] OR "lymph fluid drainage"[Title/Abstract] OR "compression garment"[Title/Abstract] OR "Avoiding tight clothing"[Title/Abstract] OR "extreme temperatures"[Title/Abstract] AND "breast cancer"[Title]


In [23]:
df = Search_PubMed(keywords)

Title: Evaluating the Surgical Outcome of Lymphovenous Anastomosis in Breast Cancer-Related Lymphedema Using Tc-99m Phytate Lymphoscintigraphy: Preliminary Results.
Title: Immune checkpoint inhibitors in breast cancer: development, mechanisms of resistance and potential management strategies.
Title: Immunogenic cell death-related classification reveals prognosis and effectiveness of immunotherapy in breast cancer.
Title: Novel Carrier-Free Nanodrug Enhances Photodynamic Effects by Blocking the Autophagy Pathway and Synergistically Triggers Immunogenic Cell Death for the Efficient Treatment of Breast Cancer.
Title: Anthocyanin Oligomers Induce Apoptosis and Autophagy by Inhibiting the mTOR Signaling Pathway in Human Breast Cancer Cells.
Title: Prevention of Breast Cancer-Related Lymphedema: An Up-to-Date Systematic Review of Different Surgical Approaches.
Title: Is the Absence of Manual Lymphatic Drainage-Based Treatment in Lymphedema after Breast Cancer Harmful? A Randomized Crossover 

In [None]:
# Use GPT to condense the 'Abstract' of articles by summarizing its content
summarization = []
for i in range(df.shape[0]):
    title = df.iloc[i]["Title"]
    text = df.iloc[i]["Abstract"]
    
    prompt = f"""
    Please summarize the following academic article into a concise, \
    informative abstract. Focus on highlighting the key points, including the research question, \
    methodology, main findings, and conclusions. Ensure the summary is clear and accessible to readers \
    who may not be specialists in this field. Here's the article which is delimited by triple backticks: ```{text}``` 
    """
    
    summary = get_completion(prompt)
    summarization.append({"Title": title, "Summary": summary})
    
data = pd.DataFrame(summarization)
data.to_csv("experiment2.csv", index=False, encoding='utf-8')