## Setup

In [1]:
# Imports
import warnings
warnings.filterwarnings('ignore')

from crewai import Agent, Task, Crew
import os
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from crewai_tools import SerperDevTool, FileReadTool
from langchain.chat_models import ChatOpenAI

# Load environment variables from .env
load_dotenv()
os.environ["OPENAI_MODEL_NAME"] = 'gpt-4o-mini'

In [2]:
openai_api_key = os.getenv("OPENAI_API_KEY")
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
serper = SerperDevTool(api_key=os.getenv("SERPER_API_KEY"))

  warn_deprecated(


## Helper Functions

In [3]:
def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

def chunk_text(text, max_words=1500):
    words = text.split()
    return [" ".join(words[i:i+max_words]) for i in range(0, len(words), max_words)]

def read_and_summarize_papers(agent: Agent, folder_path: str):
    summaries = {}
    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf") or filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            print(f"\n🔍 Reading: {filename}")

            # Read the content
            if filename.endswith(".pdf"):
                content = extract_text_from_pdf(file_path)
            else:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()

            # Split content into chunks
            chunks = chunk_text(content, max_words=1500)

            # Summarize each chunk separately
            partial_summaries = []
            for i, chunk in enumerate(chunks):
                prompt = f"""
You are summarizing part {i+1} of an academic paper. Follow this structure and do NOT make assumptions.

1. **AI Technique(s)**: What GenAI method(s) are used? Name models, frameworks, architectures.
2. **Healthcare Application**: What specific medical domain or task is this applied to?
3. **Methodology**: Detailed description of the model pipeline or process.
4. **Key Findings or Contributions**: What are the main results or insights?
5. **Limitations or Challenges**: Any weaknesses or barriers mentioned?

Text from the paper:
---------------------
{chunk}
"""
                summary = llm.predict(prompt)
                partial_summaries.append(summary)

            # Combine partial summaries into a full structured summary
            combined_prompt = f"""
You are generating a final structured summary of the paper '{filename}'. Use only the information in the following parts. Do not add anything new. Follow this format:

### {filename}
- **AI Technique(s)**:
- **Healthcare Application**:
- **Methodology**:
- **Key Findings or Contributions**:
- **Limitations or Challenges**:

Paper Parts:
---------------------
{chr(10).join(partial_summaries)}
"""
            final_summary = llm.predict(combined_prompt)
            summaries[filename] = final_summary

            print(f"✅ Done summarizing: {filename}\n")

    return summaries


In [4]:
def append_to_summary_file(text, file_path="SAMK_outputs/summaries_output.txt"):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, "a", encoding="utf-8") as f:
        f.write(text + "\n\n")

def parse_serper_response(response_str):
    """
    Parse the literal string output from Serper.
    Assumes segments are separated by '---' and each segment contains lines starting with "Title:" and "Snippet:".
    Returns a list of tuples (title, snippet).
    """
    segments = response_str.split('---')
    papers = []
    for seg in segments:
        seg = seg.strip()
        if not seg:
            continue
        title = ""
        snippet = ""
        for line in seg.split('\n'):
            if line.startswith("Title:"):
                title = line[len("Title:"):].strip()
            elif line.startswith("Snippet:"):
                snippet = line[len("Snippet:"):].strip()
        if title or snippet:
            papers.append((title, snippet))
    return papers

def get_additional_google_scholar_papers(query, serper):
    # Call serper; its output is a literal string.
    response_str = serper.run(search_query=query)
    parsed = parse_serper_response(response_str)
    results_text = ""
    for i, (title, snippet) in enumerate(parsed):
        if i >= 5:
            break
        results_text += f"Title: {title}\nSnippet: {snippet}\n\n"
    if not results_text.strip():
        results_text = "No additional papers found from Google Scholar."
    return results_text

## Reading

In [5]:
file_read_tool = FileReadTool()

study_agent = Agent(
    role="Study Agent for Healthcare + GenAI Papers",
    goal="Read and summarize generative AI techniques applied to healthcare in each paper. Be as detailed as possible.",
    backstory=(
        "I’m a research analyst with a strong understanding of AI and biomedical literature. "
        "My mission is to extract key insights from each research paper and highlight them."
    ),
    tools=[file_read_tool],
    allow_delegation=False,
    verbose=True,
    openai_api_key=openai_api_key
)

In [6]:
if __name__ == "__main__":
    folder_path = "Papers"
    output_folder = "SAMK_outputs"
    
    print("🚀 Starting paper analysis...")
    summaries = read_and_summarize_papers(study_agent, folder_path)

    # Print all summaries
    #for fname, summ in summaries.items():
    #    print(f"\n📄 === {fname} Summary ===\n{summ}\n")

    # Optional: Save to file
    output_file_path = os.path.join(output_folder, "summaries_output.txt")
    with open(output_file_path, "w", encoding="utf-8") as f:
        for fname, summ in summaries.items():
            f.write(f"=== {fname} ===\n{summ}\n\n")

    print("✅ All summaries saved to summaries_output.txt")


🚀 Starting paper analysis...

🔍 Reading: Adaptive Reasoning Language Agents.pdf


  warn_deprecated(


✅ Done summarizing: Adaptive Reasoning Language Agents.pdf


🔍 Reading: Agents in Clinic.pdf
✅ Done summarizing: Agents in Clinic.pdf


🔍 Reading: Autonomous Agents 2024 in medicine.pdf
✅ Done summarizing: Autonomous Agents 2024 in medicine.pdf


🔍 Reading: LLM Agents in Medicine.pdf
✅ Done summarizing: LLM Agents in Medicine.pdf


🔍 Reading: MedAide.pdf
✅ Done summarizing: MedAide.pdf


🔍 Reading: Multimodal in healthcare.pdf
✅ Done summarizing: Multimodal in healthcare.pdf


🔍 Reading: Polaris LLM Constellation.pdf
✅ Done summarizing: Polaris LLM Constellation.pdf


🔍 Reading: Systematic Review LLM Apps.pdf
✅ Done summarizing: Systematic Review LLM Apps.pdf


🔍 Reading: Transformative impact of LLM in Medicine.pdf
✅ Done summarizing: Transformative impact of LLM in Medicine.pdf


🔍 Reading: yang-et-al-2024-application-of-large-language-models-in-disease-diagnosis-and-treatment.pdf
✅ Done summarizing: yang-et-al-2024-application-of-large-language-models-in-disease-diagnosis-and-treatm

In [7]:
with open(output_file_path, "r", encoding="utf-8") as f:
    contents = f.read()
    print(contents)

=== Adaptive Reasoning Language Agents.pdf ===
### Adaptive Reasoning Language Agents.pdf
- **AI Technique(s)**: The paper utilizes large language models (LLMs), specifically mentioning the use of an adaptive LLM-based doctor agent, including GPT-4 and GPT-3.5. The architecture involves a sequence of actions generated by the LLM agent, guided by a reasoning process and an adaptation process to improve diagnostic accuracy.
  
- **Healthcare Application**: The application is focused on enhancing diagnostic accuracy in simulated clinical environments, specifically using the AgentClinic benchmark and the MedQA dataset to evaluate the performance of LLM agents in diagnosing patients through iterative interactions.

- **Methodology**: The model pipeline involves a simulated clinical environment with four main agents: the Doctor Agent (diagnosis), the Patient Agent (simulating patient behavior), the Measurement Agent (providing test results), and the Moderator Agent (evaluating diagnosis accu

In [8]:
research_topic = (
    "Developing a Retrieval-Augmented Generation (RAG) LLM for retrieval of medical papers, "
    "enabling a centralized vector store to mass pull papers, articles, and journals."
)
print(f"Selected Research Topic:\n{research_topic}")
append_to_summary_file(f"Selected Research Topic:\n{research_topic}")

Selected Research Topic:
Developing a Retrieval-Augmented Generation (RAG) LLM for retrieval of medical papers, enabling a centralized vector store to mass pull papers, articles, and journals.


In [9]:
gs_query = "RAG LLM retrieval of medical papers centralized vector store"
additional_papers = get_additional_google_scholar_papers(gs_query, serper)
print("Additional Google Scholar Papers:\n", additional_papers)
append_to_summary_file(f"Additional Google Scholar Papers:\n{additional_papers}")

Using Tool: Search the internet
Additional Google Scholar Papers:
 Title: 
Snippet: RAG enhances LLM's capabilities by giving access to different information sources in real-time and seamlessly integrating them with processing.

Title: Developing Retrieval Augmented Generation (RAG) based LLM ...
Snippet: This paper presents an experience report on the development of Retrieval Augmented Generation (RAG) systems using PDF documents as the primary data source.

Title: Retrieval-augmented generation for generative artificial intelligence ...
Snippet: Retrieval-augmented generation (RAG) enables models to generate more reliable content by leveraging the retrieval of external knowledge.

Title: What is retrieval-augmented generation? - Red Hat
Snippet: Retrieval-augmented generation (RAG) links external resources to an LLM to enhance a generative AI model's output accuracy.

Title: Is LLM necessary for RAG if we can retreive answer from vector ...
Snippet: Can someone clarify why we need an

## Proposal Development

In [10]:
# Agent 2: Research Gap Finder & Idea Generator
idea_generator = Agent(
    role="Research Gap & Idea Generator",
    goal="Identify gaps in existing research and generate innovative GenAI + Healthcare project ideas",
    backstory=(
        "Innovative AI researcher and US healthcare systems strategist. "
        "My aim is to propose solutions that can make an immediate, tangible impact in clinical workflows, "
        "patient care, and hospital operational efficiency. I leverage cutting-edge generative AI methods, "
        "ranging from medical imaging enhancements to large-scale language model applications for real-time clinical decision support."
        "Every concept I propose addresses a clear healthcare need and be feasible to pilot or deploy in real clinical environments."
    ),
    allow_delegation=True,
    verbose=True
)

# Agent 3: Critique & Decision Maker
critic_agent = Agent(
    role="Research Critique & Decision Agent",
    goal="Critique all proposed research ideas and select the most impactful and feasible proposal.",
    backstory=(
        "I am a senior US healthcare and advanced AI expert. I ensure that every idea is not only innovative but also practical and medically relevant."
    ),
    allow_delegation=True,
    verbose=True
)

# Agent 4: Proposal Writer
writer_agent = Agent(
    role="Research Proposal Writer",
    goal="Write a formal, structured proposal for the selected GenAI+Healthcare research idea.",
    backstory=(
        "I specialize in crafting structured academic proposals. I transform ideas into polished documents with the right sections and citations."
    ),
    allow_delegation=False,
    verbose=True
)

# ===============================
# TASKS
# ===============================

with open(output_file_path, "r", encoding="utf-8") as f:
    summaries_text = f.read()

# Task 1: Idea generation based on long-form summaries
idea_task = Task(
    description=(
        f"Based on the following summaries of recent papers, identify 3–5 meaningful research gaps.\n"
        f"Then propose 3 novel GenAI + Healthcare research directions that would fill those gaps.\n\n"
        f"PAPER SUMMARIES:\n{summaries_text}"
    ),
    expected_output="3 proposed GenAI + Healthcare research ideas with justification and associated research gaps.",
    agent=idea_generator
)

# Task 2: Critique and select final proposal
critique_task = Task(
    description=(
        "Review the 3 research ideas proposed by the Research Gap & Idea Generator agent. "
        "Rate each idea on originality, feasibility, and impact (scale 1–5). "
        "Provide detailed feedback directly to the idea_generator agent to iteratively refine any suboptimal ideas before selecting the best one. "
        "Select the single best idea clearly justifying your decision."
    ),
    expected_output="Detailed feedback for refinement, final selected idea, and justification.",
    agent=critic_agent
)

# Task 3: Final proposal writing
write_task = Task(
    description=(
        "Write a complete research proposal for the selected idea.\n"
        "Your sections must be: Title, Abstract (150–250 words), Background & Literature Review,\n"
        "Problem Statement & Research Gap, Proposed Gen AI Approach, Expected Impact in Healthcare,\n"
        "Limitations or Ethical Considerations, and References.\n"
        "Cite from the paper summaries if applicable. Do not hallucinate information."
    ),
    expected_output="Structured, rigorous proposal with proper citations and sections.",
    agent=writer_agent
)

# ===============================
# CREW SETUP & RUN
# ===============================

crew = Crew(
    agents=[idea_generator, critic_agent, writer_agent],
    tasks=[idea_task, critique_task, write_task],
    verbose=True
)

if __name__ == "__main__":
    print("\n🚀 Running multi-agent research proposal workflow...")
    result = crew.kickoff()
    print("\n✅ Final Proposal:\n")
    print(result)

    # Optional: Save to file
    output_file_path = os.path.join(output_folder, "final_proposal.txt")
    with open(output_file_path, "w", encoding="utf-8") as f:
        f.write(result)
    print("\n📄 Proposal saved to final_proposal.txt")


🚀 Running multi-agent research proposal workflow...
[1m[95m [DEBUG]: == Working Agent: Research Gap & Idea Generator[00m
[1m[95m [INFO]: == Starting Task: Based on the following summaries of recent papers, identify 3–5 meaningful research gaps.
Then propose 3 novel GenAI + Healthcare research directions that would fill those gaps.

PAPER SUMMARIES:
=== Adaptive Reasoning Language Agents.pdf ===
### Adaptive Reasoning Language Agents.pdf
- **AI Technique(s)**: The paper utilizes large language models (LLMs), specifically mentioning the use of an adaptive LLM-based doctor agent, including GPT-4 and GPT-3.5. The architecture involves a sequence of actions generated by the LLM agent, guided by a reasoning process and an adaptation process to improve diagnostic accuracy.
  
- **Healthcare Application**: The application is focused on enhancing diagnostic accuracy in simulated clinical environments, specifically using the AgentClinic benchmark and the MedQA dataset to evaluate the perfo