In [90]:
import os
import subprocess

from langchain.chat_models import ChatOpenAI
from PyPDF2 import PdfReader
from crewai_tools import SerperDevTool

In [91]:
REVISION_ROUNDS = 5

# Defining the Research Topic + Additional Papers

In [92]:
def append_to_summary_file(text, file_path="summaries/intermediate_steps.txt"):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, "a", encoding="utf-8") as f:
        f.write(text + "\n\n")

def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

def parse_serper_response(response_str):
    """
    Parse the literal string output from Serper.
    Assumes segments are separated by '---' and each segment contains lines starting with "Title:" and "Snippet:".
    Returns a list of tuples (title, snippet).
    """
    segments = response_str.split('---')
    papers = []
    for seg in segments:
        seg = seg.strip()
        if not seg:
            continue
        title = ""
        snippet = ""
        for line in seg.split('\n'):
            if line.startswith("Title:"):
                title = line[len("Title:"):].strip()
            elif line.startswith("Snippet:"):
                snippet = line[len("Snippet:"):].strip()
        if title or snippet:
            papers.append((title, snippet))
    return papers

def get_additional_google_scholar_papers(query, serper):
    # Call serper; its output is a literal string.
    response_str = serper.run(search_query=query)
    parsed = parse_serper_response(response_str)
    results_text = ""
    for i, (title, snippet) in enumerate(parsed):
        if i >= 5:
            break
        results_text += f"Title: {title}\nSnippet: {snippet}\n\n"
    if not results_text.strip():
        results_text = "No additional papers found from Google Scholar."
    return results_text

In [93]:
def escape_latex(text):
    # Escape common characters (extend as needed).
    return text.replace("&", r"\&")

def iterative_revision(text, section_name):
    """Iteratively revise the given text for REVISION_ROUNDS rounds."""
    revised_text = text
    for round in range(REVISION_ROUNDS):
        prompt = (
            f"Revise the following {section_name} text to improve clarity, structure, and academic tone. "
            "Output only the final text (no additional commentary).\n\n"
            f"{revised_text}\n"
            f"Revision Round: {round+1}"
        )
        revised_text = llm.predict(prompt)
        append_to_summary_file(f"Revision Round {round+1} for {section_name}:\n{revised_text}")
    return revised_text

def generate_section_latex(section_name, text):
    """
    Convert the given section text into well-formatted LaTeX code.
    Instruct the LLM to make full use of LaTeX formatting options (e.g. enumerate, itemize, etc.).
    """
    prompt = (
        f"Convert the following text for the '{section_name}' section of a research proposal into well-formatted LaTeX code. "
        "Use advanced LaTeX formatting (e.g., \\section, \\subsection, \\textbf, \\begin{enumerate} ... \\end{enumerate}, etc.) as appropriate. "
        "Output only the LaTeX code.\n\nText:\n{text}"
    )
    latex_section = llm.predict(prompt)
    # Optionally, apply iterative revision on the LaTeX code
    latex_section = iterative_revision(latex_section, f"{section_name} LaTeX Section")
    return latex_section

In [94]:
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)
serper = SerperDevTool(api_key=os.getenv("SERPER_API_KEY"))

In [95]:
research_topic = (
    "Developing a Retrieval-Augmented Generation (RAG) LLM for retrieval of medical papers, "
    "enabling a centralized vector store to mass pull papers, articles, and journals."
)
print(f"Selected Research Topic:\n{research_topic}")
append_to_summary_file(f"Selected Research Topic:\n{research_topic}")

Selected Research Topic:
Developing a Retrieval-Augmented Generation (RAG) LLM for retrieval of medical papers, enabling a centralized vector store to mass pull papers, articles, and journals.


In [96]:
gs_query = "RAG LLM retrieval of medical papers centralized vector store"
additional_papers = get_additional_google_scholar_papers(gs_query, serper)
print("Additional Google Scholar Papers:\n", additional_papers)
append_to_summary_file(f"Additional Google Scholar Papers:\n{additional_papers}")

Using Tool: Search the internet
Additional Google Scholar Papers:
 Title: 
Snippet: RAG enhances LLM's capabilities by giving access to different information sources in real-time and seamlessly integrating them with processing.

Title: Developing Retrieval Augmented Generation (RAG) based LLM ...
Snippet: This paper presents an experience report on the development of Retrieval Augmented Generation (RAG) systems using PDF documents as the primary data source.

Title: Retrieval-augmented generation for generative artificial intelligence ...
Snippet: Retrieval-augmented generation (RAG) enables models to generate more reliable content by leveraging the retrieval of external knowledge.

Title: Evaluating Medical Retrieval-Augmented Generation (RAG) with ...
Snippet: In this overview, we'll explore RAG's growing role in healthcare, focusing on its potential to transform applications like drug discovery and clinical trials.

Title: What is retrieval-augmented generation? - Red Hat
Snippet: R

# Co-Thinking For Proposal Development

In [97]:
prompt_brainstorm = (
    f"Brainstorm at least 3 innovative research ideas for applying Gen AI to develop a RAG LLM for retrieval of medical papers. "
    "Consider aspects like a centralized vector store, mass retrieval, and integration with academic databases. "
    "Provide brief descriptions for each idea."
)
ideas = llm.predict(prompt_brainstorm)
print("Brainstormed Ideas:\n", ideas)
append_to_summary_file(f"Brainstormed Ideas:\n{ideas}")

Brainstormed Ideas:
 1. **Centralized Vector Store with Dynamic Updating:**
   Develop a centralized vector store that continuously updates its embeddings based on the latest medical research papers. This system would use Gen AI to automatically parse new publications, extract key concepts, and update the vector representations in real-time. By integrating with academic databases like PubMed and arXiv, the system ensures that the most recent and relevant information is always available for retrieval. This dynamic updating mechanism would allow researchers to access the latest findings and trends in medical research efficiently.

2. **Mass Retrieval with Contextual Relevance Filtering:**
   Implement a mass retrieval system that leverages Gen AI to filter and rank medical papers based on contextual relevance to specific research queries. This system would use a combination of semantic search and contextual analysis to understand the nuances of a researcher's query and retrieve papers th

In [98]:
prompt_critique = (
    f"Critique the following research ideas for developing a RAG LLM for retrieval of medical papers. "
    "Evaluate each idea on feasibility, originality, and potential impact with scores (1–5) and brief justifications.\n\nIdeas:\n{ideas}\n"
)
idea_critique = llm.predict(prompt_critique)
print("Idea Critiques:\n", idea_critique)
append_to_summary_file(f"Idea Critiques:\n{idea_critique}")

Idea Critiques:
 To provide a critique of the research ideas for developing a Retrieval-Augmented Generation (RAG) Language Model (LLM) for the retrieval of medical papers, I would need to see the specific ideas you are referring to. Please provide the list of ideas so I can evaluate them based on feasibility, originality, and potential impact.


In [99]:
prompt_gaps = (
    f"Based on academic literature, identify the key research gaps related to developing a RAG LLM for retrieval of medical papers. "
    "Focus on limitations in current retrieval systems, vector store challenges, and integrating diverse academic sources."
)
research_gaps = llm.predict(prompt_gaps)
print("Identified Research Gaps:\n", research_gaps)
append_to_summary_file(f"Identified Research Gaps:\n{research_gaps}")

Identified Research Gaps:
 Developing a Retrieval-Augmented Generation (RAG) Language Model (LLM) for the retrieval of medical papers involves several complex challenges and research gaps. Based on academic literature, here are some key areas where further research is needed:

1. **Limitations in Current Retrieval Systems:**
   - **Precision and Recall Trade-offs:** Current retrieval systems often struggle to balance precision and recall, especially in the medical domain where the specificity of information is crucial. Research is needed to develop models that can maintain high precision without sacrificing recall.
   - **Contextual Understanding:** Many retrieval systems lack the ability to understand the nuanced context of medical queries, which can lead to irrelevant or incomplete results. Enhancing contextual understanding through improved natural language processing techniques is a significant research gap.
   - **Handling Ambiguity and Synonyms:** Medical terminology is complex a

In [100]:
prompt_structure = (
    f"Generate a draft research proposal structure for the topic:\n'{research_topic}'\n"
    "Incorporate the following:\n"
    f"Research Ideas:\n{ideas}\n\n"
    f"Research Gaps:\n{research_gaps}\n\n"
    "The structure should include the following sections:\n"
    "Title, Abstract (150–250 words), Background & Literature Review, Problem Statement & Research Gap, "
    "Proposed Gen AI Approach, Expected Impact in Healthcare, Limitations or Ethical Considerations, and References."
)
proposal_structure = llm.predict(prompt_structure)
print("Draft Proposal Structure:\n", proposal_structure)
append_to_summary_file(f"Draft Proposal Structure:\n{proposal_structure}")

Draft Proposal Structure:
 **Title:**
Developing a Retrieval-Augmented Generation (RAG) Language Model for Efficient Retrieval of Medical Papers Using a Centralized Vector Store

**Abstract:**
The rapid growth of medical literature presents a significant challenge for researchers seeking to stay informed about the latest developments. This research proposal outlines the development of a Retrieval-Augmented Generation (RAG) Language Model designed to enhance the retrieval of medical papers through a centralized vector store. The proposed system will dynamically update its embeddings with the latest research, leverage Gen AI for contextual relevance filtering, and offer a personalized research assistant with adaptive learning capabilities. By addressing key research gaps such as precision-recall trade-offs, contextual understanding, and integration of diverse academic sources, this project aims to improve the efficiency and accuracy of medical information retrieval. The expected impact i

# Conduct Academic Literature Review

In [101]:
papers_dir = "Papers"
paper_files = [f for f in os.listdir(papers_dir) if f.lower().endswith(".pdf")]

academic_summaries = ""
for paper in paper_files:
    paper_path = os.path.join(papers_dir, paper)
    text = extract_text_from_pdf(paper_path)
    words = text.split()
    summary_text = " ".join(words[:1000])
    prompt_summary = (
        f"Summarize the contributions and insights of the following academic paper excerpt related to Gen AI and medical paper retrieval:\n\n{summary_text}"
    )
    summary = llm.predict(prompt_summary)
    academic_summaries += f"Paper: {paper}\nSummary:\n{summary}\n\n"
academic_summaries += "Additional Google Scholar Papers:\n" + additional_papers
print("Academic Summaries:\n", academic_summaries)
append_to_summary_file(f"Academic Summaries:\n{academic_summaries}")

Academic Summaries:
 Paper: Systematic Review LLM Apps.pdf
Summary:
The paper titled "A Systematic Review of Testing and Evaluation of Healthcare Applications of Large Language Models (LLMs)" by Suhana Bedi et al. provides a comprehensive analysis of how LLMs are currently evaluated in healthcare settings. The key findings highlight that real patient care data is rarely used in LLM evaluations, with only 5% of studies utilizing such data. The predominant focus has been on assessing medical knowledge, such as answering medical licensing exam questions, while administrative tasks like billing code assignment and prescription writing are understudied. Most evaluations prioritize accuracy, with limited attention to fairness, bias, toxicity, robustness, and deployment considerations. The study also notes a lack of evaluations in specialized medical fields like nuclear medicine and medical genetics.

The paper emphasizes the need for more comprehensive and standardized evaluations of LLMs in

# Generate The Final Proposal

In [102]:
def generate_section(section_name, context):
    initial_prompt = (
        f"Write the {section_name} for a research proposal on the topic:\n'{research_topic}'.\nContext:\n{context}\n"
    )
    draft = llm.predict(initial_prompt)
    revised = iterative_revision(draft, section_name)
    append_to_summary_file(f"{section_name} Final Text:\n{revised}")
    return revised

In [103]:
title_context = "Generate a concise, catchy title that reflects using a RAG LLM for retrieval of medical papers."
abstract_context = (
    "Summarize the proposal in 150–250 words, including the motivation, methodology (centralized vector store and RAG LLM retrieval), "
    "and expected impact on academic research in healthcare."
)
bkg_context = (
    "Provide a comprehensive background and literature review on current methods in medical paper retrieval, "
    "their limitations, and how a RAG LLM could improve the process. Include insights from the academic summaries."
)
problem_context = (
    "Describe the problem and research gaps identified, focusing on limitations of current retrieval systems and challenges "
    "in building a centralized vector store for academic literature."
)
approach_context = (
    "Detail the proposed Gen AI approach, including the architecture of the RAG LLM, integration with a vector store, "
    "data processing, and experimental design."
)
impact_context = (
    "Discuss the expected impact on healthcare research, including improvements in literature retrieval efficiency, research speed, "
    "and data accessibility."
)
limits_context = (
    "Identify potential limitations and ethical considerations such as data privacy, biases, and scaling challenges."
)
references_context = (
    "List key academic references supporting the proposal, including the papers summarized and additional relevant citations."
)

In [104]:
proposal_sections = {}
proposal_sections["Title"] = generate_section("Title", title_context)
proposal_sections["Abstract"] = generate_section("Abstract", abstract_context)
# Escape ampersands in section titles for LaTeX formatting.
proposal_sections["Background \\& Literature Review"] = generate_section("Background & Literature Review", bkg_context)
proposal_sections["Problem Statement \\& Research Gap"] = generate_section("Problem Statement & Research Gap", problem_context)
proposal_sections["Proposed Gen AI Approach"] = generate_section("Proposed Gen AI Approach", approach_context)
proposal_sections["Expected Impact in Healthcare"] = generate_section("Expected Impact in Healthcare", impact_context)
proposal_sections["Limitations or Ethical Considerations"] = generate_section("Limitations or Ethical Considerations", limits_context)
proposal_sections["References"] = generate_section("References", references_context)

# Generate LaTeX File

In [105]:
latex_sections = {}
for sec, content in proposal_sections.items():
    latex_code = generate_section_latex(sec, content)
    latex_sections[sec] = latex_code

# Assemble final LaTeX document.
final_latex_document = r"""\documentclass[12pt]{article}
\usepackage[utf8]{inputenc}
\usepackage{lmodern}
\usepackage{hyperref}
\usepackage{geometry}
\geometry{margin=1in}
\title{%s}
\author{Generated by Unified Research Proposal Agent}
\date{\today}
\begin{document}
\maketitle
""" % (escape_latex(proposal_sections["Title"]).strip())

In [106]:
section_order = [
    "Abstract", "Background \\& Literature Review", "Problem Statement \\& Research Gap",
    "Proposed Gen AI Approach", "Expected Impact in Healthcare",
    "Limitations or Ethical Considerations", "References"
]

for sec in section_order:
    final_latex_document += "\n\\section*{%s}\n%s\n" % (sec, latex_sections[sec])

final_latex_document += "\n\\end{document}"

latex_file = "Assignment 2/final_proposal.tex"
os.makedirs(os.path.dirname(latex_file), exist_ok=True)
with open(latex_file, "w", encoding="utf-8") as f:
    f.write(final_latex_document)
print(f"Final LaTeX proposal written to {latex_file}")

Final LaTeX proposal written to Assignment 2/final_proposal.tex


In [112]:
subprocess.run(["pdflatex", "-output-directory", os.path.dirname(latex_file), latex_file])

This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) (preloaded format=pdflatex)
 \write18 enabled.
entering extended mode
(./Assignment 2/final_proposal.tex
LaTeX2e <2024-11-01>
L3 programming layer <2024-11-02>
(/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/base/article.cls
Document Class: article 2024/06/29 v1.4n Standard LaTeX document class
(/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/base/size12.clo)) (/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/base/inputenc.sty) (/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/lm/lmodern.sty) (/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/hyperref/hyperref.sty (/Users/kyler/Library/TinyTeX/texmf-dist/tex/generic/iftex/iftex.sty) (/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/graphics/keyval.sty) (/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/kvsetkeys/kvsetkeys.sty) (/Users/kyler/Library/TinyTeX/texmf-dist/tex/generic/kvdefinekeys/kvdefinekeys.sty) (/Users/kyler/Library/TinyTeX/texmf-dist/tex/generic/pdf

CompletedProcess(args=['pdflatex', '-output-directory', 'Assignment 2', 'Assignment 2/final_proposal.tex'], returncode=1)

# Generate Beamer Slide Deck

In [108]:
beamer_prompt = (
    "Generate a complete Beamer slide deck in LaTeX (maximum 10 slides) for the following research proposal. "
    "The slides should include a title slide, an overview of the proposal sections, and individual slides for each major section "
    "(Abstract, Background & Literature Review, Problem Statement & Research Gap, Proposed Gen AI Approach, Expected Impact in Healthcare, "
    "Limitations or Ethical Considerations, References) as well as a slide describing the system workflow and user interaction (with a diagram description) "
    "and a conclusion/future work slide. Use advanced LaTeX formatting options such as enumerate, itemize, bold, etc. Output only the LaTeX code.\n\n"
    "Proposal Sections:\n"
)
for sec, content in proposal_sections.items():
    beamer_prompt += f"\\textbf{{{sec}}}: {content}\n\n"

In [109]:
initial_slide_deck = llm.predict(beamer_prompt)
print("Initial Slide Deck LaTeX:\n", initial_slide_deck)
append_to_summary_file("Initial Slide Deck LaTeX:\n" + initial_slide_deck)

Initial Slide Deck LaTeX:
 ```latex
\documentclass{beamer}
\usepackage{graphicx}
\usepackage{amsmath}

\title{MedRAG: Transforming Medical Research Retrieval through AI-Enhanced Vector Stores}
\author{Research Proposal}
\date{\today}

\begin{document}

\begin{frame}
  \titlepage
\end{frame}

\begin{frame}{Overview}
  \tableofcontents
\end{frame}

\section{Abstract}
\begin{frame}{Abstract}
  \textbf{Objective:} Develop a Retrieval-Augmented Generation (RAG) LLM for efficient retrieval of medical literature.\\
  \textbf{Motivation:} Address inefficiencies in accessing medical research.\\
  \textbf{Methodology:} Integrate RAG LLM with a centralized vector store.\\
  \textbf{Impact:} Transform academic research in healthcare by streamlining access to critical information.
\end{frame}

\section{Background \& Literature Review}
\begin{frame}{Background \& Literature Review}
  \textbf{Background:}
  \begin{itemize}
    \item Traditional retrieval systems rely on keyword-based search engines.


In [110]:
slide_deck_revised = iterative_revision(initial_slide_deck, "Slide Deck")
beamer_file = "Assignment 2/slide_deck.tex"
with open(beamer_file, "w", encoding="utf-8") as f:
    f.write(slide_deck_revised)
print(f"Slide deck LaTeX written to {beamer_file}")

Slide deck LaTeX written to Assignment 2/slide_deck.tex


In [113]:
subprocess.run(["pdflatex", "-output-directory", os.path.dirname(beamer_file), beamer_file])

This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) (preloaded format=pdflatex)
 \write18 enabled.
entering extended mode
(./Assignment 2/slide_deck.tex
LaTeX2e <2024-11-01>
L3 programming layer <2024-11-02>

! LaTeX Error: Missing \begin{document}.

See the LaTeX manual or LaTeX Companion for explanation.
Type  H <return>  for immediate help.
 ...                                              
                                                  
l.1 `
     ``latex
? 
! Emergency stop.
 ...                                              
                                                  
l.1 `
     ``latex
!  ==> Fatal error occurred, no output PDF file produced!
Transcript written on "Assignment 2/slide_deck.log".


CompletedProcess(args=['pdflatex', '-output-directory', 'Assignment 2', 'Assignment 2/slide_deck.tex'], returncode=1)