In [36]:
import os
import subprocess

from langchain.chat_models import ChatOpenAI
from PyPDF2 import PdfReader
from crewai_tools import SerperDevTool

In [37]:
REVISION_ROUNDS = 1

# Defining the Research Topic + Additional Papers

In [38]:
def append_to_summary_file(text, file_path="Assignment 2/summaries/intermediate_steps.txt"):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, "a", encoding="utf-8") as f:
        f.write(text + "\n\n")

def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

def parse_serper_response(response_str):
    """
    Parse the literal string output from Serper.
    Assumes segments are separated by '---' and each segment contains lines starting with "Title:" and "Snippet:".
    Returns a list of tuples (title, snippet).
    """
    segments = response_str.split('---')
    papers = []
    for seg in segments:
        seg = seg.strip()
        if not seg:
            continue
        title = ""
        snippet = ""
        for line in seg.split('\n'):
            if line.startswith("Title:"):
                title = line[len("Title:"):].strip()
            elif line.startswith("Snippet:"):
                snippet = line[len("Snippet:"):].strip()
        if title or snippet:
            papers.append((title, snippet))
    return papers

def get_additional_google_scholar_papers(query, serper):
    # note serper output is a literal string.
    response_str = serper.run(search_query=query)
    parsed = parse_serper_response(response_str)
    results_text = ""
    for i, (title, snippet) in enumerate(parsed):
        if i >= 5:
            break
        results_text += f"Title: {title}\nSnippet: {snippet}\n\n"
    if not results_text.strip():
        results_text = "No additional papers found from Google Scholar."
    return results_text

def escape_latex(text):
    return text.replace("&", r"\&")

def iterative_revision(text, section_name):
    """Iteratively revise the given LaTeX text for REVISION_ROUNDS rounds.
    The LLM is instructed to output only the LaTeX code."""
    revised_text = text
    for round in range(REVISION_ROUNDS):
        prompt = (
            f"Revise the following {section_name} LaTeX code to improve clarity, structure, and formatting. "
            "Output only the final LaTeX code without any additional commentary.\n\n"
            f"{revised_text}\n"
            f"Revision Round: {round+1}"
        )
        revised_text = llm.predict(prompt)
        append_to_summary_file(f"Revision Round {round+1} for {section_name}:\n{revised_text}")
    return revised_text

In [39]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
serper = SerperDevTool(api_key=os.getenv("SERPER_API_KEY"))

In [40]:
research_topic = (
    "Developing a Retrieval-Augmented Generation (RAG) LLM for retrieval of medical papers, "
    "enabling a centralized vector store to mass pull papers, articles, and journals."
)
print(f"Selected Research Topic:\n{research_topic}")
append_to_summary_file(f"Selected Research Topic:\n{research_topic}")

Selected Research Topic:
Developing a Retrieval-Augmented Generation (RAG) LLM for retrieval of medical papers, enabling a centralized vector store to mass pull papers, articles, and journals.


In [41]:
gs_query = "RAG LLM retrieval of medical papers centralized vector store"
additional_papers = get_additional_google_scholar_papers(gs_query, serper)
print("Additional Google Scholar Papers:\n", additional_papers)
append_to_summary_file(f"Additional Google Scholar Papers:\n{additional_papers}")

Using Tool: Search the internet
Additional Google Scholar Papers:
 Title: 
Snippet: RAG enhances LLM's capabilities by giving access to different information sources in real-time and seamlessly integrating them with processing.

Title: Developing Retrieval Augmented Generation (RAG) based LLM ...
Snippet: This paper presents an experience report on the development of Retrieval Augmented Generation (RAG) systems using PDF documents as the primary data source.

Title: Retrieval-augmented generation for generative artificial intelligence ...
Snippet: Retrieval-augmented generation (RAG) enables models to generate more reliable content by leveraging the retrieval of external knowledge.

Title: Evaluating Medical Retrieval-Augmented Generation (RAG) with ...
Snippet: In this overview, we'll explore RAG's growing role in healthcare, focusing on its potential to transform applications like drug discovery and clinical trials.

Title: What is retrieval-augmented generation? - Red Hat
Snippet: R

# Co-Thinking For Proposal Development

In [42]:
prompt_brainstorm = (
    f"Brainstorm at least 3 innovative research ideas for applying Gen AI to develop a RAG LLM for retrieval of medical papers. "
    "Consider aspects like a centralized vector store, mass retrieval, and integration with academic databases. "
    "Provide brief descriptions for each idea."
)
ideas = llm.predict(prompt_brainstorm)
print("Brainstormed Ideas:\n", ideas)
append_to_summary_file(f"Brainstormed Ideas:\n{ideas}")

Brainstormed Ideas:
 1. Utilizing Gen AI to create a centralized vector store for medical papers: By leveraging Gen AI technology, researchers can develop a centralized vector store that stores embeddings of medical papers. This vector store can be used to efficiently retrieve relevant medical papers based on user queries, allowing for faster and more accurate information retrieval.

2. Implementing mass retrieval capabilities using Gen AI for RAG LLM: Gen AI can be used to enhance the mass retrieval capabilities of a RAG LLM for medical papers. By training the model on a large dataset of medical papers, researchers can improve the model's ability to retrieve relevant papers in bulk, making it easier for users to access a wide range of information quickly and efficiently.

3. Integrating Gen AI-powered RAG LLM with academic databases for enhanced search functionality: Researchers can integrate a Gen AI-powered RAG LLM with academic databases to provide users with enhanced search functi

In [43]:
prompt_critique = (
    f"Critique the following research ideas for developing a RAG LLM for retrieval of medical papers. "
    "Evaluate each idea on feasibility, originality, and potential impact with scores (1–5) and brief justifications.\n\nIdeas:\n{ideas}\n"
)
idea_critique = llm.predict(prompt_critique)
print("Idea Critiques:\n", idea_critique)
append_to_summary_file(f"Idea Critiques:\n{idea_critique}")

Idea Critiques:
 1. Developing a RAG LLM that incorporates domain-specific medical terminology and concepts to improve retrieval of medical papers.
Feasibility: 4 - Feasible as there are existing resources and tools for medical terminology and concepts that can be integrated into the model.
Originality: 3 - While incorporating domain-specific terminology is not entirely new, the focus on medical papers specifically adds a level of originality.
Potential Impact: 5 - Improving retrieval of medical papers can have a significant impact on healthcare research and decision-making.

2. Creating a RAG LLM that utilizes deep learning techniques to analyze the context and content of medical papers for more accurate retrieval.
Feasibility: 3 - Deep learning techniques can be complex and resource-intensive, but with the right expertise and resources, it is feasible.
Originality: 4 - Utilizing deep learning for context analysis in medical papers is a novel approach that can lead to more accurate re

In [44]:
prompt_gaps = (
    f"Based on academic literature, identify the key research gaps related to developing a RAG LLM for retrieval of medical papers. "
    "Focus on limitations in current retrieval systems, vector store challenges, and integrating diverse academic sources."
)
research_gaps = llm.predict(prompt_gaps)
print("Identified Research Gaps:\n", research_gaps)
append_to_summary_file(f"Identified Research Gaps:\n{research_gaps}")

Identified Research Gaps:
 1. Limited effectiveness of current retrieval systems: Many existing retrieval systems for medical papers rely on keyword-based search algorithms, which may not always capture the nuances of medical terminology and concepts. There is a need for more sophisticated retrieval systems that can better understand the context and semantics of medical literature.

2. Challenges with vector store implementation: While vector space models have shown promise in improving retrieval accuracy, there are still challenges in implementing and optimizing these models for large-scale medical document collections. Research is needed to address issues such as scalability, efficiency, and the integration of domain-specific knowledge into vector representations.

3. Integration of diverse academic sources: Medical literature is published across a wide range of academic sources, including journals, conference proceedings, and preprint repositories. Developing a RAG LLM that can effe

In [45]:
prompt_structure = (
    f"Generate a draft research proposal structure for the topic:\n'{research_topic}'\n"
    "Incorporate the following:\n"
    f"Research Ideas:\n{ideas}\n\n"
    f"Research Gaps:\n{research_gaps}\n\n"
    "The structure should include the following sections:\n"
    "Title, Abstract (150–250 words), Background & Literature Review, Problem Statement & Research Gap, "
    "Proposed Gen AI Approach, Expected Impact in Healthcare, Limitations or Ethical Considerations, and References."
)
proposal_structure = llm.predict(prompt_structure)
print("Draft Proposal Structure:\n", proposal_structure)
append_to_summary_file(f"Draft Proposal Structure:\n{proposal_structure}")

Draft Proposal Structure:
 Title: Developing a Retrieval-Augmented Generation (RAG) LLM for retrieval of medical papers using Gen AI technology

Abstract:
This research proposal aims to develop a Retrieval-Augmented Generation (RAG) LLM for the retrieval of medical papers by leveraging Gen AI technology. The proposed centralized vector store will store embeddings of medical papers to enable efficient retrieval based on user queries. By enhancing mass retrieval capabilities and integrating with academic databases, the RAG LLM aims to improve the accessibility and usability of medical literature for healthcare professionals and researchers. This research addresses the limitations of current retrieval systems and aims to bridge the gap in integrating diverse academic sources for more effective information retrieval in the medical field.

Background & Literature Review:
Existing retrieval systems for medical papers often rely on keyword-based search algorithms, which may not capture the nu

# Conduct Academic Literature Review

In [46]:
papers_dir = "Papers"
paper_files = [f for f in os.listdir(papers_dir) if f.lower().endswith(".pdf")]

academic_summaries = ""
for paper in paper_files:
    paper_path = os.path.join(papers_dir, paper)
    text = extract_text_from_pdf(paper_path)
    words = text.split()
    summary_text = " ".join(words[:1000])
    prompt_summary = (
        f"Summarize the contributions and insights of the following academic paper excerpt related to Gen AI and medical paper retrieval:\n\n{summary_text}"
    )
    summary = llm.predict(prompt_summary)
    academic_summaries += f"Paper: {paper}\nSummary:\n{summary}\n\n"
academic_summaries += "Additional Google Scholar Papers:\n" + additional_papers
print("Academic Summaries:\n", academic_summaries)
append_to_summary_file(f"Academic Summaries:\n{academic_summaries}")

Academic Summaries:
 Paper: Systematic Review LLM Apps.pdf
Summary:
The academic paper excerpt highlights the current evaluation of Large Language Models (LLMs) in healthcare applications. The findings show that evaluations of LLMs in healthcare are shallow and fragmented, with a focus on accuracy and a lack of consideration for real patient care data. The paper emphasizes the need for standardized evaluations across a broad range of healthcare tasks and specialties, including the use of real patient care data and consideration of dimensions such as fairness, bias, and toxicity. The paper also discusses the potential of LLMs in improving healthcare efficiency and patient outcomes, but notes that their performance in real-world settings is inconsistently evaluated. The authors call for future studies to establish standardized evaluation metrics and broaden testing to include administrative tasks and multiple medical specialties.

Paper: Transformative impact of LLM in Medicine.pdf
Summa

# Generate The Final Proposal

In [58]:
def generate_section(section_name, context):
    initial_prompt = (
        f"Write the {section_name} for a research proposal on the topic:\n'{research_topic}'.\nContext:\n{context}\n"
    )
    draft = llm.predict(initial_prompt)
    revision_prompt = (
        f"Revise the following {section_name} to improve clarity, structure, and academic tone:\n{draft}\n"
    )
    revised = llm.predict(revision_prompt)
    final_text = revised
    append_to_summary_file(f"{section_name} Draft:\n{draft}\n\nRevised {section_name}:\n{final_text}")
    return final_text

In [59]:
title_context = "Generate a concise, catchy title that reflects using a RAG LLM for retrieval of medical papers."
abstract_context = (
    "Summarize the proposal in 150–250 words, including motivation, methodology (centralized vector store and RAG LLM retrieval), "
    "and expected impact on academic research in healthcare."
)
bkg_context = (
    "Provide a comprehensive background and literature review on current methods in medical paper retrieval, "
    "their limitations, and how a RAG LLM could improve the process. Include insights from the academic summaries."
)
problem_context = (
    "Describe the problem and research gaps identified, focusing on limitations of current retrieval systems and the challenges "
    "of building a centralized vector store for academic literature."
)
approach_context = (
    "Detail the proposed Gen AI approach, including the architecture of the RAG LLM, integration with a vector store, "
    "data processing, and experimental design."
)
impact_context = (
    "Discuss the expected impact on healthcare research, including improvements in literature retrieval efficiency, research speed, "
    "and data accessibility."
)
limits_context = (
    "Identify potential limitations and ethical considerations such as data privacy, biases, and scaling challenges."
)
references_context = (
    "List key academic references supporting the proposal, including the papers summarized and additional relevant citations."
)

In [60]:
proposal_sections = {}
proposal_sections["Title"] = generate_section("Title", title_context)
proposal_sections["Abstract"] = generate_section("Abstract", abstract_context)
proposal_sections["Background \\& Literature Review"] = generate_section("Background & Literature Review", bkg_context)
proposal_sections["Problem Statement \\& Research Gap"] = generate_section("Problem Statement & Research Gap", problem_context)
proposal_sections["Proposed Gen AI Approach"] = generate_section("Proposed Gen AI Approach", approach_context)
proposal_sections["Expected Impact in Healthcare"] = generate_section("Expected Impact in Healthcare", impact_context)
proposal_sections["Limitations or Ethical Considerations"] = generate_section("Limitations or Ethical Considerations", limits_context)
proposal_sections["References"] = generate_section("References", references_context)

# Generate LaTeX File

In [61]:
latex_prompt = (
    "Generate a complete LaTeX document for a research proposal on the following topic:\n"
    f"{research_topic}\n\n"
    "The document must include the following sections with proper LaTeX formatting:\n"
    "Title, Abstract, Background & Literature Review, Problem Statement & Research Gap, "
    "Proposed Gen AI Approach, Expected Impact in Healthcare, Limitations or Ethical Considerations, and References.\n\n"
    "Use the following proposal sections:\n\n"
)
for sec, content in proposal_sections.items():
    latex_prompt += f"\\section*{{{sec}}}\n{content}\n\n"

final_latex = llm.predict(latex_prompt)
print("Initial LaTeX Proposal:\n", final_latex)
append_to_summary_file("Initial LaTeX Proposal:\n" + final_latex)

Initial LaTeX Proposal:
 \documentclass{article}
\usepackage{geometry}
\geometry{a4paper, margin=1in}
\usepackage{lipsum}

\begin{document}

\title{Utilizing RAG LLM Technology for Centralized Access to Revolutionize Medical Paper Retrieval}
\date{}
\maketitle

\section*{Abstract}
This research proposal aims to develop a Retrieval-Augmented Generation (RAG) Language Model (LLM) specifically designed for the retrieval of medical papers. The proposed model will utilize a centralized vector store to efficiently pull papers, articles, and journals in bulk. The motivation behind this proposal is to address the challenges faced by researchers in the healthcare field when searching for relevant literature, which can often be time-consuming and ineffective.

By harnessing the capabilities of the RAG LLM and a centralized vector store, researchers will be able to swiftly and accurately retrieve a large volume of medical literature. The methodology of this proposal involves training the RAG LLM 

In [62]:
final_latex_revised = iterative_revision(final_latex, "Final LaTeX Proposal")
latex_file = "Assignment 2/final_proposal.tex"
os.makedirs(os.path.dirname(latex_file), exist_ok=True)
with open(latex_file, "w", encoding="utf-8") as f:
    f.write(final_latex_revised)
print(f"Final LaTeX proposal written to {latex_file}")

Final LaTeX proposal written to Assignment 2/final_proposal.tex


In [63]:
subprocess.run(["pdflatex", "-output-directory", os.path.dirname(latex_file), latex_file])

This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) (preloaded format=pdflatex)
 \write18 enabled.
entering extended mode
(./Assignment 2/final_proposal.tex
LaTeX2e <2024-11-01>
L3 programming layer <2024-11-02>
(/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/base/article.cls
Document Class: article 2024/06/29 v1.4n Standard LaTeX document class
(/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/base/size10.clo)) (/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/geometry/geometry.sty (/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/graphics/keyval.sty) (/Users/kyler/Library/TinyTeX/texmf-dist/tex/generic/iftex/ifvtex.sty (/Users/kyler/Library/TinyTeX/texmf-dist/tex/generic/iftex/iftex.sty))) (/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/lipsum/lipsum.sty (/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/l3packages/l3keys2e/l3keys2e.sty (/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/l3kernel/expl3.sty (/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/l3backe

CompletedProcess(args=['pdflatex', '-output-directory', 'Assignment 2', 'Assignment 2/final_proposal.tex'], returncode=1)

# Generate Beamer Slide Deck

In [64]:
beamer_prompt = (
    "Generate a complete Beamer slide deck in LaTeX (maximum 10 slides) for the following research proposal. "
    "The slides should include:\n"
    "1. A title slide (with title, author, date).\n"
    "2. An overview slide listing the proposal sections.\n"
    "3. A slide for the Abstract.\n"
    "4. A slide for Background & Literature Review.\n"
    "5. A slide for Problem Statement & Research Gap.\n"
    "6. A slide for Proposed Gen AI Approach.\n"
    "7. A slide for Expected Impact in Healthcare.\n"
    "8. A slide for Limitations or Ethical Considerations.\n"
    "9. A slide describing the system workflow and user interaction (include a description of a diagram).\n"
    "10. A conclusion/future work slide.\n\n"
    "Use the following proposal sections in your slides:\n\n"
)
for sec, content in proposal_sections.items():
    beamer_prompt += f"\\textbf{{{sec}}}: {content}\n\n"

In [65]:
slide_deck = llm.predict(beamer_prompt)
print("Initial Slide Deck LaTeX:\n", slide_deck)
append_to_summary_file("Initial Slide Deck LaTeX:\n" + slide_deck)

Initial Slide Deck LaTeX:
 \documentclass{beamer}
\usetheme{Madrid}
\usecolortheme{seagull}

\title{Utilizing RAG LLM Technology for Centralized Access to Revolutionize Medical Paper Retrieval}
\author{Author Name}
\date{\today}

\begin{document}

\begin{frame}
\titlepage
\end{frame}

\begin{frame}{Overview}
\tableofcontents
\end{frame}

\section{Abstract}
\begin{frame}{Abstract}
This research proposal aims to develop a Retrieval-Augmented Generation (RAG) Language Model (LLM) specifically designed for the retrieval of medical papers. The proposed model will utilize a centralized vector store to efficiently pull papers, articles, and journals in bulk. The motivation behind this proposal is to address the challenges faced by researchers in the healthcare field when searching for relevant literature, which can often be time-consuming and ineffective.
\end{frame}

\section{Background \& Literature Review}
\begin{frame}{Background \& Literature Review}
Efficiently retrieving relevant medic

In [66]:
slide_deck_revised = iterative_revision(slide_deck, "Slide Deck")
beamer_file = "Assignment 2/slide_deck.tex"
with open(beamer_file, "w", encoding="utf-8") as f:
    f.write(slide_deck_revised)
print(f"Slide deck LaTeX written to {beamer_file}")

Slide deck LaTeX written to Assignment 2/slide_deck.tex


In [67]:
subprocess.run(["pdflatex", "-output-directory", os.path.dirname(beamer_file), beamer_file])

This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) (preloaded format=pdflatex)
 \write18 enabled.
entering extended mode
(./Assignment 2/slide_deck.tex
LaTeX2e <2024-11-01>
L3 programming layer <2024-11-02>
(/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/beamer/beamer.cls
Document Class: beamer 2024/01/06 v3.71 A class for typesetting presentations
(/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/beamer/beamerbasemodes.sty (/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/etoolbox/etoolbox.sty) (/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/beamer/beamerbasedecode.sty)) (/Users/kyler/Library/TinyTeX/texmf-dist/tex/generic/iftex/iftex.sty) (/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/beamer/beamerbaseoptions.sty (/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/graphics/keyval.sty)) (/Users/kyler/Library/TinyTeX/texmf-dist/tex/latex/geometry/geometry.sty (/Users/kyler/Library/TinyTeX/texmf-dist/tex/generic/iftex/ifvtex.sty)) (/Users/kyler/Library/TinyTeX/te

CompletedProcess(args=['pdflatex', '-output-directory', 'Assignment 2', 'Assignment 2/slide_deck.tex'], returncode=1)