In [2]:
import yaml
import fitz
import torch
import gradio as gr
from PIL import Image
from langchain import hub
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma, FAISS
#from langchain_community.llms import HuggingFacePipeline
#from langchain.chains import ConversationalRetrievalChain
from langchain_community.document_loaders import PyPDFLoader, PDFMinerLoader
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough 
from langchain_core.output_parsers import StrOutputParser
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter,NLTKTextSplitter
from langchain_core.runnables import RunnableParallel
#TEST
from FakeLLM import FakePromptCopyLLM
import re  # For regular expressions
from nltk.corpus import stopwords  # You'll need to install NLTK first
import copy


In [3]:
pdf_loader = PDFMinerLoader("../documents/2312.10997.pdf")
documents = pdf_loader.load()
def clean_text(text):
  #text = text.lower()  # Convert to lowercase
  #text = re.sub(r'[^a-z0-9\s-]', '', text)  # Remove non-alphanumeric characters (except space and dash)
  text = re.sub(r'-\n', '', text)  # Remove hyphens at line breaks
  return text
def preprocess_documents(documents):
  for doc in documents:
    # Apply basic cleaning
    cleaned_text = clean_text(doc.page_content)
    # Optionally remove stop words
    # cleaned_text = remove_stop_words(cleaned_text)  # Uncomment if you want stop word removal
    doc.page_content = cleaned_text
  return documents
pre_documents = preprocess_documents(documents)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, 
                                               chunk_overlap=64, 
                                               add_start_index=True,
                                               length_function = len,
                                               separators=["\n\n", "\n", "(?<=[\.?])", "(?<=[\,;])", " "])
# 正则表达式正向回顾后发断言 (?<=...) 分割.,?;
all_splits = text_splitter.split_documents(pre_documents)
all_splits

[Document(page_content='Retrieval-Augmented Generation for Large Language Models: A Survey\n\nYunfan Gao 1 , Yun Xiong 2 , Xinyu Gao 2 , Kangxiang Jia 2 , Jinliu Pan 2 , Yuxi Bi 3 , Yi\nDai1 , Jiawei Sun1 , Qianyu Guo4 , Meng Wang 3 and Haofen Wang 1,3 ∗\n1 Shanghai Research Institute for Intelligent Autonomous Systems, Tongji University\n2 Shanghai Key Laboratory of Data Science, School of Computer Science, Fudan University\n3 College of Design and Innovation, Tongji University\n4 School of Computer Science, Fudan University\n\n4\n2\n0\n2', metadata={'source': '../documents/2312.10997.pdf', 'start_index': 0}),
 Document(page_content='4\n2\n0\n2\n\nn\na\nJ\n\n5\n\n]\nL\nC\n.\ns\nc\n[\n\n4\nv\n7\n9\n9\n0\n1\n.\n2\n1\n3\n2\n:\nv\ni\nX\nr\na\n\nAbstract\n\n(RAG)\n\nuntraceable', metadata={'source': '../documents/2312.10997.pdf', 'start_index': 502}),
 Document(page_content='Large Language Models (LLMs) demonstrate\nsignificant capabilities but face challenges such\nas hallucination, outda

In [4]:
#pdf_loader = PDFMinerLoader("../documents/barlowtwins-CXR.pdf")
#documents = pdf_loader.load()
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=32, add_start_index=False)
#all_splits = text_splitter.split_documents(documents)
#embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") #e5-small-v2 or base v2 NEED TRY
embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-small") 
vectordb = Chroma.from_documents(all_splits, embeddings) 
retriever=vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 5})


In [5]:
retriever.invoke("What is naive rag")

[Document(page_content='3.1 Naive RAG\nThe Naive RAG research paradigm represents the earliest\nmethodology, which gained prominence shortly after the\nwidespread adoption of ChatGPT. The Naive RAG follows a\ntraditional process that includes indexing, retrieval, and generation. It is also characterized as a “Retrieve-Read” framework [Ma et al., 2023a].', metadata={'source': '../documents/2312.10997.pdf', 'start_index': 12575}),
 Document(page_content='upon the foundation of Naive RAG by adding “Rewrite” and\n“Rerank” modules. However, on the whole, modular RAG\nenjoys greater diversity and flexibility.', metadata={'source': '../documents/2312.10997.pdf', 'start_index': 27158}),
 Document(page_content='Drawbacks in Naive RAG\nNaive RAG faces significant challenges in three key areas:\n“Retrieval,” “Generation,” and “Augmentation”.', metadata={'source': '../documents/2312.10997.pdf', 'start_index': 14950}),
 Document(page_content='3 RAG Framework\nThe RAG research paradigm is continuous

In [3]:
documents

[Document(page_content='BarlowTwins-CXR: Enhancing Chest X-Ray\nBased Abnormality Localization with\nSelf-Supervised Learning\n\nHaoyue Sheng1,2,3*, Linrui Ma1,2, Jean-Fran¸cois Samson3,\nDianbo Liu2,4\n\n1*D´epartement d’informatique et de recherche op´erationnelle, Universit´e\nde Montr´eal, 2920 chemin de la Tour, Montr´eal, H3T 1J4, QC, Canada.\n2Mila - Quebec AI Institute, 6666 Rue Saint-Urbain, Montr´eal, H2S\n3H1, QC, Canada.\n3Direction des ressources informationnelles, CIUSSS du\nCentre-Sud-de-l’ˆIle-de-Montr´eal, 400 Blvd. De Maisonneuve Ouest,\nMontr´eal, H3A 1L4, QC, Canada.\n4School of Medicine and College of Design and Engineering, National\nUniversity of Singapore, 21 Lower Kent Ridge Rd, Singapore, 119077,\nSG, Singapore.\n\n*Corresponding author(s). E-mail(s): haoyue.sheng@umontreal.ca;\nContributing authors: linrui.ma@umontreal.ca;\njean-francois.samson.ccsmtl@ssss.gouv.qc.ca; dianbo@nus.edu.sg;\n\nAbstract\n\nBackground: Chest X-ray imaging based abnormality localiza

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=16, add_start_index=True)
all_splits = text_splitter.split_documents(documents)
all_splits

[Document(page_content='BarlowTwins-CXR: Enhancing Chest X-Ray\nBased Abnormality Localization with\nSelf-Supervised Learning', metadata={'source': '../documents/barlowtwins-CXR.pdf', 'start_index': 0}),
 Document(page_content='Haoyue Sheng1,2,3*, Linrui Ma1,2, Jean-Fran¸cois Samson3,\nDianbo Liu2,4', metadata={'source': '../documents/barlowtwins-CXR.pdf', 'start_index': 101}),
 Document(page_content='1*D´epartement d’informatique et de recherche op´erationnelle, Universit´e', metadata={'source': '../documents/barlowtwins-CXR.pdf', 'start_index': 174}),
 Document(page_content='de Montr´eal, 2920 chemin de la Tour, Montr´eal, H3T 1J4, QC, Canada.', metadata={'source': '../documents/barlowtwins-CXR.pdf', 'start_index': 249}),
 Document(page_content='2Mila - Quebec AI Institute, 6666 Rue Saint-Urbain, Montr´eal, H2S\n3H1, QC, Canada.', metadata={'source': '../documents/barlowtwins-CXR.pdf', 'start_index': 319}),
 Document(page_content='3Direction des ressources informationnelles, CIUSSS d

In [4]:
pdf_loader2 = PyPDFLoader("../documents/barlowtwins-CXR.pdf")
documents2 = pdf_loader2.load()
documents2

[Document(page_content='BarlowTwins-CXR: Enhancing Chest X-Ray\nBased Abnormality Localization with\nSelf-Supervised Learning\nHaoyue Sheng1,2,3*, Linrui Ma1,2, Jean-Fran¸ cois Samson3,\nDianbo Liu2,4\n1*D´ epartement d’informatique et de recherche op´ erationnelle, Universit´ e\nde Montr´ eal, 2920 chemin de la Tour, Montr´ eal, H3T 1J4, QC, Canada.\n2Mila - Quebec AI Institute, 6666 Rue Saint-Urbain, Montr´ eal, H2S\n3H1, QC, Canada.\n3Direction des ressources informationnelles, CIUSSS du\nCentre-Sud-de-l’ ˆIle-de-Montr´ eal, 400 Blvd. De Maisonneuve Ouest,\nMontr´ eal, H3A 1L4, QC, Canada.\n4School of Medicine and College of Design and Engineering, National\nUniversity of Singapore, 21 Lower Kent Ridge Rd, Singapore, 119077,\nSG, Singapore.\n*Corresponding author(s). E-mail(s): haoyue.sheng@umontreal.ca;\nContributing authors: linrui.ma@umontreal.ca;\njean-francois.samson.ccsmtl@ssss.gouv.qc.ca; dianbo@nus.edu.sg;\nAbstract\nBackground: Chest X-ray imaging based abnormality localiza

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0, add_start_index=True)
all_splits = text_splitter.split_documents(documents2)
all_splits

[Document(page_content='BarlowTwins-CXR: Enhancing Chest X-Ray\nBased Abnormality Localization with\nSelf-Supervised Learning\nHaoyue Sheng1,2,3*, Linrui Ma1,2, Jean-Fran¸ cois Samson3,\nDianbo Liu2,4\n1*D´ epartement d’informatique et de recherche op´ erationnelle, Universit´ e\nde Montr´ eal, 2920 chemin de la Tour, Montr´ eal, H3T 1J4, QC, Canada.\n2Mila - Quebec AI Institute, 6666 Rue Saint-Urbain, Montr´ eal, H2S\n3H1, QC, Canada.\n3Direction des ressources informationnelles, CIUSSS du\nCentre-Sud-de-l’ ˆIle-de-Montr´ eal, 400 Blvd. De Maisonneuve Ouest,\nMontr´ eal, H3A 1L4, QC, Canada.\n4School of Medicine and College of Design and Engineering, National\nUniversity of Singapore, 21 Lower Kent Ridge Rd, Singapore, 119077,\nSG, Singapore.\n*Corresponding author(s). E-mail(s): haoyue.sheng@umontreal.ca;\nContributing authors: linrui.ma@umontreal.ca;\njean-francois.samson.ccsmtl@ssss.gouv.qc.ca; dianbo@nus.edu.sg;\nAbstract\nBackground: Chest X-ray imaging based abnormality localiza

In [9]:
pdf_loader = PyPDFLoader("../documents/barlowtwins-CXR.pdf")
documents = pdf_loader.load()

def clean_text(text):
  text = text.lower()  # Convert to lowercase
  #text = re.sub(r'[^a-z0-9\s-]', '', text)  # Remove non-alphanumeric characters (except space and dash)
  text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single spaces
  return text

def remove_stop_words(text):
  stop_words = set(stopwords.words('english'))
  return " ".join([word for word in text.split() if word not in stop_words])

def preprocess_documents(documents):
  for doc in documents:
    # Apply basic cleaning
    cleaned_text = clean_text(doc.page_content)
    # Optionally remove stop words
    # cleaned_text = remove_stop_words(cleaned_text)  # Uncomment if you want stop word removal
    doc.page_content = cleaned_text
  return documents
documents_copy = copy.deepcopy(documents)
preprocessed_documents = preprocess_documents(documents_copy)

In [13]:
# Level2
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0, add_start_index=True)
all_splits = text_splitter.split_documents(documents)
all_splits

[Document(page_content='BarlowTwins-CXR: Enhancing Chest X-Ray\nBased Abnormality Localization with\nSelf-Supervised Learning\nHaoyue Sheng1,2,3*, Linrui Ma1,2, Jean-Fran¸ cois Samson3,\nDianbo Liu2,4\n1*D´ epartement d’informatique et de recherche op´ erationnelle, Universit´ e\nde Montr´ eal, 2920 chemin de la Tour, Montr´ eal, H3T 1J4, QC, Canada.\n2Mila - Quebec AI Institute, 6666 Rue Saint-Urbain, Montr´ eal, H2S\n3H1, QC, Canada.\n3Direction des ressources informationnelles, CIUSSS du\nCentre-Sud-de-l’ ˆIle-de-Montr´ eal, 400 Blvd. De Maisonneuve Ouest,\nMontr´ eal, H3A 1L4, QC, Canada.\n4School of Medicine and College of Design and Engineering, National\nUniversity of Singapore, 21 Lower Kent Ridge Rd, Singapore, 119077,\nSG, Singapore.\n*Corresponding author(s). E-mail(s): haoyue.sheng@umontreal.ca;\nContributing authors: linrui.ma@umontreal.ca;\njean-francois.samson.ccsmtl@ssss.gouv.qc.ca; dianbo@nus.edu.sg;\nAbstract\nBackground: Chest X-ray imaging based abnormality localiza

In [None]:
def make_retriever(doc):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
    all_splits = text_splitter.split_documents(doc)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = Chroma.from_documents(all_splits, embeddings)
    retriever=vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 3})
    return retriever
retriever_pre = make_retriever(preprocessed_documents)

In [None]:
retriever_pre.invoke("What is the main topic of the document?")

In [3]:
documents

[Document(page_content='Retrieval-Augmented Generation for Large Language Models: A Survey\nYunfan Gao1,Yun Xiong2,Xinyu Gao2,Kangxiang Jia2,Jinliu Pan2,Yuxi Bi3,Yi\nDai1,Jiawei Sun1,Qianyu Guo4,Meng Wang3and Haofen Wang1,3∗\n1Shanghai Research Institute for Intelligent Autonomous Systems, Tongji University\n2Shanghai Key Laboratory of Data Science, School of Computer Science, Fudan University\n3College of Design and Innovation, Tongji University\n4School of Computer Science, Fudan University\nAbstract\nLarge Language Models (LLMs) demonstrate\nsignificant capabilities but face challenges such\nas hallucination, outdated knowledge, and non-\ntransparent, untraceable reasoning processes.\nRetrieval-Augmented Generation (RAG) has\nemerged as a promising solution by incorporating\nknowledge from external databases. This enhances\nthe accuracy and credibility of the models, particu-\nlarly for knowledge-intensive tasks, and allows for\ncontinuous knowledge updates and integration of\ndomai

In [5]:
def make_retriever(doc):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
    all_splits = text_splitter.split_documents(doc)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = Chroma.from_documents(all_splits, embeddings)
    retriever=vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 3})
    return retriever
retriever_pre = make_retriever(preprocessed_documents)


In [6]:
retriever_pre.invoke("What is the main topic of the document?")

[Document(page_content='dispelling ambiguity in entities and terms confirming factual accuracy maintaining context and updating outdated documents optimizing index structures involves adjusting the size of chunks to capture relevant context querying across multiple index paths and incorporating information from the graph structure to capture relevant context by leveraging relation- ships between nodes in a graph data index adding metadata information involves integrating refer- enced metadata such as dates and purposes into chunks for filtering purposes and incorporating metadata like chapters and subsections of references to improve retrieval efficiency alignment optimization addresses alignment issues and disparities between documents by introducing hypothetical questions liet al 2023d into documents to rectify align- ment issues and differences retrieval during the retrieval stage the primary focus is on identifying the appropriate context by calculating the similarity between the q

In [7]:
retriever_pre.invoke("What is the document about?")

[Document(page_content='dispelling ambiguity in entities and terms confirming factual accuracy maintaining context and updating outdated documents optimizing index structures involves adjusting the size of chunks to capture relevant context querying across multiple index paths and incorporating information from the graph structure to capture relevant context by leveraging relation- ships between nodes in a graph data index adding metadata information involves integrating refer- enced metadata such as dates and purposes into chunks for filtering purposes and incorporating metadata like chapters and subsections of references to improve retrieval efficiency alignment optimization addresses alignment issues and disparities between documents by introducing hypothetical questions liet al 2023d into documents to rectify align- ment issues and differences retrieval during the retrieval stage the primary focus is on identifying the appropriate context by calculating the similarity between the q

In [9]:
retriever_pre.invoke("What is the central theme of the document?")

[Document(page_content='alternates placing the best docu- ment at the beginning and end of the context window ad- ditionally approaches like cohereai rerank cohere 2023 bge-rerank7 and longllmlingua jiang et al 2023a re- calculate the semantic similarity between relevant text and the query addressing the challenge of interpreting vector-based simulated searches for semantic similarity prompt compression research indicates that noise in re- trieved documents adversely affects rag performance in post-processing the emphasis lies in compressing irrelevant context highlighting pivotal paragraphs and reducing the overall context length approaches such as selective context and llmlingua litman et al 2020 anderson et al 2022 utilize small language models to calculate prompt mu- tual information or perplexity estimating element impor- tance recomp xuet al 2023a addresses this by train- ing compressors at different granularities while long context xuet al 2023b and walking in the memory maze ch

In [16]:
retriever_pre = make_retriever(preprocessed_documents)

In [17]:
retriever_pre.invoke("What is the main topic of the document?")

[Document(page_content='methodology analyzed data was responsible for experiments and results visualization and participated in manuscript drafting and revision lm assisted in developing the research methodology and contributed to the drafting and revision of the manuscript jfs collected and interpreted data and provided expertise in statistical analysis dl contributed to the study design offered statistical analysis expertise assisted in interpreting results and played a significant role in the critical revision of the manuscript all authors read and approved the final manuscript 97 acknowledgements the authors wish to express their gratitude to ciusss du centre-sud-de-l le-de- montr eal for the computational resources and support provided which were essential for the research conducted as part of the graduate internship program we are espe- cially thankful to our department director mathieu mailhot for his mentorship and to chen cheng for his collaborative efforts and valuable contri