In [1]:
import yaml
import fitz
import torch
import gradio as gr
from PIL import Image
from langchain import hub
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
#from langchain_community.llms import HuggingFacePipeline
#from langchain.chains import ConversationalRetrievalChain
from langchain_community.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough 
from langchain_core.output_parsers import StrOutputParser
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.runnables import RunnableParallel
#TEST
from FakeLLM import FakePromptCopyLLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pdf_loader = PyPDFLoader("../documents/2312.10997.pdf")
documents = pdf_loader.load()

In [3]:
documents

[Document(page_content='Retrieval-Augmented Generation for Large Language Models: A Survey\nYunfan Gao1,Yun Xiong2,Xinyu Gao2,Kangxiang Jia2,Jinliu Pan2,Yuxi Bi3,Yi\nDai1,Jiawei Sun1,Qianyu Guo4,Meng Wang3and Haofen Wang1,3∗\n1Shanghai Research Institute for Intelligent Autonomous Systems, Tongji University\n2Shanghai Key Laboratory of Data Science, School of Computer Science, Fudan University\n3College of Design and Innovation, Tongji University\n4School of Computer Science, Fudan University\nAbstract\nLarge Language Models (LLMs) demonstrate\nsignificant capabilities but face challenges such\nas hallucination, outdated knowledge, and non-\ntransparent, untraceable reasoning processes.\nRetrieval-Augmented Generation (RAG) has\nemerged as a promising solution by incorporating\nknowledge from external databases. This enhances\nthe accuracy and credibility of the models, particu-\nlarly for knowledge-intensive tasks, and allows for\ncontinuous knowledge updates and integration of\ndomai

In [4]:
import re  # For regular expressions
from nltk.corpus import stopwords  # You'll need to install NLTK first
import copy
def clean_text(text):
  text = text.lower()  # Convert to lowercase
  text = re.sub(r'[^a-z0-9\s-]', '', text)  # Remove non-alphanumeric characters (except space and dash)
  text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single spaces
  return text

def remove_stop_words(text):
  stop_words = set(stopwords.words('english'))
  return " ".join([word for word in text.split() if word not in stop_words])

def preprocess_documents(documents):
  for doc in documents:
    # Apply basic cleaning
    cleaned_text = clean_text(doc.page_content)
    # Optionally remove stop words
    # cleaned_text = remove_stop_words(cleaned_text)  # Uncomment if you want stop word removal
    doc.page_content = cleaned_text
  return documents
documents_copy = copy.deepcopy(documents)
preprocessed_documents = preprocess_documents(documents_copy)
preprocessed_documents

[Document(page_content='retrieval-augmented generation for large language models a survey yunfan gao1yun xiong2xinyu gao2kangxiang jia2jinliu pan2yuxi bi3yi dai1jiawei sun1qianyu guo4meng wang3and haofen wang13 1shanghai research institute for intelligent autonomous systems tongji university 2shanghai key laboratory of data science school of computer science fudan university 3college of design and innovation tongji university 4school of computer science fudan university abstract large language models llms demonstrate significant capabilities but face challenges such as hallucination outdated knowledge and non- transparent untraceable reasoning processes retrieval-augmented generation rag has emerged as a promising solution by incorporating knowledge from external databases this enhances the accuracy and credibility of the models particu- larly for knowledge-intensive tasks and allows for continuous knowledge updates and integration of domain-specific information rag synergistically mer

In [5]:
def make_retriever(doc):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
    all_splits = text_splitter.split_documents(doc)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = Chroma.from_documents(all_splits, embeddings)
    retriever=vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 3})
    return retriever
retriever_pre = make_retriever(preprocessed_documents)


In [6]:
retriever_pre.invoke("What is the main topic of the document?")

[Document(page_content='dispelling ambiguity in entities and terms confirming factual accuracy maintaining context and updating outdated documents optimizing index structures involves adjusting the size of chunks to capture relevant context querying across multiple index paths and incorporating information from the graph structure to capture relevant context by leveraging relation- ships between nodes in a graph data index adding metadata information involves integrating refer- enced metadata such as dates and purposes into chunks for filtering purposes and incorporating metadata like chapters and subsections of references to improve retrieval efficiency alignment optimization addresses alignment issues and disparities between documents by introducing hypothetical questions liet al 2023d into documents to rectify align- ment issues and differences retrieval during the retrieval stage the primary focus is on identifying the appropriate context by calculating the similarity between the q

In [7]:
retriever_pre.invoke("What is the document about?")

[Document(page_content='dispelling ambiguity in entities and terms confirming factual accuracy maintaining context and updating outdated documents optimizing index structures involves adjusting the size of chunks to capture relevant context querying across multiple index paths and incorporating information from the graph structure to capture relevant context by leveraging relation- ships between nodes in a graph data index adding metadata information involves integrating refer- enced metadata such as dates and purposes into chunks for filtering purposes and incorporating metadata like chapters and subsections of references to improve retrieval efficiency alignment optimization addresses alignment issues and disparities between documents by introducing hypothetical questions liet al 2023d into documents to rectify align- ment issues and differences retrieval during the retrieval stage the primary focus is on identifying the appropriate context by calculating the similarity between the q

In [9]:
retriever_pre.invoke("What is the central theme of the document?")

[Document(page_content='alternates placing the best docu- ment at the beginning and end of the context window ad- ditionally approaches like cohereai rerank cohere 2023 bge-rerank7 and longllmlingua jiang et al 2023a re- calculate the semantic similarity between relevant text and the query addressing the challenge of interpreting vector-based simulated searches for semantic similarity prompt compression research indicates that noise in re- trieved documents adversely affects rag performance in post-processing the emphasis lies in compressing irrelevant context highlighting pivotal paragraphs and reducing the overall context length approaches such as selective context and llmlingua litman et al 2020 anderson et al 2022 utilize small language models to calculate prompt mu- tual information or perplexity estimating element impor- tance recomp xuet al 2023a addresses this by train- ing compressors at different granularities while long context xuet al 2023b and walking in the memory maze ch

In [16]:
retriever_pre = make_retriever(preprocessed_documents)

In [17]:
retriever_pre.invoke("What is the main topic of the document?")

[Document(page_content='methodology analyzed data was responsible for experiments and results visualization and participated in manuscript drafting and revision lm assisted in developing the research methodology and contributed to the drafting and revision of the manuscript jfs collected and interpreted data and provided expertise in statistical analysis dl contributed to the study design offered statistical analysis expertise assisted in interpreting results and played a significant role in the critical revision of the manuscript all authors read and approved the final manuscript 97 acknowledgements the authors wish to express their gratitude to ciusss du centre-sud-de-l le-de- montr eal for the computational resources and support provided which were essential for the research conducted as part of the graduate internship program we are espe- cially thankful to our department director mathieu mailhot for his mentorship and to chen cheng for his collaborative efforts and valuable contri