In [193]:
 #! pip install langchain-openai langchain-community langchainhub gpt4all langchain-chroma chromadb langchain pypdf

In [194]:
# Clone books
# ! git clone https://github.com/aridiosilva/AI_Books.git

In [195]:
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader

# Load entire directory
# loader = DirectoryLoader("AI_Books/", glob="*.pdf", loader_cls=PyPDFLoader, use_multithreading=True)

# Load single file
loader = PyPDFLoader("https://arxiv.org/pdf/2406.12566", extract_images=False)
pages = loader.load()

In [196]:
# Get Text from all pages
txt = ' '.join([d.page_content for d in pages])

In [197]:
print("Pages: ",len(pages))
print("Text lenght: ", len(txt))

Pages:  16
Text lenght:  63473


In [198]:
# Split Text into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 250,
    add_start_index=True
)

In [199]:
chunks = text_splitter.split_documents(pages)

In [200]:
# Count chunks
len(chunks)

87

## Vectorstore

In [201]:
#! pip install chromadb

In [202]:
!rm -rf docs1/

In [203]:
#from langchain.vectorstores import Chroma


In [207]:
# Embedding with Azure OpenAI Ada
from langchain_chroma import Chroma
from langchain_community.embeddings import AzureOpenAIEmbeddings
from langchain_openai import AzureOpenAIEmbeddings
import os

# Set environment variables for Azure OpenAI
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_BASE"] = ""
os.environ["OPENAI_API_KEY"] = ""
os.environ["OPENAI_API_VERSION"] = "2023-05-15"

# Define your embedding function using Azure OpenAI
embedding_function = AzureOpenAIEmbeddings(deployment="text-embedding-ada-002")

# Set the persist directory
persist_directory = './docs/'

# Create the Chroma vector database from documents and save it
vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_function,
    persist_directory=persist_directory
)

print(f"Saved chunks to {persist_directory}")

OperationalError: attempt to write a readonly database

## Prepare DB

In [None]:
db = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding_function
)

### Small test

In [115]:
print(db._collection.count())

87


In [120]:
query_text = "datasets used"

In [122]:
# k is number of results we want to return
results = db.similarity_search_with_score(query_text, k=3)

if len(results) == 0 or results[0][1] < 0.2:
    print("Unable to find matching results.")
else:
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    sources = [doc.metadata.get("source", None) for doc, _score in results]
    formatted_response = f"Response: {context_text}\nSources: {sources}"
    print(formatted_response)

Response: 4 Experiment
4.1 Datasets and Metrics
Datasets . We conduct our experiments on
two publicly available datasets that focus on
multi-document summarization and long-form
query-answer (QA) respectively, i.e.,WikiPas-
sageQA (Hayashi et al., 2021) and Wiki-
Asp (Hayashi et al., 2021). WikiPassageQA offers
human-annotated quality-evaluated questions and
long-form answers. We chose this dataset because
its answers are generally comprehensive, related
to various aspects of questions, and the answer
length is fairly long. WikiAsp dataset is devised
for generating aspect-based summaries of entities
from 20 domains. We follow (Jiang et al., 2023) to
convert it into open-domain QA settings. To sup-
port our experiments, we first operate some data
pre-processing to ensure that each piece of data con-
tains the question, ground truth answer, sub-aspects
of the question, and their sub-answers. The process
details and statistical information of datasets are
demonstrated in Appendix A.

---


## RAG

In [123]:
from langchain.prompts import ChatPromptTemplate
# from langchain_openai import ChatOpenAI
from langchain_openai import AzureChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationChain
import os

model = AzureChatOpenAI(
    azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

conversation = ConversationChain(
    llm=model,
    verbose=False,
    memory=ConversationBufferMemory()
)

PROMPT_TEMPLATE = """
Answer the question based only on the following context:
{context}
DO NOT give irelevant information that is not in the context.
---
Answer the question based on the above context: {question}
"""

In [124]:
query_text = "What are the datasets that have been used for the training?"

In [125]:
results = db.similarity_search_with_score(query_text, k=3)

In [126]:
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)

response_text = conversation.predict(input=prompt)

sources = [doc.metadata.get("source", None) for doc, _score in results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)

Response: The datasets used for the training are WikiPassageQA and WikiAsp.
Sources: ['https://arxiv.org/pdf/2406.12566', 'https://arxiv.org/pdf/2406.12566', 'https://arxiv.org/pdf/2406.12566']
