In [2]:
# Standard library imports
import os

# Third-party library imports for arXiv API, document handling, and embedding
import arxiv
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_pinecone import PineconeVectorStore
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize components
os.environ["TOKENIZERS_PARALLELISM"] = "false"
embedding_model = HuggingFaceEmbeddings()
metadata_vector_store = PineconeVectorStore.from_existing_index(embedding=embedding_model, index_name="arxiv-metadata")
chunks_vector_store = PineconeVectorStore.from_existing_index(embedding=embedding_model, index_name="test")

In [None]:
def extract_and_split_pdf_until_references(pdf_path):
    # Create the text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=50,
        length_function=len,
        is_separator_regex=False
    )

    # Load the PDF
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()

    # Initialize a list to store all the text content
    content = []
    found_references = False

    # Loop over each page and concatenate the text content until "References" is found
    for page in pages:
        if found_references:
            break
        page_text = page.page_content
        if "references" in page_text.lower():
            # Split the text at "References" and take the part before it
            content.append(page_text.split("References")[0])
            found_references = True
        else:
            content.append(page_text)

    # Join the list into a single string
    full_content = ''.join(content)

    # Split the content into chunks
    chunks = text_splitter.split_text(full_content)
    print(chunks)
    print(len(chunks))

    return chunks

def process_and_upload_chunks(document_id):
    print("Downloading Paper")
    paper = next(arxiv.Client().results(arxiv.Search(id_list=[str(document_id)])))
    paper.download_pdf(filename=f"{document_id}.pdf")

    print("Processing & Uploading Paper")
    chunks = extract_and_split_pdf_until_references(f"{document_id}.pdf")

    # Generate embeddings for the chunks
    embeddings = embedding_model.embed_documents(chunks)

    # Add chunks to the vector store
    chunks_vector_store.add_texts(
        texts=chunks,
        embeddings=embeddings,
        metadatas=[{"document_id": document_id} for _ in chunks]
    )
    
    os.remove(f"{document_id}.pdf")
    print("Paper Uploaded. Please Proceed To Ask Your Question.")

def do_chunks_exist_already(document_id):
    filter = {"document_id": {"$eq": document_id}}
    test_query = chunks_vector_store.similarity_search(query="Chunks Existence Check", k=1, filter=filter)
    return bool(test_query)

def process_user_query(document_id):
    context = []
    user_query = input("Please enter your question:\n")
    filter = {"document_id": {"$eq": document_id}}
    search_results = chunks_vector_store.similarity_search(query=user_query, k=20, filter=filter)
    for doc in search_results:
        context.append(doc.page_content)
    return context, user_query

def query_openai_with_context(context, user_query):
    template = """Use The Following Context:
    Context: {context}
    To Answer The Following Question:
    {user_query}
    """
    prompt = ChatPromptTemplate.from_template(template)
    model = OpenAI()
    parser = StrOutputParser()
    chain = prompt | model | parser
    output = chain.invoke({"context": context, "user_query": user_query})
    return output

def select_document_from_results(search_results):
    if not search_results:
        print("No search results found.")
        return None
    print("Top search results based on content and metadata:\n")
    for i, doc in enumerate(search_results, start=1):
        page_content = doc.page_content
        document_id = doc.metadata['document_id']
        print(f"{i}: Research Paper Title & Author: {page_content}\n   Document ID: {document_id}\n")
    user_choice = int(input("Select a paper by entering its number: ")) - 1
    if 0 <= user_choice < len(search_results):
        selected_doc_id = search_results[user_choice].metadata['document_id']
        print(f"\nYou selected document ID: {selected_doc_id}")
        return selected_doc_id
    else:
        print("\nInvalid selection. Please run the process again and select a valid number.")
        return None

def main():
    initial_query = input("Enter the title or topic of the paper you're interested in: ")
    search_results = metadata_vector_store.similarity_search(query=initial_query, k=5)
    selected_doc_id = select_document_from_results(search_results)
    if selected_doc_id:
        if not do_chunks_exist_already(selected_doc_id):
            process_and_upload_chunks(selected_doc_id)
        context, user_query = process_user_query(selected_doc_id)
        response = query_openai_with_context(context, user_query)
        print("Response from AI:", response)

if __name__ == "__main__":
    main()

Top search results based on content and metadata:

1: Research Paper Title & Author: FaaF: Facts as a Function for the evaluation of RAG systems by Vasileios Katranidis, Gabor Barany
   Document ID: 2403.03888

2: Research Paper Title & Author: The Chronicles of RAG: The Retriever, the Chunk and the Generator by Paulo Finardi, Leonardo Avila, Rodrigo Castaldoni, Pedro Gengo, Celio Larcher, Marcos Piau, Pablo Costa, Vinicius Caridá
   Document ID: 2401.07883

3: Research Paper Title & Author: RAFT: Adapting Language Model to Domain Specific RAG by Tianjun Zhang, Shishir G. Patil, Naman Jain, Sheng Shen, Matei Zaharia, Ion Stoica, Joseph E. Gonzalez
   Document ID: 2403.10131

4: Research Paper Title & Author: Consequences of a Goedel's misjudgment by Giuseppe Raguni
   Document ID: 1503.03087

5: Research Paper Title & Author: Bohmian Zitterbewegung by Giuseppe Ragunì
   Document ID: 2106.05827


You selected document ID: 2401.07883


NotFoundError: Error code: 404 - {'error': {'message': 'This is a chat model and not supported in the v1/completions endpoint. Did you mean to use v1/chat/completions?', 'type': 'invalid_request_error', 'param': 'model', 'code': None}}