In [5]:
# Standard library imports
import os

# Third-party library imports for arXiv API, document handling, and embedding
import arxiv
from langchain_community.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_pinecone import PineconeVectorStore
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import OpenAI, OpenAIEmbeddings

# Initialize components
os.environ["TOKENIZERS_PARALLELISM"] = "false"
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
metadata_vector_store = PineconeVectorStore.from_existing_index(embedding=embedding_model, index_name="arxiv-rag-metadata")
chunks_vector_store = PineconeVectorStore.from_existing_index(embedding=embedding_model, index_name="openai-chunks")
semantic_chunker = SemanticChunker(embeddings=embedding_model, buffer_size=1, add_start_index=False)


In [6]:
def process_and_upload_chunks(document_id):
    document_id = str(document_id)
    print("Downloading Paper")
    paper = next(arxiv.Client().results(arxiv.Search(id_list=[document_id])))
    paper.download_pdf(filename=f"{document_id}.pdf")
    loader = PyPDFLoader(f"{document_id}.pdf")
    print("Processing & Uploading Paper")
    pages = loader.load_and_split()

    chunks = []
    for page in pages:
        text = page.page_content
        chunks.extend(semantic_chunker.split_text(text))
    chunks_vector_store.from_texts(texts=chunks, embedding=embedding_model, metadatas=[{"document_id": document_id} for _ in chunks], index_name="arxiv-project-chunks")
    os.remove(f"{document_id}.pdf")
    print("Paper Uploaded. Please Proceed To Ask Your Question.")

def do_chunks_exist_already(document_id):
    document_id = str(document_id)
    filter = {"document_id": {"$eq": document_id}}
    test_query = chunks_vector_store.similarity_search(query="Chunks Existence Check", k=1, filter=filter)
    return bool(test_query)

def process_user_query(document_id):
    document_id = str(document_id)
    context = []
    user_query = input("Please enter your question:\n")
    filter = {"document_id": {"$eq": document_id}}
    search_results = chunks_vector_store.similarity_search(query=user_query, k=5, filter=filter)
    for doc in search_results:
        context.append(doc.page_content)
    return context, user_query

def query_openai_with_context(context, user_query):
    template = """Use The Following Context:
    Context: {context}
    To Answer The Following Question:
    {user_query}
    """
    prompt = ChatPromptTemplate.from_template(template)
    model = OpenAI()
    parser = StrOutputParser()
    chain = prompt | model | parser
    output = chain.invoke({"context": context, "user_query": user_query})
    return output

def select_document_from_results(search_results):
    if not search_results:
        print("No search results found.")
        return None
    print("Top search results based on content and metadata:\n")
    for i, doc in enumerate(search_results, start=1):
        page_content = doc.page_content
        document_id = doc.metadata['document_id']
        print(f"{i}: Research Paper Title & Author: {page_content}\n   Document ID: {document_id}\n")
    user_choice = int(input("Select a paper by entering its number: ")) - 1
    if 0 <= user_choice < len(search_results):
        selected_doc_id = search_results[user_choice].metadata['document_id']
        print(f"\nYou selected document ID: {selected_doc_id}")
        return str(selected_doc_id)
    else:
        print("\nInvalid selection. Please run the process again and select a valid number.")
        return None

def main():
    initial_query = input("Enter the title or topic of the paper you're interested in: ")
    search_results = metadata_vector_store.similarity_search(query=initial_query, k=5)
    selected_doc_id = select_document_from_results(search_results)
    if selected_doc_id:
        if not do_chunks_exist_already(selected_doc_id):
            process_and_upload_chunks(selected_doc_id)
        context, user_query = process_user_query(selected_doc_id)
        response = query_openai_with_context(context, user_query)
        print("Response from AI:", response)

if __name__ == "__main__":
    main()

Top search results based on content and metadata:

1: Research Paper Title & Author: On the conformal bending of a closed Riemannian manifold by Rirong Yuan
   Document ID: 2210.06627

2: Research Paper Title & Author: On the partial uniform ellipticity and complete conformal metrics with prescribed curvature functions on manifolds with boundary by Rirong Yuan
   Document ID: 2011.08580

3: Research Paper Title & Author: Notes on conformal metrics of negative curvature on manifolds with boundary by Rirong Yuan
   Document ID: 2308.05979

4: Research Paper Title & Author: M-eigenvalues of Riemann Curvature Tensor of Conformally Flat Manifolds by Yun Miao, Liqun Qi, Yimin Wei
   Document ID: 1808.01882

5: Research Paper Title & Author: Conformal scalar curvature rigidity on Riemannian manifolds by Seongtag Kim
   Document ID: 1706.00460

