In [2]:
# Standard library imports
import os

# Third-party library imports for arXiv API, document handling, and embedding
import arxiv
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_pinecone import PineconeVectorStore
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import OpenAI

# Initialize components
os.environ["TOKENIZERS_PARALLELISM"] = "false"
embedding_model = HuggingFaceEmbeddings()
metadata_vector_store = PineconeVectorStore.from_existing_index(embedding=embedding_model, index_name="arxiv-metadata")
chunks_vector_store = PineconeVectorStore.from_existing_index(embedding=embedding_model, index_name="arxiv-project-chunks")
semantic_chunker = SemanticChunker(embeddings=embedding_model, buffer_size=1, add_start_index=False)


In [7]:
def process_and_upload_chunks(document_id):
    print("Downloading Paper")
    paper = next(arxiv.Client().results(arxiv.Search(id_list=[str(document_id)])))
    paper.download_pdf(filename=f"{document_id}.pdf")
    loader = PyPDFLoader(f"{document_id}.pdf")
    print("Processing & Uploading Paper")
    pages = loader.load_and_split()

    chunks = []
    for page in pages:
        text = page.page_content
        chunks.extend(semantic_chunker.split_text(text))
    chunks_vector_store.from_texts(texts=chunks, embedding=embedding_model, metadatas=[{"document_id": document_id} for _ in chunks], index_name="arxiv-project-chunks")
    os.remove(f"{document_id}.pdf")
    print("Paper Uploaded. Please Proceed To Ask Your Question.")

def do_chunks_exist_already(document_id):
    filter = {"document_id": {"$eq": document_id}}
    test_query = chunks_vector_store.similarity_search(query="Chunks Existence Check", k=1, filter=filter)
    return bool(test_query)

def process_user_query(document_id):
    context = []
    user_query = input("Please enter your question:\n")
    filter = {"document_id": {"$eq": document_id}}
    search_results = chunks_vector_store.similarity_search(query=user_query, k=5, filter=filter)
    for doc in search_results:
        context.append(doc.page_content)
    return context, user_query

def query_openai_with_context(context, user_query):
    template = """Use The Following Context:
    Context: {context}
    To Answer The Following Question:
    {user_query}
    """
    prompt = ChatPromptTemplate.from_template(template)
    model = OpenAI()
    parser = StrOutputParser()
    chain = prompt | model | parser
    output = chain.invoke({"context": context, "user_query": user_query})
    return output

def select_document_from_results(search_results):
    if not search_results:
        print("No search results found.")
        return None
    print("Top search results based on content and metadata:\n")
    for i, doc in enumerate(search_results, start=1):
        page_content = doc.page_content
        document_id = doc.metadata['document_id']
        print(f"{i}: Research Paper Title & Author: {page_content}\n   Document ID: {document_id}\n")
    user_choice = int(input("Select a paper by entering its number: ")) - 1
    if 0 <= user_choice < len(search_results):
        selected_doc_id = search_results[user_choice].metadata['document_id']
        print(f"\nYou selected document ID: {selected_doc_id}")
        return selected_doc_id
    else:
        print("\nInvalid selection. Please run the process again and select a valid number.")
        return None

def main():
    initial_query = input("Enter the title or topic of the paper you're interested in: ")
    search_results = metadata_vector_store.similarity_search(query=initial_query, k=5)
    selected_doc_id = select_document_from_results(search_results)
    if selected_doc_id:
        if not do_chunks_exist_already(selected_doc_id):
            process_and_upload_chunks(selected_doc_id)
        context, user_query = process_user_query(selected_doc_id)
        response = query_openai_with_context(context, user_query)
        print("Response from AI:", response)

if __name__ == "__main__":
    main()

Top search results based on content and metadata:

1: Research Paper Title & Author: Quantifying Quantumness and the Quest for Queens of Quantum by Olivier Giraud, Petr A. Braun, Daniel Braun
   Document ID: 1002.2158

2: Research Paper Title & Author: Demystifying Quantum Mechanics by Ana Elisa D. Barioni, Felipe B. Mazzi, Elsa Bifano Pimenta, Willian Vieira dos Santos, Marco A. P. Lima
   Document ID: 2106.02161

3: Research Paper Title & Author: A primer on quantum mechanics and its interpretations by Casey Blood
   Document ID: 1001.3080

4: Research Paper Title & Author: A glance beyond the quantum model by Miguel Navascues, Harald Wunderlich
   Document ID: 0907.0372

5: Research Paper Title & Author: Physics as quantum information processing by Giacomo Mauro D'Ariano
   Document ID: 1012.2597


You selected document ID: 1002.2158
Downloading Paper
Processing & Uploading Paper
Paper Uploaded. Please Proceed To Ask Your Question
Response from AI: 
Well, first of all, let me explai