In [None]:
%pip install opendatasets openai unstructured[pdf] gradio langchain-openai aperturedb pandas langchain-community arxiv --upgrade --quiet

In [None]:
import os
import json
import arxiv
import requests
import pandas as pd
import opendatasets as od
from langchain_core.documents import Document
from unstructured.partition.auto import partition
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain.chains import (
    StuffDocumentsChain, LLMChain
)
from langchain.schema import HumanMessage, AIMessage
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, MessagesPlaceholder
from langchain.callbacks.manager import (
    trace_as_chain_group,
)
import gradio as gr

In [None]:
dataset = 'https://www.kaggle.com/datasets/Cornell-University/arxiv'
od.download(dataset)

In [None]:
def fetch_paper_details(arxiv_id):
    paper = next(arxiv.Client().results(arxiv.Search(id_list=[arxiv_id])))
    paper.download_pdf( filename=f"{arxiv_id}.pdf")
    return partition(f"{arxiv_id}.pdf")

In [None]:
papers = []
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=5000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

sample = 2 # Arxiv has over 1.7M articles, using 20 for our application

# Open the JSON file and process entries
with open("arxiv/arxiv-metadata-oai-snapshot.json", "r") as file:
    for _ in range(sample):
        line = file.readline()
        data = json.loads(line)

        # Extract relevant fields
        arxiv_id = data.get("id", "")

        # Add paper details by downloading and parsing the paper
        paper_details = "".join(
            text if isinstance((text := element.text), str)
            else "".join(str(part) for part in text) if isinstance(text, (list, tuple))
            else str(text)
            for element in fetch_paper_details(arxiv_id)
        )
        print(type(paper_details))
        # Use LangChain's splitter to divide paper details into chunks
        chunks = text_splitter.create_documents([paper_details])
        print(len(chunks))
        # Create a Document for each chunk
        for idx, chunk in enumerate(chunks):
            print(chunk,type(chunk))
            document_id = f"{arxiv_id}_{idx + 1}"  # Unique ID for each chunk
            document = Document(
                page_content=chunk.page_content,
                id=document_id,
                metadata={
                    'title': data.get("title",""),
                    'authors': data.get("authors", ""),
                    'submitter': data.get("submitter", ""),
                    'abstract': data.get("abstract", ""),
                    'paper_content': chunk.page_content
                }
            )
            papers.append(document)

print("Processing complete. Papers saved to processed_papers.json.")

In [None]:
papers

In [None]:
!adb config create --active --from-json

In [None]:
from langchain_community.vectorstores import ApertureDB

embeddings = OpenAIEmbeddings(api_key  = "<API-KEY>")
vector_db = ApertureDB.from_documents(papers, embeddings)

In [None]:

retriever = vector_db.as_retriever(search_type="mmr", search_kwargs={"k": 1})
document_prompt = PromptTemplate(
    input_variables=["page_content"],
     template="{page_content}"
)
document_variable_name = "context"
llm = ChatOpenAI(
    api_key  = "<API-KEY>",
    model="gpt-4o",
    temperature=0,
)

prompt_template = """Use the following pieces of context to answer user questions. If you don't know the answer, just say that you don't know, don't try to make up an answer.

--------------

{context}"""
system_prompt = SystemMessagePromptTemplate.from_template(prompt_template)
prompt = ChatPromptTemplate(
	messages=[
		system_prompt,
		MessagesPlaceholder(variable_name="chat_history"),
		HumanMessagePromptTemplate.from_template("{question}")
	]
)
llm_chain = LLMChain(llm=llm, prompt=prompt)
combine_docs_chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_prompt=document_prompt,
    document_variable_name=document_variable_name,
    document_separator="---------"
)

### Set up a chain that controls how the search query for the vectorstore is generated

# This controls how the search query is generated.
# Should take `chat_history` and `question` as input variables.
template = """Combine the chat history and follow up question into a a search query.

Chat History:

{chat_history}

Follow up question: {question}
"""
prompt = PromptTemplate.from_template(template)
question_generator_chain = LLMChain(llm=llm, prompt=prompt)


### Create our function to use

def qa_response(message, history):

	# Convert message history into format for the `question_generator_chain`.
	convo_string = "\n\n".join([f"Human: {h}\nAssistant: {a}" for h, a in history])

	# Convert message history into LangChain format for the final response chain.
	messages = []
	for human, ai in history:
		messages.append(HumanMessage(content=human))
		messages.append(AIMessage(content=ai))

	# Wrap all actual calls to chains in a trace group.
	with trace_as_chain_group("qa_response") as group_manager:

		# Generate search query.
		search_query = question_generator_chain.run(
			question=message,
			chat_history=convo_string,
			callbacks=group_manager
		)

		# Retrieve relevant docs.
		docs = retriever.get_relevant_documents(search_query, callbacks=group_manager)

		# Answer question.
		return combine_docs_chain.run(
			input_documents=docs,
			chat_history=messages,
			question=message,
			callbacks=group_manager
		)

### Now we start the app!



In [None]:
gr.ChatInterface(qa_response).launch(debug=True)