In [18]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import UnstructuredPDFLoader
import os
import dotenv
from langchain.prompts import PromptTemplate

dotenv.load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
loader = UnstructuredPDFLoader("notebooks/ijgi-11-00628-v2.pdf")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()
docsearch = Chroma.from_documents(texts, embeddings)
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)
chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=OPENAI_API_KEY, temperature = 0, model_name = "gpt-3.5-turbo-16k"), chain_type="stuff", retriever=docsearch.as_retriever(), chain_type_kwargs=chain_type_kwargs, return_source_documents = True)
query = """
Choose the availability of the data used in this study from the following options: data vailable via URL, data available upon request, data is not available, not mentioned. The URL need to be a link to the whole dataset used by the author via a data host service, such as Google Drive (https://drive.google.com/), Harvard Dataverse (https://dataverse.harvard.edu/), Figshare (https://figshare.com/). Other type of URLs (e.g., social media data provider) should not be considered. 
Example Answer: 
------
*XXX*
------
"""
qa.run(query)

Created a chunk of size 1584, which is longer than the specified 1000
Created a chunk of size 1348, which is longer than the specified 1000
Created a chunk of size 2246, which is longer than the specified 1000
Created a chunk of size 1456, which is longer than the specified 1000
Created a chunk of size 1319, which is longer than the specified 1000
Created a chunk of size 1044, which is longer than the specified 1000
Created a chunk of size 1133, which is longer than the specified 1000
Created a chunk of size 1327, which is longer than the specified 1000
Created a chunk of size 1048, which is longer than the specified 1000
Created a chunk of size 1047, which is longer than the specified 1000
Created a chunk of size 1073, which is longer than the specified 1000
Created a chunk of size 1210, which is longer than the specified 1000
Created a chunk of size 1062, which is longer than the specified 1000
Created a chunk of size 1377, which is longer than the specified 1000
Created a chunk of s

'The availability of the data used in this study is not mentioned.'