## **Package Installation**

In [1]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

## **Load Data**

In [2]:
load_data = UnstructuredPDFLoader("./estimate-global-impacts.pdf")

In [3]:
data = load_data.load()

detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.


In [4]:
print(f'{len(data)} document(s) in data')
print(f'{len(data[0].page_content)} characters in the document')

1 document(s) in data
301820 characters in the document


## **Split page content into chunks**

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [6]:
print(f'{len(texts)} documents')

315 documents


## **Convert documents into vectors as embeddings**

In [7]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

  from tqdm.autonotebook import tqdm


In [8]:
OPEN_API_KEY = '<INSERT OPENAPI KEY FROM OPENAI ACCOUNT>'
PINECONE_API_KEY = '<INSERT PINECONE API KEY FROM PINECONE ACCOUNT>'
PINECONE_API_ENV = 'us-east4-gcp'

In [9]:
embeddings = OpenAIEmbeddings(openai_api_key=OPEN_API_KEY)

In [10]:
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_API_ENV
)

index_name = "langchaintest1"

In [11]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [12]:
query1 = "What is climate change?"
docs1 = docsearch.similarity_search(query1, include_metadata=True)

In [13]:
docs1

[Document(page_content='Martens, W.J.M. 1998. “Climate Change, Thermal Stress and Mortality Changes.” Social Science and\n\nMedicine 46(3):331-344.\n\nMartin, P.H. and M.G. Lefebvre. 1995. “Malaria and Climate: Sensitivity of Malaria Potential\n\nTransmission to Climate.” Ambio 24:200-207.\n\nMcMichael, A., A. Githeko, R. Akhtar, R. Caracavallo, D. Gubler, A. Haines, R.S. Kovats, P. Martens, J.\n\nPatz, and A. Sasaki. 2001. “Human Health.” In Climate Change 2001: Impacts, Adaptation and\n\nVulnerability, J. McCarthy, O. Canziani, N. Leary, D. Dokken, and K. White (eds.). New York:\n\nCambridge University Press, pp. 451-485.\n\nMendelsohn, R. 2001. Global Warming and the American Economy: A Regional Assessment of Climate\n\nChange Impacts. Northampton, MA: Edward Elgar.\n\nMendelsohn, R. and J. Neumann (eds.). 1999. The Impacts of Climate Change on the U.S. Economy.\n\nCambridge, UK: Cambridge University Press.\n\nMendelsohn, R. and M.E. Schlesinger. 1997. “Climate Response Functions”, 

In [22]:
context = []

for doc in docs1:
    if doc.page_content not in context:
        context.append(doc.page_content)

In [23]:
context

['face  and  it  can  lead  to  double  counting.  Nor  is  it  always  clear  whether  winners,  those  who  somehow\n\nbenefit  from  climate  change,  should  offset  those  who  stand  to  lose  in  aggregation.  Similarly,  change  in\n\nland use or classification does not measure degree of impact and can allow for double counting as well.\n\nIn  this  study,  we  attempt  to  identify  the  marginal  benefits  associated  with  different  levels  of\n\nclimate change. We do so based on a survey of primarily sectoral studies that have attempted to quantify\n\nglobal impacts of climate change. Instead of converting impacts to a common metric such as dollars, we\n\nretain the different metrics reported by the authors. Clearly, this prohibits us from aggregating our results\n\nacross sectors. Our goal is not to develop a single estimate of global benefits across sectors. Rather, it is to',
 'marginal  benefits,  or  avoided  damages,  associated  with  controlling  climate  change  v

In [20]:
context = ' '.join(context)

In [21]:
context

'face  and  it  can  lead  to  double  counting.  Nor  is  it  always  clear  whether  winners,  those  who  somehow\n\nbenefit  from  climate  change,  should  offset  those  who  stand  to  lose  in  aggregation.  Similarly,  change  in\n\nland use or classification does not measure degree of impact and can allow for double counting as well.\n\nIn  this  study,  we  attempt  to  identify  the  marginal  benefits  associated  with  different  levels  of\n\nclimate change. We do so based on a survey of primarily sectoral studies that have attempted to quantify\n\nglobal impacts of climate change. Instead of converting impacts to a common metric such as dollars, we\n\nretain the different metrics reported by the authors. Clearly, this prohibits us from aggregating our results\n\nacross sectors. Our goal is not to develop a single estimate of global benefits across sectors. Rather, it is to marginal  benefits,  or  avoided  damages,  associated  with  controlling  climate  change  vary  

## **Query docs to obtain answer**

In [14]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [15]:
llm = OpenAI(temperature=0, openai_api_key=OPEN_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [18]:
query2 = "What is climate change?"
docs2 = docsearch.similarity_search(query2, include_metadata=True)

In [19]:
chain.run(input_documents=docs2, question=query2)

' Climate change is a broad term that refers to the long-term changes in global or regional climate patterns. It can refer to changes in average weather conditions, such as temperature, precipitation, and wind patterns, as well as extreme weather events. It can also refer to changes in the frequency and intensity of extreme weather events, such as heat waves, droughts, floods, and hurricanes.'