### **Preparing Data**

In [1]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

  from tqdm.autonotebook import tqdm


In [2]:
load_data = UnstructuredPDFLoader("../langchain+openai+pinecone/estimate-global-impacts.pdf")

In [3]:
data = load_data.load()

detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.


In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [5]:
OPEN_API_KEY = '<INSERT OPENAPI KEY FROM OPENAI ACCOUNT>'
PINECONE_API_KEY = '<INSERT PINECONE API KEY FROM PINECONE ACCOUNT>'
PINECONE_API_ENV = 'us-east4-gcp'

In [6]:
embeddings = OpenAIEmbeddings(openai_api_key=OPEN_API_KEY)

In [7]:
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_API_ENV
)

index_name = "langchaintest1"

In [8]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [9]:
query1 = "What is climate change?"
docs1 = docsearch.similarity_search(query1, include_metadata=True)

In [10]:
docs1

[Document(page_content='Martens, W.J.M. 1998. “Climate Change, Thermal Stress and Mortality Changes.” Social Science and\n\nMedicine 46(3):331-344.\n\nMartin, P.H. and M.G. Lefebvre. 1995. “Malaria and Climate: Sensitivity of Malaria Potential\n\nTransmission to Climate.” Ambio 24:200-207.\n\nMcMichael, A., A. Githeko, R. Akhtar, R. Caracavallo, D. Gubler, A. Haines, R.S. Kovats, P. Martens, J.\n\nPatz, and A. Sasaki. 2001. “Human Health.” In Climate Change 2001: Impacts, Adaptation and\n\nVulnerability, J. McCarthy, O. Canziani, N. Leary, D. Dokken, and K. White (eds.). New York:\n\nCambridge University Press, pp. 451-485.\n\nMendelsohn, R. 2001. Global Warming and the American Economy: A Regional Assessment of Climate\n\nChange Impacts. Northampton, MA: Edward Elgar.\n\nMendelsohn, R. and J. Neumann (eds.). 1999. The Impacts of Climate Change on the U.S. Economy.\n\nCambridge, UK: Cambridge University Press.\n\nMendelsohn, R. and M.E. Schlesinger. 1997. “Climate Response Functions”, 

In [11]:
context = []

for doc in docs1:
    if doc.page_content not in context:
        context.append(doc.page_content)

In [12]:
context = ' '.join(context) # str of context(s)

In [13]:
context

'Martens, W.J.M. 1998. “Climate Change, Thermal Stress and Mortality Changes.” Social Science and\n\nMedicine 46(3):331-344.\n\nMartin, P.H. and M.G. Lefebvre. 1995. “Malaria and Climate: Sensitivity of Malaria Potential\n\nTransmission to Climate.” Ambio 24:200-207.\n\nMcMichael, A., A. Githeko, R. Akhtar, R. Caracavallo, D. Gubler, A. Haines, R.S. Kovats, P. Martens, J.\n\nPatz, and A. Sasaki. 2001. “Human Health.” In Climate Change 2001: Impacts, Adaptation and\n\nVulnerability, J. McCarthy, O. Canziani, N. Leary, D. Dokken, and K. White (eds.). New York:\n\nCambridge University Press, pp. 451-485.\n\nMendelsohn, R. 2001. Global Warming and the American Economy: A Regional Assessment of Climate\n\nChange Impacts. Northampton, MA: Edward Elgar.\n\nMendelsohn, R. and J. Neumann (eds.). 1999. The Impacts of Climate Change on the U.S. Economy.\n\nCambridge, UK: Cambridge University Press.\n\nMendelsohn, R. and M.E. Schlesinger. 1997. “Climate Response Functions”, Ambio 28 (1999) 362-366

In [1]:
context = (
    "This study therefore surveyed the literature on global impacts of climate change in specific "
    "sectors. It focused on the literature that examined global impacts up to 2100 and to a limited extent "
    "delineated some regional impacts as well. It did not attempt to summarize the regional impact literature. "
    "We used the metrics as reported in these studies, such as number of people affected, production, and "
    "primary productivity, as indicators of global impacts. We used change in global mean temperature (GMT) "
    "as the primary indicator of climate change, recognising that climate change is far more complex than this. "
    "For example, potential changes in regional climate and climate variance associated with a particular change "
    "in GMT can vary widely and encompass not only changes in temperature but also changes in precipitation "
    "and other climatic variables. We examined different studies to see if they showed a consistent relationship "
    "between impacts and increases in GMT. In particular, we tried to determine whether damages rise "
    "monotonically with increasing GMT, whether there are thresholds below which there are virtually no "
    "impacts, or whether there is a parabolic relationship, i.e., positive impacts followed by a reversal in sign. "
)

In [9]:
query2 = 'What are the factors of global impacts?'

### **Importing Libraries**

In [3]:
# import torch
from transformers import BertForQuestionAnswering
# from transformers import BertTokenizer
from transformers import AutoTokenizer
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 736kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [10]:
tokenizer.encode(query2, truncation=True, padding=True)

[101, 2054, 2024, 1996, 5876, 1997, 3795, 14670, 1029, 102]

In [11]:
nlp = pipeline('question-answering', model=model, tokenizer=tokenizer)

In [12]:
nlp({
    'question': query2,
    'context': context
})

{'score': 0.8394834399223328,
 'start': 359,
 'end': 422,
 'answer': 'number of people affected, production, and primary productivity'}

In [23]:
# input_ids = tokenizer.encode(query1, context, max_length=512, truncation=True, add_special_tokens=True, padding=True)
input_ids = tokenizer.encode(query1, context)
print(f'The input has a total of {len(input_ids)} tokens.')

The input has a total of 312 tokens.


In [24]:
tokens = tokenizer.convert_ids_to_tokens(input_ids)

for token, id in zip(tokens, input_ids):
    print('{:8}{:8,}'.format(token, id))

[CLS]        101
what       2,054
is         2,003
climate    4,785
change     2,689
?          1,029
[SEP]        102
mart      20,481
##ens      6,132
,          1,010
w          1,059
.          1,012
j          1,046
.          1,012
m          1,049
.          1,012
1998       2,687
.          1,012
“          1,523
climate    4,785
change     2,689
,          1,010
thermal    9,829
stress     6,911
and        1,998
mortality  13,356
changes    3,431
.          1,012
”          1,524
social     2,591
science    2,671
and        1,998
medicine   4,200
46         4,805
(          1,006
3          1,017
)          1,007
:          1,024
331       27,533
-          1,011
344       29,386
.          1,012
martin     3,235
,          1,010
p          1,052
.          1,012
h          1,044
.          1,012
and        1,998
m          1,049
.          1,012
g          1,043
.          1,012
le         3,393
##fe       7,959
##b        2,497
##vre     12,229
.          1,012
1995       2,

In [25]:
sep_idx = input_ids.index(tokenizer.sep_token_id)
num_seg_a = sep_idx + 1
num_seg_b = len(input_ids) - num_seg_a
segment_ids = [0]*num_seg_a + [1]*num_seg_b
assert len(segment_ids) == len(input_ids)

In [26]:
output = model(torch.tensor([input_ids]),
               token_type_ids=torch.tensor([segment_ids]))

In [27]:
answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)

if answer_end >= answer_start:
    answer = " ".join(tokens[answer_start:answer_end+1])
else:
    answer = 'No answer to the question.'

print(f"\nQuestion:\n{query1.capitalize()}")
print(f"\nAnswer:\n{answer.capitalize()}")


Question:
What is climate change?

Answer:
[cls] what is climate change ? [sep]


In [28]:
if answer_end >= answer_start:
    answer = tokens[answer_start]
    for i in range(answer_start+1, answer_end+1):
        if tokens[i][0:2] == "##":
            answer += tokens[i][2:]
        else:
            answer += " " + tokens[i]

if answer.startswith("[CLS]"):
    answer = "No answer to question."

In [29]:
print(f"\nQuestion:\n{query1.capitalize()}")
print(f"\nAnswer:\n{answer.capitalize()}")


Question:
What is climate change?

Answer:
No answer to question.
