# **Project-P9 for CS60075: Natural Language Processing**
## Automated query processing from passages 
Using BERT Model

# Team Members (Group-3): 
Jaisaikrishnan

Shivam Bhosale

Mayank agrawal


Arundhuti Nuskar

# **Importing libraries**

In [15]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm

In [17]:
!pip install transformers



In [18]:
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

In [19]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [20]:
from gensim.summarization.bm25 import BM25

In [21]:
import re
import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize,sent_tokenize

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# **Bert Model**

In [22]:
class Bert:

    def __init__(self, corpus):
      self.corpus = corpus
      self.bm25 = None
      self.passage_list = None # List of all the paragraphs in the corpus

      self.preprocess()
    
    @staticmethod
    def tokenize(text):
      text = text.lower()
      text = re.sub('[^a-z0-9_]', ' ', text)
      text = re.sub(r'\s+', ' ', text)

      return [wnl.lemmatize(word) for word in word_tokenize(text) if not word in stop_words]

    def preprocess(self):
      self.corpus = re.sub('\n\n', '\n', self.corpus)

      passages = [p for p in self.corpus.split('\n') if p]
      self.passage_list = passages

      passage_token_list = []

      # remove stopwords, lemmatize, ...
      for passage in self.passage_list:
        passage_tokens = Bert.tokenize(passage)
        passage_token_list.append(passage_tokens)

      self.bm25 = BM25(passage_token_list)


    def extract_passage(self, question):

      question_tokens = Bert.tokenize(question)
      # print(question_tokens)

      average_idf = sum(map(lambda k: float(self.bm25.idf[k]), self.bm25.idf.keys())) / len(self.bm25.idf.keys())

      scores = self.bm25.get_scores(question_tokens, average_idf)

      pairs = [(s, i) for i, s in enumerate(scores)]
      pairs.sort(reverse=True)


      return pairs[0][1]
      # return [i for _, i in pairs[:topn]]




    def process_query(self, question_list):
            
      answer_list = []
      
      for question in (question_list):
          cur_passage_ind = self.extract_passage(question)
          # print(cur_passage_ind)

          text = self.passage_list[cur_passage_ind]

          #tokenize question and text as a pair
          input_ids = tokenizer.encode(question, text)
          
          #string version of tokenized ids
          tokens = tokenizer.convert_ids_to_tokens(input_ids)
          
          #segment IDs
          #first occurence of [SEP] token
          sep_idx = input_ids.index(tokenizer.sep_token_id) # [SEP]
          #number of tokens in segment A (question)
          num_seg_a = sep_idx+1
          #number of tokens in segment B (text)
          num_seg_b = len(input_ids) - num_seg_a
          
          #list of 0s and 1s for segment embeddings
          segment_ids = [0]*num_seg_a + [1]*num_seg_b

          # print("len: ", len(input_ids))
          
          #model output using input_ids and segment_ids
          output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
          
          #reconstructing the answer
          answer_start = torch.argmax(output.start_logits)
          answer_end = torch.argmax(output.end_logits)

          answer = ""

          if answer_end >= answer_start:
              answer = tokens[answer_start]
              for i in range(answer_start+1, answer_end+1):
                  if tokens[i][0:2] == "##":
                      answer += tokens[i][2:]
                  else:
                      answer += " " + tokens[i]

          else:
              answer = "Unable to find the answer to your question."

          if answer.startswith("[CLS]"):
              answer = "Unable to find the answer to your question."
          
          answer_list.append(answer)
          # print("\nPredicted answer:\n{}".format(answer.capitalize()))

      return answer_list

# **Question Answering**

In [23]:
content = open("//content/drive/MyDrive/Colab Notebooks/NLP/NLP PROJECT/corpus.txt","r").read()

bert_model = Bert(content)

i=0
while(True):
  i += 1
  question = input("Enter the question: ")
  if(question == "end"):
      print("Good Bye!")
      break

  print(f"Q{i}: {question}")
  
  answer = bert_model.process_query([question])

  print("Ans: ", answer[0])
  print("------------------------------------------------------------")

   

Enter the question: What is the name of the satellite that measured the amount of dust?
Q1: What is the name of the satellite that measured the amount of dust?
Ans:  calipso satellite
------------------------------------------------------------
Enter the question: What did the analysis from the sediment deposits indicate?
Q2: What did the analysis from the sediment deposits indicate?
Ans:  rainfall in the basin during the lgm was lower than for the present
------------------------------------------------------------
Enter the question: Where is the majority of the rainforest contained?
Q3: Where is the majority of the rainforest contained?
Ans:  brazil
------------------------------------------------------------
Enter the question: What is the name of the book written by Archeologist Betty Meggers?
Q4: What is the name of the book written by Archeologist Betty Meggers?
Ans:  amazonia : man and culture in a counterfeit paradise
-----------------------------------------------------------

# **Accuracy**

In [24]:
def compute_f1(prediction, truth):
    pred_tokens = Bert.tokenize(prediction)
    truth_tokens = Bert.tokenize(truth)
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [28]:
questions = ["What is the name of the satellite that measured the amount of dust?",
"What did the analysis from the sediment deposits indicate?",
"Where is the majority of the rainforest contained?",
"What is the name of the book written by Archeologist Betty Meggers?",
"What is the average plant biosmass?",
"When dating rocks, what is the absolute isotopic date applied to?",
"What are the three major types of rock?",
"What types of waves do seismologists use to image the interior of the Earth?",
"What principle relates to the formation of faults and the age of the sequences through which they cut?",
"Where do thrust faults form?"]

ground_truth = ["CALIPSO",
"rainfall in the basin during the LGM was lower than for the present",
"Brazil",
"Amazonia: Man and Culture in a Counterfeit Paradise",
"356 ± 47 tonnes per hectare",
"fossil sequences",
"igneous, sedimentary, and metamorphic",
"seismic waves",
"he principle of cross-cutting relationships",
"In the shallow crust"]

predictions = []
bert_model = Bert(content)

for i,answer in enumerate(bert_model.process_query(questions)):
  predictions.append(answer)

f1_score = 0
best_f1_score = 0
for i,prediction in enumerate(predictions):
  cur = compute_f1(prediction, ground_truth[i])
  best_f1_score = max(cur, best_f1_score)
  f1_score += cur

avg_f1_score = f1_score/len(predictions)

print("Average F1 score is: ", avg_f1_score)
print("Best F1 scores is: ", best_f1_score)

Average F1 score is:  0.8333333333333333
Best F1 scores is:  1.0
