# **Project-P9 for CS60075: Natural Language Processing**
## Automated query processing from passages
Using rule based Quarc Model

# Team Members (Group-3): 
Jaisaikrishnan

Shivam Bhosale

Mayank agrawal


Arundhuti Nuskar

# **Importing libraries**

In [None]:
import numpy as np
import re
import nltk
import spacy

nlp = spacy.load("en_core_web_sm")

nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

#Stop words for question and sentence...
question_stop_words = [w for w in stop_words if not w in ["this"]]
sentence_stop_words = [w for w in stop_words if not w in ["so", "because", "from"]]

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

from nltk.stem.porter import PorterStemmer
porter_stemmer  = PorterStemmer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# **Reading the corpus**

In [None]:
corpus = open("/content/corpus.txt","r").read()

print(corpus)

The Amazon rainforest (Portuguese: Floresta Amazônica or Amazônia; Spanish: Selva Amazónica, Amazonía or usually Amazonia; French: Forêt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain "Amazonas" in their names. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 

# **Quarc Model**

In [None]:
class Quarc:
  
  def __init__(self, corpus):
    self.corpus = corpus

    # Contains all the stop words...
    self.raw_sentence_list = [self.text_preprocess(sent) for sent in sent_tokenize(self.corpus)]

    self.preprocessed_sentence_list = [self.remove_stop_words(sent, sentence_stop_words) for sent in self.raw_sentence_list]

    self.sentence_score = list(np.zeros(len(self.raw_sentence_list), dtype=int))

    # Four Possible Point Values
    self.clue = 3
    self.good_clue = 4
    self.confident = 6
    self.slam_dunk = 20

  @staticmethod
  def text_preprocess(sentence):
    sentence = sentence.lower()
    sentence = re.sub('[^a-z0-9_]', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence
  
  @staticmethod
  def tokenize(text):
    text = text.lower()
    text = re.sub('[^a-z0-9_]', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    return [wnl.lemmatize(word) for word in word_tokenize(text) if not word in stop_words]

  @staticmethod
  def remove_stop_words(sentence, stop_word_list):
    return " ".join([porter_stemmer.stem(wnl.lemmatize(word)) for word in word_tokenize(sentence) if not word in stop_word_list])

  @staticmethod
  def contains_name(text):
    # text ---> stopwords and punctuations are removed...
    doc = nlp(text)
    flag = False
    for token in doc:
        if token.pos_ == 'PROPN':
            flag = True
            break
    if flag:
        for ent in doc.ents:
            if ent.label_ == 'PERSON':
                return True
    return False

  @staticmethod
  def contains_keyword(word_list, text):
    # text ---> stopwords and punctuations are removed...
    if any(word in text for word in word_list):
      return True
    return False

  def word_match(self, question, sentence):   
    # question, sentence ---> stop_words should be removed    
    score_word_match = 0
    question_ = question.split()
    sentence_ = sentence.split()

    # intersection of question and senctence
    intersection = list(set(question_).intersection(sentence_))
    doc = nlp(' '.join(intersection))
    for token in doc:
      if token.pos_ == 'VERB':
        score_word_match += self.confident # self.confident = 6 points
      else:
        score_word_match += self.clue # self.clue = 3 points
    return score_word_match


# **Who model**

In [None]:
class Who(Quarc):

  def __init__(self, corpus):
    super().__init__(corpus)

  
  def get_answer(self, question):
    # question ---> (raw question) contains stopwords, punctuations...
    
    # Preprocess the question and remove the stopwords
    # "Where are you going, this early in the morning?" ---> "go early morning"
    question = self.text_preprocess(question)
    question = self.remove_stop_words(question, question_stop_words)

    # Now, find the score of all the sentences...

    for i, sent in enumerate(self.preprocessed_sentence_list):
      self.sentence_score[i] = self.get_score(question, sent)
    
    max_index = self.sentence_score.index(max(self.sentence_score))

    if(self.sentence_score[max_index] == 0):
      print("Alert: max_score is zero!!")

    # Now, depending upon the max_score value, return the appropriate sentence
    return self.raw_sentence_list[max_index]

  def get_score(self, question, sentence):
    score_1 = self.who_rule_1(question, sentence)
    score_2 = self.who_rule_2(question, sentence)
    score_3 = self.who_rule_3(question, sentence)
    score_4 = self.who_rule_4(sentence)

    return score_1+score_2+score_3+score_4


  def who_rule_1(self, question, sentence):
    return self.word_match(question, sentence)

  def who_rule_2(self, question, sentence):
    if not self.contains_name(question) and self.contains_name(sentence):
      return self.confident # self.confident = 6 points
    return 0

  def who_rule_3(self, question, sentence):
    if not self.contains_name(question) and self.contains_keyword(["name"], sentence):
      return self.good_clue # self.good_clue = 4 points
    return 0

  def who_rule_4(self, sentence):
    if self.contains_name(sentence):
      return self.good_clue # self.good_clue = 4 points
    return 0

  def extract_answer(self, text, question=None):
    #who type question ---> ner: PERSON
    doc = nlp(text)
    flag = False
    for token in doc:
        if token.pos_ == 'PROPN':
            flag = True
            break
    if flag:
        for ent in doc.ents:
            if ent.label_ == 'PERSON':
                return str(ent)
    return text


# **When Model**

In [None]:
class When(Quarc):

  def __init__(self, corpus):
    super().__init__(corpus)

  
  def get_answer(self, question):
    # Preprocess the question and remove the stopwords
    question = self.text_preprocess(question)
    question = self.remove_stop_words(question, question_stop_words)

    # Now, find the score of all the sentences...
    for i, sent in enumerate(self.preprocessed_sentence_list):
      self.sentence_score[i] = self.get_score(question, sent)
    
    max_index = self.sentence_score.index(max(self.sentence_score))

    if(self.sentence_score[max_index] == 0):
      print("Alert: max_score is zero!!")

    # Now, depending upon the max_score value, return the appropriate sentence
    return self.raw_sentence_list[max_index]

  def get_score(self, question, sentence):
    score_1 = self.when_rule_1(question, sentence)
    score_2 = self.when_rule_2(question, sentence)
    score_3 = self.when_rule_3(question, sentence)

    return score_1+score_2+score_3

  @staticmethod
  def contains_time(sentence):
    # question ---> stop_words and punctuations are removed...
    doc = nlp(sentence)
    for ent in doc.ents:
      if ent.label_ == "DATE":
        return True
    return False

  def when_rule_1(self, question, sentence):
    if self.contains_time(sentence):
      return self.word_match(question, sentence) + self.good_clue # self.good_clue = 4 points
    return 0

  def when_rule_2(self, question, sentence):
      if self.contains_keyword(["the last"], question) and self.contains_keyword(["first", "last", "since", "ago"], sentence):
        return self.slam_dunk # self.slam_dunk = 20
      return 0

  def when_rule_3(self, question, sentence):
      if self.contains_keyword(["start", "begin"], question) and self.contains_keyword(["start", "begin", "since", "year", "during"], sentence):
        return self.slam_dunk # self.slam_dunk = 20
      return 0
  
  def extract_answer(self, text, question=None):

    doc = nlp(text)
    for ent in doc.ents:
      if ent.label_ == 'DATE' or ent.label_ == 'TIME':
        return str(ent)
        
    return text

# **Where Model**

In [None]:
class Where(Quarc):

  def __init__(self, corpus):
    super().__init__(corpus)

  def get_answer(self, question):
    # Preprocess the question and remove the stopwords
    question = self.text_preprocess(question)
    question = self.remove_stop_words(question, question_stop_words)

    # Now, find the score of all the sentences...
    for i, sent in enumerate(self.preprocessed_sentence_list):
      self.sentence_score[i] = self.get_score(question, sent)
    
    max_index = self.sentence_score.index(max(self.sentence_score))

    if(self.sentence_score[max_index] == 0):
      print("Alert: max_score is zero!!")

    # Now, depending upon the max_score value, return the appropriate sentence
    return self.raw_sentence_list[max_index]

  def get_score(self, question, sentence):
    score_1 = self.where_rule_1(question, sentence)
    score_2 = self.where_rule_2(sentence)
    score_3 = self.where_rule_3(sentence)

    return score_1+score_2+score_3

  @staticmethod
  def contains_loc(sentence):
    doc = nlp(sentence)
    for ent in doc.ents:
      if ent.label_ == "GPE":
        return True
    return False

  def where_rule_1(self, question, sentence):
    return self.word_match(question, sentence)

  def where_rule_2(self, sentence):
    prep_list = ["in", "at", "on", "above", "below", "near", "inside", "next to", "in between", "behind",
                  "under", "inside", "beneath", "far from", "across"]
    if self.contains_keyword(prep_list, sentence):
      return self.good_clue # self.good_clue = 4
    return 0

  def where_rule_3(self, sentence):
    if self.contains_loc(sentence):
      return self.confident # self.confident = 6
    return 0

  def extract_answer(self, text, question=None):
    #who type question ---> ner: PERSON
    doc = nlp(text)
    for ent in doc.ents:
      if ent.label_ in ['LOC', 'FAC', 'GPE', 'ORG']:
        return str(ent)
    return text

# **Why Model**

In [None]:
class Why(Quarc):

  def __init__(self, corpus):
    super().__init__(corpus)
    # Best Sentence for Why Rule
    self.best_sentences = []

  def get_answer(self, question):
    # Preprocess the question and remove the stopwords
    question = self.text_preprocess(question)
    question = self.remove_stop_words(question, question_stop_words)

    # It will fetch all the best sentences which match the given question...
    self.find_best_sentences(question)

    # Now, find the score of all the sentences...
    for i, sent in enumerate(self.preprocessed_sentence_list):
      self.sentence_score[i] = self.get_score(question, sent)
    
    max_index = self.sentence_score.index(max(self.sentence_score))

    if(self.sentence_score[max_index] == 0):
      print("Alert: max_score is zero!!")

    # Now, depending upon the max_score value, return the appropriate sentence
    return self.raw_sentence_list[max_index]

  def get_score(self, question, sentence):
    score_1 = self.why_rule_1(sentence)
    score_2 = self.why_rule_2(sentence)
    score_3 = self.why_rule_3(sentence)
    score_4 = self.why_rule_4(sentence)
    score_5 = self.why_rule_5(sentence)


    return score_1+score_2+score_3+score_4+score_5

  def find_best_sentences(self, question):
    for sentence in self.preprocessed_sentence_list:
      if self.word_match(question, sentence) > 0:
        self.best_sentences.append(sentence)

  def why_rule_1(self, sentence):
    if sentence in self.best_sentences:
      return self.clue # self.clue = 3
    return 0

  def why_rule_2(self, sentence):
    cur_ind = self.preprocessed_sentence_list.index(sentence)
    sent_len = len(self.preprocessed_sentence_list)
    if cur_ind < sent_len-1 and self.preprocessed_sentence_list[cur_ind+1] in self.best_sentences:
      return self.clue # self.clue = 3
    return 0

  def why_rule_3(self, sentence):
    cur_ind = self.preprocessed_sentence_list.index(sentence)
    if cur_ind > 0 and self.preprocessed_sentence_list[cur_ind-1] in self.best_sentences:
      return self.good_clue # self.good_clue = 4
    return 0

  def why_rule_4(self, sentence):
    if self.contains_keyword(["want"], sentence):
      return self.good_clue # self.good_clue = 4
    return 0

  def why_rule_5(self, sentence):
    if self.contains_keyword(["so", "because"], sentence):
      return self.good_clue # self.good_clue = 4
    return 0

  def extract_answer(self, text, question=None):

    #find the intersection between text and question
    ans = ""
    question = Quarc.text_preprocess(question)
    for token in text.split():
      if not token in question.split() and not token in stop_words:
        ans += token + " "    
    return ans

# **DateLine Model**

In [None]:
class DateLine(Quarc):
  def dateline_rule_1(self, question):
    if contains_keyword(["happen"], question):
      return self.good_clue # self.good_clue = 4
    return 0

  def dateline_rule_2(self, question):
    if contains_keyword(["take"], question) and contains_keyword(["place"], question):
      return self.good_clue # self.good_clue = 4
    return 0

  def dateline_rule_3(self, question):
    if contains_keyword(["this"], question):
      return self.slam_dunk # self.slam_dunk = 20
    return 0

  def dateline_rule_4(self, question):
    if contains_keyword(["story"], question):
      return self.slam_dunk # self.slam_dunk = 20
    return 0

# **What Model**

In [None]:
class What(Quarc):

  def __init__(self, corpus):
    super().__init__(corpus)

  
  def get_answer(self, question):
    # Preprocess the question and remove the stopwords
    question = self.text_preprocess(question)
    question = self.remove_stop_words(question, question_stop_words)

    # Now, find the score of all the sentences...
    for i, sent in enumerate(self.preprocessed_sentence_list):
      self.sentence_score[i] = self.get_score(question, sent)
    
    max_index = self.sentence_score.index(max(self.sentence_score))

    if(self.sentence_score[max_index] == 0):
      print("Alert: max_score is zero!!")

    # Now, depending upon the max_score value, return the appropriate sentence
    return self.raw_sentence_list[max_index]

  def get_score(self, question, sentence):
    score_1 = self.what_rule_1(question, sentence)
    score_2 = self.what_rule_2(question, sentence)
    score_3 = self.what_rule_3(question, sentence)
    score_4 = self.what_rule_4(question, sentence)

    return score_1+score_2+score_3+score_4

  def what_rule_1(self, question, sentence):
    return self.word_match(question, sentence)

  def what_rule_2(self, question, sentence):
    month_list = ["january", "jan", "february", "feb", "march", "mar", "april", "apr", "may", "june", "jun", "july", "jul", 
                    "august", "aug", "september", "sept", "sep", "october", "oct", "november", "nov", "december", "dec"]
    day_list = ["today", "yesterday", "yesterday", "last night"]

    if self.contains_keyword(month_list, question) and self.contains_keyword(day_list, sentence):
      return self.clue # self.clue = 3 points
    return 0

  def what_rule_3(self, question, sentence):
    if self.contains_keyword(["kind"], question) and self.contains_keyword(["call", "from"], sentence):
      return self.good_clue # self.good_clue = 4 points
    return 0

  def what_rule_4(self, question, sentence):
    if self.contains_keyword(["name"], question) and self.contains_keyword(["name", "call", "known"], sentence):
      return self.clue # self.clue = 3 points
    return 0

  @staticmethod
  def get_head_pp(question):

      # name of the house? ---> "house"
      # name of the creek? ---> "creek"

      # "What was the name of the window?"
    
      # "name <ADJ> <PROPN> <NOUN>"

    question_list = question.split()
    name_index = question_list.index('name')

    try:
      if(nlp(question_list[name_index+1])[0].pos_ is "ADJ"):
        return question_list[name_index+2]

      elif (nlp(question_list[name_index+1])[0].pos_ is "PROPN" or "NOUN"):
        return question_list[name_index+1]

      else:
        return None

    except:
      return None


  def what_rule_5(self, question, sentence):
    if self.contains_name(sentence) and self.contains_keyword(["name"], question):
      head_pp = self.get_head_pp(question)

      if(head_pp is None):
        return 0

      if self.contains_keyword([head_pp], sentence):
        return self.slam_dunk # self.slam_dun = 20 points
    return 0

  def extract_answer(self, text, question=None):

    #find the intersection between text and question
    ans = ""
    question = Quarc.text_preprocess(question)
    for token in text.split():
      if not token in question.split() and not token in stop_words:
        ans += token + " "
    
    return ans

# **Question Answering**

In [None]:
# Build a dictionary {"type_of_wh_question": "Class"}
wh_dict = {"who": Who, "what": What, "when": When, "where": Where, "why": Why}


In [None]:
def get_q_type(question):

  question = question.lower()

  if "what" in question:
    return "what"
  elif "who" in question:
    return "who"
  elif "when" in question:
    return "when"
  elif "where" in question:
    return "where"
  elif "why" in question:
    return "why"
  
  else:
    return None

In [None]:
while(True):
  question = input("Enter the question: ")
  if(question == "end"):
    print("Good Bye!")
    break
  
  else:
    q_type = get_q_type(question)
    
    try:
      model_var = wh_dict[q_type]
      model = model_var(corpus)

    except:
      print("Only WH questions are allowed (who, what, when, where, why)")
      continue
    
    print("Q: ", question)
    
    ans_sent = model.get_answer(question)     
    ans = model.extract_answer(ans_sent, question)

    print("A: ", ans_sent)  # ans_sent is the sentence containing the answer
    print("Ans: ", ans)     # ans is the extracted answer from the ans_sent (Sometimes, ans maybe the same as ans_sent)
    print("------------------------------------------------------------")

Enter the question: What is the name of the satellite that measured the amount of dust?
Q:  What is the name of the satellite that measured the amount of dust?
A:  nasa s calipso satellite has measured the amount of dust transported by wind from the sahara to the amazon an average 182 million tons of dust are windblown out of the sahara each year at 15 degrees west longitude across 1 600 miles 2 600 km over the atlantic ocean some dust falls into the atlantic then at 35 degrees west longitude at the eastern coast of south america 27 7 million tons 15 of dust fall over the amazon basin 132 million tons of dust remain in the air 43 million tons of dust are windblown and falls on the caribbean sea past 75 degrees west longitude 
Ans:  nasa calipso transported wind sahara amazon average 182 million tons windblown sahara year 15 degrees west longitude across 1 600 miles 2 600 km atlantic ocean falls atlantic 35 degrees west longitude eastern coast south america 27 7 million tons 15 fall ama

# **Accuracy**

In [None]:
def compute_f1(prediction, truth):
    pred_tokens = Quarc.tokenize(prediction)
    truth_tokens = Quarc.tokenize(truth)
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [None]:
questions = ["What is the name of the satellite that measured the amount of dust?",
"What did the analysis from the sediment deposits indicate?",
"Where is the majority of the rainforest contained?",
"What is the name of the book written by Archeologist Betty Meggers?",
"What is the average plant biosmass?",
"When dating rocks, what is the absolute isotopic date applied to?",
"What are the three major types of rock?",
"What types of waves do seismologists use to image the interior of the Earth?",
"What principle relates to the formation of faults and the age of the sequences through which they cut?",
"Where do thrust faults form?"]

ground_truth = ["CALIPSO",
"rainfall in the basin during the LGM was lower than for the present",
"Brazil",
"Amazonia: Man and Culture in a Counterfeit Paradise",
"356 ± 47 tonnes per hectare",
"fossil sequences",
"igneous, sedimentary, and metamorphic",
"seismic waves",
"he principle of cross-cutting relationships",
"In the shallow crust"]

predictions = []

for i,question in enumerate(questions):
  q_type = get_q_type(question)

  model_var = wh_dict[q_type]
  model = model_var(corpus)

  ans_sent = model.get_answer(question)
  ans = model.extract_answer(ans_sent, question)
  predictions.append(ans)

f1_score = 0
best_f1_score = 0
for i, prediction in enumerate(predictions):
  cur = compute_f1(prediction, ground_truth[i])
  best_f1_score = max(cur, best_f1_score)
  f1_score += cur

avg_f1_score = f1_score/len(predictions)

print("Average F1 score: ", avg_f1_score)
print("Best F1 score: ", best_f1_score)

Average F1 score:  0.5253686635944701
Best F1 score:  1.0
