<a href="https://colab.research.google.com/github/ighoshsubho/NLP_Question_Generation/blob/main/NLP_True_False_Question_Generation_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
str = 'React is the best frontend language. React libraries are very user friendly. React renders the pages statically.'
ls = [x.strip() for x in str.strip()[:-1].split('.')]

In [None]:
!pip install allennlp==0.9.0
!pip install overrides==4.1.2

In [None]:
!python -m spacy download en_core_web_sm

## 1. Split a sentence at ending noun phrase or verb phrase


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
from allennlp.predictors.predictor import Predictor
predictor = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz")

In [None]:
test_sentence = ls[0]
test_sentence = test_sentence.rstrip('?:!.,;')
print (test_sentence)
parser_output = predictor.predict(sentence=test_sentence)
print (parser_output)

In [None]:
tree_string = parser_output["trees"]
print (tree_string)

In [None]:
from nltk import tokenize
from nltk.tree import Tree

tree = Tree.fromstring(tree_string)
print (tree)
print (tree.pretty_print())

##Notations

S	sentence	

NP	noun phrase	

VP	verb phrase	

PP	prepositional phrase	

Det	determiner	

N	noun	

V	verb	

P	preposition	

VBD - Past Tense Verb

JJ - Adjective

etc

In [None]:
tree.pretty_print()
temp1 = tree[0]
temp2 = tree[1]
temp3 = tree[-1]
temp1.pretty_print()
temp2.pretty_print()
temp3.pretty_print()

In [None]:
# split at right most nounphrase or verbphrase

def get_flattened(t):
    sent_str_final = None
    if t is not None:
        sent_str = [" ".join(x.leaves()) for x in list(t)]
        sent_str_final = [" ".join(sent_str)]
        sent_str_final = sent_str_final[0]
    return sent_str_final

def get_right_most_VP_or_NP(parse_tree,last_NP = None,last_VP = None):
    if len(parse_tree.leaves()) == 1:
        return last_NP,last_VP
    last_subtree = parse_tree[-1]
    if last_subtree.label() == "NP":
        last_NP = last_subtree
    elif last_subtree.label() == "VP":
        last_VP = last_subtree
    
    return get_right_most_VP_or_NP(last_subtree,last_NP,last_VP)


last_nounphrase, last_verbphrase =  get_right_most_VP_or_NP(tree)
last_nounphrase_flattened = get_flattened(last_nounphrase)
last_verbphrase_flattened = get_flattened(last_verbphrase)

print ("Original Sentence ",test_sentence)
print ("last_nounphrase ",last_nounphrase )
print ("last_verbphrase ",last_verbphrase)
print ("\n ")
print ("last_nounphrase ",last_nounphrase_flattened )
print ("last_verbphrase ",last_verbphrase_flattened)

In [None]:
import re

# sub_string - sipping coffee
# main_string - The old woman was sitting under a tree and sipping coffee
# compare like below
# Theoldwomanwassittingunderatreeandsippingcoffee  || sippingcoffee
# oldwomanwassittingunderatreeandsippingcoffee || sippingcoffee
# womanwassittingunderatreeandsippingcoffee || sippingcoffee
# ...............
# andsippingcoffee || sippingcoffee
# sippingcoffee || sippingcoffee
def get_termination_portion(main_string, sub_string):
    combined_sub_string = sub_string.replace(" ", "")
    main_string_list = main_string.split()
    last_index = len(main_string_list)
    for i in range(last_index):
        check_string_list = main_string_list[i:]
        check_string = "".join(check_string_list)
        check_string = check_string.replace(" ", "")
        if check_string == combined_sub_string:
            return " ".join(main_string_list[:i])

    return None

longest_phrase_to_use = max(last_nounphrase_flattened, last_verbphrase_flattened,key = len)
print ("Ending phrase: ", longest_phrase_to_use)

longest_phrase_to_use = re.sub(r"-LRB- ", "(", longest_phrase_to_use)
longest_phrase_to_use = re.sub(r" -RRB-", ")", longest_phrase_to_use)


split_sentence = get_termination_portion(test_sentence, longest_phrase_to_use)
print ("Original sentence : ",test_sentence)
print ("Original sentence after splitting at ending phrase: ",split_sentence)

In [None]:
# split at the first noun phrase or verb phrase

test_sentence2 = ls[1]
test_sentence2 = test_sentence2.rstrip('?:!.,;')
print (test_sentence2)
parser_output2 = predictor.predict(sentence=test_sentence2)
tree_string2 = parser_output2["trees"]

tree2 = Tree.fromstring(tree_string2)
print (tree2.pretty_print())

In [None]:
# SBAR stands for Subordinate Clause.
#  Penn Tree bank overview - http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.9.8216&rep=rep1&type=pdf

def get_first_VP_and_NP_and_sentence(parse_tree, first_NP=None, first_VP=None, first_sent=None):
    if len(parse_tree.leaves()) == 1:
        return get_flattened(first_NP), get_flattened(first_VP), get_flattened(first_sent)
    last_subtree = parse_tree[-1]

    if last_subtree.label() == "NP" and not first_NP:
        first_NP = last_subtree
    elif last_subtree.label() == "VP" and not first_VP:
        first_VP = last_subtree
    elif last_subtree.label() == "S" and not first_sent:
        first_sent = last_subtree

    return get_first_VP_and_NP_and_sentence(last_subtree, first_NP, first_VP, first_sent)


first_nounphrase, first_verbphrase, first_sentence = get_first_VP_and_NP_and_sentence(tree2)

print("first_nounphrase: ",first_nounphrase)
print ("first_verbphrase: ",first_verbphrase)
print ("first_sentence: ",first_sentence)

In [None]:
longest_phrase_to_use = max(first_nounphrase, first_verbphrase,key = len)
print ("Ending phrase: ", longest_phrase_to_use)

longest_phrase_to_use = re.sub(r"-LRB- ", "(", longest_phrase_to_use)
longest_phrase_to_use = re.sub(r" -RRB-", ")", longest_phrase_to_use)


split_sentence = get_termination_portion(test_sentence2, longest_phrase_to_use)
print ("Original sentence : ",test_sentence2)
print ("Original sentence after splitting at ending phrase: ",split_sentence)

## 2. Generate alternate endings to a split sentence using GPT-2

In [None]:
!pip install transformers

In [None]:
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer

# GPT2tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
# GPT2model = TFGPT2LMHeadModel.from_pretrained("distilgpt2",pad_token_id=GPT2tokenizer.eos_token_id)
GPT2tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
GPT2model = TFGPT2LMHeadModel.from_pretrained("gpt2",pad_token_id=GPT2tokenizer.eos_token_id)

In [None]:
partial_sentence = ls[0]
input_ids = GPT2tokenizer.encode(partial_sentence,return_tensors='tf')
print (input_ids)
maximum_length = len(partial_sentence.split())+40

In [None]:
# Activate top_k sampling and top_p sampling with only from 90% most likely words
sample_outputs = GPT2model.generate(
    input_ids, 
    do_sample=True, 
    max_length=maximum_length, 
    top_p=0.80, # 0.85 
    top_k=30,   #30
    repetition_penalty  = 10.0,
    num_return_sequences=5
)

In [None]:
import nltk
nltk.download('punkt')
from nltk import tokenize
generated_sentences=[]

for i, sample_output in enumerate(sample_outputs):
    decoded_sentence = GPT2tokenizer.decode(sample_output, skip_special_tokens=True)
    # final_sentence = decoded_sentence
    final_sentence = tokenize.sent_tokenize(decoded_sentence)[0]
    generated_sentences.append(final_sentence)
    print (i,": ",final_sentence)

## 3. Filter sentences with BERT

In [None]:
# https://pypi.org/project/sentence-transformers/
!pip install sentence-transformers==0.4.0

In [None]:
from sentence_transformers import SentenceTransformer, util
BERT_model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

In [None]:
possible_false_sentences = generated_sentences


original_sentence = ls[0]

print(original_sentence)

In [None]:
false_sentences_embeddings = BERT_model.encode(possible_false_sentences)
original_sentence_embedding = BERT_model.encode([original_sentence])

In [None]:
import scipy
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html
distances = scipy.spatial.distance.cdist(original_sentence_embedding, false_sentences_embeddings, "cosine")[0]
print (distances)

In [None]:
results = zip(range(len(distances)), distances)
results = sorted(results, key=lambda x: x[1])
print (results)

In [None]:
dissimilar_sentences =[]
for idx, distance in results:
  dissimilar_sentences.append(possible_false_sentences[idx])
  print (possible_false_sentences[idx])

In [None]:
false_sentences_list_final = reversed(dissimilar_sentences)
for sent in false_sentences_list_final:
  print (sent)

## 4. Saving the falsified statement and true ones in a list randomly and feeding it in t5 model for proper question generation

In [2]:
import requests

API_URL = "https://api-inference.huggingface.co/models/mrm8488/t5-base-finetuned-question-generation-ap"
headers = {"Authorization": "Bearer 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"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query({
	"inputs": f"answer: True context: {ls[2]}.",
})

print(output[0]['generated_text'][10:])

What is the default state of the page rendering?


In [5]:
final_true_output = []
for x in ls:
  def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()
	
  output = query({
    "inputs": f"answer: True context: {x}.",
  })

  final_true_output.append(output[0]['generated_text'][10:])

print(final_true_output)

['Is React the best frontend language?', 'Are React libraries user friendly?', 'What is the default state of the page rendering?']


##Performing all the steps sequentially for generting falsified statements:

In [7]:
import locale
import locale
print(locale.getpreferredencoding())
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

UTF-8


In [None]:
!pip install allennlp

In [None]:
import spacy
from allennlp.predictors.predictor import Predictor
from nltk import tokenize
from nltk.tree import Tree
import re
import tensorflow as tf
from transformers import GPT2Tokenizer, GPT2Model
import nltk
nltk.download('punkt')
from nltk import tokenize
from sentence_transformers import SentenceTransformer, util
import scipy

BERT_model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

def get_flattened(t):
    sent_str_final = None
    if t is not None:
        sent_str = [" ".join(x.leaves()) for x in list(t)]
        sent_str_final = [" ".join(sent_str)]
        sent_str_final = sent_str_final[0]
    return sent_str_final

def get_right_most_VP_or_NP(parse_tree,last_NP = None,last_VP = None):
    if len(parse_tree.leaves()) == 1:
        return last_NP,last_VP
    last_subtree = parse_tree[-1]
    if last_subtree.label() == "NP":
        last_NP = last_subtree
    elif last_subtree.label() == "VP":
        last_VP = last_subtree
    
    return get_right_most_VP_or_NP(last_subtree,last_NP,last_VP)

def get_termination_portion(main_string, sub_string):
    combined_sub_string = sub_string.replace(" ", "")
    main_string_list = main_string.split()
    last_index = len(main_string_list)
    for i in range(last_index):
        check_string_list = main_string_list[i:]
        check_string = "".join(check_string_list)
        check_string = check_string.replace(" ", "")
        if check_string == combined_sub_string:
            return " ".join(main_string_list[:i])

    return None

def get_first_VP_and_NP_and_sentence(parse_tree, first_NP=None, first_VP=None, first_sent=None):
    if len(parse_tree.leaves()) == 1:
        return get_flattened(first_NP), get_flattened(first_VP), get_flattened(first_sent)
    last_subtree = parse_tree[-1]

    if last_subtree.label() == "NP" and not first_NP:
        first_NP = last_subtree
    elif last_subtree.label() == "VP" and not first_VP:
        first_VP = last_subtree
    elif last_subtree.label() == "S" and not first_sent:
        first_sent = last_subtree

    return get_first_VP_and_NP_and_sentence(last_subtree, first_NP, first_VP, first_sent)

# GPT2tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# GPT2model = TFGPT2LMHeadModel.from_pretrained("gpt2",pad_token_id=GPT2tokenizer.eos_token_id)

GPT2tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
GPT2model = GPT2Model.from_pretrained('gpt2-medium')

generated_sentences=[]
final_falsified_output = []

In [None]:
! pip install allennlp-models

In [None]:
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz")

In [None]:
!pip install allennlp-models==2.0.1.dev20210201

In [None]:
from allennlp_models.pretrained import load_predictor
for x in ls:

  nlp = spacy.load("en_core_web_sm")

  test_sentence = x
  test_sentence = test_sentence.rstrip('?:!.,;')

  parser_output = predictor.predict(premise=test_sentence)

  tree_string = parser_output["trees"]

  tree = Tree.fromstring(tree_string)

  last_nounphrase, last_verbphrase =  get_right_most_VP_or_NP(tree)
  last_nounphrase_flattened = get_flattened(last_nounphrase)
  last_verbphrase_flattened = get_flattened(last_verbphrase)

  longest_phrase_to_use = max(last_nounphrase_flattened, last_verbphrase_flattened,key = len)
  print ("Ending phrase: ", longest_phrase_to_use)

  longest_phrase_to_use = re.sub(r"-LRB- ", "(", longest_phrase_to_use)
  longest_phrase_to_use = re.sub(r" -RRB-", ")", longest_phrase_to_use)

  # test_sentence2 = "They had no ice cream left at home, nor did they have money to go to the store."
  # test_sentence2 = test_sentence2.rstrip('?:!.,;')
  # parser_output2 = predictor.predict(sentence=test_sentence2)
  # tree_string2 = parser_output2["trees"]

  # tree2 = Tree.fromstring(tree_string2)

  first_nounphrase, first_verbphrase, first_sentence = get_first_VP_and_NP_and_sentence(tree)

  longest_phrase_to_use = max(first_nounphrase, first_verbphrase,key = len)
  longest_phrase_to_use = re.sub(r"-LRB- ", "(", longest_phrase_to_use)
  longest_phrase_to_use = re.sub(r" -RRB-", ")", longest_phrase_to_use)
  split_sentence = get_termination_portion(test_sentence, longest_phrase_to_use)

  partial_sentence = split_sentence
  input_ids = GPT2tokenizer.encode(partial_sentence,return_tensors='tf')
  maximum_length = len(partial_sentence.split())+40

  sample_outputs = GPT2model.generate(
      input_ids, 
      do_sample=True, 
      max_length=maximum_length, 
      top_p=0.80, # 0.85 
      top_k=30,   #30
      repetition_penalty  = 10.0,
      num_return_sequences=5
  )

  for i, sample_output in enumerate(sample_outputs):
      decoded_sentence = GPT2tokenizer.decode(sample_output, skip_special_tokens=True)
      # final_sentence = decoded_sentence
      final_sentence = tokenize.sent_tokenize(decoded_sentence)[0]
      generated_sentences.append(final_sentence)

  possible_false_sentences = generated_sentences

  original_sentence = x

  false_sentences_embeddings = BERT_model.encode(possible_false_sentences)
  original_sentence_embedding = BERT_model.encode([original_sentence])

  distances = scipy.spatial.distance.cdist(original_sentence_embedding, false_sentences_embeddings, "cosine")[0]

  results = zip(range(len(distances)), distances)
  results = sorted(results, key=lambda x: x[1])

  dissimilar_sentences =[]
  for idx, distance in results:
    dissimilar_sentences.append(possible_false_sentences[idx])

  false_sentences_list_final = reversed(dissimilar_sentences)

  final_falsified_output.append(false_sentences_list_final[0])

In [None]:
print(final_falsified_output)

##Storing all the true and false statements in a list and jumbling up after zipping them with their values:

In [None]:
import random
Total_output = [] #final_falsified_output + final_true_output
for x in final_falsified_output:
  Total_output.append(zip("False",x))
for x in final_true_output:
  Total_output.append(zip("True",x))
random.shuffle(Total_output)
print(Total_output)