# Question Generation

In [None]:
!pip install torch datasets pyarrow transformers tokenizers sentencepiece pytorch-lightning textblob nltk

In [None]:
import pandas as pd
import torch
import random
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
import copy

SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
MAX_SAMPLES = 2000
ANSWER_THRESHOLD = 7
BATCH_SIZE = 4
MAX_EPOCHS = 2
LEARNING_RATE = 3e-4

pd.options.display.max_rows, pd.options.display.max_columns = 100, 100

In [None]:
def create_pandas_dataset(data, answer_threshold=ANSWER_THRESHOLD, verbose=False):
  count_long, count_short = 0, 0
  result_df = pd.DataFrame(columns=['context', 'answer', 'question'])
  for index, val in enumerate(tqdm(data)):
      passage = val['context']
      question = val['question']
      answer = val['answers']['text'][0]
      no_of_words = len(answer.split())
      if no_of_words >= answer_threshold:
          count_long = count_long + 1
          continue
      else:
          result_df.loc[count_short] = [passage] + [answer] + [question]
          count_short = count_short + 1
  if verbose:
    return (result_df, count_long, count_short)
  else:
    return result_df

In [None]:
raw_train = load_dataset('squad', split='train')
raw_valid = load_dataset('squad', split='validation')
print(f"Total Train Samples:{len(raw_train)} , Total Validation Samples:{len(raw_valid)}")

In [None]:
df_train, df_validation = create_pandas_dataset(raw_train), create_pandas_dataset(raw_valid)
print(f"\n Total Train Samples:{df_train.shape} , Total Validation Samples:{df_validation.shape}")

In [None]:
df_train.to_parquet('train_squad.parquet')
df_validation.to_parquet('validation_squad.parquet')

In [None]:
from transformers import (
    T5ForConditionalGeneration,
    AutoTokenizer,
)

In [None]:
t5_tokenizer = AutoTokenizer.from_pretrained('t5-large', model_max_length=512)
t5_model = T5ForConditionalGeneration.from_pretrained('t5-large')

In [None]:
class QuestionGenerationDataset(Dataset):
    def __init__(self, tokenizer, filepath, max_len_inp=512, max_len_out=96, max_samples=None):
        self.path = filepath

        self.passage_column = "context"
        self.answer = "answer"
        self.question = "question"

        self.data = pd.read_parquet(self.path)
        if max_samples is not None:
            self.data = self.data.iloc[:max_samples, :]

        self.max_len_input = max_len_inp
        self.max_len_output = max_len_out
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []
        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()
        target_mask = self.targets[index]["attention_mask"].squeeze()

        labels = copy.deepcopy(target_ids)
        labels[labels == 0] = -100

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask, "labels": labels}

    def _build(self):
        for rownum, val in tqdm(self.data.iterrows()):
            passage, answer, target = val[self.passage_column], val[self.answer], val[self.question]

            input_ = f"context: {passage}  answer: {answer}"
            target = f"question: {str(target)}"

            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len_input, padding='max_length',
                truncation=True, return_tensors="pt"
            )
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len_output, padding='max_length',
                truncation=True,
                return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [None]:
train_path = 'train_squad.parquet'
validation_path = 'validation_squad.parquet'
train_dataset = QuestionGenerationDataset(t5_tokenizer, train_path, max_samples=MAX_SAMPLES)
validation_dataset = QuestionGenerationDataset(t5_tokenizer, validation_path, max_samples=MAX_SAMPLES)

In [None]:
import pytorch_lightning as pl
from torch.optim import AdamW

class T5Tuner(pl.LightningModule):

    def __init__(self, t5model, t5tokenizer, train_data, val_data, batchsize=BATCH_SIZE, lr=LEARNING_RATE):
        super().__init__()
        self.model = t5model
        self.tokenizer = t5tokenizer
        self.train_data = train_data
        self.val_data = val_data
        self.batch_size = batchsize
        self.lr = lr

    def forward(self, input_ids, attention_mask=None,
                decoder_attention_mask=None,
                lm_labels=None):

        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=lm_labels,
        )

        return outputs

    def training_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log("val_loss", loss)
        return loss

    def train_dataloader(self):
        return DataLoader(self.train_data, batch_size=self.batch_size,
                          num_workers=2)

    def val_dataloader(self):
        return DataLoader(self.val_data,
                          batch_size=self.batch_size,
                          num_workers=2)

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.lr, eps=1e-8)
        return optimizer

In [None]:
tuner = T5Tuner(t5_model, t5_tokenizer, train_dataset, validation_dataset)
trainer = pl.Trainer(max_epochs=MAX_EPOCHS, accelerator=DEVICE)
trainer.fit(tuner)

In [None]:
tuner.model.save_pretrained('t5_trained_model')
t5_tokenizer.save_pretrained('t5_tokenizer')

In [None]:
trained_model_path = 't5_trained_model'
trained_tokenizer = 't5_tokenizer'

In [None]:
model = T5ForConditionalGeneration.from_pretrained(trained_model_path)
tokenizer = AutoTokenizer.from_pretrained(trained_tokenizer)

# POS Tagging for Answer Generation

In [None]:
import nltk
nltk.download('brown')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from textblob import TextBlob

In [None]:
def find_possible_answers(context):
  possible_answers = []
  text = TextBlob(context)
  for np in text.noun_phrases:
    possible_answers.append(np)
  return possible_answers

# Wh-Question Generation

In [None]:
def generate_wh_question(sentence, answer, model, tokenizer):
  text = "context: {} answer: {}".format(sentence, answer)
  max_len = 256
  encoding = tokenizer.encode_plus(text, max_length=max_len, padding=False, truncation=True, return_tensors="pt")

  input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

  outs = model.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  early_stopping=True,
                                  num_beams=5,
                                  num_return_sequences=1,
                                  no_repeat_ngram_size=2,
                                  max_length=72)

  dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]

  question = dec[0].replace("question:", "")
  question = question.strip()
  return question

# Multiple-Choice Question Generation

In [None]:
def is_substring(s1, stringlist):
    for s2 in stringlist:
        if s1 in s2 or s2 in s1:
            return True
    return False

def mc_is_possible(word_list):
  unique_words = []
  for word in word_list:
    if not is_substring(word, unique_words):
      unique_words.append(word)
  return len(unique_words) >= 4

def generate_answer_choices(correct_answer, answers):
  distractors = [a for a in answers if a != correct_answer]
  if len(distractors) < 3:
    return []

  random.shuffle(distractors)
  distractors = distractors[:3]

  answer_choices = ["A. ", "B. ", "C. ", "D. "]
  correct_idx = random.randint(0, 3)
  answer_choices[correct_idx] += correct_answer

  distractor_idx = 0
  for i in range(4):
    if i != correct_idx:
      answer_choices[i] += distractors[distractor_idx]
      distractor_idx += 1

  return answer_choices

In [None]:
def generate_mc_question(sentence, answer, possible_answers, model, tokenizer):
  question = generate_wh_question(sentence, answer, model, tokenizer)
  answer_choices = generate_answer_choices(answer, possible_answers)
  if len(answer_choices) == 0:
    return question
  mc_question = question + '\n' + '\n'.join(answer_choices)
  return mc_question