In [None]:
!pip install pytorch_lightning
!pip install tqdm
!pip install --quiet transformers




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Setting up the environment

In [None]:
import json
import numpy as np
import random
from tqdm import tqdm

# To evaluate the model
import  nltk.translate.bleu_score as bleu
from torchmetrics.functional.text.bert import bert_score
from torchmetrics.functional.text.rouge import rouge_score
from torchmetrics.functional.text import word_error_rate


In [None]:
# Set the same seed across all libraries. e.g. numpy, torch and python
random.seed(42)

# Evaluation with NLP Metrics
We will end this notebook with an evaluation of the trained model on the prepared test set with well known NLP Metrics that are suited for question generation. These include: BLEU-Score, BERT-Score, ROUGE-Score and Word Error Rate. As many of these metrics required the predicted output to compare it with the target outputs, the predicted outputs are first generated using the inputs in the test set.

These metrics are chosen based on the understanding that was derived from [this paper](https://aclanthology.org/D18-1429.pdf), which eloborates on suitable evaluation metrics for question generation systems.


In [None]:
# Load the target and predicted outputs from Google Drive
with open("/content/drive/MyDrive/LAMA/Training_8/predicted_outputs.txt", "r") as file:
  predicted_outputs = json.load(file)

with open("/content/drive/MyDrive/LAMA/Training_8/target_outputs.txt", "r") as file:
  target_outputs = json.load(file)


In [None]:
def evaluate_bleu_score(preds: list, targets: list):

    # chencherry = bleu.SmoothingFunction()
    bleu_score_arr = np.array([])

    # raise error if the prediction and target lists are of different lengths
    assert len(preds) == len(targets)
    num_test = len(preds)

    for current_test in tqdm(range(num_test)):
      prediction = preds[current_test].split()
      target = [targets[current_test].split()]
      bleu_score = bleu.sentence_bleu(target, prediction)
      bleu_score_arr = np.append(bleu_score_arr, bleu_score)

    return np.mean(bleu_score_arr)

In [None]:
bleu_score_squad = evaluate_bleu_score(predicted_outputs, target_outputs)

Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
100%|██████████| 4892/4892 [00:00<00:00, 4980.32it/s]


In [None]:
print(f'The BLEU-Score is {bleu_score_squad:.4f}.')

The BLEU-Score is 0.4562.


In [None]:
def evaluate_bert_score(preds: list, targets: list):

  bert_f1_arr = np.array([])
  bert_precision_arr = np.array([])
  bert_recall_arr = np.array([])

  assert len(preds) == len(targets)
  num_test = len(preds)

  for current_test in tqdm(range(num_test)):
    prediction = [preds[current_test]]
    target = [targets[current_test]]
    bert = bert_score(prediction, target)
    bert_f1_arr = np.append(bert_f1_arr, bert['f1'])
    bert_precision_arr = np.append(bert_precision_arr, bert['precision'])
    bert_recall_arr = np.append(bert_recall_arr, bert['recall'])

  return np.mean(bert_f1_arr), np.mean(bert_precision_arr), np.mean(bert_recall_arr)


In [None]:
bert_f1, bert_prec, bert_recall = evaluate_bert_score(predicted_outputs,target_outputs)

[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
 66%|██████▌   | 3226/4892 [6:35:11<3:39:53,  7.92s/it]Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 66%|██████▌   | 3227/4892 [6:35:18<3:37:54,  7.85s/it]Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_

In [None]:
print(f'The BERT F1 measure is {bert_f1:.4f}.')
print(f'The BERT precision value is {bert_prec:.4f}.')
print(f'The BERT recall value is {bert_recall:.4f}.')

The BERT F1 measure is 0.9828.
The BERT precision value is 0.9835.
The BERT recall value is 0.9820.


In [None]:
bert_f1_squad = bert_f1
bert_prec_squad = bert_prec
bert_recall_squad = bert_recall

In [None]:
def evaluate_rouge_score(preds: list, targets: list):

  rouge_fmeasure_arr = np.array([])
  rouge_precision_arr = np.array([])
  rouge_recall_arr = np.array([])

  assert len(preds) == len(targets)
  num_test = len(preds)

  for current_test in tqdm(range(num_test)):
    prediction = preds[current_test]
    target = targets[current_test]
    rouge = rouge_score(prediction, target, rouge_keys='rougeL')
    rouge_fmeasure_arr = np.append(rouge_fmeasure_arr, rouge['rougeL_fmeasure'])
    rouge_precision_arr = np.append(rouge_precision_arr, rouge['rougeL_precision'])
    rouge_recall_arr = np.append(rouge_recall_arr, rouge['rougeL_recall'])

  return np.mean(rouge_fmeasure_arr), np.mean(rouge_precision_arr), np.mean(rouge_recall_arr)

In [None]:
rouge_fmeasure_squad, rouge_prec_squad, rouge_recall_squad = evaluate_rouge_score(predicted_outputs, target_outputs)

100%|██████████| 4892/4892 [00:02<00:00, 2241.31it/s]


In [None]:
print(f'The ROUGE F-measure is {rouge_fmeasure_squad:.4f}.')
print(f'The ROUGE precision value is {rouge_prec_squad:.4f}.')
print(f'The ROUGE recall value is {rouge_recall_squad:.4f}.')

The ROUGE F-measure is 0.5138.
The ROUGE precision value is 0.5457.
The ROUGE recall value is 0.5066.


In [None]:
def evaluate_wer_score(preds: list, targets: list):

  wer_arr = np.array([])

  assert len(preds) == len(targets)
  num_test = len(preds)

  for current_test in tqdm(range(num_test)):
    prediction = [preds[current_test]]
    target = [targets[current_test]]
    wer = word_error_rate(prediction, target)
    wer_arr = np.append(wer_arr, wer)

  return np.mean(wer_arr)

In [None]:
wer_value_squad = evaluate_wer_score(predicted_outputs, target_outputs)

100%|██████████| 4892/4892 [00:00<00:00, 5018.74it/s]


In [None]:
print(f'The word error rate is {wer_value_squad:.4f}.')

The word error rate is 0.6989.


We now repeat the process with the dataset from BioBert, which has medical jargons that might not be included in the vocabulary set of the T5 Model

In [None]:
# Load the target and predicted outputs from Google Drive
with open("/content/drive/MyDrive/LAMA/Training_8/predicted_outputs_biobert.txt", "r") as file:
  predicted_outputs_biobert = json.load(file)

with open("/content/drive/MyDrive/LAMA/Training_8/target_outputs_biobert.txt", "r") as file:
  target_outputs_biobert = json.load(file)


In [None]:
bleu_score_biobert = evaluate_bleu_score(predicted_outputs_biobert, target_outputs_biobert)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
100%|██████████| 209/209 [00:00<00:00, 5405.43it/s]


In [None]:
print(f'The BLEU-Score is {bleu_score_biobert:.4f}.')

The BLEU-Score is 0.3514.


In [None]:
bert_f1_biobert, bert_prec_biobert, bert_recall_biobert = evaluate_bert_score(predicted_outputs_biobert, target_outputs_biobert)

  "The argument `model_name_or_path` was not specified while it is required when default"
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 1/209 [00:09<31:52,  9.19s/it]Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bia

In [None]:
print(f'The BERT F1 measure is {bert_f1_biobert:.4f}.')
print(f'The BERT precision value is {bert_prec_biobert:.4f}.')
print(f'The BERT recall value is {bert_recall_biobert:.4f}.')

The BERT F1 measure is 0.9714.
The BERT precision value is 0.9740.
The BERT recall value is 0.9689.


In [None]:
rouge_fmeasure_biobert, rouge_prec_biobert, rouge_recall_biobert = evaluate_rouge_score(predicted_outputs_biobert, target_outputs_biobert)

100%|██████████| 209/209 [00:00<00:00, 1141.37it/s]


In [None]:
print(f'The ROUGE F-measure is {rouge_fmeasure_biobert:.4f}.')
print(f'The ROUGE precision value is {rouge_prec_biobert:.4f}.')
print(f'The ROUGE recall value is {rouge_recall_biobert:.4f}.')

The ROUGE F-measure is 0.3122.
The ROUGE precision value is 0.3635.
The ROUGE recall value is 0.2922.


In [None]:
wer_value_biobert = evaluate_wer_score(predicted_outputs_biobert, target_outputs_biobert)

100%|██████████| 209/209 [00:00<00:00, 4987.45it/s]


In [None]:
print(f'The word error rate is {wer_value_biobert:.4f}.')

The word error rate is 0.8720.


In [None]:
evaluation_results = {
    "Bleu SQuAD":bleu_score_squad,
    "Bert F1 SQUAD":bert_f1_squad,
    "Bert Precision SQuAD":bert_prec_squad,
    "Bert Recall SQuAD":bert_recall_squad,
    "ROUGE F1 SQuAD":rouge_fmeasure_squad,
    "ROUGE Preicision SQuAD":rouge_prec_squad,
    "ROUGE Recall SQuAD":rouge_recall_squad,
    "WER SQuAD":wer_value_squad,
    "Bleu BioBert":bleu_score_biobert,
    "Bert F1 BioBert":bert_f1_biobert,
    "Bert Precision BioBert":bert_prec_biobert,
    "Bert Recall BioBert":bert_recall_biobert,
    "ROUGE F1 BioBert":rouge_fmeasure_biobert,
    "ROUGE Preicision BioBert":rouge_prec_biobert,
    "ROUGE Recall BioBert":rouge_recall_biobert,
    "WER BioBert":wer_value_biobert,
}

with open('eval_results.txt', 'w') as file:                                            # TO CHANGE
     file.write(json.dumps(evaluation_results))

# Download to Google Drive
%cp /content/eval_results.txt /content/drive/MyDrive/LAMA/Training_8                   # TO CHANGE