In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR

from encoder import *
from AttnDecoder import * 
from seq2seq import *

from build_dataset import *
from inference import *
from tqdm import tqdm

In [5]:
# Test example:
hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
               'ensures', 'that', 'the', 'military', 'always',
               'obeys', 'the', 'commands', 'of', 'the', 'party']
hypothesis2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', 'interested', 'in', 'world', 'history']
hypotheses = [hypothesis1, hypothesis2]               

reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
               'ensures', 'that', 'the', 'military', 'will', 'forever',
               'heed', 'Party', 'commands']                  
reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
               'guarantees', 'the', 'military', 'forces', 'always',
               'being', 'under', 'the', 'command', 'of', 'the',
               'Party']               
reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
               'army', 'always', 'to', 'heed', 'the', 'directions',
               'of', 'the', 'party']              
ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', 'because', 'he', 'read', 'the', 'book']                
references = [[reference1, reference2, reference3], [ref2a]]

In [12]:
sentence_bleu([reference1], hypothesis1) 

0.41180376356915777

In [6]:
preds_list = [['My', 'full', 'pytorch', 'test'], ['Another', 'Sentence'], ['This', 'is', 'another', 'sentence']]
labels_list = [[['My', 'full', 'pytorch', 'test'], ['Completely', 'Different']], [['No', 'Match']], [['Another', 'Different', 'One']] ]

In [7]:
# NLTK - modified_precision
# compute the BLEU score across all sentences for n in [1, 4]
all_bleu_1 = []
all_bleu_2 = []
all_bleu_3 = []
all_bleu_4 = []

for idx in range(len(labels_list)):
    all_bleu_1.append(modified_precision(labels_list[idx], preds_list[idx], n=(idx+1)))
    all_bleu_2.append(modified_precision(labels_list[idx], preds_list[idx], n=(idx+2)))
    all_bleu_3.append(modified_precision(labels_list[idx], preds_list[idx], n=(idx+3)))
    all_bleu_4.append(modified_precision(labels_list[idx], preds_list[idx], n=(idx+4)))

print(np.mean(all_bleu_1), np.mean(all_bleu_2), np.mean(all_bleu_3), np.mean(all_bleu_4))
# print(round(np.mean(all_bleu_1),4), round(np.mean(all_bleu_2),4), round(np.mean(all_bleu_3),4), round(np.mean(all_bleu_4),4))

1/3 1/3 1/3 1/3


In [8]:
# NLTK - modified_precision
# compute the BLEU score across all sentences for n in [1, 4]
all_bleu_1 = []
all_bleu_2 = []
all_bleu_3 = []
all_bleu_4 = []

for idx in range(len(references)):
    all_bleu_1.append(modified_precision(references[idx], hypotheses[idx], n=(idx+1)))
    all_bleu_2.append(modified_precision(references[idx], hypotheses[idx], n=(idx+2)))
    all_bleu_3.append(modified_precision(references[idx], hypotheses[idx], n=(idx+3)))
    all_bleu_4.append(modified_precision(references[idx], hypotheses[idx], n=(idx+4)))

print(float(np.mean(all_bleu_1)), float(np.mean(all_bleu_2)), float(np.mean(all_bleu_3)), float(np.mean(all_bleu_4)))
# print(round(np.mean(all_bleu_1),4), round(np.mean(all_bleu_2),4), round(np.mean(all_bleu_3),4), round(np.mean(all_bleu_4),4))

0.9222222222222223 0.6274509803921569 0.46875 0.2761904761904762


In [7]:
# NLTK - sentence_bleu:
# compute the BLEU score across all sentences for n in [1, 4]
all_bleu_1 = []
all_bleu_2 = []
all_bleu_3 = []
all_bleu_4 = []

weights1 = [1.0/1.0]
weights2 = [1.0/2.0, 1.0/2.0]
weights3 = [1.0/3.0, 1.0/3.0, 1.0/3.0]
weights4 = [1.0/4.0, 1.0/4.0, 1.0/4.0, 1.0/4.0]

for idx in range(len(references)):
    all_bleu_1.append(sentence_bleu(references[idx], hypotheses[idx], weights1))
    all_bleu_2.append(sentence_bleu(references[idx], hypotheses[idx], weights2))
    all_bleu_3.append(sentence_bleu(references[idx], hypotheses[idx], weights3))
    all_bleu_4.append(sentence_bleu(references[idx], hypotheses[idx], weights4))

print(np.mean(all_bleu_1), np.mean(all_bleu_2), np.mean(all_bleu_3), np.mean(all_bleu_4))

0.9722222222222222 0.8470196452752219 0.7337526821183125 0.6223247442490669


In [10]:
# Torchmetrics - BLEUScore
# compute the BLEU score across all sentences for n in [1, 4]
all_bleu_1 = []
all_bleu_2 = []
all_bleu_3 = []
all_bleu_4 = []

metric1 = BLEUScore(n_gram=1)
metric2 = BLEUScore(n_gram=2)
metric3 = BLEUScore(n_gram=3)
metric4 = BLEUScore(n_gram=4)
for idx in range(len(labels_list)):
    all_bleu_1.append(metric1(preds_list[idx], labels_list[idx]))
    all_bleu_2.append(metric2(preds_list[idx], labels_list[idx]))
    all_bleu_3.append(metric3(preds_list[idx], labels_list[idx]))
    all_bleu_4.append(metric4(preds_list[idx], labels_list[idx]))

print(np.mean(all_bleu_1), np.mean(all_bleu_2), np.mean(all_bleu_3), np.mean(all_bleu_4))

NameError: name 'BLEUScore' is not defined

In [30]:
# compute the METEOR score:
all_meteor = []

for idx in range(len(labels_list)):
    all_meteor.append(meteor_score(labels_list[idx], preds_list[idx]))

LookupError: 
**********************************************************************
  Resource [93mwordnet[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('wordnet')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/wordnet[0m

  Searched in:
    - '/Users/soomink/nltk_data'
    - '/opt/homebrew/Caskroom/miniconda/base/envs/nlp/nltk_data'
    - '/opt/homebrew/Caskroom/miniconda/base/envs/nlp/share/nltk_data'
    - '/opt/homebrew/Caskroom/miniconda/base/envs/nlp/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
# from torchmetrics.text.rouge import ROUGEScore
preds = "My name is John"
target = "Is your name John"
rouge = ROUGEScore()
print(rouge(preds, target).)

{'rouge1_fmeasure': tensor(0.7500), 'rouge1_precision': tensor(0.7500), 'rouge1_recall': tensor(0.7500), 'rouge2_fmeasure': tensor(0.), 'rouge2_precision': tensor(0.), 'rouge2_recall': tensor(0.), 'rougeL_fmeasure': tensor(0.5000), 'rougeL_precision': tensor(0.5000), 'rougeL_recall': tensor(0.5000), 'rougeLsum_fmeasure': tensor(0.5000), 'rougeLsum_precision': tensor(0.5000), 'rougeLsum_recall': tensor(0.5000)}
