In [36]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR

from encoder import *
from AttnDecoder import * 
from seq2seq import *

from build_dataset import *
from inference import *
from tqdm import tqdm

In [5]:
# Test example:
hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
               'ensures', 'that', 'the', 'military', 'always',
               'obeys', 'the', 'commands', 'of', 'the', 'party']
hypothesis2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', 'interested', 'in', 'world', 'history']
hypotheses = [hypothesis1, hypothesis2]               

reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
               'ensures', 'that', 'the', 'military', 'will', 'forever',
               'heed', 'Party', 'commands']                  
reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
               'guarantees', 'the', 'military', 'forces', 'always',
               'being', 'under', 'the', 'command', 'of', 'the',
               'Party']               
reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
               'army', 'always', 'to', 'heed', 'the', 'directions',
               'of', 'the', 'party']              
ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', 'because', 'he', 'read', 'the', 'book']                
references = [[reference1, reference2, reference3], [ref2a]]

In [12]:
sentence_bleu([reference1], hypothesis1)

0.41180376356915777

In [6]:
preds_list = [['My', 'full', 'pytorch', 'test'], ['Another', 'Sentence'], ['This', 'is', 'another', 'sentence']]
labels_list = [[['My', 'full', 'pytorch', 'test'], ['Completely', 'Different']], [['No', 'Match']], [['Another', 'Different', 'One']] ]

In [35]:
# NLTK - modified_precision
# compute the BLEU score across all sentences for n in [1, 4]
all_bleu_1 = []
all_bleu_2 = []
all_bleu_3 = []
all_bleu_4 = []

for idx in range(len(references)):
    all_bleu_1.append(modified_precision(labels_list[idx], preds_list[idx], n=1))
    all_bleu_2.append(modified_precision(labels_list[idx], preds_list[idx], n=2))
    all_bleu_3.append(modified_precision(labels_list[idx], preds_list[idx], n=3))
    all_bleu_4.append(modified_precision(labels_list[idx], preds_list[idx], n=4))

print(float(np.mean(all_bleu_1)), float(np.mean(all_bleu_2)), float(np.mean(all_bleu_3)), float(np.mean(all_bleu_4)))
# print(round(np.mean(all_bleu_1),4), round(np.mean(all_bleu_2),4), round(np.mean(all_bleu_3),4), round(np.mean(all_bleu_4),4))

0.5 0.5 0.5 0.5


In [8]:
# NLTK - modified_precision
# compute the BLEU score across all sentences for n in [1, 4]
all_bleu_1 = []
all_bleu_2 = []
all_bleu_3 = []
all_bleu_4 = []

for idx in range(len(references)):
    all_bleu_1.append(modified_precision(references[idx], hypotheses[idx], n=(idx+1)))
    all_bleu_2.append(modified_precision(references[idx], hypotheses[idx], n=(idx+2)))
    all_bleu_3.append(modified_precision(references[idx], hypotheses[idx], n=(idx+3)))
    all_bleu_4.append(modified_precision(references[idx], hypotheses[idx], n=(idx+4)))

print(float(np.mean(all_bleu_1)), float(np.mean(all_bleu_2)), float(np.mean(all_bleu_3)), float(np.mean(all_bleu_4)))
# print(round(np.mean(all_bleu_1),4), round(np.mean(all_bleu_2),4), round(np.mean(all_bleu_3),4), round(np.mean(all_bleu_4),4))

0.9222222222222223 0.6274509803921569 0.46875 0.2761904761904762


In [7]:
# NLTK - sentence_bleu:
# compute the BLEU score across all sentences for n in [1, 4]
all_bleu_1 = []
all_bleu_2 = []
all_bleu_3 = []
all_bleu_4 = []

weights1 = [1.0/1.0]
weights2 = [1.0/2.0, 1.0/2.0]
weights3 = [1.0/3.0, 1.0/3.0, 1.0/3.0]
weights4 = [1.0/4.0, 1.0/4.0, 1.0/4.0, 1.0/4.0]

for idx in range(len(references)):
    all_bleu_1.append(sentence_bleu(references[idx], hypotheses[idx], weights1))
    all_bleu_2.append(sentence_bleu(references[idx], hypotheses[idx], weights2))
    all_bleu_3.append(sentence_bleu(references[idx], hypotheses[idx], weights3))
    all_bleu_4.append(sentence_bleu(references[idx], hypotheses[idx], weights4))

print(np.mean(all_bleu_1), np.mean(all_bleu_2), np.mean(all_bleu_3), np.mean(all_bleu_4))

0.9722222222222222 0.8470196452752219 0.7337526821183125 0.6223247442490669


In [22]:
# Torchmetrics - BLEUScore
# compute the BLEU score across all sentences for n in [1, 4]
from torchmetrics import BLEUScore

all_bleu_1 = []
all_bleu_2 = []
all_bleu_3 = []
all_bleu_4 = []

metric1 = BLEUScore(n_gram=1)
metric2 = BLEUScore(n_gram=2)
metric3 = BLEUScore(n_gram=3)
metric4 = BLEUScore(n_gram=4)
for idx in range(len(references)):
    all_bleu_1.append(metric1(hypotheses[idx], references[idx]))
    all_bleu_2.append(metric2(hypotheses[idx], references[idx]))
    all_bleu_3.append(metric3(hypotheses[idx], references[idx]))
    all_bleu_4.append(metric4(hypotheses[idx], references[idx]))

print(np.mean(all_bleu_1), np.mean(all_bleu_2), np.mean(all_bleu_3), np.mean(all_bleu_4))

0.8333334 0.0 0.0 0.0


In [27]:
from torchtext.data.metrics import bleu_score

all_bleu_1 = []
all_bleu_2 = []
all_bleu_3 = []
all_bleu_4 = []

weights1 = [1.0/1.0]
weights2 = [1.0/2.0, 1.0/2.0]
weights3 = [1.0/3.0, 1.0/3.0, 1.0/3.0]
weights4 = [1.0/4.0, 1.0/4.0, 1.0/4.0, 1.0/4.0]

for idx in range(len(labels_list)):
    all_bleu_1.append(bleu_score(preds_list[idx], labels_list[idx], max_n=1, weights=weights1))
    all_bleu_2.append(bleu_score(preds_list[idx], labels_list[idx], max_n=2, weights=weights2))
    all_bleu_3.append(bleu_score(preds_list[idx], labels_list[idx], max_n=3, weights=weights3))
    all_bleu_4.append(bleu_score(preds_list[idx], labels_list[idx], max_n=4, weights=weights4))

print(np.mean(all_bleu_1), np.mean(all_bleu_2), np.mean(all_bleu_3), np.mean(all_bleu_4))

AssertionError: The length of candidate and reference corpus should be the same

In [31]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/soominkim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/soominkim/nltk_data...


True

In [34]:
# compute the METEOR score:
all_meteor = []
for idx in range(len(labels_list)):
    all_meteor.append(meteor_score([labels_list[idx]], preds_list[idx]))

print(np.mean(all_meteor))

0.38449260752688175


In [65]:
from torchmetrics.text.rouge import ROUGEScore

preds_list = [['My', 'full', 'pytorch', 'test'], ['Another', 'Sentence'], ['This', 'is', 'another', 'sentence']]
labels_list = [['My', 'full', 'pytorch', 'test'], ['Completely', 'Different'], ['No', 'Match']]

all_rougeL = []
rouge = ROUGEScore(rouge_keys='rougeL')
for idx in range(len(labels_list)):
    all_rougeL.append(rouge(' '.join(preds_list[idx]), ' '.join(labels_list[idx])).get('rougeL_recall'))

# return np.mean(all_rougeL)

print(np.mean(all_rougeL))

# preds = ["My name is John"]
# target = ["Is your name John"]
# rouge = ROUGEScore(rouge_keys='rougeL')
# print(rouge(preds, target))

0.33333334


In [None]:
# how to plot attention scores (matrix that shows attention score mappping from words in target to source)