In [1]:
import nltk
import torch
import tqdm as notebook_tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertModel

nltk.download('punkt', download_dir='data')
nltk.download('wordnet', download_dir='data')
nltk.data.path.append('data')

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

[nltk_data] Downloading package punkt to data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def preprocess_sentence(sentence):
    # Tokenize and make lowercase
    return word_tokenize(sentence.lower())

def calculate_similarity(reference, candidate):
    bleu_score = sentence_bleu([reference], candidate, smoothing_function=SmoothingFunction().method4)
    meteor_score_value = meteor_score([reference], candidate)

    return bleu_score, meteor_score_value

In [3]:
def get_bert_embedding(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = ['[CLS]'] + tokens + ['[SEP]']
    input_ids = torch.tensor(tokenizer.convert_tokens_to_ids(tokens)).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids)
    # Extract the output embeddings (the output of the [CLS] token)
    embeddings = outputs[0][:, 0, :]
    return embeddings.numpy()

def get_bert_similarity(embedding1, embedding2):
    # Calculate cosine similarity between the two embeddings
    similarity = cosine_similarity(embedding1, embedding2)
    return similarity[0][0]

In [4]:
import json
# This is just a test file for doing the similarity tests
test_data = json.load(open('data/test.json'))

english = test_data['english']
spanish_translation = test_data['spanish']
gpt_english = test_data['gpt_english']

processed_english = preprocess_sentence(test_data['english'])
processed_spanish_translation = preprocess_sentence(test_data['spanish'])
processed_gpt_english = preprocess_sentence(test_data['gpt_english'])

print('Spanish:', processed_spanish_translation)
print('English:', processed_english)
print('GPT English:', processed_gpt_english)

Spanish: ['registro', 'genealógico', 'de', 'jesucristo', ',', 'hijo', 'de', 'david', 'y', 'de', 'abraham']
English: ['this', 'is', 'the', 'genealogy', 'of', 'jesus', 'the', 'messiah', 'the', 'son', 'of', 'david', ',', 'the', 'son', 'of', 'abraham']
GPT English: ['genealogical', 'record', 'of', 'jesus', 'christ', ',', 'son', 'of', 'david', 'and', 'of', 'abraham', '.']


In [5]:
# Sanity check
bleu_score, meteor_score_value = calculate_similarity(processed_english, processed_english)
print("--- Sanity check ---")
print("BLEU score: {} METEOR score: {}".format(bleu_score, meteor_score_value))

# Check similarity between gpt english translation and actual english version
bleu_score, meteor_score_value = calculate_similarity(processed_english, processed_gpt_english)
print("--- GPT English ---")
print("BLEU score: {} METEOR score: {}".format(bleu_score, meteor_score_value))

# Checking with google translation
google_english = preprocess_sentence(test_data['google_english'])
bleu_score, meteor_score_value = calculate_similarity(processed_english, google_english)
print("--- Google English ---")
print("BLEU score: {} METEOR score: {}".format(bleu_score, meteor_score_value))

--- Sanity check ---
BLEU score: 1.0 METEOR score: 0.999898229187869
--- GPT English ---
BLEU score: 0.10871733577564763 METEOR score: 0.4990963855421686
--- Google English ---
BLEU score: 0.09203291586838737 METEOR score: 0.35606744956338454


In [6]:
english_embedding = get_bert_embedding(english)
gpt_english_embedding = get_bert_embedding(gpt_english)

similarity = get_bert_similarity(english_embedding, gpt_english_embedding)
print("BERT Similarity between English and GPT English:", similarity)

similarity = get_bert_similarity(english_embedding, english_embedding)
print("BERT Similarity between English and GPT English:", similarity)

BERT Similarity between English and GPT English: 0.753445
BERT Similarity between English and GPT English: 0.99999994


In [8]:
# More testing
test_english = preprocess_sentence("Hello, how are you doing?")
test_gpt_english = preprocess_sentence("Hi, how are you doing?")
bleu_score, meteor_score_value = calculate_similarity(test_english, test_gpt_english)
print("--- Test English ---")
print("BLEU score: {} METEOR score: {}".format(bleu_score, meteor_score_value))
test_english_embedding = get_bert_embedding("Hello, how are you doing?")
test_gpt_english_embedding = get_bert_embedding("Hi, how are you doing?")
similarity = get_bert_similarity(test_english_embedding, test_gpt_english_embedding)
print("BERT Similarity between Test English and Test GPT English:", similarity)

--- Test English ---
BLEU score: 0.8091067115702212 METEOR score: 0.9985422740524781
BERT Similarity between Test English and Test GPT English: 0.9887735
