In [5]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from collections import Counter
import math

def rouge_n(reference, candidate, n):
    ref_ngrams = set(ngrams(word_tokenize(reference.lower()), n))
    cand_ngrams = set(ngrams(word_tokenize(candidate.lower()), n))
    
    overlap = len(ref_ngrams.intersection(cand_ngrams))
    total = len(ref_ngrams)
    
    return overlap / total if total > 0 else 0

def rouge_l(reference, candidate):
    ref_tokens = word_tokenize(reference.lower())
    cand_tokens = word_tokenize(candidate.lower())
    
    m, n = len(ref_tokens), len(cand_tokens)
    lcs_table = [[0] * (n + 1) for _ in range(m + 1)]
    
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if ref_tokens[i-1] == cand_tokens[j-1]:
                lcs_table[i][j] = lcs_table[i-1][j-1] + 1
            else:
                lcs_table[i][j] = max(lcs_table[i-1][j], lcs_table[i][j-1])
    
    lcs_length = lcs_table[m][n]
    return lcs_length / len(ref_tokens)

def bleu_score(reference, candidate, max_n=4):
    ref_tokens = word_tokenize(reference.lower())
    cand_tokens = word_tokenize(candidate.lower())
    
    precisions = []
    for n in range(1, max_n + 1):
        ref_ngrams = Counter(ngrams(ref_tokens, n))
        cand_ngrams = Counter(ngrams(cand_tokens, n))
        
        total_ngrams = sum(cand_ngrams.values())
        matching_ngrams = sum((ref_ngrams & cand_ngrams).values())
        
        precisions.append(matching_ngrams / total_ngrams if total_ngrams > 0 else 0)
    
    bp = min(1, math.exp(1 - len(ref_tokens) / len(cand_tokens))) if len(cand_tokens) > 0 else 0
    
    return bp * math.exp(sum(math.log(p) for p in precisions if p > 0) / max_n)

def compare_metrics(reference, candidate):
    rouge_1 = rouge_n(reference, candidate, 1)
    rouge_2 = rouge_n(reference, candidate, 2)
    rouge_l_score = rouge_l(reference, candidate)
    bleu = bleu_score(reference, candidate)
    
    print(f"ROUGE-1: {rouge_1:.4f}")
    print(f"ROUGE-2: {rouge_2:.4f}")
    print(f"ROUGE-L: {rouge_l_score:.4f}")
    print(f"BLEU: {bleu:.4f}")

# Example usage
reference = "The cat is on the mat."
candidate = "There is a cat on the mat."

compare_metrics(reference, candidate)

ROUGE-1: 1.0000
ROUGE-2: 0.5000
ROUGE-L: 0.7143
BLEU: 0.3826


In [9]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.squeeze(0).numpy()

def bertscore_f1(reference, candidate):
    # Get BERT embeddings
    ref_embeddings = get_bert_embeddings(reference)
    cand_embeddings = get_bert_embeddings(candidate)
    
    # Compute cosine similarity
    similarity_matrix = cosine_similarity(ref_embeddings, cand_embeddings)
    
    # Compute precision and recall
    precision = similarity_matrix.max(axis=0).mean()
    recall = similarity_matrix.max(axis=1).mean()
    
    # Compute F1 score
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    return {"precision": precision, "recall": recall, "f1": f1}

# Example usage
reference = "The cat is sitting on the mat."
candidate = "A feline is resting on the floor covering."

scores = bertscore_f1(reference, candidate)
print(f"BERTScore - Precision: {scores['precision']:.4f}, Recall: {scores['recall']:.4f}, F1: {scores['f1']:.4f}")

# Compare with a less semantically similar candidate
candidate2 = "The dog is running in the park."
scores2 = bertscore_f1(reference, candidate2)
print(f"BERTScore - Precision: {scores2['precision']:.4f}, Recall: {scores2['recall']:.4f}, F1: {scores2['f1']:.4f}")

BERTScore - Precision: 0.6806, Recall: 0.7502, F1: 0.7137
BERTScore - Precision: 0.7620, Recall: 0.7814, F1: 0.7716


In [11]:
import numpy as np

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def greedy_matching(reference_embeddings, candidate_embeddings, threshold=0):
    matches = []
    unmatched_candidate_indices = list(range(len(candidate_embeddings)))
    
    for i, ref_embedding in enumerate(reference_embeddings):
        best_match = -1
        best_score = -float('inf')
        
        for j in unmatched_candidate_indices:
            cand_embedding = candidate_embeddings[j]
            score = cosine_similarity(ref_embedding, cand_embedding)
            
            if score > best_score and score > threshold:
                best_match = j
                best_score = score
        
        if best_match != -1:
            matches.append((i, best_match, best_score))
            unmatched_candidate_indices.remove(best_match)
    
    return matches

# Example usage
reference = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])  # 3 reference "words"
candidate = np.array([[1, 1, 0.9], [3, 3, 3.1], [4, 4, 4]])  # 3 candidate "words"

matches = greedy_matching(reference, candidate, threshold=0.9)

print("Matches (reference_index, candidate_index, similarity_score):")
for match in matches:
    print(f"Reference word {match[0]} matched with candidate word {match[1]}, score: {match[2]:.4f}")

print("\nUnmatched reference words:", set(range(len(reference))) - set(m[0] for m in matches))
print("Unmatched candidate words:", set(range(len(candidate))) - set(m[1] for m in matches))


Matches (reference_index, candidate_index, similarity_score):
Reference word 0 matched with candidate word 2, score: 1.0000
Reference word 1 matched with candidate word 1, score: 0.9999
Reference word 2 matched with candidate word 0, score: 0.9988

Unmatched reference words: set()
Unmatched candidate words: set()
