In [1]:
# Install and import NLTK's BLEU scorer
import nltk
from nltk.translate.bleu_score import sentence_bleu

# Reference and candidate sentences
reference = "the cat is on the mat"
candidate = "the cat is on mat"

# Tokenize the sentences by words
ref_tokens = reference.split()            # e.g., ['the','cat','is','on','the','mat']
cand_tokens = candidate.split()          # e.g., ['the','cat','is','on','mat']

# Calculate BLEU score (up to 4-gram by default)
bleu_score = sentence_bleu([ref_tokens], cand_tokens)
print(f"BLEU score: {bleu_score:.3f}")


BLEU score: 0.579


In [5]:
from collections import Counter

# Reference and candidate (e.g., summary sentences)
reference = "the cat is on the mat"
candidate = "the cat is mat"

# Tokenize into words
ref_tokens = reference.split()
cand_tokens = candidate.split()

# Count unigrams in each
ref_counts = Counter(ref_tokens)
cand_counts = Counter(cand_tokens)

# Count overlap â€“ for each unique word, how many times it appears in both
overlap = 0
for word, cnt in cand_counts.items():
    overlap += min(cnt, ref_counts.get(word, 0))

# Compute precision, recall, and F1 for unigrams
precision = overlap / len(cand_tokens) if cand_tokens else 0.0
recall    = overlap / len(ref_tokens) if ref_tokens else 0.0
if precision + recall > 0:
    f1 = 2 * precision * recall / (precision + recall)
else:
    f1 = 0.0

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"ROUGE-1 F1: {f1:.2f}")


Precision: 1.00
Recall: 0.67
ROUGE-1 F1: 0.80


In [6]:
import nltk
nltk.download('wordnet')  # Ensure WordNet is available for synonym matching


[nltk_data] Downloading package wordnet to /root/nltk_data...


METEOR score: 0.304


In [8]:
from nltk.translate.meteor_score import meteor_score

# Reference (human-written sentence)
reference = "The boy quickly ran to school to avoid being late"

# Hypothesis (model-generated sentence)
hypothesis = "The kid hurried to the school so he wouldn't be late"

# Compute METEOR score (range 0 to 1)
score = meteor_score([reference.split()], hypothesis.split())
print(f"METEOR score: {score:.3f}")



METEOR score: 0.368


In [9]:
import math

# Suppose our model predicts the following probabilities for each token in a sequence:
probs = [0.1, 0.5, 0.2, 0.8]  # Example probabilities P_hat(y_j) for j=1..4

# Ensure none of the probabilities are zero (perplexity is infinite if so)
assert all(p > 0 for p in probs), "Probabilities must be > 0"

# Calculate average negative log2-probability
avg_neg_log2 = -sum(math.log2(p) for p in probs) / len(probs)

# Calculate perplexity as 2^(average_negative_log2_prob)
perplexity = 2 ** avg_neg_log2
print(f"Perplexity: {perplexity:.2f}")


Perplexity: 3.34


In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# True labels and model predictions for 5 examples (1 = positive class, 0 = negative class)
y_true = [1, 0, 1, 1, 0]  # actual ground truth labels
y_pred = [1, 0, 0, 1, 0]  # model predictions

# Compute the metrics
acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred)
rec = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Accuracy:  {acc:.2f}")
print(f"Precision: {prec:.2f}")
print(f"Recall:    {rec:.2f}")
print(f"F1-score:  {f1:.2f}")


Accuracy:  0.80
Precision: 1.00
Recall:    0.67
F1-score:  0.80
