just included some potential metrics to evaluate on so we can have a discussion section on evaluation

In [None]:
!pip3 install textstat rouge-score bert-score spacy scispacy
!python -m spacy download en_core_web_sm

In [None]:
import textstat
import spacy
from rouge_score import rouge_scorer
import bert_score

# Load English model for entity extraction
nlp = spacy.load("en_core_web_sm")  # For medical, try scispacy

## 1️⃣ Semantic Coverage (ROUGE + BERTScore)

In [None]:
# ROUGE
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = scorer.score(source_text, generated_text)

# BERTScore
P, R, F1 = bert_score.score([generated_text], [source_text], lang="en", rescale_with_baseline=True)

print("\n🔍 Semantic Coverage:")
print("ROUGE-1 Recall:", rouge_scores['rouge1'].recall)
print("BERTScore F1:", F1[0].item())

## 2️⃣ Information Reliability (Entity Overlap Heuristic)

In [None]:
def extract_entities(text):
    doc = nlp(text)
    return set([ent.text.lower() for ent in doc.ents])

source_ents = extract_entities(source_text)
gen_ents = extract_entities(generated_text)

intersection = source_ents.intersection(gen_ents)
reliability_score = len(intersection) / max(1, len(gen_ents))

print("\n✅ Information Reliability:")
print("Extracted Entities in Source:", source_ents)
print("Entities in Generated Output:", gen_ents)
print("Reliability Score:", round(reliability_score, 2))  # heuristic

## 3️⃣ Readability (textstat)

In [None]:
print("\n📘 Readability Stats:")
print("Flesch-Kincaid Grade Level:", textstat.flesch_kincaid_grade(generated_text))
print("Reading Ease Score:", textstat.flesch_reading_ease(generated_text))
print("SMOG Index:", textstat.smog_index(generated_text))