<a href="https://colab.research.google.com/github/jayanthbagare/MedicalDocumentSimilarity/blob/main/DocumentSimilarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Required libraries
# pip install sentence-transformers scikit-learn nltk spacy textstat numpy scipy

# Download required models (run once)
# python -m spacy download en_core_web_sm

import nltk
# Run these once to download required data
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

In [2]:
import nltk
import spacy
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import textstat
import re

class DocumentAnalyzer:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

    def preprocess_document(self, text):
        """
        Preprocesses a document into its component parts

        Args:
            text (str): Raw document text

        Returns:
            dict: Contains raw_text, sentences, paragraphs, and spacy doc object
        """
        # Clean and segment the document using spaCy
        doc = self.nlp(text)

        # Extract sentences using spaCy's sentence segmentation
        sentences = [sent.text.strip() for sent in doc.sents]

        # Extract paragraphs by splitting on double newlines
        # Filter out empty paragraphs
        paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]

        # Handle case where document has no paragraph breaks
        if len(paragraphs) == 0:
            paragraphs = [text.strip()]

        return {
            'raw_text': text,
            'sentences': sentences,
            'paragraphs': paragraphs,
            'doc_object': doc
        }

In [3]:
class SemanticAnalyzer:
    def __init__(self, sentence_model):
        self.sentence_model = sentence_model

    def document_level_similarity(self, doc1, doc2):
        """
        Computes overall semantic similarity between two documents

        Args:
            doc1, doc2: Preprocessed document dictionaries

        Returns:
            float: Cosine similarity score between document embeddings (0-1)
        """
        # Encode entire documents as single embeddings
        emb1 = self.sentence_model.encode([doc1['raw_text']])
        emb2 = self.sentence_model.encode([doc2['raw_text']])

        # Calculate cosine similarity
        return cosine_similarity(emb1, emb2)[0][0]

    def paragraph_level_similarity(self, doc1, doc2):
        """
        Aligns paragraphs between documents to find best semantic matches

        Uses Hungarian algorithm to find optimal paragraph-to-paragraph alignment

        Returns:
            dict: Contains average alignment score and individual alignment pairs
        """
        # Handle empty paragraph lists
        if not doc1['paragraphs'] or not doc2['paragraphs']:
            return {
                'average_alignment_score': 0.0,
                'alignment_pairs': []
            }

        # Generate embeddings for all paragraphs
        p1_embeddings = self.sentence_model.encode(doc1['paragraphs'])
        p2_embeddings = self.sentence_model.encode(doc2['paragraphs'])

        # Create similarity matrix (each cell is cosine similarity)
        similarity_matrix = cosine_similarity(p1_embeddings, p2_embeddings)

        # Find optimal alignment using Hungarian algorithm
        # We negate the matrix because linear_sum_assignment finds minimum cost
        from scipy.optimize import linear_sum_assignment
        row_indices, col_indices = linear_sum_assignment(-similarity_matrix)

        # Extract the similarity scores for the optimal alignment
        aligned_scores = similarity_matrix[row_indices, col_indices]

        return {
            'average_alignment_score': np.mean(aligned_scores),
            'alignment_pairs': list(zip(row_indices, col_indices, aligned_scores)),
            'similarity_matrix': similarity_matrix
        }

    def sentence_level_patterns(self, doc1, doc2):
        """
        Compares the overall semantic "style" of sentences between documents

        Method: Computes mean embedding for all sentences in each document,
        then compares these mean embeddings

        Returns:
            float: Similarity of sentence-level semantic patterns
        """
        # Handle empty sentence lists
        if not doc1['sentences'] or not doc2['sentences']:
            return 0.0

        # Generate embeddings for all sentences
        s1_embeddings = self.sentence_model.encode(doc1['sentences'])
        s2_embeddings = self.sentence_model.encode(doc2['sentences'])

        # Compute centroid (mean) of sentence embeddings for each document
        s1_mean = np.mean(s1_embeddings, axis=0)
        s2_mean = np.mean(s2_embeddings, axis=0)

        # Compare the centroids
        return cosine_similarity([s1_mean], [s2_mean])[0][0]

In [4]:
class StructuralAnalyzer:
    def __init__(self, nlp_model):
        self.nlp = nlp_model

class StructuralAnalyzer:
    def __init__(self, nlp_model):
        self.nlp = nlp_model

    def document_organization_similarity(self, doc1, doc2):
        """
        Compare high-level document structure metrics

        Analyzes:
        - Number of paragraphs and sentences
        - Average lengths
        - Length distributions
        """
        features = {}

        for name, doc in [('doc1', doc1), ('doc2', doc2)]:
            # Calculate basic structural metrics
            paragraph_lengths = [len(p.split()) for p in doc['paragraphs']]
            sentence_lengths = [len(s.split()) for s in doc['sentences']]

            features[name] = {
                'paragraph_count': len(doc['paragraphs']),
                'sentence_count': len(doc['sentences']),
                'avg_paragraph_length': np.mean(paragraph_lengths) if paragraph_lengths else 0,
                'avg_sentence_length': np.mean(sentence_lengths) if sentence_lengths else 0,
                'paragraph_length_distribution': paragraph_lengths
            }

        # Calculate structural similarity
        structure_score = self._compare_structural_features(features['doc1'], features['doc2'])
        return structure_score

    def discourse_pattern_similarity(self, doc1, doc2):
        """Analyze discourse markers and transitions"""
        discourse_markers = [
            'however', 'therefore', 'furthermore', 'moreover', 'nevertheless',
            'consequently', 'additionally', 'meanwhile', 'similarly', 'conversely',
            'in contrast', 'on the other hand', 'as a result', 'for example',
            'in conclusion', 'to summarize', 'first', 'second', 'finally'
        ]

        def extract_discourse_features(text):
            text_lower = text.lower()
            marker_counts = {marker: text_lower.count(marker) for marker in discourse_markers}
            total_markers = sum(marker_counts.values())

            return {
                'total_discourse_markers': total_markers,
                'marker_density': total_markers / len(text.split()),
                'marker_distribution': marker_counts
            }

        d1_features = extract_discourse_features(doc1['raw_text'])
        d2_features = extract_discourse_features(doc2['raw_text'])

        return self._compare_discourse_features(d1_features, d2_features)

    def syntactic_pattern_similarity(self, doc1, doc2):
        """Compare syntactic structures using POS patterns"""
        def extract_pos_patterns(doc_object):
            # Extract POS tag sequences for sentences
            pos_patterns = []
            for sent in doc_object.sents:
                pos_sequence = [token.pos_ for token in sent if not token.is_space]
                pos_patterns.append(tuple(pos_sequence))

            # Count pattern frequencies
            from collections import Counter
            pattern_counts = Counter(pos_patterns)

            return {
                'pos_patterns': pattern_counts,
                'avg_sentence_complexity': np.mean([len(p) for p in pos_patterns]),
                'unique_patterns': len(pattern_counts)
            }

        p1 = extract_pos_patterns(doc1['doc_object'])
        p2 = extract_pos_patterns(doc2['doc_object'])

        return self._compare_syntactic_features(p1, p2)

    def paragraph_role_similarity(self, doc1, doc2):
        """Identify and compare paragraph functional roles"""
        def classify_paragraph_role(paragraph):
            paragraph_lower = paragraph.lower()

            # Simple heuristic-based classification
            if any(word in paragraph_lower for word in ['introduction', 'begin', 'start', 'overview']):
                return 'introduction'
            elif any(word in paragraph_lower for word in ['conclusion', 'summary', 'end', 'finally']):
                return 'conclusion'
            elif any(word in paragraph_lower for word in ['example', 'instance', 'case study']):
                return 'example'
            elif any(word in paragraph_lower for word in ['argument', 'claim', 'assert', 'maintain']):
                return 'argument'
            else:
                return 'body'

        d1_roles = [classify_paragraph_role(p) for p in doc1['paragraphs']]
        d2_roles = [classify_paragraph_role(p) for p in doc2['paragraphs']]

        # Compare role sequences
        from difflib import SequenceMatcher
        role_similarity = SequenceMatcher(None, d1_roles, d2_roles).ratio()

        return {
            'role_sequence_similarity': role_similarity,
            'doc1_structure': d1_roles,
            'doc2_structure': d2_roles
        }

    def _compare_structural_features(self, f1, f2):
        """
        Compare numerical structural features between two documents

        Method:
        1. Normalize differences between basic metrics (paragraphs, sentences, lengths)
        2. Compare paragraph length distributions using Jensen-Shannon divergence
        3. Combine with weighted average
        """
        # Avoid division by zero errors
        def safe_relative_diff(a, b):
            if max(a, b) == 0:
                return 0.0
            return abs(a - b) / max(a, b)

        # Calculate normalized differences (lower = more similar)
        para_diff = safe_relative_diff(f1['paragraph_count'], f2['paragraph_count'])
        sent_diff = safe_relative_diff(f1['sentence_count'], f2['sentence_count'])
        avg_para_diff = safe_relative_diff(f1['avg_paragraph_length'], f2['avg_paragraph_length'])
        avg_sent_diff = safe_relative_diff(f1['avg_sentence_length'], f2['avg_sentence_length'])

        # Compare paragraph length distributions
        distribution_similarity = 0.5  # Default if no valid distributions

        if f1['paragraph_length_distribution'] and f2['paragraph_length_distribution']:
            from scipy.spatial.distance import jensenshannon

            # Create histograms with same bins
            all_lengths = f1['paragraph_length_distribution'] + f2['paragraph_length_distribution']
            if all_lengths:
                max_length = max(all_lengths)
                min_length = min(all_lengths)

                if max_length > min_length:
                    # Create histogram bins
                    bins = np.linspace(min_length, max_length, 10)

                    # Calculate histograms
                    hist1, _ = np.histogram(f1['paragraph_length_distribution'], bins=bins)
                    hist2, _ = np.histogram(f2['paragraph_length_distribution'], bins=bins)

                    # Normalize to probabilities
                    hist1_norm = hist1 / np.sum(hist1) if np.sum(hist1) > 0 else np.ones_like(hist1) / len(hist1)
                    hist2_norm = hist2 / np.sum(hist2) if np.sum(hist2) > 0 else np.ones_like(hist2) / len(hist2)

                    # Calculate Jensen-Shannon divergence (0 = identical, 1 = completely different)
                    js_distance = jensenshannon(hist1_norm, hist2_norm)
                    distribution_similarity = 1 - js_distance

        # Combine all features with equal weighting
        # Convert differences to similarities (1 - difference)
        structure_score = (
            (1 - para_diff) * 0.2 +
            (1 - sent_diff) * 0.2 +
            (1 - avg_para_diff) * 0.2 +
            (1 - avg_sent_diff) * 0.2 +
            distribution_similarity * 0.2
        )

        return max(0.0, min(1.0, structure_score))  # Clamp to [0,1]

    def _compare_discourse_features(self, d1, d2):
        """
        Compare discourse marker usage between documents

        Args:
            d1, d2: Discourse feature dictionaries

        Returns:
            float: Combined similarity score for discourse patterns
        """
        # Compare marker density (markers per word)
        max_density = max(d1['marker_density'], d2['marker_density'], 0.001)  # Avoid division by zero
        density_diff = abs(d1['marker_density'] - d2['marker_density'])
        density_similarity = 1 - min(density_diff / max_density, 1.0)

        # Compare distribution of specific markers
        all_markers = set(d1['marker_distribution'].keys()) | set(d2['marker_distribution'].keys())

        if not all_markers:
            distribution_sim = 1.0  # Both have no markers
        else:
            # Create vectors for all markers
            d1_vector = [d1['marker_distribution'].get(marker, 0) for marker in all_markers]
            d2_vector = [d2['marker_distribution'].get(marker, 0) for marker in all_markers]

            # Calculate cosine similarity if both have markers
            d1_sum = sum(d1_vector)
            d2_sum = sum(d2_vector)

            if d1_sum == 0 and d2_sum == 0:
                distribution_sim = 1.0  # Both have no markers
            elif d1_sum == 0 or d2_sum == 0:
                distribution_sim = 0.0  # One has markers, other doesn't
            else:
                distribution_sim = cosine_similarity([d1_vector], [d2_vector])[0][0]

        return (density_similarity + distribution_sim) / 2

    def _compare_syntactic_features(self, p1, p2):
        """
        Compare syntactic patterns between documents

        Args:
            p1, p2: Dictionaries with POS pattern features

        Returns:
            float: Combined similarity score for syntactic patterns
        """
        # Compare sentence complexity (average POS tags per sentence)
        max_complexity = max(p1['avg_sentence_complexity'], p2['avg_sentence_complexity'], 1.0)
        complexity_diff = abs(p1['avg_sentence_complexity'] - p2['avg_sentence_complexity'])
        complexity_similarity = 1 - min(complexity_diff / max_complexity, 1.0)

        # Compare POS pattern overlap (Jaccard similarity)
        set1 = set(p1['pos_patterns'].keys())
        set2 = set(p2['pos_patterns'].keys())

        if not set1 and not set2:
            pattern_overlap = 1.0  # Both have no patterns
        elif not set1 or not set2:
            pattern_overlap = 0.0  # One has patterns, other doesn't
        else:
            intersection = len(set1 & set2)
            union = len(set1 | set2)
            pattern_overlap = intersection / union  # Jaccard similarity

        return (complexity_similarity + pattern_overlap) / 2

In [5]:
class CombinedSimilarityAnalyzer:
    def __init__(self):
        self.doc_analyzer = DocumentAnalyzer()
        self.semantic_analyzer = SemanticAnalyzer(self.doc_analyzer.sentence_model)
        self.structural_analyzer = StructuralAnalyzer(self.doc_analyzer.nlp)

    def analyze_similarity(self, text1, text2, weights=None):
        """
        Comprehensive similarity analysis combining semantic and structural measures

        Args:
            text1, text2: Input documents as strings
            weights: Dict with keys 'semantic', 'structural' (default: 60% semantic, 40% structural)

        Returns:
            dict: Comprehensive similarity analysis results
        """
        if weights is None:
            weights = {'semantic': 0.6, 'structural': 0.4}

        # Validate weights
        if not (0.99 <= sum(weights.values()) <= 1.01):  # Allow small floating point errors
            raise ValueError("Weights must sum to 1.0")

        # Preprocess documents
        print("Preprocessing documents...")
        doc1 = self.doc_analyzer.preprocess_document(text1)
        doc2 = self.doc_analyzer.preprocess_document(text2)

        print(f"Doc1: {len(doc1['paragraphs'])} paragraphs, {len(doc1['sentences'])} sentences")
        print(f"Doc2: {len(doc2['paragraphs'])} paragraphs, {len(doc2['sentences'])} sentences")

        # Semantic Analysis
        print("Analyzing semantic similarity...")
        semantic_scores = {
            'document_similarity': self.semantic_analyzer.document_level_similarity(doc1, doc2),
            'paragraph_alignment': self.semantic_analyzer.paragraph_level_similarity(doc1, doc2),
            'sentence_patterns': self.semantic_analyzer.sentence_level_patterns(doc1, doc2)
        }

        # Structural Analysis
        print("Analyzing structural similarity...")
        structural_scores = {
            'organization': self.structural_analyzer.document_organization_similarity(doc1, doc2),
            'discourse_patterns': self.structural_analyzer.discourse_pattern_similarity(doc1, doc2),
            'syntactic_patterns': self.structural_analyzer.syntactic_pattern_similarity(doc1, doc2),
            'paragraph_roles': self.structural_analyzer.paragraph_role_similarity(doc1, doc2)
        }

        # Calculate weighted final scores
        semantic_final = (
            semantic_scores['document_similarity'] * 0.5 +
            semantic_scores['paragraph_alignment']['average_alignment_score'] * 0.3 +
            semantic_scores['sentence_patterns'] * 0.2
        )

        structural_final = (
            structural_scores['organization'] * 0.3 +
            structural_scores['discourse_patterns'] * 0.25 +
            structural_scores['syntactic_patterns'] * 0.25 +
            structural_scores['paragraph_roles']['role_sequence_similarity'] * 0.2
        )

        # Final combined score
        overall_similarity = (
            semantic_final * weights['semantic'] +
            structural_final * weights['structural']
        )

        return {
            'overall_similarity': overall_similarity,
            'semantic_score': semantic_final,
            'structural_score': structural_final,
            'detailed_scores': {
                'semantic': semantic_scores,
                'structural': structural_scores
            },
            'document_stats': {
                'doc1': {
                    'paragraphs': len(doc1['paragraphs']),
                    'sentences': len(doc1['sentences']),
                    'words': len(doc1['raw_text'].split())
                },
                'doc2': {
                    'paragraphs': len(doc2['paragraphs']),
                    'sentences': len(doc2['sentences']),
                    'words': len(doc2['raw_text'].split())
                }
            }
        }

In [6]:
# Initialize analyzer
analyzer = CombinedSimilarityAnalyzer()

# Example documents (you can also load from files)
doc1_text = """
Introduction to Machine Learning

Machine learning is a subset of artificial intelligence that focuses on algorithms and statistical models. These systems can automatically improve their performance on a specific task through experience.

The field encompasses various approaches including supervised learning, unsupervised learning, and reinforcement learning. Each approach addresses different types of problems and data scenarios.

Applications of machine learning are widespread, from recommendation systems to autonomous vehicles. The technology continues to evolve rapidly with new techniques emerging regularly.
"""

doc2_text = """
Understanding Machine Learning Fundamentals

Machine learning represents a branch of AI that emphasizes the development of algorithms capable of learning from data. These systems enhance their performance automatically as they process more information.

There are several key methodologies in this field: supervised techniques, unsupervised methods, and reinforcement-based approaches. Each methodology serves distinct problem domains and data types.

Machine learning applications span numerous industries, including e-commerce recommendations and self-driving cars. The field advances quickly with continuous innovation in methodologies.
"""

# Analyze similarity with default weights (60% semantic, 40% structural)
results = analyzer.analyze_similarity(doc1_text, doc2_text)

# Print results
print(f"Overall Similarity: {results['overall_similarity']:.3f}")
print(f"Semantic Score: {results['semantic_score']:.3f}")
print(f"Structural Score: {results['structural_score']:.3f}")

# Detailed breakdown
print("\nDetailed Analysis:")
semantic = results['detailed_scores']['semantic']
structural = results['detailed_scores']['structural']

print(f"Document-level semantic: {semantic['document_similarity']:.3f}")
print(f"Paragraph alignment: {semantic['paragraph_alignment']['average_alignment_score']:.3f}")
print(f"Sentence patterns: {semantic['sentence_patterns']:.3f}")
print(f"Organization structure: {structural['organization']:.3f}")
print(f"Discourse patterns: {structural['discourse_patterns']:.3f}")
print(f"Syntactic patterns: {structural['syntactic_patterns']:.3f}")
print(f"Paragraph roles: {structural['paragraph_roles']['role_sequence_similarity']:.3f}")

# Document statistics
print(f"\nDocument Statistics:")
stats = results['document_stats']
print(f"Doc1: {stats['doc1']['paragraphs']} paragraphs, {stats['doc1']['sentences']} sentences, {stats['doc1']['words']} words")
print(f"Doc2: {stats['doc2']['paragraphs']} paragraphs, {stats['doc2']['sentences']} sentences, {stats['doc2']['words']} words")

# Try different weights - emphasize structure more
print(f"\nWith different weights (40% semantic, 60% structural):")
results_structural = analyzer.analyze_similarity(doc1_text, doc2_text,
                                               weights={'semantic': 0.4, 'structural': 0.6})
print(f"Overall Similarity: {results_structural['overall_similarity']:.3f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Preprocessing documents...
Doc1: 4 paragraphs, 6 sentences
Doc2: 4 paragraphs, 6 sentences
Analyzing semantic similarity...


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


Analyzing structural similarity...
Overall Similarity: 0.835
Semantic Score: 0.862
Structural Score: 0.796

Detailed Analysis:
Document-level semantic: 0.882
Paragraph alignment: 0.799
Sentence patterns: 0.903
Organization structure: 0.934
Discourse patterns: 1.000
Syntactic patterns: 0.463
Paragraph roles: 0.750

Document Statistics:
Doc1: 4 paragraphs, 6 sentences, 78 words
Doc2: 4 paragraphs, 6 sentences, 78 words

With different weights (40% semantic, 60% structural):
Preprocessing documents...
Doc1: 4 paragraphs, 6 sentences
Doc2: 4 paragraphs, 6 sentences
Analyzing semantic similarity...


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


Analyzing structural similarity...
Overall Similarity: 0.822


  return forward_call(*args, **kwargs)


In [7]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

def topic_structure_similarity(doc1, doc2, n_topics=5):
    """Compare topic distributions"""
    combined_paragraphs = doc1['paragraphs'] + doc2['paragraphs']

    vectorizer = CountVectorizer(max_features=100, stop_words='english')
    doc_term_matrix = vectorizer.fit_transform(combined_paragraphs)

    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    topic_distributions = lda.fit_transform(doc_term_matrix)

    # Split back into documents
    doc1_topics = topic_distributions[:len(doc1['paragraphs'])]
    doc2_topics = topic_distributions[len(doc1['paragraphs']):]

    # Compare topic progression patterns
    d1_progression = np.mean(doc1_topics, axis=0)
    d2_progression = np.mean(doc2_topics, axis=0)

    return cosine_similarity([d1_progression], [d2_progression])[0][0]

In [8]:
def rhetorical_structure_similarity(doc1, doc2):
    """Analyze argument flow and rhetorical patterns"""

    def extract_rhetorical_features(paragraphs):
        features = {
            'question_density': 0,
            'claim_density': 0,
            'evidence_density': 0,
            'transition_density': 0
        }

        total_sentences = 0
        for para in paragraphs:
            sentences = nltk.sent_tokenize(para)
            total_sentences += len(sentences)

            for sent in sentences:
                sent_lower = sent.lower()
                if '?' in sent:
                    features['question_density'] += 1
                if any(word in sent_lower for word in ['argue', 'claim', 'assert', 'contend']):
                    features['claim_density'] += 1
                if any(word in sent_lower for word in ['evidence', 'data', 'study', 'research']):
                    features['evidence_density'] += 1
                if any(word in sent_lower for word in ['however', 'therefore', 'thus', 'moreover']):
                    features['transition_density'] += 1

        # Normalize by sentence count
        for key in features:
            features[key] = features[key] / total_sentences if total_sentences > 0 else 0

        return features

    f1 = extract_rhetorical_features(doc1['paragraphs'])
    f2 = extract_rhetorical_features(doc2['paragraphs'])

    # Calculate feature similarity
    similarities = []
    for key in f1:
        if f1[key] + f2[key] > 0:
            sim = 1 - abs(f1[key] - f2[key]) / max(f1[key], f2[key])
            similarities.append(sim)

    return np.mean(similarities) if similarities else 0

Similarity Score Ranges:

0.9-1.0: Nearly identical documents \
0.7-0.9: High similarity (same topic, similar approach) \
0.5-0.7: Moderate similarity (related content, different presentation) \
0.3-0.5: Low similarity (some overlap, mostly different) \
0.0-0.3: Very different documents \

In [9]:
import re
import numpy as np
from collections import Counter
from typing import Dict, List, Tuple, Any
import json
from dataclasses import dataclass
from datetime import datetime

# You'll need to install these packages:
# pip install sentence-transformers nltk scikit-learn spacy
# python -m spacy download en_core_web_sm

try:
    from sentence_transformers import SentenceTransformer
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    import nltk
    import spacy

    # Download required NLTK data
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')

    # Load spaCy model for medical entity extraction
    nlp = spacy.load("en_core_web_sm")

    # Load sentence transformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')

except ImportError as e:
    print(f"Please install required packages: {e}")
    print("Run: pip install sentence-transformers nltk scikit-learn spacy")
    print("Then: python -m spacy download en_core_web_sm")

@dataclass
class EvaluationResult:
    semantic_similarity: float
    bleu_score: float
    rouge_l_score: float
    jaccard_similarity: float
    medical_entity_overlap: float
    factual_consistency_score: float
    completeness_score: float
    clinical_relevance_score: float
    coherence_score: float
    overall_score: float
    detailed_analysis: Dict[str, Any]

class MedicalNoteEvaluator:
    def __init__(self):
        self.medical_patterns = {
            'vital_signs': r'(?:BP|blood pressure|pulse|heart rate|temperature|temp|respiratory rate|O2 sat|oxygen saturation)[:\s]*(\d+[/\-\d]*)',
            'medications': r'(?:prescribed|medication|drug|med)[:\s]*([A-Za-z]+(?:\s+\d+(?:mg|mcg|g|ml))?)',
            'diagnoses': r'(?:diagnosis|diagnosed|condition|assessment)[:\s]*([A-Za-z\s]+)',
            'symptoms': r'(?:symptom|complaint|presents with|reports)[:\s]*([A-Za-z\s]+)',
            'procedures': r'(?:procedure|treatment|surgery|operation)[:\s]*([A-Za-z\s]+)',
            'follow_up': r'(?:follow.?up|return|next visit|appointment)[:\s]*([A-Za-z0-9\s]+)',
            'allergies': r'(?:allerg|adverse reaction)[:\s]*([A-Za-z\s]+)',
            'family_history': r'(?:family history|fh)[:\s]*([A-Za-z\s]+)'
        }

    def preprocess_text(self, text: str) -> str:
        """Clean and normalize text for comparison."""
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text.strip())
        # Normalize medical abbreviations
        medical_abbrevs = {
            r'\bpt\b': 'patient',
            r'\bpts\b': 'patients',
            r'\bw/\b': 'with',
            r'\bw/o\b': 'without',
            r'\bc/o\b': 'complains of',
            r'\bs/p\b': 'status post',
            r'\bh/o\b': 'history of'
        }
        for abbrev, full in medical_abbrevs.items():
            text = re.sub(abbrev, full, text, flags=re.IGNORECASE)
        return text.lower()

    def extract_medical_entities(self, text: str) -> Dict[str, List[str]]:
        """Extract medical entities using pattern matching and NLP."""
        entities = {}

        # Pattern-based extraction
        for category, pattern in self.medical_patterns.items():
            matches = re.findall(pattern, text, re.IGNORECASE)
            entities[category] = [match.strip() for match in matches if match.strip()]

        # NLP-based entity extraction
        doc = nlp(text)
        entities['named_entities'] = [(ent.text, ent.label_) for ent in doc.ents]
        entities['medical_terms'] = []

        # Common medical terms
        medical_keywords = [
            'hypertension', 'diabetes', 'pneumonia', 'infection', 'fever',
            'pain', 'headache', 'nausea', 'vomiting', 'diarrhea', 'fatigue',
            'chest pain', 'shortness of breath', 'cough', 'rash', 'swelling'
        ]

        for term in medical_keywords:
            if term in text.lower():
                entities['medical_terms'].append(term)

        return entities

    def calculate_semantic_similarity(self, text1: str, text2: str) -> float:
        """Calculate semantic similarity using sentence transformers."""
        try:
            embeddings = model.encode([text1, text2])
            similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
            return float(similarity)
        except Exception:
            # Fallback to TF-IDF similarity
            vectorizer = TfidfVectorizer()
            tfidf_matrix = vectorizer.fit_transform([text1, text2])
            similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
            return float(similarity)

    def calculate_bleu_score(self, reference: str, candidate: str) -> float:
        """Calculate BLEU score for text similarity."""
        def get_ngrams(text: str, n: int) -> List[Tuple]:
            words = text.split()
            return [tuple(words[i:i+n]) for i in range(len(words)-n+1)]

        ref_words = reference.split()
        cand_words = candidate.split()

        if len(cand_words) == 0:
            return 0.0

        # Calculate precision for 1-4 grams
        precisions = []
        for n in range(1, 5):
            ref_ngrams = get_ngrams(reference, n)
            cand_ngrams = get_ngrams(candidate, n)

            if not cand_ngrams:
                precisions.append(0.0)
                continue

            ref_counter = Counter(ref_ngrams)
            cand_counter = Counter(cand_ngrams)

            overlap = sum((ref_counter & cand_counter).values())
            precision = overlap / len(cand_ngrams)
            precisions.append(precision)

        # Geometric mean of precisions
        if any(p == 0 for p in precisions):
            return 0.0

        bleu = np.exp(np.mean(np.log(precisions)))

        # Brevity penalty
        bp = min(1.0, np.exp(1 - len(ref_words) / len(cand_words)))

        return bleu * bp

    def calculate_rouge_l(self, reference: str, candidate: str) -> float:
        """Calculate ROUGE-L score based on longest common subsequence."""
        def lcs_length(x: List[str], y: List[str]) -> int:
            m, n = len(x), len(y)
            dp = [[0] * (n + 1) for _ in range(m + 1)]

            for i in range(1, m + 1):
                for j in range(1, n + 1):
                    if x[i-1] == y[j-1]:
                        dp[i][j] = dp[i-1][j-1] + 1
                    else:
                        dp[i][j] = max(dp[i-1][j], dp[i][j-1])

            return dp[m][n]

        ref_words = reference.split()
        cand_words = candidate.split()

        if not ref_words or not cand_words:
            return 0.0

        lcs_len = lcs_length(ref_words, cand_words)

        if lcs_len == 0:
            return 0.0

        precision = lcs_len / len(cand_words)
        recall = lcs_len / len(ref_words)

        if precision + recall == 0:
            return 0.0

        f1_score = 2 * precision * recall / (precision + recall)
        return f1_score

    def calculate_jaccard_similarity(self, text1: str, text2: str) -> float:
        """Calculate Jaccard similarity coefficient."""
        words1 = set(text1.split())
        words2 = set(text2.split())

        intersection = len(words1.intersection(words2))
        union = len(words1.union(words2))

        return intersection / union if union > 0 else 0.0

    def calculate_medical_entity_overlap(self, entities1: Dict, entities2: Dict) -> float:
        """Calculate overlap of extracted medical entities."""
        total_overlap = 0
        total_entities = 0

        for category in self.medical_patterns.keys():
            set1 = set(entities1.get(category, []))
            set2 = set(entities2.get(category, []))

            if set1 or set2:
                intersection = len(set1.intersection(set2))
                union = len(set1.union(set2))
                overlap = intersection / union if union > 0 else 0
                total_overlap += overlap
                total_entities += 1

        return total_overlap / total_entities if total_entities > 0 else 0.0

    def assess_factual_consistency(self, text1: str, text2: str, entities1: Dict, entities2: Dict) -> float:
        """Assess factual consistency between documents."""
        consistency_score = 0.0
        checks = 0

        # Check vital signs consistency
        vitals1 = entities1.get('vital_signs', [])
        vitals2 = entities2.get('vital_signs', [])
        if vitals1 and vitals2:
            vital_overlap = len(set(vitals1).intersection(set(vitals2))) / len(set(vitals1).union(set(vitals2)))
            consistency_score += vital_overlap
            checks += 1

        # Check medication consistency
        meds1 = set(entities1.get('medications', []))
        meds2 = set(entities2.get('medications', []))
        if meds1 or meds2:
            med_overlap = len(meds1.intersection(meds2)) / len(meds1.union(meds2)) if meds1.union(meds2) else 1.0
            consistency_score += med_overlap
            checks += 1

        # Check diagnosis consistency
        diag1 = set(entities1.get('diagnoses', []))
        diag2 = set(entities2.get('diagnoses', []))
        if diag1 or diag2:
            diag_overlap = len(diag1.intersection(diag2)) / len(diag1.union(diag2)) if diag1.union(diag2) else 1.0
            consistency_score += diag_overlap
            checks += 1

        return consistency_score / checks if checks > 0 else 0.5

    def assess_completeness(self, text1: str, text2: str, entities1: Dict, entities2: Dict) -> float:
        """Assess completeness of information coverage."""
        essential_categories = ['vital_signs', 'medications', 'diagnoses', 'symptoms', 'follow_up']

        coverage1 = sum(1 for cat in essential_categories if entities1.get(cat))
        coverage2 = sum(1 for cat in essential_categories if entities2.get(cat))

        # Both should ideally cover similar essential categories
        min_coverage = min(coverage1, coverage2)
        max_coverage = max(coverage1, coverage2)

        if max_coverage == 0:
            return 1.0  # Both empty

        return min_coverage / max_coverage

    def assess_clinical_relevance(self, text1: str, text2: str) -> float:
        """Assess clinical relevance and medical appropriateness."""
        clinical_keywords = [
            'patient', 'diagnosis', 'treatment', 'medication', 'symptom',
            'examination', 'assessment', 'plan', 'history', 'vital',
            'laboratory', 'imaging', 'follow-up', 'prognosis', 'therapy'
        ]

        def count_clinical_terms(text: str) -> int:
            count = 0
            for keyword in clinical_keywords:
                count += len(re.findall(r'\b' + keyword + r'\b', text, re.IGNORECASE))
            return count

        count1 = count_clinical_terms(text1)
        count2 = count_clinical_terms(text2)

        # Both should have similar clinical term density
        total_words1 = len(text1.split())
        total_words2 = len(text2.split())

        if total_words1 == 0 or total_words2 == 0:
            return 0.0

        density1 = count1 / total_words1
        density2 = count2 / total_words2

        # Similarity in clinical term density
        avg_density = (density1 + density2) / 2
        density_diff = abs(density1 - density2)

        return max(0.0, 1.0 - (density_diff / max(avg_density, 0.1)))

    def assess_coherence(self, text1: str, text2: str) -> float:
        """Assess logical flow and coherence of medical notes."""
        # Check for proper medical note structure
        structure_patterns = [
            r'(?:chief complaint|cc|presenting complaint)',
            r'(?:history of present illness|hpi)',
            r'(?:physical examination|exam|pe)',
            r'(?:assessment|impression|diagnosis)',
            r'(?:plan|treatment|management)'
        ]

        def has_medical_structure(text: str) -> float:
            found_sections = 0
            for pattern in structure_patterns:
                if re.search(pattern, text, re.IGNORECASE):
                    found_sections += 1
            return found_sections / len(structure_patterns)

        structure1 = has_medical_structure(text1)
        structure2 = has_medical_structure(text2)

        # Both should have similar structural completeness
        avg_structure = (structure1 + structure2) / 2
        structure_diff = abs(structure1 - structure2)

        coherence_score = avg_structure * (1 - structure_diff)

        return min(1.0, coherence_score)

    def extract_numerical_values(self, text: str) -> List[str]:
        """Extract numerical values that might be medically significant."""
        # Pattern for numbers with units (vital signs, lab values, etc.)
        number_pattern = r'\d+(?:\.\d+)?(?:\s*(?:mg|mcg|g|ml|mmHg|bpm|°F|°C|%|units?))?'
        return re.findall(number_pattern, text, re.IGNORECASE)

    def compare_numerical_consistency(self, text1: str, text2: str) -> float:
        """Compare numerical values for consistency."""
        nums1 = set(self.extract_numerical_values(text1))
        nums2 = set(self.extract_numerical_values(text2))

        if not nums1 and not nums2:
            return 1.0  # No numerical values in either

        if not nums1 or not nums2:
            return 0.0  # One has numbers, other doesn't

        intersection = len(nums1.intersection(nums2))
        union = len(nums1.union(nums2))

        return intersection / union

    def evaluate_similarity(self, document1: str, document2: str) -> EvaluationResult:
        """Main evaluation function that combines all metrics."""

        # Preprocess documents
        processed_doc1 = self.preprocess_text(document1)
        processed_doc2 = self.preprocess_text(document2)

        # Extract medical entities
        entities1 = self.extract_medical_entities(processed_doc1)
        entities2 = self.extract_medical_entities(processed_doc2)

        # Calculate all similarity metrics
        semantic_sim = self.calculate_semantic_similarity(processed_doc1, processed_doc2)
        bleu = self.calculate_bleu_score(processed_doc1, processed_doc2)
        rouge_l = self.calculate_rouge_l(processed_doc1, processed_doc2)
        jaccard = self.calculate_jaccard_similarity(processed_doc1, processed_doc2)

        # Calculate medical-specific metrics
        entity_overlap = self.calculate_medical_entity_overlap(entities1, entities2)
        factual_consistency = self.assess_factual_consistency(document1, document2, entities1, entities2)
        completeness = self.assess_completeness(document1, document2, entities1, entities2)
        clinical_relevance = self.assess_clinical_relevance(document1, document2)
        coherence = self.assess_coherence(document1, document2)

        # Add numerical consistency check
        numerical_consistency = self.compare_numerical_consistency(document1, document2)

        # Calculate weighted overall score
        weights = {
            'factual_consistency': 0.40,
            'completeness': 0.25,
            'clinical_relevance': 0.20,
            'coherence': 0.15
        }

        overall_score = (
            factual_consistency * weights['factual_consistency'] +
            completeness * weights['completeness'] +
            clinical_relevance * weights['clinical_relevance'] +
            coherence * weights['coherence']
        )

        # Detailed analysis
        detailed_analysis = {
            'entity_comparison': {
                'document1_entities': entities1,
                'document2_entities': entities2,
                'common_entities': self._find_common_entities(entities1, entities2),
                'unique_to_doc1': self._find_unique_entities(entities1, entities2),
                'unique_to_doc2': self._find_unique_entities(entities2, entities1)
            },
            'length_comparison': {
                'doc1_length': len(document1.split()),
                'doc2_length': len(document2.split()),
                'length_ratio': len(document1.split()) / max(len(document2.split()), 1)
            },
            'numerical_consistency': numerical_consistency,
            'similarity_metrics': {
                'semantic': semantic_sim,
                'bleu': bleu,
                'rouge_l': rouge_l,
                'jaccard': jaccard
            }
        }

        return EvaluationResult(
            semantic_similarity=semantic_sim,
            bleu_score=bleu,
            rouge_l_score=rouge_l,
            jaccard_similarity=jaccard,
            medical_entity_overlap=entity_overlap,
            factual_consistency_score=factual_consistency,
            completeness_score=completeness,
            clinical_relevance_score=clinical_relevance,
            coherence_score=coherence,
            overall_score=overall_score,
            detailed_analysis=detailed_analysis
        )

    def _find_common_entities(self, entities1: Dict, entities2: Dict) -> Dict:
        """Find entities common to both documents."""
        common = {}
        for category in entities1.keys():
            if category in entities2:
                set1 = set(entities1[category]) if isinstance(entities1[category], list) else set()
                set2 = set(entities2[category]) if isinstance(entities2[category], list) else set()
                common[category] = list(set1.intersection(set2))
        return common

    def _find_unique_entities(self, entities1: Dict, entities2: Dict) -> Dict:
        """Find entities unique to the first document."""
        unique = {}
        for category in entities1.keys():
            if category in entities2:
                set1 = set(entities1[category]) if isinstance(entities1[category], list) else set()
                set2 = set(entities2[category]) if isinstance(entities2[category], list) else set()
                unique[category] = list(set1 - set2)
            else:
                unique[category] = entities1[category]
        return unique

    def generate_report(self, result: EvaluationResult, doc1_name: str = "Document 1", doc2_name: str = "Document 2") -> str:
        """Generate a comprehensive evaluation report."""

        def score_interpretation(score: float) -> str:
            if score >= 0.9: return "Excellent"
            elif score >= 0.8: return "Very Good"
            elif score >= 0.7: return "Good"
            elif score >= 0.6: return "Fair"
            elif score >= 0.5: return "Poor"
            else: return "Very Poor"

        report = f"""
MEDICAL NOTE SIMILARITY EVALUATION REPORT
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
=========================================

EXECUTIVE SUMMARY
-----------------
Overall Similarity Score: {result.overall_score:.3f} ({score_interpretation(result.overall_score)})

The two medical notes show {score_interpretation(result.overall_score).lower()} similarity in conveying the same medical information effectively.

DETAILED METRICS
----------------

1. FACTUAL CONSISTENCY: {result.factual_consistency_score:.3f} ({score_interpretation(result.factual_consistency_score)})
   - Measures agreement on medical facts (diagnoses, medications, vital signs)

2. COMPLETENESS: {result.completeness_score:.3f} ({score_interpretation(result.completeness_score)})
   - Evaluates coverage of essential medical information categories

3. CLINICAL RELEVANCE: {result.clinical_relevance_score:.3f} ({score_interpretation(result.clinical_relevance_score)})
   - Assesses appropriate use of medical terminology and concepts

4. COHERENCE: {result.coherence_score:.3f} ({score_interpretation(result.coherence_score)})
   - Evaluates logical structure and medical note organization

SIMILARITY METRICS
------------------
• Semantic Similarity (AI-based): {result.semantic_similarity:.3f}
• BLEU Score (n-gram overlap): {result.bleu_score:.3f}
• ROUGE-L Score (sequence similarity): {result.rouge_l_score:.3f}
• Jaccard Similarity (word overlap): {result.jaccard_similarity:.3f}
• Medical Entity Overlap: {result.medical_entity_overlap:.3f}

DOCUMENT ANALYSIS
-----------------
Length Comparison:
• {doc1_name}: {result.detailed_analysis['length_comparison']['doc1_length']} words
• {doc2_name}: {result.detailed_analysis['length_comparison']['doc2_length']} words
• Length Ratio: {result.detailed_analysis['length_comparison']['length_ratio']:.2f}

Numerical Consistency: {result.detailed_analysis['numerical_consistency']:.3f}

ENTITY ANALYSIS
---------------
"""

        # Add common entities
        common_entities = result.detailed_analysis['entity_comparison']['common_entities']
        if any(common_entities.values()):
            report += "\nCommon Medical Entities:\n"
            for category, entities in common_entities.items():
                if entities:
                    # Handle list of tuples for named_entities
                    if category == 'named_entities':
                        entity_strings = [f"{text} ({label})" for text, label in entities]
                        report += f"• {category.replace('_', ' ').title()}: {', '.join(entity_strings)}\n"
                    else:
                        report += f"• {category.replace('_', ' ').title()}: {', '.join(entities)}\n"

        # Add unique entities
        unique1 = result.detailed_analysis['entity_comparison']['unique_to_doc1']
        unique2 = result.detailed_analysis['entity_comparison']['unique_to_doc2']

        if any(entities for entities in unique1.values() if entities):
            report += f"\nUnique to {doc1_name}:\n"
            for category, entities in unique1.items():
                if entities:
                     # Handle list of tuples for named_entities
                    if category == 'named_entities':
                        entity_strings = [f"{text} ({label})" for text, label in entities]
                        report += f"• {category.replace('_', ' ').title()}: {', '.join(entity_strings)}\n"
                    else:
                        report += f"• {category.replace('_', ' ').title()}: {', '.join(entities)}\n"

        if any(entities for entities in unique2.values() if entities):
            report += f"\nUnique to {doc2_name}:\n"
            for category, entities in unique2.items():
                if entities:
                     # Handle list of tuples for named_entities
                    if category == 'named_entities':
                        entity_strings = [f"{text} ({label})" for text, label in entities]
                        report += f"• {category.replace('_', ' ').title()}: {', '.join(entity_strings)}\n"
                    else:
                        report += f"• {category.replace('_', ' ').title()}: {', '.join(entities)}\n"

        # Recommendations
        report += "\nRECOMMENDATIONS\n---------------\n"

        if result.overall_score >= 0.8:
            report += "✓ The documents show high similarity and likely convey the same medical message effectively.\n"
        elif result.overall_score >= 0.6:
            report += "⚠ The documents show moderate similarity. Review differences in key medical entities.\n"
        else:
            report += "⚠ The documents show low similarity. Significant differences detected that may affect medical accuracy.\n"

        if result.factual_consistency_score < 0.7:
            report += "• Review factual inconsistencies in medical data (vital signs, medications, diagnoses)\n"

        if result.completeness_score < 0.7:
            report += "• One document may be missing essential medical information categories\n"

        if result.clinical_relevance_score < 0.7:
            report += "• Check for appropriate use of medical terminology and clinical concepts\n"

        if result.coherence_score < 0.7:
            report += "• Review document structure and logical flow of medical information\n"

        return report

# Usage Example and Testing Function
def evaluate_medical_notes(document1: str, document2: str, doc1_name: str = "Document 1", doc2_name: str = "Document 2"):
    """
    Main function to evaluate two medical notes.

    Args:
        document1: First medical note text
        document2: Second medical note text
        doc1_name: Name/identifier for first document
        doc2_name: Name/identifier for second document

    Returns:
        Tuple of (EvaluationResult, formatted_report)
    """
    evaluator = MedicalNoteEvaluator()
    result = evaluator.evaluate_similarity(document1, document2)
    report = evaluator.generate_report(result, doc1_name, doc2_name)

    return result, report

# Example usage and test function
def run_example():
    """Run with sample medical notes to demonstrate functionality."""

    sample_note1 = """
    Chief Complaint: Patient presents with chest pain and shortness of breath.

    History of Present Illness:
    45-year-old male with history of hypertension presents with acute onset chest pain
    starting 2 hours ago. Pain is crushing, substernal, radiates to left arm.
    Associated with diaphoresis and nausea.

    Vital Signs: BP 150/95, HR 105, RR 22, Temp 98.6°F, O2 sat 96%

    Physical Examination:
    Cardiovascular: Tachycardic, regular rhythm, no murmurs
    Pulmonary: Clear to auscultation bilaterally

    Assessment: Acute coronary syndrome, rule out myocardial infarction

    Plan:
    - EKG and cardiac enzymes
    - Aspirin 325mg, Nitroglycerin PRN
    - Cardiology consultation
    - Serial cardiac monitoring
    """

    sample_note2 = """
    CC: 45 y/o male c/o acute chest pain and SOB

    HPI: Patient with HTN history presents with sudden onset crushing chest pain
    beginning 2 hours prior to arrival. Pain is substernal with radiation to L arm,
    accompanied by sweating and nausea.

    Vitals: Blood pressure 150/95 mmHg, pulse 105 bpm, respiratory rate 22,
    temperature 98.6 degrees F, oxygen saturation 96%

    PE:
    CV: Tachycardic rate, regular, no murmurs appreciated
    Lungs: CTAB

    A&P: Acute coronary syndrome, r/o MI
    - Obtain EKG and cardiac biomarkers
    - Start ASA 325mg, nitroglycerin as needed
    - Cardiology consult
    - Continuous cardiac monitoring
    """

    result, report = evaluate_medical_notes(sample_note1, sample_note2, "LLM Model A", "LLM Model B")

    print("SAMPLE EVALUATION RESULTS:")
    print("=" * 50)
    print(report)

    return result, report

if __name__ == "__main__":
    # Run example to demonstrate functionality
    run_example()

    print("\n" + "="*50)
    print("TO USE WITH YOUR DOCUMENTS:")
    print("="*50)
    print("""
# Load your documents
with open('note1.txt', 'r') as f:
    doc1 = f.read()

with open('note2.txt', 'r') as f:
    doc2 = f.read()

# Evaluate similarity
result, report = evaluate_medical_notes(doc1, doc2, "Model A Output", "Model B Output")

# Print results
print(report)

# Access individual metrics
print(f"Overall Score: {result.overall_score}")
print(f"Semantic Similarity: {result.semantic_similarity}")
print(f"Medical Entity Overlap: {result.medical_entity_overlap}")
""")

  return forward_call(*args, **kwargs)


SAMPLE EVALUATION RESULTS:

MEDICAL NOTE SIMILARITY EVALUATION REPORT
Generated: 2025-08-08 06:58:14

EXECUTIVE SUMMARY
-----------------
Overall Similarity Score: 0.455 (Very Poor)

The two medical notes show very poor similarity in conveying the same medical information effectively.

DETAILED METRICS
----------------

1. FACTUAL CONSISTENCY: 0.300 (Very Poor)
   - Measures agreement on medical facts (diagnoses, medications, vital signs)

2. COMPLETENESS: 0.667 (Fair)
   - Evaluates coverage of essential medical information categories

3. CLINICAL RELEVANCE: 0.481 (Very Poor)
   - Assesses appropriate use of medical terminology and concepts

4. COHERENCE: 0.480 (Very Poor)
   - Evaluates logical structure and medical note organization

SIMILARITY METRICS
------------------
• Semantic Similarity (AI-based): 0.772
• BLEU Score (n-gram overlap): 0.000
• ROUGE-L Score (sequence similarity): 0.375
• Jaccard Similarity (word overlap): 0.220
• Medical Entity Overlap: 0.200

DOCUMENT ANALYSIS