In [1]:

import os
import sys
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)
# Set the parent directory as the current directory
os.chdir(parent_dir)

In [9]:
import numpy as np
from collections import Counter
import re

def compute_dataset_statistics(data):
    """
    Compute statistics about a dataset of clinical notes.
    
    Args:
        data: Dictionary of note_id -> {"clinical_text": text_content, ...}
    
    Returns:
        Dictionary containing dataset statistics.
    """
    # Initialize counters and lists for collecting stats
    total_notes = len(data)
    all_word_counts = []
    all_char_counts = []
    all_sentence_counts = []
    all_unique_word_counts = []
    word_frequencies = Counter()
    
    # Process each note
    for note_id, note_data in data.items():
        text = note_data.get("clinical_text", "")
        
        # Skip if text is missing
        if not text:
            continue
            
        # Character count
        char_count = len(text)
        all_char_counts.append(char_count)
        
        # Word count (split on whitespace)
        words = re.findall(r'\b\w+\b', text.lower())
        word_count = len(words)
        all_word_counts.append(word_count)
        
        # Update word frequency counter
        word_frequencies.update(words)
        
        # Unique word count
        unique_words = len(set(words))
        all_unique_word_counts.append(unique_words)
        
        # Approximate sentence count (split on ., !, ?)
        sentences = re.split(r'[.!?]+', text)
        sentence_count = len([s for s in sentences if s.strip()])
        all_sentence_counts.append(sentence_count)
    
    # Calculate statistics
    stats = {
        "total_notes": total_notes,
        
        # Character statistics
        "total_chars": sum(all_char_counts),
        "mean_chars_per_note": np.mean(all_char_counts),
        "median_chars_per_note": np.median(all_char_counts),
        "min_chars": min(all_char_counts),
        "max_chars": max(all_char_counts),
        
        # Word statistics
        "total_words": sum(all_word_counts),
        "mean_words_per_note": np.mean(all_word_counts),
        "median_words_per_note": np.median(all_word_counts),
        "min_words": min(all_word_counts),
        "max_words": max(all_word_counts),
        
        # Unique words
        "total_unique_words": len(word_frequencies),
        "mean_unique_words_per_note": np.mean(all_unique_word_counts),
        "median_unique_words_per_note": np.median(all_unique_word_counts),
        
        # Sentence statistics
        "total_sentences": sum(all_sentence_counts),
        "mean_sentences_per_note": np.mean(all_sentence_counts),
        "median_sentences_per_note": np.median(all_sentence_counts),
        
        # Calculate average word length
        "mean_word_length": np.mean([len(word) for word in word_frequencies.keys()]),
        
        # Vocabulary richness (type-token ratio for the entire corpus)
        "vocabulary_richness": len(word_frequencies) / sum(all_word_counts),
    }
    
    # Add distribution data
    stats["char_count_percentiles"] = {
        "25%": np.percentile(all_char_counts, 25),
        "50%": np.percentile(all_char_counts, 50),
        "75%": np.percentile(all_char_counts, 75),
        "90%": np.percentile(all_char_counts, 90),
    }
    
    stats["word_count_percentiles"] = {
        "25%": np.percentile(all_word_counts, 25),
        "50%": np.percentile(all_word_counts, 50),
        "75%": np.percentile(all_word_counts, 75),
        "90%": np.percentile(all_word_counts, 90),
    }
    
    # Top 10 most frequent words
    stats["top_10_words"] = dict(word_frequencies.most_common(10))
    
    return stats

# Example usage:
if __name__ == "__main__":
    from utils.data import read_json_file
    
    data = read_json_file("data/dataset/mine_hpo.json")
    stats = compute_dataset_statistics(data)
    
    # Print formatted statistics
    print("\n=== Dataset Statistics ===\n")
    
    print(f"Total Clinical Notes: {stats['total_notes']}")
    print(f"Total Words: {stats['total_words']:,}")
    print(f"Total Characters: {stats['total_chars']:,}")
    print(f"Total Sentences: {stats['total_sentences']:,}")
    print(f"Unique Words: {stats['total_unique_words']:,}")
    
    print("\n--- Per Note Statistics ---")
    print(f"Words per note: Mean={stats['mean_words_per_note']:.1f}, Median={stats['median_words_per_note']:.1f}")
    print(f"Characters per note: Mean={stats['mean_chars_per_note']:.1f}, Median={stats['median_chars_per_note']:.1f}")
    print(f"Sentences per note: Mean={stats['mean_sentences_per_note']:.1f}, Median={stats['median_sentences_per_note']:.1f}")
    print(f"Unique words per note: Mean={stats['mean_unique_words_per_note']:.1f}, Median={stats['median_unique_words_per_note']:.1f}")
    
    print("\n--- Word Count Percentiles ---")
    percentiles = stats["word_count_percentiles"]
    for name, value in percentiles.items():
        print(f"{name}: {value:.1f}")
        
    print("\n--- Top 10 Most Frequent Words ---")
    for word, count in stats["top_10_words"].items():
        print(f"{word}: {count:,}")
    
    print(f"\nVocabulary Richness (type-token ratio): {stats['vocabulary_richness']:.4f}")
    print(f"Average word length: {stats['mean_word_length']:.2f} characters")


=== Dataset Statistics ===

Total Clinical Notes: 116
Total Words: 33,824
Total Characters: 216,187
Total Sentences: 2,187
Unique Words: 5,591

--- Per Note Statistics ---
Words per note: Mean=291.6, Median=271.5
Characters per note: Mean=1863.7, Median=1738.0
Sentences per note: Mean=18.9, Median=17.0
Unique words per note: Mean=178.2, Median=164.5

--- Word Count Percentiles ---
25%: 203.0
50%: 271.5
75%: 372.8
90%: 439.0

--- Top 10 Most Frequent Words ---
and: 1,318
the: 1,299
of: 1,208
was: 811
a: 783
with: 582
in: 465
to: 459
were: 307
for: 260

Vocabulary Richness (type-token ratio): 0.1653
Average word length: 7.58 characters
