## Setup

First, let's import the package and load the reference data.

In [None]:
from entroprisal import (
    TokenEntropisalCalculator,
    CharacterEntropisalCalculator,
    RestOfWordEntropisalCalculator,
)
from entroprisal.utils import load_4grams, load_google_books_words
import pandas as pd

## 1. Token-Level Entropy and Surprisal

Token-level metrics use n-gram frequencies to calculate information content.

In [None]:
# Load 4-gram reference data
ngrams = load_4grams("aw")  # "aw" = all words variant

# Initialize calculator
token_calc = TokenEntropisalCalculator(ngrams, min_frequency=100)

print("Token calculator initialized!")

In [None]:
# Calculate metrics for a single sequence
tokens = ["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]
metrics = token_calc.calculate_metrics(tokens)

print("Metrics for token sequence:")
for key, value in metrics.items():
    print(f"  {key}: {value:.4f}")

In [None]:
# Batch processing multiple sequences
token_sequences = [
    ["the", "cat", "sat", "on", "the", "mat"],
    ["a", "quick", "brown", "fox"],
    ["hello", "world", "this", "is", "a", "test"],
]

results_df = token_calc.calculate_batch(token_sequences)
print("\nBatch processing results:")
results_df

In [None]:
# Get detailed analysis showing per-token metrics
detailed = token_calc.get_detailed_ngram_analysis(tokens)

print("\nDetailed analysis for trigrams (n=3):")
if 3 in detailed and len(detailed[3]) > 0:
    detailed[3].head(10)

## 2. Character-Level Entropy and Surprisal

Character-level metrics analyze transition probabilities and information content between characters within words.

In [None]:
# Load word frequency reference data
words_df = load_google_books_words()

print(f"Loaded {len(words_df):,} words from Google Books")
words_df.head()

In [None]:
# Initialize character entropy calculator
char_calc = CharacterEntropisalCalculator(words_df)

print("Character calculator initialized!")

In [None]:
# Calculate metrics for a text sample
text = "The quick brown fox jumps over the lazy dog"
char_metrics = char_calc.calculate_metrics(text)

print("Character-level metrics:")
for key, value in char_metrics.items():
    print(f"  {key}: {value:.4f}")

In [None]:
# Batch processing
texts = [
    "Simple text with common words",
    "More complex vocabulary requires careful analysis",
    "Short text",
]

char_results = char_calc.calculate_batch(texts)
print("\nBatch character entropy results:")
char_results

In [None]:
# Look up entropy and surprisal for specific character sequences
print("Entropy lookups:")
print(f"  Character 'q': {char_calc.get_character_entropy('q'):.4f}")
print(f"  Character 't': {char_calc.get_character_entropy('t'):.4f}")
print(f"  Bigraph 'th': {char_calc.get_bigraph_entropy('th'):.4f}")
print(f"  Bigraph 'qu': {char_calc.get_bigraph_entropy('qu'):.4f}")
print(f"  Trigraph 'the': {char_calc.get_trigraph_entropy('the'):.4f}")

print("\nSurprisal lookups:")
print(f"  'u' after 'q': {char_calc.get_character_surprisal('q', 'u'):.4f}")
print(f"  'h' after 't': {char_calc.get_character_surprisal('t', 'h'):.4f}")
print(f"  'th' at the end of a word: {char_calc.get_bigraph_surprisal('th', '#'):.4f}")

## 3. Rest-of-Word Entropy and Surprisal (Character-Level, Bidirectional)

Rest-of-word metrics analyze character-level entropy and surprisal for predicting the remaining characters of a word from its beginning (left-to-right) or end (right-to-left).

In [None]:
# Initialize rest-of-word calculator (uses same reference data as character calculator)
word_calc = RestOfWordEntropisalCalculator(words_df)

print("Rest-of-word calculator initialized!")

In [None]:
# Calculate metrics for a text sample
text = "The quick brown fox jumps over the lazy dog"
word_metrics = word_calc.calculate_metrics(text)

print("Word-level bidirectional metrics:")
for key, value in word_metrics.items():
    print(f"  {key}: {value:.4f}")

In [None]:
# Compare different texts
texts = ["Simple words", "Complex multisyllabic terminology", "The cat sat on the mat"]

word_results = word_calc.calculate_batch(texts)
print("\nBatch word entropy results:")
word_results

In [None]:
# Look up word frequency
words_to_check = ["the", "quick", "antidisestablishmentarianism", "xyz"]

print("Word frequencies in reference corpus:")
for word in words_to_check:
    freq = word_calc.get_word_frequency(word)
    print(f"  '{word}': {freq:,}")

## 4. Combining Multiple Metrics

You can combine metrics from different calculators for comprehensive text analysis.

In [None]:
# Analyze multiple texts with all three calculators
sample_texts = [
    "The cat sat on the mat.",
    "A quick brown fox jumps over the lazy dog.",
    "Complex linguistic analysis requires sophisticated tools.",
]

# Character metrics
char_df = char_calc.calculate_batch(sample_texts)
char_df["text"] = sample_texts

# Word metrics
word_df = word_calc.calculate_batch(sample_texts)
word_df["text"] = sample_texts

# Combine
combined = pd.merge(char_df, word_df, on="text", suffixes=("_char", "_word"))

print("Combined metrics:")
combined

## Summary

This notebook demonstrated:
1. Token-level entropy and surprisal calculation
2. Character-level transition entropy
3. Bidirectional rest-of-word entropy
4. Combining multiple metrics

For more information, see the [README.md](../README.md) and API documentation.