In [6]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import utils
from pathlib import Path
from pprint import pprint
from importlib import reload

In [7]:
corpus_path = "data/alice.txt"
# corpus_path = "data/all_en.txt"
# corpus_path = "data/all_de.txt"
# corpus_path = "data/all_fr.txt"
raw_text = utils.load_text(corpus_path)

# print start of the text
print(raw_text[:200])

Alice's Adventures in Wonderland

                            CHAPTER I

                      Down the Rabbit-Hole

  Alice was beginning to get very tired of sitting by her sister
on the bank, and o


In [8]:
level = "word"  # char or word
keep_punct = True
order = 3

# Preprocess
reload(utils)
print(f"Processing: Level={level}, Punct={keep_punct}")
tokens = utils.clean_text(raw_text, level=level, keep_punctuation=keep_punct)
tokens[:10]

Processing: Level=word, Punct=True


['alice',
 's',
 'adventures',
 'in',
 'wonderland',
 '\n',
 'chapter',
 'i',
 '\n',
 'down']

In [9]:
reload(utils)

# show top k items
top_k = 2

# Build Model
counts = utils.build_ngram_counts(tokens, order)
print("Sample counts:")
pprint(list(counts.items())[:top_k])

# Print some statistics
utils.print_model_statistics(counts, order)

Sample counts:
[(('alice', 's', 'adventures'), Counter({'in': 1})),
 (('s', 'adventures', 'in'), Counter({'wonderland': 1}))]

----- Model Statistics -----
Order of N-gram Model (N)            : 3
Number of Unique Contexts            : 25,993
Total Observed N-grams (Transitions) : 33,448
Unique (Context, Token) N-grams      : 30,962
Conditional Vocab Size (Next Tokens) : 2,570


In [10]:
# Normalize to Probabilities
model = utils.normalize_to_probs(counts)
print("Normalized to proba:")
pprint(list(model.items())[:top_k])

# get context with most diverse next token options
context = max(model.keys(), key=lambda ctx: len(model[ctx]))

# show sorted next token probabilities for this context
df = pd.DataFrame(model[context].items(), columns=["token", "proba"]).sort_values(
    by="proba", ascending=False
)
print(f"\nMost likely token after {context} :")
print(df)

Normalized to proba:
[(('alice', 's', 'adventures'), {'in': 1.0}),
 (('s', 'adventures', 'in'), {'wonderland': 1.0})]

Most likely token after (',', '\n', 'and') :
        token   proba
25        the  0.0682
0        then  0.0455
18          a  0.0455
10          ,  0.0455
35  concluded  0.0227
34       will  0.0227
33       said  0.0227
32       were  0.0227
31       that  0.0227
30      there  0.0227
21       have  0.0227
29         it  0.0227
28     looked  0.0227
27     people  0.0227
26  unlocking  0.0227
36       took  0.0227
24       kept  0.0227
23       beat  0.0227
37       went  0.0227
22     argued  0.0227
19     rushed  0.0227
20       your  0.0227
1       alice  0.0227
2          oh  0.0227
3        four  0.0227
4        rome  0.0227
5       began  0.0227
6        pour  0.0227
7     welcome  0.0227
8         was  0.0227
9       would  0.0227
11       left  0.0227
12     asking  0.0227
13        why  0.0227
14         \n  0.0227
15     called  0.0227
16         be  0.0227


In [11]:
# Calculate Entropy (Model based)
h_model = utils.calculate_entropy_from_counts(counts)
print(f"Model Entropy: {h_model:.4f} bits")

# def calculate_entropy_from_counts(counts):
#     """
#     H = - Sum_ctx ( P(ctx) * Sum_x ( P(x|ctx) * log2 P(x|ctx) ) )
#     """
#     total_observations = sum(sum(counter.values()) for counter in counts.values())

#     entropy = 0.0
#     for context, counter in counts.items():
#         ctx_count = sum(counter.values())
#         p_ctx = ctx_count / total_observations

#         h_cond = 0.0
#         for token, count in counter.items():
#             p_x_given_ctx = count / ctx_count
#             h_cond -= p_x_given_ctx * math.log2(p_x_given_ctx)

#         entropy += p_ctx * h_cond

#     return entropy

# The Entropy Rate (or Source Entropy) is a fundamental concept in info theory 
# that measures the avg amount of uncertainty or information per symbol (token) in a stochastic process (like text).
# For a source that generates a sequence of symbols (e.g., characters or words)
# the entropy rate, typically denoted $H(\mathcal{X})$, describes how "compressible" the source is.
    # Low Entropy Rate: The sequence is highly predictable (redundant). 
    # Knowing the past tokens tells you a lot about the next one.

    # High Entropy Rate: The sequence is less predictable (closer to random). 
    # Knowing the past doesn't help much in predicting the next token.

# -> What it is and what it is not:
# ✅ It is H(X_t | X_{t-2},X_{t-1}) for your empirical trigram distribution (with whatever tokenization and handling of boundaries you used).
# ❌ It is not the entropy of English in general, and it is not the unconditional entropy of the corpus.
# ❌ It is not a guaranteed measure of how well you’ll predict new text. 
# For held-out data you’d want cross-entropy / perplexity on a test set, because the training-set conditional entropy can be optimistically low 
# (especially if there are many rare contexts).

Model Entropy: 0.4964 bits


In [12]:
# Calculate Entropy (Sequence based - NLL)
# h_seq = utils.calculate_nll(tokens, model, order)
# print(f"Sequence NLL Entropy: {h_seq:.4f} bits")

# When both functions are calculated using the same sequence (i.e., the training sequence is passed as the sequence argument to calculate_nll),
# they will indeed give you the same numerical result. 
# $$H_{\text{model}} = L_{\text{train}}$$

In [13]:
# Generate Text
gen_len = 400 if level == "char" else 50
generated_tokens = utils.generate_text(model, order, gen_len)

if level == "char":
    gen_text = "".join(generated_tokens)
else:
    gen_text = " ".join(generated_tokens)

print("\n--- Generated Text ---")
print(gen_text)


--- Generated Text ---
not , i ll write one but i m not a serpent , i say again repeated the pigeon , but in a more 
 subdued tone , and she felt that it ought to be ashamed of yourself , said alice , a good deal to me , said the duchess .


In [14]:
# --- Chapter Analysis (Extension E3) ---
print("Running Chapter Analysis...")
chapters = utils.split_chapters(raw_text)
print(f"Found {len(chapters)} chapters.")

chapter_results = []
# Analyze entropy per chapter for a fixed config (e.g., Char, Punct, k=2)
# Ideally we train on the whole text and evaluate NLL on chapters,
# OR train on each chapter?
# README says "Evaluate non-stationarity: entropy per chapter."
# Usually this means measuring the entropy OF that chapter.
# We can measure it by:
# 1. Training a model on that chapter and finding its entropy.
# 2. Or using a global model and finding NLL of that chapter (surprisal).
# "Non-stationarity" implies the statistics change.
# Let's do both? Or just method 1 (simpler interpretation: "This chapter is more complex").
# Method 2 (Cross-entropy) shows how much the chapter deviates from the global average.
# Let's do Method 1 (Self-Entropy) for k=1 (simple)

target_config = {"level": "char", "keep_punct": True, "order": 1}

for i, chapter_text in enumerate(chapters):
    tokens = utils.clean_text(
        chapter_text,
        level=target_config["level"],
        keep_punctuation=target_config["keep_punct"],
    )
    if not tokens:
        continue

    k = target_config["order"]
    counts = utils.build_ngram_counts(tokens, k)
    # model = utils.normalize_to_probs(counts) # Not needed for entropy_from_counts

    h_chapter = utils.calculate_entropy_from_counts(counts)

    chapter_results.append(
        {"chapter": i + 1, "entropy": h_chapter, "length": len(tokens)}
    )

df_chap = pd.DataFrame(chapter_results)
df_chap.to_csv("results/chapter_entropy.csv", index=False)
print("Chapter entropy saved to results/chapter_entropy.csv")

Running Chapter Analysis...
Found 12 chapters.
Chapter entropy saved to results/chapter_entropy.csv
