In [89]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import utils
from pathlib import Path
from pprint import pprint
from importlib import reload

In [90]:
corpus_path = "data/alice.txt"
# corpus_path = "data/all_en.txt"
# corpus_path = "data/all_de.txt"
# corpus_path = "data/all_fr.txt"
# corpus_path = "data/cn/hongloumeng.txt"
raw_text = utils.load_text(corpus_path)

# print start of the text
print(raw_text[:200])

Alice's Adventures in Wonderland

                            CHAPTER I

                      Down the Rabbit-Hole

  Alice was beginning to get very tired of sitting by her sister
on the bank, and o


In [98]:
level = "char"  # char or word
keep_punct = False
order = 1

# Preprocess
reload(utils)
print(f"Processing: Level={level}, Punct={keep_punct}")
tokens = utils.clean_text(raw_text, level=level, keep_punctuation=keep_punct)
tokens[:10]

Processing: Level=char, Punct=False


['a', 'l', 'i', 'c', 'e', 's', ' ', 'a', 'd', 'v']

In [99]:
reload(utils)

# show top k items
top_k = 2

# Build Model
counts = utils.build_ngram_counts(tokens, order)
print("Sample counts:")
pprint(list(counts.items())[:top_k])

# Print some statistics
utils.print_model_statistics(counts, order)

Sample counts:
[(('a',),
  Counter({'n': 1607,
           't': 1167,
           'l': 929,
           's': 900,
           'i': 716,
           'r': 704,
           ' ': 679,
           'd': 442,
           'y': 257,
           'b': 214,
           'm': 183,
           'v': 168,
           'g': 160,
           'c': 157,
           'k': 125,
           'p': 118,
           'u': 76,
           'w': 75,
           'f': 63,
           'h': 25,
           'j': 12,
           'z': 5,
           'x': 4,
           'o': 3,
           'a': 1})),
 (('l',),
  Counter({'i': 855,
           'e': 730,
           'l': 680,
           ' ': 670,
           'y': 436,
           'd': 334,
           'o': 322,
           'a': 308,
           'f': 145,
           'k': 59,
           's': 52,
           't': 48,
           'w': 15,
           'u': 14,
           'v': 12,
           'p': 12,
           'm': 8,
           'r': 4,
           'b': 4,
           'c': 1,
           'g': 1}))]

----- Model Statisti

In [100]:
# Normalize to Probabilities
model = utils.normalize_to_probs(counts)
print("Normalized to proba:")
pprint(list(model.items())[:top_k])

# get context with most diverse next token options
context = max(model.keys(), key=lambda ctx: len(model[ctx]))

# show sorted next token probabilities for this context
df = pd.DataFrame(model[context].items(), columns=["token", "proba"]).sort_values(
    by="proba", ascending=False
)
print(f"\nMost likely token after {context} :")
print(df)

Normalized to proba:
[(('a',),
  {' ': 0.07724687144482366,
   'a': 0.00011376564277588168,
   'b': 0.02434584755403868,
   'c': 0.017861205915813423,
   'd': 0.05028441410693971,
   'f': 0.007167235494880546,
   'g': 0.01820250284414107,
   'h': 0.002844141069397042,
   'i': 0.08145620022753129,
   'j': 0.0013651877133105802,
   'k': 0.01422070534698521,
   'l': 0.10568828213879408,
   'm': 0.020819112627986348,
   'n': 0.18282138794084188,
   'o': 0.00034129692832764505,
   'p': 0.013424345847554038,
   'r': 0.08009101251422071,
   's': 0.10238907849829351,
   't': 0.13276450511945392,
   'u': 0.008646188850967008,
   'v': 0.01911262798634812,
   'w': 0.008532423208191127,
   'x': 0.0004550625711035267,
   'y': 0.029237770193401593,
   'z': 0.0005688282138794084}),
 (('l',),
  {' ': 0.14225053078556263,
   'a': 0.0653927813163482,
   'b': 0.0008492569002123143,
   'c': 0.00021231422505307856,
   'd': 0.07091295116772824,
   'e': 0.15498938428874734,
   'f': 0.03078556263269639,
   'g

In [101]:
# Calculate Entropy (Model based)
h_model = utils.calculate_entropy_from_counts(counts)
print(f"Model Entropy: {h_model:.4f} bits")

# def calculate_entropy_from_counts(counts):
#     """
#     H = - Sum_ctx ( P(ctx) * Sum_x ( P(x|ctx) * log2 P(x|ctx) ) )
#     """
#     total_observations = sum(sum(counter.values()) for counter in counts.values())

#     entropy = 0.0
#     for context, counter in counts.items():
#         ctx_count = sum(counter.values())
#         p_ctx = ctx_count / total_observations

#         h_cond = 0.0
#         for token, count in counter.items():
#             p_x_given_ctx = count / ctx_count
#             h_cond -= p_x_given_ctx * math.log2(p_x_given_ctx)

#         entropy += p_ctx * h_cond

#     return entropy

# The Entropy Rate (or Source Entropy) is a fundamental concept in info theory 
# that measures the avg amount of uncertainty or information per symbol (token) in a stochastic process (like text).
# For a source that generates a sequence of symbols (e.g., characters or words)
# the entropy rate, typically denoted $H(\mathcal{X})$, describes how "compressible" the source is.
    # Low Entropy Rate: The sequence is highly predictable (redundant). 
    # Knowing the past tokens tells you a lot about the next one.

    # High Entropy Rate: The sequence is less predictable (closer to random). 
    # Knowing the past doesn't help much in predicting the next token.

# -> What it is and what it is not:
# ✅ It is H(X_t | X_{t-2},X_{t-1}) for your empirical trigram distribution (with whatever tokenization and handling of boundaries you used).
# ❌ It is not the entropy of English in general, and it is not the unconditional entropy of the corpus.
# ❌ It is not a guaranteed measure of how well you’ll predict new text. 
# For held-out data you’d want cross-entropy / perplexity on a test set, because the training-set conditional entropy can be optimistically low 
# (especially if there are many rare contexts).

Model Entropy: 3.2261 bits


In [95]:
# Calculate Entropy (Sequence based - NLL)
# h_seq = utils.calculate_nll(tokens, model, order)
# print(f"Sequence NLL Entropy: {h_seq:.4f} bits")

# When both functions are calculated using the same sequence (i.e., the training sequence is passed as the sequence argument to calculate_nll),
# they will indeed give you the same numerical result. 
# $$H_{\text{model}} = L_{\text{train}}$$

In [96]:
# Generate Text
gen_len = 200
generated_tokens = utils.generate_text(model, order, gen_len)

if level == "char":
    gen_text = "".join(generated_tokens)
else:
    gen_text = " ".join(generated_tokens)

print("\n--- Generated Text ---")
print(gen_text)


--- Generated Text ---
ic and was pard the came the see had of shed meaning said the were you tell added the fear about her all refree in sting tone or you know she melay want direconting here was gave of the ran tong at shaps


In [None]:
# --- Chapter Analysis (Extension E3) ---
print("Running Chapter Analysis...")
chapters = utils.split_chapters(raw_text)
print(f"Found {len(chapters)} chapters.")

chapter_results = []
# Analyze entropy per chapter for a fixed config (e.g., Char, Punct, k=2)
# Ideally we train on the whole text and evaluate NLL on chapters,
# OR train on each chapter?
# Q = why would we want to do this ?

# README says "Evaluate non-stationarity: entropy per chapter."
# Usually this means measuring the entropy OF that chapter.
# We can measure it by:
# 1. Training a model on that chapter and finding its entropy.
# 2. Or using a global model and finding NLL of that chapter (surprisal).
# "Non-stationarity" implies the statistics change.
# Let's do both? Or just method 1 (simpler interpretation: "This chapter is more complex").
# Method 2 (Cross-entropy) shows how much the chapter deviates from the global average.
# Let's do Method 1 (Self-Entropy) for k=1 (simple)

target_config = {"level": "char", "keep_punct": True, "order": 1}

for i, chapter_text in enumerate(chapters):
    tokens = utils.clean_text(
        chapter_text,
        level=target_config["level"],
        keep_punctuation=target_config["keep_punct"],
    )

    if not tokens:
        continue

    k = target_config["order"]
    counts = utils.build_ngram_counts(tokens, k)
    # model = utils.normalize_to_probs(counts) # Not needed for entropy_from_counts

    h_chapter = utils.calculate_entropy_from_counts(counts)

    chapter_results.append(
        {"chapter": i + 1, "entropy": h_chapter, "length": len(tokens)}
    )

df_chap = pd.DataFrame(chapter_results)
df_chap.to_csv("results/chapter_entropy.csv", index=False)
print("Chapter entropy saved to results/chapter_entropy.csv")

Running Chapter Analysis...
Found 12 chapters.
Chapter entropy saved to results/chapter_entropy.csv
