In [51]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import utils
from pathlib import Path
from pprint import pprint
from importlib import reload

In [85]:
# corpus_path = "data/alice.txt"
corpus_path = "data/cn/hongloumeng.txt"
# corpus_path = "data/all_en.txt"
# corpus_path = "data/all_de.txt"
# corpus_path = "data/all_fr.txt"
raw_text = utils.load_text(corpus_path)

# print start of the text
print(raw_text[:200])

紅樓夢

第一回　甄士隱夢幻識通靈　賈雨村風塵怀閨秀
-----------------------------------------------------------------
此開卷第一回也．作者自云：因曾歷過一番夢幻之后，故將真事隱去，
而借"通靈"之說，撰此《石頭記》一書也．故曰"甄士隱"云云．但書中所記
何事何人？自又云：“今風塵碌碌，一事無成，忽念及當日所有之女子，一
一細考較去


In [86]:
level = "char"  # char or word
keep_punct = False
order = 3

# Preprocess
reload(utils)
print(f"Processing: Level={level}, Punct={keep_punct}")
tokens = utils.clean_text(raw_text, level=level, keep_punctuation=keep_punct)
tokens[:10]

Processing: Level=char, Punct=False


['d', ' ', 'd', ' ', 'z', ' ', 'b', ' ', 'o', ' ']

In [87]:
reload(utils)

# show top k items
top_k = 2

# Build Model
counts = utils.build_ngram_counts(tokens, order)
print("Sample counts:")
pprint(list(counts.items())[:top_k])

# Print some statistics
utils.print_model_statistics(counts, order)

Sample counts:
[(('d', ' ', 'd'), Counter({'i': 10, 'o': 7, ' ': 1, 'e': 1})),
 ((' ', 'd', ' '), Counter({'z': 1, 'f': 1, 'p': 1, 'b': 1, 'w': 1, 't': 1}))]

----- Model Statistics -----
Order of N-gram Model (N)            : 3
Number of Unique Contexts            : 1,770
Total Observed N-grams (Transitions) : 17,603
Unique (Context, Token) N-grams      : 3,957
Conditional Vocab Size (Next Tokens) : 27


In [80]:
# Normalize to Probabilities
model = utils.normalize_to_probs(counts)
print("Normalized to proba:")
pprint(list(model.items())[:top_k])

# get context with most diverse next token options
context = max(model.keys(), key=lambda ctx: len(model[ctx]))

# show sorted next token probabilities for this context
df = pd.DataFrame(model[context].items(), columns=["token", "proba"]).sort_values(
    by="proba", ascending=False
)
print(f"\nMost likely token after {context} :")
print(df)

Normalized to proba:
[(('a', 'l', 'i'),
  {'a': 0.0025, 'c': 0.9851, 's': 0.0025, 't': 0.0025, 'v': 0.0074}),
 (('l', 'i', 'c'), {'a': 0.0025, 'e': 0.995, 'k': 0.0025})]

Most likely token after ('e', 'r', ' ') :
   token   proba
6      a  0.1408
5      t  0.1318
1      s  0.1028
11     h  0.0950
3      w  0.0827
0      i  0.0804
2      o  0.0503
4      f  0.0469
12     l  0.0302
10     c  0.0279
16     e  0.0279
8      b  0.0235
15     d  0.0201
7      m  0.0190
14     y  0.0190
21     p  0.0156
9      g  0.0145
18     v  0.0145
22     n  0.0145
17     q  0.0101
20     r  0.0101
13     k  0.0089
19     u  0.0078
23     x  0.0034
24     j  0.0022


In [81]:
# Calculate Entropy (Model based)
h_model = utils.calculate_entropy_from_counts(counts)
print(f"Model Entropy: {h_model:.4f} bits")

# def calculate_entropy_from_counts(counts):
#     """
#     H = - Sum_ctx ( P(ctx) * Sum_x ( P(x|ctx) * log2 P(x|ctx) ) )
#     """
#     total_observations = sum(sum(counter.values()) for counter in counts.values())

#     entropy = 0.0
#     for context, counter in counts.items():
#         ctx_count = sum(counter.values())
#         p_ctx = ctx_count / total_observations

#         h_cond = 0.0
#         for token, count in counter.items():
#             p_x_given_ctx = count / ctx_count
#             h_cond -= p_x_given_ctx * math.log2(p_x_given_ctx)

#         entropy += p_ctx * h_cond

#     return entropy

# The Entropy Rate (or Source Entropy) is a fundamental concept in info theory 
# that measures the avg amount of uncertainty or information per symbol (token) in a stochastic process (like text).
# For a source that generates a sequence of symbols (e.g., characters or words)
# the entropy rate, typically denoted $H(\mathcal{X})$, describes how "compressible" the source is.
    # Low Entropy Rate: The sequence is highly predictable (redundant). 
    # Knowing the past tokens tells you a lot about the next one.

    # High Entropy Rate: The sequence is less predictable (closer to random). 
    # Knowing the past doesn't help much in predicting the next token.

# -> What it is and what it is not:
# ✅ It is H(X_t | X_{t-2},X_{t-1}) for your empirical trigram distribution (with whatever tokenization and handling of boundaries you used).
# ❌ It is not the entropy of English in general, and it is not the unconditional entropy of the corpus.
# ❌ It is not a guaranteed measure of how well you’ll predict new text. 
# For held-out data you’d want cross-entropy / perplexity on a test set, because the training-set conditional entropy can be optimistically low 
# (especially if there are many rare contexts).

Model Entropy: 1.8276 bits


In [57]:
# Calculate Entropy (Sequence based - NLL)
# h_seq = utils.calculate_nll(tokens, model, order)
# print(f"Sequence NLL Entropy: {h_seq:.4f} bits")

# When both functions are calculated using the same sequence (i.e., the training sequence is passed as the sequence argument to calculate_nll),
# they will indeed give you the same numerical result. 
# $$H_{\text{model}} = L_{\text{train}}$$

In [82]:
# Generate Text
gen_len = 200
generated_tokens = utils.generate_text(model, order, gen_len)

if level == "char":
    gen_text = "".join(generated_tokens)
else:
    gen_text = " ".join(generated_tokens)

print("\n--- Generated Text ---")
print(gen_text)


--- Generated Text ---
boves fining suchnestears faceand to they spers and puttener v adven that evided its and not lar oppeaking oldinah she king in a number alice these against if your sir footmany othis look off were said a


In [59]:
# --- Chapter Analysis (Extension E3) ---
print("Running Chapter Analysis...")
chapters = utils.split_chapters(raw_text)
print(f"Found {len(chapters)} chapters.")

chapter_results = []
# Analyze entropy per chapter for a fixed config (e.g., Char, Punct, k=2)
# Ideally we train on the whole text and evaluate NLL on chapters,
# OR train on each chapter?
# README says "Evaluate non-stationarity: entropy per chapter."
# Usually this means measuring the entropy OF that chapter.
# We can measure it by:
# 1. Training a model on that chapter and finding its entropy.
# 2. Or using a global model and finding NLL of that chapter (surprisal).
# "Non-stationarity" implies the statistics change.
# Let's do both? Or just method 1 (simpler interpretation: "This chapter is more complex").
# Method 2 (Cross-entropy) shows how much the chapter deviates from the global average.
# Let's do Method 1 (Self-Entropy) for k=1 (simple)

target_config = {"level": "char", "keep_punct": True, "order": 1}

for i, chapter_text in enumerate(chapters):
    tokens = utils.clean_text(
        chapter_text,
        level=target_config["level"],
        keep_punctuation=target_config["keep_punct"],
    )
    if not tokens:
        continue

    k = target_config["order"]
    counts = utils.build_ngram_counts(tokens, k)
    # model = utils.normalize_to_probs(counts) # Not needed for entropy_from_counts

    h_chapter = utils.calculate_entropy_from_counts(counts)

    chapter_results.append(
        {"chapter": i + 1, "entropy": h_chapter, "length": len(tokens)}
    )

df_chap = pd.DataFrame(chapter_results)
df_chap.to_csv("results/chapter_entropy.csv", index=False)
print("Chapter entropy saved to results/chapter_entropy.csv")

Running Chapter Analysis...
Found 12 chapters.
Chapter entropy saved to results/chapter_entropy.csv
