In [4]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import utils
from tqdm import tqdm
from pathlib import Path
from pprint import pprint
from importlib import reload

In [None]:
level = "char" # "char" or "word"
keep_punct = False
orders = list(range(1, 10))

sources = {
    "alice": "data/alice.txt",
    "en":    "data/all_en.txt",
    "de":    "data/all_de.txt",
    "fr":    "data/all_fr.txt",
}

rows = []

for source, corpus_path in sources.items():
    print(f"Processing source: {source}")
    raw_text = utils.load_text(corpus_path)
    tokens = utils.clean_text(raw_text, level=level, keep_punctuation=keep_punct)

    for order in tqdm(orders):
        counts = utils.build_ngram_counts(tokens, order, show_pb=False, print_stats=False)
        model = utils.normalize_to_probs(counts)
        h_model = utils.calculate_entropy_from_counts(counts)

        rows.append({
            "source": source,
            "order": order,
            "entropy": h_model,
        })

df_entropy = pd.DataFrame(rows).sort_values(["source", "order"]).reset_index(drop=True)
df_entropy

Processing source: alice


100%|██████████| 9/9 [00:00<00:00, 15.41it/s]


Processing source: en


 67%|██████▋   | 6/9 [00:11<00:07,  2.36s/it]