# Build top-10k word -> probability + phonemes

Merge:

- `data/unigram_freq.csv` (columns: `word,count`)
- `data/cmudict-0.7b` (format: `WORD  PHONEMES`)

Output: a pickle with the top 10k words by frequency (case-insensitive English), their probability (count normalized over the filtered set), and CMU phonemes with stress digits stripped.

In [1]:
from pathlib import Path
import re
import pandas as pd
import pickle

ROOT = Path.cwd()
UNIGRAM_PATH = ROOT / "data" / "unigram_freq.csv"
CMU_PATH = ROOT / "data" / "cmudict-0.7b"
OUTPUT_PATH = ROOT / "data" / "top10k_words_phonemes.pkl"

UNIGRAM_PATH, CMU_PATH, OUTPUT_PATH

(WindowsPath('c:/Users/johnn/Desktop/EC ENGR C143A/c143a-project/notebooks/data/unigram_freq.csv'),
 WindowsPath('c:/Users/johnn/Desktop/EC ENGR C143A/c143a-project/notebooks/data/cmudict-0.7b'),
 WindowsPath('c:/Users/johnn/Desktop/EC ENGR C143A/c143a-project/notebooks/data/top10k_words_phonemes.pkl'))

In [2]:
# Load unigram frequencies
unigram_df = pd.read_csv(UNIGRAM_PATH)
unigram_df.columns = [c.strip().lower() for c in unigram_df.columns]
unigram_df.head()

Unnamed: 0,word,count
0,the,23135851162
1,of,13151942776
2,and,12997637966
3,to,12136980858
4,a,9081174698


In [3]:
# Load CMU dict into a mapping of lowercase word -> phoneme string (stress digits removed)
cmu_map = {}
stress_re = re.compile(r"\d")
# CMU file can contain non-UTF-8 bytes; fall back to utf-8 with errors ignored.
with CMU_PATH.open("r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        line = line.strip()
        if not line or line.startswith(";;;"):
            continue
        # CMU variants look like WORD(1)  PHONEMES
        if "  " not in line:
            continue
        word_raw, phones_raw = line.split("  ", 1)
        word = word_raw.split("(")[0].lower()
        # Normalize phonemes by removing stress digits
        phones = " ".join(stress_re.sub("", p) for p in phones_raw.split())
        # Keep the first occurrence
        if word not in cmu_map:
            cmu_map[word] = phones

len(cmu_map)

125764

In [4]:
# Select top 10k words by frequency that exist in CMU dict
top_df = unigram_df.copy()
top_df["word"] = top_df["word"].str.lower()
top_df = top_df.sort_values("count", ascending=False)
top_df = top_df[top_df["word"].isin(cmu_map)]
total_count = top_df["count"].sum()
top10k = top_df.head(10000).copy()
top10k["probability"] = top10k["count"] / total_count
len(top10k), top10k.head()

(10000,
   word        count  probability
 0  the  23135851162     0.041534
 1   of  13151942776     0.023611
 2  and  12997637966     0.023334
 3   to  12136980858     0.021789
 4    a   9081174698     0.016303)

In [5]:
# Build records and save
records = []
for _, row in top10k.iterrows():
    word = row["word"]
    records.append(
        {
            "word": word,
            "probability": float(row["probability"]),
            "phonemes": cmu_map[word],
        }
    )

with OUTPUT_PATH.open("wb") as f:
    pickle.dump(records, f)

OUTPUT_PATH, len(records), records[0]

(WindowsPath('c:/Users/johnn/Desktop/EC ENGR C143A/c143a-project/notebooks/data/top10k_words_phonemes.pkl'),
 10000,
 {'word': 'the', 'probability': 0.04153440748730958, 'phonemes': 'DH AH'})