# Most Frequent Words

Creating a dataset for actual use.

In [1]:
from pathlib import Path

import pandas as pd

In [2]:
data_dir = Path("../data")
assert data_dir.exists()

In [3]:
dfs = []
for source, fname in zip(
    ["Wikipedia", "Project Gutenberg", "peS2o"],
    ["wiki_word_counts.csv", "books_word_counts.csv", "s2_word_counts.csv"],
):
    df = pd.read_csv(
        data_dir / "derived" / fname, header=None, names=["count", "word"], dtype={"count": int, "word": str}
    )
    df["source"] = source
    dfs.append(df)
len(dfs)

3

In [4]:
top_dfs = []
for df in dfs:
    df = df.sort_values(by="count", ascending=False)
    df = df[(df["word"] != "") & (df["word"].notna())]
    total_words = df["count"].sum()
    df = df.head(1000000).copy()
    df = df.reset_index(drop=True)
    df["rank"] = df.index + 1
    df["pct"] = df["count"] / total_words
    top_dfs.append(df)
len(top_dfs)

3

In [5]:
top_dfs[2].head(2)

Unnamed: 0,count,word,source,rank,pct
0,57425445,the,peS2o,1,0.059086
1,36987140,of,peS2o,2,0.038057


In [6]:
top_n = 1000
cols = ["word", "pct", "rank"]
mdf = pd.merge(
    pd.merge(top_dfs[0][cols], top_dfs[1][cols], how="outer", on="word", suffixes=("_wiki", "_gutenberg")),
    top_dfs[2][cols].rename(columns={"pct": "pct_s2", "rank": "rank_s2"}),
    how="outer",
    on="word",
)
mdf = mdf[(mdf.rank_wiki <= top_n) | (mdf.rank_gutenberg <= top_n) | (mdf.rank_s2 <= top_n)].copy()
mdf = mdf.sort_values(by=["rank_wiki", "rank_gutenberg", "rank_s2"], ascending=True).reset_index(drop=True)
mdf

Unnamed: 0,word,pct_wiki,rank_wiki,pct_gutenberg,rank_gutenberg,pct_s2,rank_s2
0,the,6.253678e-02,1.0,6.033402e-02,1.0,5.908579e-02,1.0
1,of,3.411199e-02,2.0,3.578621e-02,2.0,3.805655e-02,2.0
2,and,2.959591e-02,3.0,3.036963e-02,3.0,2.954267e-02,3.0
3,in,2.559779e-02,4.0,1.718741e-02,6.0,2.139867e-02,4.0
4,to,2.075563e-02,5.0,2.574570e-02,4.0,1.959035e-02,5.0
...,...,...,...,...,...,...,...
1997,µ,8.561810e-08,196832.0,2.249327e-07,79771.0,2.255655e-04,494.0
1998,CHAPTER,8.328306e-08,200637.0,1.975420e-04,473.0,1.954935e-07,96237.0
1999,LETTER,4.631161e-08,296014.0,1.339040e-04,665.0,4.630109e-08,254622.0
2000,SMALL,3.541476e-08,357269.0,1.193558e-04,738.0,2.654596e-07,78841.0


In [7]:
mdf.to_csv("../data/derived/top_words_raw.csv", index=False)