# Word Counts


In [25]:
from glob import glob
from pathlib import Path

import pandas as pd

from dolma_count import count_utils

In [2]:
data_dir = Path("../data")
assert data_dir.exists()

In [3]:
!du -h {data_dir}/raw/wiki-en-simple/*

3.8G	../data/raw/wiki-en-simple/en_simple_wiki-0000.json.gz
2.0G	../data/raw/wiki-en-simple/en_simple_wiki-0001.json.gz


In [None]:
# this is a functional but bad approach to word counting
# I think it's mainly bad because of the sort over approximately 1.5B lines
# not sure of the exact time, but it took a few hours to execute on my mac
# inspired in part by this post: https://stackoverflow.com/questions/10552803/how-to-create-a-frequency-list-of-every-word-in-a-file
!gunzip -c en_simple_wiki-0001.json.gz \
| ggrep -Po '"text":.*?[^\]",' - \
| cut -c 9- \
| sed 's/",$//' \
| sed 's/
/
/g' \
| sed 's/\"/"/g' \
| tr -d '[[:punct:]]' \
| tr ' ' '
' \
| sort \
| uniq -c \
| sort -nr > wiki0001_word_counts.txt

In [17]:
!echo "a\nb\nc\nd\ne\nb" | python ../src/dolma_count/count_utils.py

a 1
b 2
c 1
d 1
e 1


In [8]:
# this approach counts words at approximately 4M words/second
# at 3.6B tokens in the Dolma wiki data, let's do some back-of-the-envelope calculation
# (3600000000 words / 2) / 4000000 words-per-second / 60s ~= 7.5 minutes
# that works in practice: it takes about 7 minutes!
!gunzip -c {data_dir}/raw/wiki-en-simple/en_simple_wiki-0000.json.gz \
| ggrep -Po '"text":.*?[^\]",' - \
| cut -c 9- \
| sed 's/",$//' \
| sed 's/\n/
/g' \
| sed 's/\"/"/g' \
| tr -d '[[:punct:]]' \
| tr ' ' '
' \
| python ../src/dolma_count/count_utils.py > {data_dir}/derived/wiki0000_word_counts.txt

Counting words: 1713382367it [07:11, 3970836.03it/s]


In [24]:
!gunzip -c {data_dir}/raw/wiki-en-simple/en_simple_wiki-0001.json.gz \
| ggrep -Po '"text":.*?[^\]",' - \
| cut -c 9- \
| sed 's/",$//' \
| sed 's/\n/
/g' \
| sed 's/\"/"/g' \
| tr -d '[[:punct:]]' \
| tr ' ' '
' \
| python ../src/dolma_count/count_utils.py > {data_dir}/derived/wiki0001_word_counts.txt

Counting words: 872821373it [03:40, 3962156.09it/s]


In [9]:
!wc -l {data_dir}/derived/wiki0000_word_counts.txt

 9716979 ../data/derived/wiki0000_word_counts.txt


In [11]:
# this doesn't work, but something like it should
!cat {data_dir}/derived/wiki0000_word_counts.txt | sort -k1 -nr -t, | head

^C


In [12]:
!head {data_dir}/derived/wiki0000_word_counts.txt

5725,"Organic"
2,"ChemistryCover"
10259635,""
8643,"Welcome"
36142078,"to"
107532442,"the"
81354,"worlds"
10783,"foremost"
281024,"open"
1,"contentltbrgtOrganic"


In [13]:
df = pd.read_csv(data_dir / "derived" / "wiki0000_word_counts.txt", header=None, names=["count", "word"])
len(df)

9716979

In [14]:
df.sample(n=3)

Unnamed: 0,count,word
6613815,1,Mosoru
2717138,7,UFAW
8547159,1,NoordZuidHollandsche


In [15]:
df = df.sort_values(by="count", ascending=False)
df.head(10)

Unnamed: 0,count,word
5,107532442,the
15,58669363,of
24,50567064,and
32,42568367,in
4,36142078,to
29,34080892,a
795,21064791,was
13,17664703,The
17,16259858,is
260,13981699,for


In [23]:
# total word count
f"{df['count'].sum():,}"

'1,713,382,367'

In [21]:
# hapax legomena
f"{(df['count'] == 1).sum()} / {len(df)} = {(df['count'] == 1).sum() / len(df):.2%}"

'5404986 / 9716979 = 55.62%'

In [29]:
glob(str(data_dir / "derived" / "wiki*_word_counts.txt"))

['../data/derived/wiki0001_word_counts.txt',
 '../data/derived/wiki0000_word_counts.txt']

In [30]:
dfs = []
for fname in glob(str(data_dir / "derived" / "wiki*_word_counts.txt")):
    df = pd.read_csv(fname, header=None, names=["count", "word"])
    dfs.append(df)
len(dfs)

2

In [37]:
wc_df = pd.concat(dfs).groupby("word")["count"].sum()
len(wc_df)

12931993

In [38]:
wc_df = pd.DataFrame(wc_df).sort_values(by="count", ascending=False)
wc_df.head()

Unnamed: 0_level_0,count
word,Unnamed: 1_level_1
the,160691399
of,87652471
and,76048179
in,65774812
to,53332643


In [40]:
wc_df.reset_index().to_csv(data_dir / "derived" / "all_wiki_word_counts.csv", index=False)

In [42]:
wc_df.reset_index().head(1000).to_csv(data_dir / "app_data" / "wiki_word_counts.csv", index=False)