# Create Wordlist

Create a vocabulary list for Bookworm, optimized to include top language-specific tokens.

In [1]:
import pandas as pd
import dask.dataframe as dd

In [2]:
import os
st = os.stat('/notebooks/data/final/final-sorted.h5')
st.st_size / 1024**3

0.009112454950809479

In [3]:
# Get all langs and their sizes
with pd.HDFStore('/notebooks/data/final/final-sorted.h5') as store:
    keys = store.keys()
    sizes = [store.get_storer(key).nrows for key in keys]
tablesizes = pd.Series(sizes, index=keys).sort_values(ascending=False)
tablesizes.head(5)

/eng    123771
/hin     44774
/jpn     19241
/slo      6335
/chi      3540
dtype: int64

In [4]:
with pd.HDFStore('/notebooks/data/final/final-sorted.h5') as store:
    df = store.select('/gre')
df[df.index.str.startswith('Ελλάδα')]

KeyError: 'No object named /gre in the file'

In [4]:
df[df.index.str.startswith('Ελλάδα')]

NameError: name 'df' is not defined

# Determine a trimming policy for each lang

Each language contributes top $N_{lang}$ tokens to the word list. $N_{lang}$ is selected according to the following rules:

    4% of the language's saved vocabulary, to a minimum of 25k, and hard-coded adjustments for the biggest languages where 4% is too high (eng=900m, ger=650k, {fre,lat,rus}=400k, {jpn,ita,spa}=250k). Any language with less than 100k tokens *total* is assumed to be a junk language, or one that BW is not useful for to begin with, so it's trimmed.

In [4]:
# Reference for how many top words to keep
top_words_ref = dict(eng=900000, ger=650000,
                     fre=400000, lat=360000, rus=280000,
                     jpn=300000, ita=220000, spa=220000)

def trim_topwords(row):
    if row[0][1:] in top_words_ref:
        return top_words_ref[row[0][1:]]
    elif row[1] < 100000:
        # Ignore langs with practically no words as likely duds, or at the very least
        # something BW wouldn't be useful for
        return 0
    else:
        # Other languages: keep greater of 25k or 5% of vocab
        mincount = 20000
        percentagetrim = int(row[1] * 0.035)
        return percentagetrim if percentagetrim > mincount else mincount

cutoff_list = tablesizes.reset_index().rename(columns={'index': 'lang', 0: 'count'})
cutoff_list['retain_count'] = cutoff_list.apply(trim_topwords, axis=1)
print("Total tokens (including possible dupes)", cutoff_list['retain_count'].sum())
cutoff_list.head(10)

Total tokens (including possible dupes) 1850000


Unnamed: 0,lang,count,retain_count
0,/eng,123771,900000
1,/hin,44774,0
2,/jpn,19241,300000
3,/slo,6335,0
4,/chi,3540,0
5,/ger,2075,650000
6,/und,389,0
7,/ind,287,0


In [5]:
dfs = []
problem_dfs = []
for i, row in cutoff_list.iterrows():
    if row['retain_count'] == 0:
        continue
    df = pd.read_hdf('/notebooks/data/final/final-sorted.h5', row['lang'], stop=1000000)
    
    count = df[(df.index.str.startswith('\u200b') | df.index.str.endswith("\u200b"))]
    if count.shape[0] != 0:
        print("%s has %d tokens withs \\u200b in the top 1m" % (row['lang'], count.shape[0]))

In [6]:
dfs = []
problem_dfs = []
for i, row in cutoff_list.iterrows():
    if row['retain_count'] == 0:
        continue
    df = pd.read_hdf('/notebooks/data/final/final-sorted.h5', row['lang'], stop=row['retain_count'])
    # Save Japanese and Chinese chars with \u200b char
    if row['lang'] in ['/jpn', '/chi', '/kor', '/arm', '/urd']:
        problems = df[(df.index.str.startswith('\u200b') | df.index.str.endswith("\u200b"))]
        problem_dfs.append(problems)
    dfs.append(df)
    
asn_probchars = pd.concat(problem_dfs).groupby(level='token').sum().sort_values('count', ascending=False)
print(asn_probchars)
wordlist = pd.concat(dfs).groupby(level='token').sum().sort_values('count', ascending=False)
print("Final wordlist using top N trim criteria: ", wordlist.shape)

Empty DataFrame
Columns: [count]
Index: []
Final wordlist using top N trim criteria:  (141924, 1)


## Testing trim policy

In [7]:
# Grab a 1000 word chunk starting at 'start'
start = 0#9*10**5
lang = '/eng'
test_tokens = pd.read_hdf('/notebooks/data/final/final-sorted.h5', lang,
                          start=start, stop=start+1000)

In [8]:
# Randomly sample, so that you're not biased just to the 
# same ten tokens at the start of the list. Try re-running this cell
test_tokens.sample(10) 

Unnamed: 0_level_0,count
token,Unnamed: 1_level_1
outside,23254
later,49994
who,298154
factor,21430
without,81568
schools,28730
associated,20396
aid,30880
n,47364
simply,22060


## Testing rules for removing likely junk

In [8]:
# The re module is broken for hindi and similar characters, need to use regex
!pip install regex
import regex
import numpy as np

print(wordlist.shape)
tokens = wordlist.index
hyphenated = tokens.str.contains(r"-")
#alpha = tokens.str.isalpha() # Faster, but bad for some languages
alphaadv = tokens.map(lambda x: not not regex.search("^\\w+$", x)).values.astype(np.bool_)
number = tokens.str.contains(r"^(£|$|€)?[\d.,]+(st|nd|rd|th|s|C|F|c|m|°|¥)?$")
singlequote = tokens.str.contains(r"[\'’]")
abbr = tokens.str.contains(r"^[^\W\d]([^\W\d]|\.)+$")
endwithperiod = tokens.str.endswith('.')
# This shows up for many asian characters, should be dealt with *before* wordlist is created
blankchar = (tokens.str.startswith('\u200b') | tokens.str.endswith("\u200b"))
tlen = tokens.str.len()
print(len(tokens))

(141924, 1)


  return func(self, *args, **kwargs)


141924


In [53]:
df = pd.read_hdf('/notebooks/data/final/final-sorted.h5', 'hin', stop=1000000)
df.head()

Unnamed: 0_level_0,count
token,Unnamed: 1_level_1
है,6772484
।,6255580
के,5910544
में,4718680
की,3886890


In [54]:
tokens = df.index
hyphenated = tokens.str.contains(r"-")
#alpha = tokens.str.isalpha() # Faster, but bad for some languages
alphaadv = tokens.map(lambda x: not not regex.search("^\\w+$", x)).values.astype(np.bool_)
number = tokens.str.contains(r"^(£|$|€)?[\d.,]+(st|nd|rd|th|s|C|F|c|m|°|¥)?$")
singlequote = tokens.str.contains(r"[\'’]")
abbr = tokens.str.contains(r"^[^\W\d]([^\W\d]|\.)+$")
endwithperiod = tokens.str.endswith('.')
# This shows up for many asian characters, should be dealt with *before* wordlist is created
blankchar = (tokens.str.startswith('\u200b') | tokens.str.endswith("\u200b"))
tlen = tokens.str.len()

  return func(self, *args, **kwargs)


In [55]:
print(len(tokens))
print(type(hyphenated))
print(type(hyphenated[0]))
print(~hyphenated)
print(type(alphaadv))
print(type(alphaadv[0]))
print(~alphaadv)
print(tlen >= 2)
print(~endwithperiod)
print(~singlequote)
print(~number)
print(~abbr)
print(~blankchar)
df[~hyphenated & ~alphaadv & (tlen >= 2) & ~endwithperiod & ~singlequote & ~number & ~abbr & ~blankchar].index.values[:100]

164570
<class 'numpy.ndarray'>
<class 'numpy.bool_'>
[ True  True  True ...  True  True False]
<class 'numpy.ndarray'>
<class 'numpy.bool_'>
[False  True False ... False False  True]
[ True False  True ...  True  True  True]
[ True  True  True ...  True  True  True]
[ True  True  True ...  True  True  True]
[ True  True  True ...  True  True  True]
[ True  True  True ... False False  True]
[ True  True  True ...  True  True  True]


array(['है,', 'हैं,', 'है।', 'हैं।', 'था,', 'हो,', 'नहीं,', '।"', '1:',
       '।।', 'थे,', 'में,', '``', 'थी,', 'है:', '),', 'है;', '",', 'से,',
       'कहा,', 'था।', 'है"', 'को,', '(,', 'की,', '**', 'गया,', 'थे।',
       ':,', 'का,', '।”', 'किया,', 'हूँ,', 'होगा,', '.:', 'के,', 'वही,',
       'है)', 'थी।', 'भी,', '(२)', 'जी,', 'ही,', '।१', '(:', 'दिया,',
       '."', '::', 'हों,', 'वि,', 'दु:ख', 'करना,', 'होता,', '(३)', 'गया।',
       'गो,', '""', 'दो,', 'पृ"', '.)', 'बी,', 'रहे,', '?"', '।२', 'हैं;',
       '?”', 'होगी,', '।:', '***', 'गये,', 'दे,', 'लगा,', 'देखा,', ':)',
       'जा,', '[.]', 'हैं:', 'नहीं।', 'रहा,', 'छो,', 'मैं,', 'किया।',
       '(४)', 'शर्मा,', 'हाँ,', 'पु]', 'ने,', 'ले,', '(१)', 'सं,',
       'बोला,', 'हो।', 'हैं"', 'थीं,', 'सकता,', '":', 'हु,', '()', 'हूं,',
       'तो,'], dtype=object)

### Expected junk

Top of the list:

In [9]:
print(len(wordlist))
print(len(~hyphenated))
print(len(~alphaadv))
print(len(tlen >= 2))
print(len(~endwithperiod))
print(len(~singlequote))
print(len(~number))
print(len(~abbr))
print(len(~blankchar))
junk = wordlist[~hyphenated & ~alphaadv & (tlen >= 2) & ~endwithperiod & ~singlequote & ~number & ~abbr & ~blankchar]
junk.head(10)

141924
141924
141924
141924
141924
141924
141924
141924
141924


Unnamed: 0_level_0,count
token,Unnamed: 1_level_1
``,884644
**,18386
||,11450
0),10826
1),9064
"*,",8500
°C,6784
—|,5074
2),4144
***,4084


And a random selection:

In [10]:
junk.sample(10)

Unnamed: 0_level_0,count
token,Unnamed: 1_level_1
[20],24
=\,110
100$<,26
**『,50
c/o,132
|T,22
+9,130
』､,34
、㎞の,26
・八,76


### JPN and CHI fix

Any characters in the /jpn and /chi lists that have a non-breaking line space will be added, but with the id of the cleaned version. If there is no cleaned version, add one to the word list.

In [12]:
final_candidate = wordlist[hyphenated | alphaadv | (tlen < 2) | endwithperiod | singlequote | number | abbr | blankchar].reset_index()

In [13]:
prob_chars = asn_probchars.reset_index().query('token != "\u200b"')
print(prob_chars)
prob_chars['broken'] =prob_chars['token']
prob_chars['token'] = prob_chars['broken'].str.replace('\u200b', '')

Empty DataFrame
Columns: [token, count]
Index: []


In [14]:
# Problem characters that are not in the wordlist: add them (as fixed version)
to_add = np.setdiff1d(prob_chars['token'].values, final_candidate['token'].values)
new_lines = prob_chars[prob_chars['token'].isin(to_add)][['token', 'count']]
final = pd.concat([final_candidate, new_lines])\
        .groupby('token', as_index=False).sum()\
        .sort_values('count', ascending=False)\
        .reset_index(drop=True)\
        .reset_index()

In [15]:
# The indices to for the fixed characters. When we encounter the broken words in the dataset, we'll encode then
# with the id for the fixed token
print(len(final))
print(len(prob_chars[['token','broken']]))
problemchar_indices = pd.merge(final, prob_chars[['token', 'broken']], on='token')[['index','broken']]
problemchar_indices.sample(3)

210884
0


ValueError: a must be greater than 0 unless no samples are taken

## Save final list

Also, save /jpn and /chi fixes

In [16]:
# OVERWRITE MODE
with pd.HDFStore('/notebooks/data/final/wordlist.h5', complib='blosc', mode='w', complevel=9) as store:
    store.append('/final', final)
    store.append('/fixes', problemchar_indices)

# Test against dictionary

In [17]:
from htrc_features import FeatureReader, utils

# Two copies of the same dictionary, Laird and Lee's Webster's. They capitalize their words, so
# I'm looking for capital words that occur in both.
dicts = ['loc.ark:/13960/t84j1sb5j', 'loc.ark:/13960/t3xs70k06']
paths = ['/notebooks/features/' + utils.id_to_rsync(volid) for volid in dicts]
fr = FeatureReader(paths)
tokenlist = []
for vol in fr.volumes():
    tokenlist += vol.tokens()

tokens = pd.Series(tokenlist)
# Grab capitalized letters
dictionary_words = tokens[tokens.str.contains(r"^[A-Z][A-Z\-]*$")].value_counts()
shortlist = dictionary_words[dictionary_words > 1].index.str.lower().values

In [18]:
unique_final_lower = final['token'].str.lower().unique()

In [19]:
extradictwords = np.setdiff1d(shortlist, unique_final_lower)

In [22]:
pd.Series(extradictwords).sample(50)

12739          monetized
20213       variableness
10395          humidness
5548         crimination
16872    sensitive-plant
17197        singlestick
19037    through-lighted
6386           denizened
8891                fisc
15301          pythoness
7233        disregardful
14772             posset
2800             bordage
5481           craziness
19470         tree-nymph
18474         synecdoche
10256        hobby-horse
20995             yauped
17525        soliloquize
11875           legories
18342              surfy
11011            inglobe
4909         conjecturer
6798     dictatorialness
1027             apogean
10427            huskies
13706        overtrading
18569           tameless
18853          terranean
2502             blabbed
8555       expectorative
1069         appeasingly
9950      hair-splitting
14618               plic
16119             rifler
13545          osteology
5776         currishness
9975           hall-mark
20618           war-path
19185                tng


In [23]:
shortlist.shape[0], extradictwords.shape[0], 1-extradictwords.shape[0]/shortlist.shape[0]

(46591, 21104, 0.5470369813912559)

In [24]:
df = pd.read_hdf('/notebooks/data/final/wordlist.h5')
df.to_csv('/notebooks/data/final/wordlist.csv')

## Junk filter testing, here be dragons

Keeping this here as an example of how I tested various matching criteria.

In [27]:
eng_2m = dd.read_hdf('/notebooks/data/final/final-sorted.h5', '/eng', stop=2000000).compute()

In [28]:
tokens = eng_2m.index
alpha = tokens.str.isalpha()
digit = tokens.str.isdigit()
tlen = tokens.str.len()
endwithperiod = tokens.str.endswith('.')
quotes = (tokens.str.startswith('"') | tokens.str.endswith('"')) # | tokens.str.startswith('\'') | tokens.str.endswith('\''))
endash = (tokens.str.startswith('—') | tokens.str.endswith('—'))
punccount = tokens.str.count('[\W]')
repeating = tokens.str.contains(r"(([\w\W])\2{3,})")
repeatingdigit = tokens.str.contains(r"((\d)\2{3,})")
abbr = tokens.str.contains(r"^[^\W\d]([^\W\d]|\.)+$")
singlequote = tokens.str.contains(r"\'")
hyphenated = tokens.str.contains(r"-")
number = tokens.str.contains(r"^(£|$|€)?[\d.,]+(st|nd|rd|th|s|C|F|c|m|°|¥)?$")

  return func(self, *args, **kwargs)


In [29]:
eng_2m[~alpha & (tlen > 1) & ~endwithperiod & (punccount >= 1)].head(10)

Unnamed: 0_level_0,count
token,Unnamed: 1_level_1
-RRB-,1477950
-LRB-,1441984
'',916482
``,792328
's,600158
--,545330
-RSB-,285958
-LSB-,284386
n't,129732
'd,37548


In [30]:
a = eng_2m[~hyphenated & ~alpha & (tlen >= 2) & ~endwithperiod & ~singlequote & ~number & ~abbr]
print("\t".join(a[:300].index.values))

``	है	°C	**	2d	के	2b	2a	+1	तो	भी	and/or	3d	कि	में	इति	),	नहीं	»,	.)	।०	को	3D	ही	6d	.;	3b	1a	हैं	है,	***	की	T3	हो	."	RD&D	C02	2D	H2	का	से	9/11	+2	.:	मैं	लिए	1/2	````	:,	a2	1b	A2	था	R2	जो	x1	>>	0)	S3	1रि०	m/s	x2	9d	1n	P1	कोई	S2	a1	n2	3a	",	5»	P2	A1	US$	a>	कहा	भाव	लोग	i2	गया	अपने	उसे	AISI4140	R&D	हैं,	ता	.‘	T2	s2	N2	X2	N0	??	2»	0»	कुछ	काम	किया	V2	C2	वे	._	,"	हुए	v2	क्या	या	2B	t2	t0	?!	^^	बात	<<	£ηβ	दिया	°F	चाहिए	अपि	I1	m2	ये	रहा	मुझे	प्र	L2	राम	R1	I2	3B	फिर	1r	साथ	रूप	गये	थी	+00	J3	करते	बाबा	```	5a	ΟΟ*0	उसके	QR/E	मेरे	1п	I0	A3	==	I4	S1	जाता	B1	CO2	v1	i6	अपनी	थे	!!	<I>	""	J2	होता	हुआ	प्त	e2	M2	2n	मैंने	<f>	E2	B2	.—	k2	C0	r2	1:	<t>	t1	०त्	मेरी	‘_	C1	l2	1/3	हूँ	ST363	दो	जा	his/her	8vo	अति	H20	T1	23d	2A	रहे	N/A	2S	L1	कारण	l0	r1	m3	n/a	हुई	i8	f2	«44	F2	p2	U2	करने	C3	(1	पाते	••	उन्हें	+3	****	*»	ऐसा	पास	V0	Q1	मेरा	नाम	M3	करना	.),	MoS2	l1	6a	जैसे	E1	१रि०	M1	X1	2i	यहाँ	Tests/A	K2	s1	S0	7a	n0	,—	ने	हमारे	b2	12mo	i860	1l	I5	c2	ले	2003a	e1	V1	बोले	3i	//	०ई	F1	y1	y2	हमें	पहले	G2	);	1:1	4a	1/4	0O	h2	2

In [31]:
eng_2m[~alpha & abbr & ~endwithperiod & (tlen > 2)]

Unnamed: 0_level_0,count
token,Unnamed: 1_level_1
pr.p,23846
___,2550
____,1512
n.pl,1192
F.C,1098
...,...
H.S,20
J.A,20
Ipr.p,20
i.xix,20


In [32]:
abbr = tokens.str.contains(r"^[^\W\d]([^\W\d]|\.)+$")

  return func(self, *args, **kwargs)


In [33]:
eng_2m[~hyphenated & ~abbr & ~alpha & (tlen >= 2) & ~endwithperiod & ~singlequote & ~number].shape[0] /2000000

0.0028315

In [34]:
# Longest digit-only values
eng_2m[digit & (tlen > 10)]

Unnamed: 0_level_0,count
token,Unnamed: 1_level_1
0,54
0,52
0,22
0,22


In [35]:
# Long words
pd.set_option('display.float_format', lambda x: '%.0f' % x)
longwords = eng_2m[alpha].reset_index().copy()
longwords['chars'] = longwords['token'].str.len()
longwords.head(1)
longwords.groupby('chars').apply(lambda x: x.sort_values('count').iloc[-1] )[['token', 'count']][:-1]

Unnamed: 0_level_0,token,count
chars,Unnamed: 1_level_1,Unnamed: 2_level_1
1,a,2322422
2,of,4937172
3,the,7289634
4,that,1130682
5,which,439120
6,should,109122
7,between,108458
8,American,47712
9,different,55256
10,University,44640


In [36]:
eng_nonalpha = eng_2m[~eng_2m.index.str.isalpha()]
eng_nonalpha.shape

(21718, 1)

In [37]:
eng_nonalphanumeric = eng_nonalpha[~eng_nonalpha.index.str.isdigit()]
eng_nonalphanumeric.shape

(19192, 1)