In [1]:
import json
import urllib.request
import pandas as pd
import re
import random
import string
from spellchecker import SpellChecker
import numpy as np
from wordle_helpers import (
    words_with_commonly_used_letters,
    words_starting_with_commonly_used_letters,
    sorted_words_by_frequency,
    anagram_pairs,
    anagram_word_frequency,
    anagram_to_keep
)

import os
# where to save the data
ROOT_DIR = "."
FOLDER_NAME = "data"
PATH_TO_FOLDER = os.path.join(ROOT_DIR, FOLDER_NAME)
os.makedirs(PATH_TO_FOLDER, exist_ok=True)

# Load data  

I'll be using the Webster's Unabridged English Dictionary as my source of English words, helpfully curated [here](https://github.com/adambom/dictionary).

In [2]:
with urllib.request.urlopen("https://raw.githubusercontent.com/adambom/dictionary/master/dictionary.json") as url:
    data = json.load(url)

In [3]:
websters_english_dictionary = pd.DataFrame([data]).T
websters_english_dictionary.reset_index(inplace=True)
websters_english_dictionary.rename(columns={"index": "word", 0: "definition"}, inplace=True)

In [4]:
websters_english_dictionary.head()

Unnamed: 0,word,definition
0,DIPLOBLASTIC,Characterizing the ovum when it has two primar...
1,DEFIGURE,To delineate. [Obs.]These two stones as they a...
2,LOMBARD,"Of or pertaining to Lombardy, or the inhabitan..."
3,BAHAISM,The religious tenets or practices of the Bahais.
4,FUMERELL,See Femerell.


In [5]:
websters_english_dictionary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86036 entries, 0 to 86035
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   word        86036 non-null  object
 1   definition  86036 non-null  object
dtypes: object(2)
memory usage: 1.3+ MB


In [6]:
websters_english_dictionary.to_csv("./data/websters_english_dictionary.csv", index=False)

# Create list of five letter words

## Five letter words

In [7]:
five_letter_words = []

for dict_word in websters_english_dictionary["word"]:
    words = []
    m = re.search(r"\w+", dict_word)
    if m is not None:
        words.append(m.group(0).lower())
    for word in words:
        if len(word) == 5 and word not in five_letter_words:
            five_letter_words.append(word)

In [8]:
len(five_letter_words)

5568

In [9]:
len(set(five_letter_words))

5568

In [10]:
check = [len(word) for word in five_letter_words]

In [11]:
set(check)

{5}

In [12]:
five_letter_words[:10]

['water',
 'gnarl',
 'arles',
 'villa',
 'stagy',
 'betty',
 'aknow',
 'berbe',
 'icily',
 'yamma']

In [13]:
# Save words as txt file. Text file provided in data folder
with open("./data/webster_dict_all_five_letter_words.txt", mode="w") as file:
    file.write("\n".join(five_letter_words))

## no repeat letter in 5 letter words  

So as to not waste letter guesses, we'll only select words with no repeat letters

In [14]:
no_repeat_letter_five_letter_words = [word for word in five_letter_words if len(set(word)) == 5]
no_repeat_letter_five_letter_words[:10]

['water',
 'gnarl',
 'arles',
 'stagy',
 'aknow',
 'tawer',
 'copra',
 'knout',
 'atole',
 'timal']

In [15]:
len(no_repeat_letter_five_letter_words)

3636

## words with most common starting letters  

Further filtering by choosing words form `most_common_starting_letters = "taisocmfpw"`

In [16]:
common_starting_letters = words_starting_with_commonly_used_letters(no_repeat_letter_five_letter_words)
common_starting_letters[:10]

['water',
 'arles',
 'stagy',
 'aknow',
 'tawer',
 'copra',
 'atole',
 'timal',
 'trunk',
 'inkle']

In [17]:
len(common_starting_letters)

2181

## words frequency

In [18]:
words_sorted_by_frequency = sorted_words_by_frequency(common_starting_letters, sorting=True)
words_sorted_by_frequency[:10]

[('about', 0.004575716203465601),
 ('would', 0.0025520860237645514),
 ('could', 0.002105716897178485),
 ('their', 0.001190727852529245),
 ('maybe', 0.001072666290978991),
 ('after', 0.0010576258274517993),
 ('those', 0.0010403927566496672),
 ('other', 0.0009394233730457645),
 ('first', 0.0009136375616764069),
 ('thing', 0.0008622612539266068)]

In [19]:
len(words_sorted_by_frequency)

2181

# Determine anagrams to remove  

All anagrams in a group will share the same informational content, albeit different positional information. So where we find anagrams we select the one with the highest word frequency. If our guess word is an anagram of the challenge word we'll get yellows and greens.

## Pairs of anagrams

In [20]:
random.seed(44)
paired_anagrams = anagram_pairs([x[0] for x in words_sorted_by_frequency])

In [21]:
paired_anagrams

array([['could', 'cloud'],
       ['those', 'shote'],
       ['those', 'sothe'],
       ...,
       ['muser', 'merus'],
       ['pelta', 'tepal'],
       ['oxfly', 'foxly']], dtype='<U5')

In [22]:
words_sharing_all_letters = pd.DataFrame(paired_anagrams, columns=["word1", "word2"])
words_sharing_all_letters

Unnamed: 0,word1,word2
0,could,cloud
1,those,shote
2,those,sothe
3,other,throe
4,first,frist
...,...,...
590,suant,astun
591,shote,sothe
592,muser,merus
593,pelta,tepal


In [23]:
words_sharing_all_letters.loc[words_sharing_all_letters["word1"] == "those"]

Unnamed: 0,word1,word2
1,those,shote
2,those,sothe


In [24]:
words_sharing_all_letters.loc[words_sharing_all_letters["word1"] == "stale"]

Unnamed: 0,word1,word2
228,stale,satle
229,stale,stela
230,stale,astel


In [25]:
words_sharing_all_letters["word1"].value_counts()

steal    6
trace    5
plate    5
taper    5
tales    5
        ..
widen    1
chute    1
flyer    1
crisp    1
oxfly    1
Name: word1, Length: 418, dtype: int64

In [26]:
anagrams = pd.pivot_table(words_sharing_all_letters,
                          values="word2",
                          columns="word1",
                          index=None,
                          aggfunc=lambda x: ", ".join(x)).T
anagrams.reset_index(inplace=True)
anagrams

Unnamed: 0,word1,word2
0,abode,adobe
1,abort,tabor
2,acred,cader
3,acrid,caird
4,adept,pated
...,...,...
413,worth,"wroth, whort"
414,wrath,thraw
415,wreak,waker
416,wrote,tower


In [27]:
anagrams["anagrams"] = [", ".join(x) for x in zip(anagrams["word1"], anagrams["word2"])]
anagrams

Unnamed: 0,word1,word2,anagrams
0,abode,adobe,"abode, adobe"
1,abort,tabor,"abort, tabor"
2,acred,cader,"acred, cader"
3,acrid,caird,"acrid, caird"
4,adept,pated,"adept, pated"
...,...,...,...
413,worth,"wroth, whort","worth, wroth, whort"
414,wrath,thraw,"wrath, thraw"
415,wreak,waker,"wreak, waker"
416,wrote,tower,"wrote, tower"


In [28]:
anagrams.rename(columns={"word1": "word"}, inplace=True)
anagrams.drop(columns=["word2"], inplace=True)

In [29]:
anagrams.loc[(anagrams["word"] == "spare") | (anagrams["word"] == "taper") | (anagrams["word"] == "teams")]

Unnamed: 0,word,anagrams
301,spare,"spare, spear, parse, prase, asper"
352,taper,"taper, prate, petar, trape, apert, peart"


In [30]:
anagrams.loc[anagrams["word"] == "those"]

Unnamed: 0,word,anagrams
362,those,"those, shote, sothe"


In [31]:
anagrams.loc[anagrams["word"] == "stale"]

Unnamed: 0,word,anagrams
312,stale,"stale, satle, stela, astel"


In [32]:
anagrams["word"].nunique()

418

In [33]:
anagrams.to_csv("./data/anagrams.csv", index=False)

## Get word frequency of anagrams

In [34]:
random.seed(123)
anagrams_word_freq = anagram_word_frequency(anagrams["anagrams"].to_numpy())

## Keep anagrams with the highest frequency

In [35]:
anagrams_keep_words = anagram_to_keep(anagrams_word_freq)
anagrams_keep_words[:10]

array(['abode', 'abort', 'acred', 'acrid', 'adept', 'adore', 'afire',
       'aider', 'aitch', 'alert'], dtype=object)

In [36]:
anagrams_keep_words.size

418

## Remove these anagrams

In [37]:
arr_of_words = np.array([x.split(", ") for x in anagrams["anagrams"].to_numpy()], dtype=object)
arr_of_words[:10]

array([list(['abode', 'adobe']), list(['abort', 'tabor']),
       list(['acred', 'cader']), list(['acrid', 'caird']),
       list(['adept', 'pated']), list(['adore', 'oread']),
       list(['afire', 'feria']), list(['aider', 'irade']),
       list(['aitch', 'chati']), list(['alert', 'alter'])], dtype=object)

In [38]:
flat_arr_of_words = np.unique(np.concatenate(arr_of_words))
flat_arr_of_words[:10]

array(['abode', 'abort', 'acerb', 'acred', 'acrid', 'acton', 'adeps',
       'adept', 'adobe', 'adore'], dtype='<U5')

In [39]:
flat_arr_of_words.size

726

In [40]:
anagrams_to_remove = np.setdiff1d(flat_arr_of_words, anagrams_keep_words)

In [41]:
anagrams_to_remove.size

308

In [42]:
anagrams_to_remove[:10]

array(['acerb', 'acton', 'adeps', 'adobe', 'aesir', 'aimer', 'alfet',
       'algin', 'almry', 'alose'], dtype='<U5')

# Final words

In [43]:
len(common_starting_letters)

2181

In [44]:
common_starting_letters_final_words_idx = \
    (np.isin(np.array(common_starting_letters), anagrams_to_remove, invert=True))

In [45]:
common_starting_letters_final_words_idx

array([ True,  True,  True, ...,  True,  True,  True])

In [46]:
common_starting_letters_final_words = \
    np.array(common_starting_letters)[common_starting_letters_final_words_idx]

In [47]:
common_starting_letters_final_words.size

1873

In [48]:
np.savetxt("./data/webster_common_starting_letters_final_words.txt", common_starting_letters_final_words, fmt="%s")

# OR
# with open("./data/webster_common_starting_letters_final_words.txt", mode="w") as file:
#     file.write("\n".join(common_starting_letters_final_words))