In [1]:
import json
import urllib.request
import pandas as pd
import re
import random
import string
from spellchecker import SpellChecker
import numpy as np
from wordle_helpers import (
    words_with_commonly_used_letters,
    words_starting_with_commonly_used_letters,
    sorted_words_by_frequency,
    anagram_pairs,
    anagram_word_frequency,
    anagram_to_keep
)

import os
# where to save the data
ROOT_DIR = "."
FOLDER_NAME = "data"
PATH_TO_FOLDER = os.path.join(ROOT_DIR, FOLDER_NAME)
os.makedirs(PATH_TO_FOLDER, exist_ok=True)

# Load data  

I'll be using the Webster's Unabridged English Dictionary as my source of English words, helpfully curated [here](https://github.com/adambom/dictionary).

In [2]:
with urllib.request.urlopen("https://raw.githubusercontent.com/adambom/dictionary/master/dictionary.json") as url:
    data = json.load(url)

In [3]:
websters_english_dictionary = (pd.DataFrame([data])
                               .T
                               .reset_index()
                               .rename(columns={"index": "word", 0: "definition"}))

In [4]:
websters_english_dictionary.head()

Unnamed: 0,word,definition
0,DIPLOBLASTIC,Characterizing the ovum when it has two primar...
1,DEFIGURE,To delineate. [Obs.]These two stones as they a...
2,LOMBARD,"Of or pertaining to Lombardy, or the inhabitan..."
3,BAHAISM,The religious tenets or practices of the Bahais.
4,FUMERELL,See Femerell.


In [5]:
websters_english_dictionary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86036 entries, 0 to 86035
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   word        86036 non-null  object
 1   definition  86036 non-null  object
dtypes: object(2)
memory usage: 1.3+ MB


In [6]:
websters_english_dictionary.to_csv("./data/websters_english_dictionary.csv", index=False)

# Create list of five letter words

## Five letter words

In [7]:
five_letter_words = []

for dict_word in websters_english_dictionary["word"]:
    words = []
    m = re.search(r"\w+", dict_word)
    if m is not None:
        words.append(m.group(0).lower())
    for word in words:
        if len(word) == 5 and word not in five_letter_words:
            five_letter_words.append(word)

In [8]:
len(five_letter_words)

5568

In [9]:
len(set(five_letter_words))

5568

In [10]:
check = [len(word) for word in five_letter_words]

In [11]:
set(check)

{5}

In [12]:
five_letter_words[:10]

['water',
 'gnarl',
 'arles',
 'villa',
 'stagy',
 'betty',
 'aknow',
 'berbe',
 'icily',
 'yamma']

In [13]:
# Save words as txt file. Text file provided in data folder
with open("./data/webster_dict_all_five_letter_words.txt", mode="w") as file:
    file.write("\n".join(five_letter_words))

## no repeat letter in 5 letter words  

So as to not waste letter guesses, we'll only select words with no repeat letters

In [14]:
no_repeat_letter_five_letter_words = [word for word in five_letter_words if len(set(word)) == 5]
no_repeat_letter_five_letter_words[:10]

['water',
 'gnarl',
 'arles',
 'stagy',
 'aknow',
 'tawer',
 'copra',
 'knout',
 'atole',
 'timal']

In [15]:
len(no_repeat_letter_five_letter_words)

3636

## words with most common starting letters  

Further filtering by choosing words form `most_common_starting_letters = "taisocmfpw"`

In [16]:
common_starting_letters = words_starting_with_commonly_used_letters(no_repeat_letter_five_letter_words)
common_starting_letters[:10]

['water',
 'arles',
 'stagy',
 'aknow',
 'tawer',
 'copra',
 'atole',
 'timal',
 'trunk',
 'inkle']

In [17]:
len(common_starting_letters)

2181

# Determine anagrams to remove  

All anagrams in a group will share the same informational content, albeit different positional information. So where we find anagrams we select the one with the highest word frequency. If our guess word is an anagram of the challenge word we'll get yellows and greens.

## Get anagrams

In [18]:
# credit - https://twitter.com/mathsppblog/status/1547493884384710656?t=lWTpT5wZ3izeYRnNy9dc8g&s=03

signatures = []
for word in common_starting_letters:
    signatures.append(frozenset(word))
    
word_anagrams = {}
#  iterate over signatures
for signature in set(signatures):
    # get first word with this signature
    key_word = None
    for word in common_starting_letters:
        if frozenset(word) == signature:
            key_word = word
            break
            print(key_word)
    
    anagram_matches = []
    for sig, word in zip(signatures, common_starting_letters):
        if sig == signature:
            anagram_matches.append(word)
#     word_anagrams[key_word] = anagram_matches
    word_anagrams[key_word] = [", ".join(anagram_matches)]

In [19]:
# anagrams = np.concatenate([value for key, value in word_anagrams.items()])
anagrams = np.concatenate([value for value in word_anagrams.values()])

In [20]:
anagrams[:15]

array(['scout', 'atomy', 'sting', 'capri, picra', 'wakif', 'orbit',
       'acorn', 'woxen', 'amole, maleo', 'shave', 'peach, cheap', 'fixed',
       'indol', 'skein', 'auric, curia'], dtype='<U47')

In [21]:
anagrams.size

1763

In [22]:
anagrams_word_freq = anagram_word_frequency(anagrams)

In [23]:
anagrams_word_freq[:30]

array([list([('scout', 6.3583192211803815e-06)]), list([('atomy', 0.0)]),
       list([('sting', 6.473815081498541e-06)]),
       list([('capri', 1.0098329148237029e-07), ('picra', 0.0)]),
       list([('wakif', 0.0)]), list([('orbit', 6.022716066852743e-06)]),
       list([('acorn', 5.798980630634318e-07)]), list([('woxen', 0.0)]),
       list([('amole', 0.0), ('maleo', 0.0)]),
       list([('shave', 1.0080188437192293e-05)]),
       list([('peach', 3.5858138831763824e-06), ('cheap', 2.5594124522651528e-05)]),
       list([('fixed', 2.955545112445397e-05)]), list([('indol', 0.0)]),
       list([('skein', 3.6281422089474355e-08)]),
       list([('auric', 9.675045890526496e-09), ('curia', 1.7959303934289807e-07)]),
       list([('cagot', 0.0)]), list([('ihram', 1.3303188099473932e-08)]),
       list([('plack', 0.0)]), list([('trump', 1.5334947736484495e-06)]),
       list([('crumb', 7.933537630231726e-07)]), list([('tirma', 0.0)]),
       list([('shone', 1.3430173076787092e-06)]), list(

In [24]:
# save word_anagrams as csv
(pd.DataFrame(word_anagrams)
 .T
 .reset_index()
 .rename(columns={"index": "word", 0: "anagrams"})
).to_csv("./data/anagrams.csv", index=False)

## Keep anagrams with the highest frequency

In [25]:
keep_words = anagram_to_keep(anagrams_word_freq)
keep_words[:10]

array(['abhor', 'abide', 'abies', 'abler', 'abnet', 'abode', 'abord',
       'abort', 'about', 'above'], dtype=object)

In [26]:
keep_words.size

1763

In [27]:
np.where(keep_words == "suine")

(array([], dtype=int64),)

In [28]:
# np.savetxt("./data/webster_common_starting_letters_final_words.txt", keep_words, fmt="%s")

# OR
with open("./data/webster_common_starting_letters_final_words.txt", mode="w") as file:
    file.write("\n".join(keep_words))