In [1]:
import json
import urllib.request
import pandas as pd
import re
import random
import string
from tqdm import tqdm
from spellchecker import SpellChecker
import numpy as np
from wordle_helpers import (
    words_with_commonly_used_letters,
    words_starting_with_commonly_used_letters,
    sorted_words_by_frequency,
    anagram_pairs,
    anagram_word_frequency,
    anagram_to_keep
)

import os
# where to save the data
ROOT_DIR = "."
FOLDER_NAME = "data"
PATH_TO_FOLDER = os.path.join(ROOT_DIR, FOLDER_NAME)
os.makedirs(PATH_TO_FOLDER, exist_ok=True)

# Load data  

I'll be using the Webster's Unabridged English Dictionary as my source of English words, helpfully curated [here](https://github.com/adambom/dictionary).

In [2]:
with urllib.request.urlopen("https://raw.githubusercontent.com/adambom/dictionary/master/dictionary.json") as url:
    data = json.load(url)
    
words = set([word.lower() for word in re.findall(r"[a-z]+", " ".join(list(data)), flags=re.IGNORECASE)])

In [3]:
list(words)[:10]

['stalling',
 'postoblongata',
 'salebrous',
 'attic',
 'dermopteran',
 'orderless',
 'excalceation',
 'logicalness',
 'aluminated',
 'exuperant']

In [4]:
len(words)

82702

# Select words 

In [5]:
new_words_list = words_starting_with_commonly_used_letters(words_with_commonly_used_letters(words))
print(new_words_list[:10], end="")

['stalling', 'postoblongata', 'salebrous', 'attic', 'orderless', 'aluminated', 'signboard', 'produce', 'pacificable', 'perishableness']

In [6]:
def select_words(words_list, length_of_words=None):
    if length_of_words:
        return [word for word in words_list
                for length in length_of_words
                if len(word) == length]
    return list(words_list)

five_letter_words = select_words(new_words_list, length_of_words=[5])

In [7]:
print(five_letter_words[:10], end="")

['attic', 'aural', 'scaup', 'sapid', 'sleep', 'sprue', 'tunny', 'weedy', 'froth', 'thane']

In [8]:
len(five_letter_words)

2809

## no repeat letter in 5 letter words  

So as to not waste letter guesses, we'll only select words with no repeat letters

In [9]:
no_repeat_letter_five_letter_words = [word for word in five_letter_words if len(set(word)) == 5]
print(no_repeat_letter_five_letter_words[:10], end="")

['scaup', 'sapid', 'sprue', 'froth', 'thane', 'agush', 'train', 'coney', 'alogy', 'fumed']

In [10]:
len(no_repeat_letter_five_letter_words)

1823

## Anagrams

In [11]:
# # credit - https://twitter.com/mathsppblog/status/1547493884384710656?t=lWTpT5wZ3izeYRnNy9dc8g&s=03

# def get_anagrams(words_list):
#     """
#     Returns a dict of word and anagrams. Anagrams are only for words with
#     no repeat letters.
#     """
#     signatures = [sorted(word) for word in words_list]

#     word_anagrams = {}
#     #  iterate over signatures
#     for signature in signatures:
#         # get first word with this signature
#         key_word = None
#         for word in words_list:
#             if sorted(word) == signature:
#                 key_word = word
#                 break
                
#         anagram_matches = [word for sig, word in zip(signatures, words_list)
#                            if sig == signature]
#         word_anagrams[key_word] = [", ".join(anagram_matches)]
#     return word_anagrams


# credit: https://twitter.com/eboygarcia/status/1547833387788169216/photo/2

def check_anagrams(words_list):
    # make frozenset list
    signatures = [frozenset("".join([e for e in word.lower() if e.isalnum()])) for word in words_list]
    # get keyword
    anagrams = {}
    for signature in tqdm(signatures):
        keyword = None
        for word in words_list:
            word_ = "".join([e for e in word.lower() if e.isalnum()])
            if frozenset(word_) == signature:
                keyword = word
                break
        matches = [word for sig, word in zip(signatures, words_list)
                   if sig == signature]
        anagrams[keyword] = [", ".join(matches)]
    return anagrams
    
anagrams_five_letter_words_no_repeat_letters = check_anagrams(no_repeat_letter_five_letter_words)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1823/1823 [00:02<00:00, 714.63it/s]


In [12]:
list(anagrams_five_letter_words_no_repeat_letters.items())[:10]

[('scaup', ['scaup']),
 ('sapid', ['sapid, spaid']),
 ('sprue', ['sprue, purse, super']),
 ('froth', ['froth, forth']),
 ('thane', ['thane']),
 ('agush', ['agush']),
 ('train', ['train, tarin, trina, intra, tairn']),
 ('coney', ['coney']),
 ('alogy', ['alogy']),
 ('fumed', ['fumed'])]

In [13]:
# Save words as txt file. Text file provided in data folder
with open("./data/webster_dict_all_five_letter_words.txt", mode="w") as file:
    file.write("\n".join(five_letter_words))

# Determine anagrams to remove  

All anagrams in a group will share the same informational content, albeit different positional information. So where we find anagrams we select the one with the highest word frequency. If our guess word is an anagram of the challenge word we'll get yellows and greens.

## Get anagrams

In [14]:
# anagrams = np.concatenate([value for key, value in word_anagrams.items()])
anagrams = np.concatenate([value for value in anagrams_five_letter_words_no_repeat_letters.values()])

In [15]:
anagrams[:15]

array(['scaup', 'sapid, spaid', 'sprue, purse, super', 'froth, forth',
       'thane', 'agush', 'train, tarin, trina, intra, tairn', 'coney',
       'alogy', 'fumed', 'comet',
       'astel, steal, satle, tales, stela, slate, stale', 'singe',
       'caple, capel, clape, place', 'silty'], dtype='<U47')

In [16]:
anagrams.size

1418

In [17]:
anagrams_word_freq = anagram_word_frequency(anagrams)

In [18]:
anagrams_word_freq[:10]

array([list([('scaup', 0.0)]), list([('sapid', 0.0), ('spaid', 0.0)]),
       list([('sprue', 0.0), ('purse', 7.57435155154593e-06), ('super', 3.778589172545123e-05)]),
       list([('froth', 2.63645000516847e-07), ('forth', 1.4087471506974736e-05)]),
       list([('thane', 2.2736357842737265e-07)]), list([('agush', 0.0)]),
       list([('train', 8.49989062904842e-05), ('tarin', 1.9954782149210898e-08), ('trina', 1.1851931215894957e-07), ('intra', 2.539699546263205e-08), ('tairn', 0.0)]),
       list([('coney', 1.1368178921368632e-07)]), list([('alogy', 0.0)]),
       list([('fumed', 6.772532123368546e-08)])], dtype=object)

In [19]:
# save word_anagrams as csv
(pd.DataFrame(anagrams_five_letter_words_no_repeat_letters)
 .T
 .reset_index()
 .rename(columns={"index": "word", 0: "anagrams"})
).to_csv("./data/anagrams.csv", index=False)

## Keep anagrams with the highest frequency

In [20]:
keep_words = anagram_to_keep(anagrams_word_freq)
keep_words[:10]

array(['abhor', 'abide', 'abies', 'abler', 'abnet', 'abode', 'abord',
       'abort', 'about', 'absey'], dtype=object)

In [21]:
keep_words.size

1418

In [22]:
np.where(keep_words == "suine")

(array([1150]),)

In [23]:
# np.savetxt("./data/webster_common_starting_letters_final_words.txt", keep_words, fmt="%s")

# OR
with open("./data/webster_common_starting_letters_final_words.txt", mode="w") as file:
    file.write("\n".join(keep_words))