# WordleAssist

In [1]:
# imports
import re
from urllib import request
from dataclasses import dataclass, field
import requests
try:
  import wordfreq as wf
except:
  wf = None
finally:
  use_wordfreqs = wf is not None

In [17]:
# setup
url = "https://raw.githubusercontent.com/hasithvm/urbandictionary-termlist/master/low.txt"
s = ""
words = []
with requests.get(url, stream=True) as r:
    for line in r.iter_lines():
        word = line.decode("utf-8")
        if (re.match("^[a-z]{5}$", word)):
            words.append(word)

Test format string
- Underscores are unknown characters
- Use lower case chars for right letter + wrong position (yellow)
- Use upper case for right letter in the right position (green)
- Characters after the pipe (|) character are incorrect guesses (grey)

In [11]:
@dataclass
class CompositeMatch:
    CorrectMatch: str = None
    PartialMatches: set = field(default_factory=set)
    def __repr__(self):
        return f"CM: {self.CorrectMatch}, PM: {''.join([c for c in self.PartialMatches])}"
    
    
    
# build a regex
def build_regex_v2(inputs, exclude):
    partial_pattern_chars = []
    pattern = '^'
    for m in inputs:
        partial_pattern_chars.extend(m.PartialMatches)
    print(partial_pattern_chars)
    partial_pattern_chars = list(set(partial_pattern_chars))
    for c in partial_pattern_chars:
        pattern += (f"(?=.*[{c}])")
        
    for i in range(5):
        m = inputs[i]
        # this can't be an excluded character or a correct character
        if m.CorrectMatch is not None:
            pattern += m.CorrectMatch.lower()
            continue
        elif len(m.PartialMatches) > 0:
            pattern += f"[^{''.join(list(m.PartialMatches)) + ''.join(exclude)}]"
            continue
        else:
            pattern += f"[^{''.join(exclude)}]"
    return pattern


# inputs are a list of attempts in the same format as above
# "<CORRECT-LETTER><partial-match>____|<unused-letters>"
#
# test the pattern with the v2 gen
def test_v2(*args, limit = 25):
    
    compositematches = []
    excludechars = set()
    for arg in args:
        step_cm = []
        partial_pattern, exclude = arg.split('|')
        
        for c in partial_pattern:
            cm = CompositeMatch()
            if c >= 'A' and c<= 'Z':
                cm.CorrectMatch = c
            elif c >= 'a' and c <= 'z':
                cm.PartialMatches.add(c)
            step_cm.append(cm)
        compositematches.append(step_cm)
        
        for c in exclude:
            excludechars.add(c)
   
    final_matches = [CompositeMatch() for x in range(5)]
    for match in compositematches:
        for i in range(5):
            if match[i].CorrectMatch is not None:
                final_matches[i].CorrectMatch = match[i].CorrectMatch
            final_matches[i].PartialMatches.update(match[i].PartialMatches)
            
    regex = build_regex_v2(final_matches, excludechars)
    
    known_letters = set([f.CorrectMatch for f in final_matches if f.CorrectMatch is not None])
    matching_words = [w for w in words if re.match(regex, w, re.IGNORECASE)]
    
    print(f"Found {len(matching_words)} matches")
    letter_frequencies = {x: 0 for x in map(chr, range(97, 123))}
    for word in matching_words:
        s = set(word)
        for c in s:
            letter_frequencies[c] = letter_frequencies.get(c,0) + 1
    letter_frequencies = {k:v for k,v in letter_frequencies.items() if v > 0 and k not in known_letters}
    letter_freq_list = list(letter_frequencies.items())
    letter_freq_list.sort(key = lambda f: f[1], reverse = True)
       
    if (use_wordfreqs):
        l = []
        for w in matching_words:
            l.append((w,wf.word_frequency(w, 'en')))
        l.sort(key = lambda f : f[1], reverse = True)
        l = l[:limit]
        for i in l:
            print(f"{i[0]}|{i[1]}")
        
    else:     
        for w in matching_words:
            print(w)
    print("Most common letters:")
    for k in letter_freq_list[:10]:
        print(f"{k[0]} has hits in {k[1]} words")


### Testing partial guesses
Pass in an array of strings representing guesses. Format string is as above, and the guesser will combine these guesses to narrow down the word list as much as possible.


In [20]:
test_v2("____s|peni", "__Ash|fl")

['s', 's', 'h']
Found 112 matches
shark|1.26e-05
shady|5.13e-06
shaky|2.82e-06
shack|2.69e-06
shard|1e-06
shaka|4.27e-07
shara|1.51e-07
shabu|1.26e-07
shama|1.1e-07
shado|9.33e-08
shart|9.12e-08
shaar|7.24e-08
shada|7.08e-08
shaku|6.17e-08
shaba|5.89e-08
shaam|4.9e-08
shako|4.9e-08
shatt|4.79e-08
shaya|4.37e-08
shatz|3.72e-08
shaab|3.63e-08
shaco|3.39e-08
shawa|2.82e-08
shamy|2.69e-08
shaub|2.45e-08
Most common letters:
a has hits in 112 words
h has hits in 112 words
s has hits in 112 words
y has hits in 17 words
d has hits in 16 words
k has hits in 16 words
t has hits in 16 words
w has hits in 15 words
r has hits in 14 words
z has hits in 14 words


Analyze the word list for figuring out the most common letters

In [18]:
letter_frequencies = {x: 0 for x in map(chr, range(97, 123))}
for word in words:
    s = set(word)
    for c in s:
        letter_frequencies[c] = letter_frequencies.get(c, 0) + 1
letter_frequencies = {k:v for k,v in letter_frequencies.items() if v > 0}
letter_freq_list = list(letter_frequencies.items())
letter_freq_list.sort(key = lambda f: f[1], reverse = True)
print(letter_freq_list)

[('a', 57502), ('e', 44130), ('i', 40525), ('o', 38138), ('n', 30182), ('r', 30155), ('s', 27332), ('l', 26545), ('t', 24430), ('u', 23829), ('y', 23781), ('m', 21212), ('h', 21066), ('d', 19182), ('k', 17237), ('b', 17095), ('g', 14838), ('c', 14752), ('p', 13939), ('f', 13674), ('w', 11536), ('z', 10403), ('j', 8506), ('v', 6992), ('x', 4105), ('q', 2358)]
