In [1]:
import numpy as np
import pandas as pd
import itertools
import random
from tqdm import tqdm
import re
from wordle_helpers import anagram_scoring, all_anagram_scoring

import os
# where to save the data
ROOT_DIR = "."
FOLDER_NAME = "data"
PATH_TO_FOLDER = os.path.join(ROOT_DIR, FOLDER_NAME)
os.makedirs(PATH_TO_FOLDER, exist_ok=True)

## Load data

In [2]:
anagrams = pd.read_csv("./data/anagrams.csv")

In [3]:
anagrams

Unnamed: 0,word,anagrams
0,abode,"abode, adobe"
1,abort,"abort, tabor"
2,acred,"acred, cader"
3,acrid,"acrid, caird"
4,adept,"adept, pated"
...,...,...
413,worth,"worth, wroth, whort"
414,wrath,"wrath, thraw"
415,wreak,"wreak, waker"
416,wrote,"wrote, tower"


In [4]:
anagrams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   word      418 non-null    object
 1   anagrams  418 non-null    object
dtypes: object(2)
memory usage: 6.7+ KB


In [5]:
anagrams["word"].nunique()

418

## Number of anagrams

In [6]:
anagrams = (anagrams
            .assign(num_anagrams = list(map(lambda pair: len(pair.split(", ")), anagrams["anagrams"])))
            .assign(word_length = list(map(lambda word: len(word), anagrams["word"])))
           )

In [7]:
anagrams.loc[anagrams["num_anagrams"] == anagrams["num_anagrams"].max()]

Unnamed: 0,word,anagrams,num_anagrams,word_length
317,steal,"steal, tales, slate, stale, satle, stela, astel",7,5


In [8]:
anagrams["num_anagrams"].idxmax()

317

In [9]:
indx = np.argmax(anagrams["num_anagrams"])
indx

317

In [10]:
(anagrams
 .query("num_anagrams.idxmax()")
)

word                                                      steal
anagrams        steal, tales, slate, stale, satle, stela, astel
num_anagrams                                                  7
word_length                                                   5
Name: 317, dtype: object

In [11]:
words = anagrams["anagrams"].iloc[indx].split(", ")
words

['steal', 'tales', 'slate', 'stale', 'satle', 'stela', 'astel']

## Scoring anagrams

In [12]:
challenge = words[0]
challenge

'steal'

In [13]:
guess_anagrams = words[1:]
guess_anagrams

['tales', 'slate', 'stale', 'satle', 'stela', 'astel']

In [14]:
data = []
for guess in guess_anagrams:
    paired = zip(challenge, guess)
    scores = np.zeros(5, dtype=np.int8)
    for x, (i, j) in enumerate(paired):
#         print((x, (i, j)))
        if i == j:
            scores[x] = 1
        elif i != j and j in challenge:
            scores[x] = 0
        else:
            scores[x] = -1
            
    temp = {
        "challenge_word": challenge,
        "guess": guess,
        "positional_scores": re.sub(r"[\[\]]", "", str(scores)).replace("\n", ","),
        "overall_scores": np.sum(scores, dtype=np.int8),
    }
    data.append(temp)

In [15]:
df = pd.DataFrame(data)

In [16]:
df

Unnamed: 0,challenge_word,guess,positional_scores,overall_scores
0,steal,tales,0 0 0 0 0,0
1,steal,slate,1 0 0 0 0,1
2,steal,stale,1 1 0 0 0,2
3,steal,satle,1 0 0 0 0,1
4,steal,stela,1 1 1 0 0,3
5,steal,astel,0 0 0 0 1,1


In [17]:
# 3 anagram words
anagrams.loc[anagrams["num_anagrams"] == 3].iloc[:5]

Unnamed: 0,word,anagrams,num_anagrams,word_length
11,alien,"alien, aline, anile",3,5
24,ample,"ample, maple, pelma",3,5
32,argol,"argol, algor, orgal",3,5
33,argon,"argon, orang, angor",3,5
34,aries,"aries, serai, aesir",3,5


### Scoring each anagram against itself

In [18]:
data2 = []
num_guesses = len(words)
counter = 0

while counter < num_guesses:
    for word in words:
        guess = words[counter]
        combo_guess_word = zip(guess, word)
        scores = np.zeros(5, dtype=np.int8)
        for x, (i, j) in enumerate(combo_guess_word):
            if i == j:
                scores[x] = 1
            elif i != j and j in word:
                scores[x] = 0
            else:
                scores[x] = -1
        data2.append(re.sub(r"[\[\]]", "", str(scores)).replace("\n", ","))
    counter += 1   

In [19]:
data2[:7]

['1 1 1 1 1',
 '0 0 0 0 0',
 '1 0 0 0 0',
 '1 1 0 0 0',
 '1 0 0 0 0',
 '1 1 1 0 0',
 '0 0 0 0 1']

In [20]:
len(data2), num_guesses

(49, 7)

In [21]:
# Every 7th value (num guesses) in data2 represents the next guess word scored against the challenge word
# so we can reshape data2 to be a 7x7 array
np.array(data2).reshape(num_guesses, num_guesses)

array([['1 1 1 1 1', '0 0 0 0 0', '1 0 0 0 0', '1 1 0 0 0', '1 0 0 0 0',
        '1 1 1 0 0', '0 0 0 0 1'],
       ['0 0 0 0 0', '1 1 1 1 1', '0 0 0 0 0', '0 0 0 0 0', '0 1 0 0 0',
        '0 0 0 0 0', '0 0 0 1 0'],
       ['1 0 0 0 0', '0 0 0 0 0', '1 1 1 1 1', '1 0 1 0 1', '1 0 0 0 1',
        '1 0 0 0 0', '0 0 0 0 0'],
       ['1 1 0 0 0', '0 0 0 0 0', '1 0 1 0 1', '1 1 1 1 1', '1 0 0 1 1',
        '1 1 0 1 0', '0 0 0 0 0'],
       ['1 0 0 0 0', '0 1 0 0 0', '1 0 0 0 1', '1 0 0 1 1', '1 1 1 1 1',
        '1 0 0 1 0', '0 0 1 0 0'],
       ['1 1 1 0 0', '0 0 0 0 0', '1 0 0 0 0', '1 1 0 1 0', '1 0 0 1 0',
        '1 1 1 1 1', '0 0 0 0 0'],
       ['0 0 0 0 1', '0 0 0 1 0', '0 0 0 0 0', '0 0 0 0 0', '0 0 1 0 0',
        '0 0 0 0 0', '1 1 1 1 1']], dtype='<U9')

In [22]:
df2 = pd.DataFrame(np.array(data2).reshape(7, 7), columns=words, index=words)

In [23]:
df2

Unnamed: 0,steal,tales,slate,stale,satle,stela,astel
steal,1 1 1 1 1,0 0 0 0 0,1 0 0 0 0,1 1 0 0 0,1 0 0 0 0,1 1 1 0 0,0 0 0 0 1
tales,0 0 0 0 0,1 1 1 1 1,0 0 0 0 0,0 0 0 0 0,0 1 0 0 0,0 0 0 0 0,0 0 0 1 0
slate,1 0 0 0 0,0 0 0 0 0,1 1 1 1 1,1 0 1 0 1,1 0 0 0 1,1 0 0 0 0,0 0 0 0 0
stale,1 1 0 0 0,0 0 0 0 0,1 0 1 0 1,1 1 1 1 1,1 0 0 1 1,1 1 0 1 0,0 0 0 0 0
satle,1 0 0 0 0,0 1 0 0 0,1 0 0 0 1,1 0 0 1 1,1 1 1 1 1,1 0 0 1 0,0 0 1 0 0
stela,1 1 1 0 0,0 0 0 0 0,1 0 0 0 0,1 1 0 1 0,1 0 0 1 0,1 1 1 1 1,0 0 0 0 0
astel,0 0 0 0 1,0 0 0 1 0,0 0 0 0 0,0 0 0 0 0,0 0 1 0 0,0 0 0 0 0,1 1 1 1 1


### Take the anagram dataframe and randomly select a word to display the scoring

In [24]:
# show score for random anagrams
anagram_scoring_df = anagram_scoring(anagrams)
anagram_scoring_df

Unnamed: 0,inust,suint
inust,1 1 1 1 1,0 0 0 0 1
suint,0 0 0 0 1,1 1 1 1 1


### Score all anagrams against each other. The scoring partners which aren't anagrams will contain `-1s`  

In [25]:
all_anagram_scoring_df = all_anagram_scoring(anagrams)

100%|█████████████████████████████████████████| 726/726 [00:42<00:00, 17.15it/s]


In [26]:
all_anagram_scoring_df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 726 entries, clape to thewy
Columns: 726 entries, clape to thewy
dtypes: string(726)
memory usage: 9.1 MB


In [27]:
all_anagram_scoring_df

Unnamed: 0,clape,scate,plane,meant,thrum,algin,slate,shoer,apert,mavis,...,piles,scary,stirp,satyr,whale,sward,smote,mythe,cates,thewy
clape,1 1 1 1 1,-1 0 1 -1 1,0 1 1 -1 1,-1 0 1 -1 -1,-1 -1 -1 -1 -1,0 1 -1 -1 -1,-1 1 1 -1 1,-1 -1 -1 0 -1,0 0 0 -1 -1,-1 0 -1 -1 -1,...,0 -1 0 0 -1,-1 0 1 -1 -1,-1 -1 -1 -1 0,-1 0 -1 -1 -1,-1 -1 1 0 1,-1 -1 1 -1 -1,-1 -1 -1 -1 1,-1 -1 -1 -1 1,1 0 -1 0 -1,-1 -1 0 -1 -1
scate,0 -1 1 -1 1,1 1 1 1 1,-1 -1 1 -1 1,-1 0 1 -1 0,0 -1 -1 -1 -1,0 -1 -1 -1 -1,1 -1 1 1 1,1 -1 -1 0 -1,0 -1 0 -1 0,-1 0 -1 -1 0,...,-1 -1 -1 0 0,1 1 1 -1 -1,1 0 -1 -1 -1,1 0 0 -1 -1,-1 -1 1 -1 1,1 -1 1 -1 -1,1 -1 -1 1 1,-1 -1 0 -1 1,0 0 0 0 0,0 -1 0 -1 -1
plane,-1 1 1 0 1,-1 -1 1 -1 1,1 1 1 1 1,-1 0 1 1 -1,-1 -1 -1 -1 -1,0 1 -1 -1 0,-1 1 1 -1 1,-1 -1 -1 0 -1,0 0 0 -1 -1,-1 0 -1 -1 -1,...,1 -1 0 0 -1,-1 -1 1 -1 -1,-1 -1 -1 -1 0,-1 0 -1 -1 -1,-1 -1 1 0 1,-1 -1 1 -1 -1,-1 -1 -1 -1 1,-1 -1 -1 -1 1,-1 0 -1 0 -1,-1 -1 0 -1 -1
meant,-1 -1 1 -1 0,-1 -1 1 0 0,-1 -1 1 1 0,1 1 1 1 1,0 -1 -1 -1 0,0 -1 -1 -1 0,-1 -1 1 0 0,-1 -1 -1 0 -1,0 -1 0 -1 1,1 0 -1 -1 -1,...,-1 -1 -1 0 -1,-1 -1 1 -1 -1,-1 0 -1 -1 -1,-1 0 0 -1 -1,-1 -1 1 -1 0,-1 -1 1 -1 -1,-1 0 -1 0 0,1 -1 0 -1 0,-1 0 0 0 -1,0 -1 0 -1 -1
thrum,-1 -1 -1 -1 -1,-1 -1 -1 0 -1,-1 -1 -1 -1 -1,0 -1 -1 -1 0,1 1 1 1 1,-1 -1 -1 -1 -1,-1 -1 -1 0 -1,-1 1 -1 -1 0,-1 -1 -1 0 0,0 -1 -1 -1 -1,...,-1 -1 -1 -1 -1,-1 -1 -1 0 -1,-1 0 -1 0 -1,-1 -1 0 -1 0,-1 1 -1 -1 -1,-1 -1 -1 0 -1,-1 0 -1 0 -1,0 -1 0 0 -1,-1 -1 0 -1 -1,1 1 -1 -1 -1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sward,-1 -1 1 -1 -1,1 -1 1 -1 -1,-1 -1 1 -1 -1,-1 -1 1 -1 -1,-1 -1 0 -1 -1,0 -1 -1 -1 -1,1 -1 1 -1 -1,1 -1 -1 -1 0,0 -1 -1 1 -1,-1 0 -1 -1 0,...,-1 -1 -1 -1 0,1 -1 1 1 -1,1 -1 -1 1 -1,1 0 -1 -1 0,0 -1 1 -1 -1,1 1 1 1 1,1 -1 -1 -1 -1,-1 -1 -1 -1 -1,-1 0 -1 -1 0,-1 -1 -1 0 -1
smote,-1 -1 -1 -1 1,1 -1 -1 1 1,-1 -1 -1 -1 1,0 0 -1 -1 0,0 -1 -1 -1 0,-1 -1 -1 -1 -1,1 -1 -1 1 1,1 -1 1 0 -1,-1 -1 0 -1 0,0 -1 -1 -1 0,...,-1 -1 -1 0 0,1 -1 -1 -1 -1,1 0 -1 -1 -1,1 -1 0 -1 -1,-1 -1 -1 -1 1,1 -1 -1 -1 -1,1 1 1 1 1,0 -1 0 -1 1,-1 -1 0 0 0,0 -1 0 -1 -1
mythe,-1 -1 -1 -1 1,-1 -1 -1 0 1,-1 -1 -1 -1 1,1 0 -1 -1 0,0 0 -1 -1 0,-1 -1 -1 -1 -1,-1 -1 -1 0 1,-1 0 -1 0 -1,-1 -1 0 -1 0,1 -1 -1 -1 -1,...,-1 -1 -1 0 -1,-1 -1 -1 -1 0,-1 0 -1 -1 -1,-1 -1 1 0 -1,-1 0 -1 -1 1,-1 -1 -1 -1 -1,-1 0 -1 0 1,1 1 1 1 1,-1 -1 1 0 -1,0 0 0 -1 0
cates,1 -1 0 -1 0,0 0 0 0 0,-1 -1 0 -1 0,-1 0 0 -1 0,0 -1 -1 -1 -1,0 -1 -1 -1 -1,0 -1 0 0 0,0 -1 -1 1 -1,0 -1 0 -1 0,-1 1 -1 -1 1,...,-1 -1 -1 1 1,0 0 0 -1 -1,0 0 -1 -1 -1,0 1 1 -1 -1,-1 -1 0 -1 0,0 -1 0 -1 -1,0 -1 -1 0 0,-1 -1 1 -1 0,1 1 1 1 1,0 -1 0 -1 -1


In [28]:
all_anagram_scoring_df.to_csv("./data/all_anagram_scoring.csv", index=False)

## 5 bit binary  

Scoring of anagrams produces 5 bit binary with range from `00000` to `11111` `(0 to 31)`.  
- `00000`, all letter are in wrong place
- `11111`, letters matched correctly

In [29]:
df2

Unnamed: 0,steal,tales,slate,stale,satle,stela,astel
steal,1 1 1 1 1,0 0 0 0 0,1 0 0 0 0,1 1 0 0 0,1 0 0 0 0,1 1 1 0 0,0 0 0 0 1
tales,0 0 0 0 0,1 1 1 1 1,0 0 0 0 0,0 0 0 0 0,0 1 0 0 0,0 0 0 0 0,0 0 0 1 0
slate,1 0 0 0 0,0 0 0 0 0,1 1 1 1 1,1 0 1 0 1,1 0 0 0 1,1 0 0 0 0,0 0 0 0 0
stale,1 1 0 0 0,0 0 0 0 0,1 0 1 0 1,1 1 1 1 1,1 0 0 1 1,1 1 0 1 0,0 0 0 0 0
satle,1 0 0 0 0,0 1 0 0 0,1 0 0 0 1,1 0 0 1 1,1 1 1 1 1,1 0 0 1 0,0 0 1 0 0
stela,1 1 1 0 0,0 0 0 0 0,1 0 0 0 0,1 1 0 1 0,1 0 0 1 0,1 1 1 1 1,0 0 0 0 0
astel,0 0 0 0 1,0 0 0 1 0,0 0 0 0 0,0 0 0 0 0,0 0 1 0 0,0 0 0 0 0,1 1 1 1 1


In [30]:
int("11111", 2)

31

In [31]:
int("00000", 2)

0

In [32]:
int("10000", 2)

16

In [33]:
int("10101", 2)

21

**`stale` contains 24 bits of information necessary to describe `steal`**

In [34]:
int("11000", 2)

24

In [35]:
int("00100", 2)

4

In [36]:
df2.loc[df2.index.isin(["stale"])]

Unnamed: 0,steal,tales,slate,stale,satle,stela,astel
stale,1 1 0 0 0,0 0 0 0 0,1 0 1 0 1,1 1 1 1 1,1 0 0 1 1,1 1 0 1 0,0 0 0 0 0


In [37]:
stale = df2.loc[df2.index.isin(["stale"])].T
stale

Unnamed: 0,stale
steal,1 1 0 0 0
tales,0 0 0 0 0
slate,1 0 1 0 1
stale,1 1 1 1 1
satle,1 0 0 1 1
stela,1 1 0 1 0
astel,0 0 0 0 0


In [38]:
df2.loc[df2.index.isin(["stale"])].squeeze()

steal    1 1 0 0 0
tales    0 0 0 0 0
slate    1 0 1 0 1
stale    1 1 1 1 1
satle    1 0 0 1 1
stela    1 1 0 1 0
astel    0 0 0 0 0
Name: stale, dtype: object

In [39]:
pd.DataFrame(df2.loc[df2.index.isin(["stale"])].squeeze())

Unnamed: 0,stale
steal,1 1 0 0 0
tales,0 0 0 0 0
slate,1 0 1 0 1
stale,1 1 1 1 1
satle,1 0 0 1 1
stela,1 1 0 1 0
astel,0 0 0 0 0


In [40]:
stale_binaries = df2.loc[df2.index.isin(["stale"])].squeeze().tolist()
stale_binaries

['1 1 0 0 0',
 '0 0 0 0 0',
 '1 0 1 0 1',
 '1 1 1 1 1',
 '1 0 0 1 1',
 '1 1 0 1 0',
 '0 0 0 0 0']

In [41]:
stale_binaries[0]

'1 1 0 0 0'

In [42]:
"".join(stale_binaries[0].split())

'11000'

In [43]:
int("".join(stale_binaries[0].split()), 2)

24

In [44]:
stale_binaries = ["".join(x.split()) for x in stale_binaries]
stale_binaries

['11000', '00000', '10101', '11111', '10011', '11010', '00000']

In [45]:
[int(binary, 2) for binary in stale_binaries]

[24, 0, 21, 31, 19, 26, 0]

In [46]:
scare = all_anagram_scoring_df.loc[all_anagram_scoring_df.index.isin(["scare"])].squeeze()
scare

clape     0 -1  1 -1  1
scate     1  1  1 -1  1
plane    -1 -1  1 -1  1
meant    -1  0  1 -1 -1
thrum    -1 -1  0 -1 -1
              ...      
sward     1 -1  1  1 -1
smote     1 -1 -1 -1  1
mythe    -1 -1 -1 -1  1
cates     0  0 -1  0  0
thewy    -1 -1  0 -1 -1
Name: scare, Length: 726, dtype: string

In [47]:
#  you can find the scores for scare by filtering out those with -1
cols = [(idx, binary) for idx, binary in enumerate(scare) if "-1" not in binary]
cols

[(77, '0 0 1 0 1'),
 (129, '1 1 1 1 1'),
 (144, '1 0 0 1 1'),
 (206, '1 0 0 0 0'),
 (441, '0 0 0 0 1')]

In [48]:
scare_cols = [col[0] for col in cols]
scare_cols

[77, 129, 144, 206, 441]

In [49]:
all_anagram_scoring_df.iloc[scare_cols, scare_cols]

Unnamed: 0,crase,scare,sacre,serac,carse
crase,1 1 1 1 1,0 0 1 0 1,0 0 0 0 1,0 0 0 0 0,1 0 0 1 1
scare,0 0 1 0 1,1 1 1 1 1,1 0 0 1 1,1 0 0 0 0,0 0 0 0 1
sacre,0 0 0 0 1,1 0 0 1 1,1 1 1 1 1,1 0 0 0 0,0 1 0 0 1
serac,0 0 0 0 0,1 0 0 0 0,1 0 0 0 0,1 1 1 1 1,0 0 1 0 0
carse,1 0 0 1 1,0 0 0 0 1,0 1 0 0 1,0 0 1 0 0,1 1 1 1 1


In [50]:
# remove spaces
all_anagram_scoring_df.iloc[scare_cols, scare_cols].applymap(lambda x: "".join(x.split()))

Unnamed: 0,crase,scare,sacre,serac,carse
crase,11111,101,1,0,10011
scare,101,11111,10011,10000,1
sacre,1,10011,11111,10000,1001
serac,0,10000,10000,11111,100
carse,10011,1,1001,100,11111


In [51]:
# convert to int
(all_anagram_scoring_df.iloc[scare_cols, scare_cols]
 .applymap(lambda x: "".join(x.split()))
 .applymap(lambda x: int(x, 2)))

Unnamed: 0,crase,scare,sacre,serac,carse
crase,31,5,1,0,19
scare,5,31,19,16,1
sacre,1,19,31,16,9
serac,0,16,16,31,4
carse,19,1,9,4,31


In [52]:
(all_anagram_scoring_df.iloc[scare_cols, scare_cols]
 .applymap(lambda x: "".join(x.split()))
 .applymap(lambda x: int(x, 2))
 .assign(sum_across=lambda x: x.sum())
 .astype(np.int8))

Unnamed: 0,crase,scare,sacre,serac,carse,sum_across
crase,31,5,1,0,19,56
scare,5,31,19,16,1,72
sacre,1,19,31,16,9,76
serac,0,16,16,31,4,67
carse,19,1,9,4,31,64


## Anagram cipher lookup table  
With `itertools.zip_longest` you can create an encode/decode lookup table for 4, 5 and 6 letter anagrams.

In [53]:
list(itertools.zip_longest("taper", "slip", fillvalue="*"))

[('t', 's'), ('a', 'l'), ('p', 'i'), ('e', 'p'), ('r', '*')]

In [54]:
list(itertools.zip_longest("taper", "scare"))

[('t', 's'), ('a', 'c'), ('p', 'a'), ('e', 'r'), ('r', 'e')]