In [1]:
import numpy as np
import pandas as pd
import itertools
import random
from tqdm import tqdm
import re
from wordle_helpers import anagram_scoring, all_anagram_scoring

import os
# where to save the data
ROOT_DIR = "."
FOLDER_NAME = "data"
PATH_TO_FOLDER = os.path.join(ROOT_DIR, FOLDER_NAME)
os.makedirs(PATH_TO_FOLDER, exist_ok=True)

## Load data

In [2]:
anagrams = pd.read_csv("./data/anagrams.csv")

In [3]:
anagrams = (anagrams
            .loc[lambda df_: df_["anagrams"].str.split(", ").apply(len) > 1]
            .assign(num_anagrams = lambda df_: df_["anagrams"].str.split(", ").apply(len),
                    word_length = lambda df_: df_["word"].apply(len))
            .astype({"num_anagrams": "int8", "word_length": "int8"})
            .reset_index(drop=True)
           )
anagrams.head(10)

Unnamed: 0,word,anagrams,num_anagrams,word_length
0,capri,"capri, picra",2,5
1,amole,"amole, maleo",2,5
2,peach,"peach, cheap",2,5
3,auric,"auric, curia",2,5
4,frail,"frail, filar",2,5
5,wrote,"wrote, tower",2,5
6,carol,"carol, coral, claro",3,5
7,polka,"polka, pokal",2,5
8,forth,"forth, froth",2,5
9,first,"first, frist",2,5


In [4]:
anagrams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308 entries, 0 to 307
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   word          308 non-null    object
 1   anagrams      308 non-null    object
 2   num_anagrams  308 non-null    int8  
 3   word_length   308 non-null    int8  
dtypes: int8(2), object(2)
memory usage: 5.5+ KB


In [5]:
(anagrams
 .loc[anagrams["num_anagrams"] == anagrams["num_anagrams"].max()])

Unnamed: 0,word,anagrams,num_anagrams,word_length
275,slate,"slate, satle, stale, stela, steal, tales, astel",7,5


In [6]:
anagrams["num_anagrams"].idxmax()

275

In [7]:
indx = np.argmax(anagrams["num_anagrams"])
indx

275

In [8]:
(anagrams
 .query("num_anagrams.idxmax()")
)

word                                                      slate
anagrams        slate, satle, stale, stela, steal, tales, astel
num_anagrams                                                  7
word_length                                                   5
Name: 275, dtype: object

In [9]:
words = anagrams["anagrams"].iloc[indx].split(", ")
words

['slate', 'satle', 'stale', 'stela', 'steal', 'tales', 'astel']

## Scoring anagrams

In [10]:
challenge = words[0]
challenge

'slate'

In [11]:
guess_anagrams = words[1:]
guess_anagrams

['satle', 'stale', 'stela', 'steal', 'tales', 'astel']

In [12]:
data = []
for guess in guess_anagrams:
    paired = zip(challenge, guess)
    scores = np.zeros(5, dtype=np.int8)
    for x, (i, j) in enumerate(paired):
#         print((x, (i, j)))
        if i == j:
            scores[x] = 1
        elif i != j and j in challenge:
            scores[x] = 0
        else:
            scores[x] = -1
            
    temp = {
        "challenge_word": challenge,
        "guess": guess,
        "positional_scores": re.sub(r"[\[\]]", "", str(scores)).replace("\n", ","),
        "overall_scores": np.sum(scores, dtype=np.int8),
    }
    data.append(temp)

In [13]:
df = pd.DataFrame(data)

In [14]:
df

Unnamed: 0,challenge_word,guess,positional_scores,overall_scores
0,slate,satle,1 0 0 0 1,2
1,slate,stale,1 0 1 0 1,3
2,slate,stela,1 0 0 0 0,1
3,slate,steal,1 0 0 0 0,1
4,slate,tales,0 0 0 0 0,0
5,slate,astel,0 0 0 0 0,0


In [15]:
# 3 anagram words
(anagrams
 .loc[anagrams["num_anagrams"] == 3]
 .iloc[:5])

Unnamed: 0,word,anagrams,num_anagrams,word_length
6,carol,"carol, coral, claro",3,5
13,slide,"slide, sleid, sidle",3,5
19,flare,"flare, flear, feral",3,5
21,terin,"terin, trine, inter",3,5
39,moria,"moria, moira, maori",3,5


### Scoring each anagram against itself

In [16]:
data2 = []
num_guesses = len(words)
counter = 0

while counter < num_guesses:
    for word in words:
        guess = words[counter]
        combo_guess_word = zip(guess, word)
        scores = np.zeros(5, dtype=np.int8)
        for x, (i, j) in enumerate(combo_guess_word):
            if i == j:
                scores[x] = 1
            elif i != j and j in word:
                scores[x] = 0
            else:
                scores[x] = -1
        data2.append(re.sub(r"[\[\]]", "", str(scores)).replace("\n", ","))
    counter += 1   

In [17]:
data2[:7]

['1 1 1 1 1',
 '1 0 0 0 1',
 '1 0 1 0 1',
 '1 0 0 0 0',
 '1 0 0 0 0',
 '0 0 0 0 0',
 '0 0 0 0 0']

In [18]:
len(data2), num_guesses

(49, 7)

In [19]:
# Every 7th value (num guesses) in data2 represents the next guess word scored against the challenge word
# so we can reshape data2 to be a 7x7 array
np.array(data2).reshape(num_guesses, num_guesses)

array([['1 1 1 1 1', '1 0 0 0 1', '1 0 1 0 1', '1 0 0 0 0', '1 0 0 0 0',
        '0 0 0 0 0', '0 0 0 0 0'],
       ['1 0 0 0 1', '1 1 1 1 1', '1 0 0 1 1', '1 0 0 1 0', '1 0 0 0 0',
        '0 1 0 0 0', '0 0 1 0 0'],
       ['1 0 1 0 1', '1 0 0 1 1', '1 1 1 1 1', '1 1 0 1 0', '1 1 0 0 0',
        '0 0 0 0 0', '0 0 0 0 0'],
       ['1 0 0 0 0', '1 0 0 1 0', '1 1 0 1 0', '1 1 1 1 1', '1 1 1 0 0',
        '0 0 0 0 0', '0 0 0 0 0'],
       ['1 0 0 0 0', '1 0 0 0 0', '1 1 0 0 0', '1 1 1 0 0', '1 1 1 1 1',
        '0 0 0 0 0', '0 0 0 0 1'],
       ['0 0 0 0 0', '0 1 0 0 0', '0 0 0 0 0', '0 0 0 0 0', '0 0 0 0 0',
        '1 1 1 1 1', '0 0 0 1 0'],
       ['0 0 0 0 0', '0 0 1 0 0', '0 0 0 0 0', '0 0 0 0 0', '0 0 0 0 1',
        '0 0 0 1 0', '1 1 1 1 1']], dtype='<U9')

In [20]:
df2 = pd.DataFrame(np.array(data2).reshape(7, 7), columns=words, index=words)

In [21]:
df2

Unnamed: 0,slate,satle,stale,stela,steal,tales,astel
slate,1 1 1 1 1,1 0 0 0 1,1 0 1 0 1,1 0 0 0 0,1 0 0 0 0,0 0 0 0 0,0 0 0 0 0
satle,1 0 0 0 1,1 1 1 1 1,1 0 0 1 1,1 0 0 1 0,1 0 0 0 0,0 1 0 0 0,0 0 1 0 0
stale,1 0 1 0 1,1 0 0 1 1,1 1 1 1 1,1 1 0 1 0,1 1 0 0 0,0 0 0 0 0,0 0 0 0 0
stela,1 0 0 0 0,1 0 0 1 0,1 1 0 1 0,1 1 1 1 1,1 1 1 0 0,0 0 0 0 0,0 0 0 0 0
steal,1 0 0 0 0,1 0 0 0 0,1 1 0 0 0,1 1 1 0 0,1 1 1 1 1,0 0 0 0 0,0 0 0 0 1
tales,0 0 0 0 0,0 1 0 0 0,0 0 0 0 0,0 0 0 0 0,0 0 0 0 0,1 1 1 1 1,0 0 0 1 0
astel,0 0 0 0 0,0 0 1 0 0,0 0 0 0 0,0 0 0 0 0,0 0 0 0 1,0 0 0 1 0,1 1 1 1 1


### Take the anagram dataframe and randomly select a word to display the scoring

In [22]:
# show score for random anagrams
anagram_scoring_df = anagram_scoring(anagrams)
anagram_scoring_df

Unnamed: 0,close,scole
close,1 1 1 1 1,0 0 1 0 1
scole,0 0 1 0 1,1 1 1 1 1


### Score all anagrams against each other. The scoring partners which aren't anagrams will contain `-1s`  

In [23]:
all_anagram_scoring_df = all_anagram_scoring(anagrams)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 726/726 [00:34<00:00, 20.75it/s]


In [24]:
all_anagram_scoring_df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 726 entries, cadre to talus
Columns: 726 entries, cadre to talus
dtypes: string(726)
memory usage: 9.1 MB


In [25]:
all_anagram_scoring_df

Unnamed: 0,cadre,freya,whity,predy,serac,coign,caper,pluma,styan,anoil,...,feral,asper,splay,cader,tared,micro,align,psalm,anime,talus
cadre,1 1 1 1 1,-1 0 0 -1 0,-1 -1 -1 -1 -1,-1 0 0 0 -1,-1 0 0 0 0,1 -1 -1 -1 -1,1 1 -1 0 0,-1 -1 -1 -1 0,-1 -1 -1 0 -1,0 -1 -1 -1 -1,...,-1 0 0 0 -1,0 -1 -1 0 0,-1 -1 -1 0 -1,1 1 1 0 0,-1 1 0 0 0,-1 -1 0 1 -1,0 -1 -1 -1 -1,-1 -1 0 -1 -1,0 -1 -1 -1 1,-1 1 -1 -1 -1
freya,-1 0 -1 0 0,1 1 1 1 1,-1 -1 -1 -1 0,-1 1 1 -1 0,-1 0 0 0 -1,-1 -1 -1 -1 -1,-1 0 -1 0 0,-1 -1 -1 -1 1,-1 -1 0 0 -1,0 -1 -1 -1 -1,...,1 0 0 0 -1,0 -1 -1 0 0,-1 -1 -1 0 0,-1 0 -1 0 0,-1 0 0 0 -1,-1 -1 -1 0 -1,0 -1 -1 -1 -1,-1 -1 0 -1 -1,0 -1 -1 -1 0,-1 0 -1 -1 -1
whity,-1 -1 -1 -1 -1,-1 -1 -1 0 -1,1 1 1 1 1,-1 -1 -1 -1 1,-1 -1 -1 -1 -1,-1 -1 1 -1 -1,-1 -1 -1 -1 -1,-1 -1 -1 -1 -1,-1 0 0 -1 -1,-1 -1 -1 0 -1,...,-1 -1 -1 -1 -1,-1 -1 -1 -1 -1,-1 -1 -1 -1 1,-1 -1 -1 -1 -1,0 -1 -1 -1 -1,-1 0 -1 -1 -1,-1 -1 1 -1 -1,-1 -1 -1 -1 -1,-1 -1 1 -1 -1,0 -1 -1 -1 -1
predy,-1 -1 0 0 0,-1 1 1 0 -1,-1 -1 -1 -1 1,1 1 1 1 1,-1 0 0 -1 -1,-1 -1 -1 -1 -1,-1 -1 0 0 0,1 -1 -1 -1 -1,-1 -1 0 -1 -1,-1 -1 -1 -1 -1,...,-1 0 0 -1 -1,-1 -1 0 0 0,-1 0 -1 -1 1,-1 -1 0 0 0,-1 -1 0 0 0,-1 -1 -1 0 -1,-1 -1 -1 -1 -1,1 -1 -1 -1 -1,-1 -1 -1 -1 0,-1 -1 -1 -1 -1
serac,0 0 -1 0 0,-1 0 0 -1 0,-1 -1 -1 -1 -1,-1 0 0 -1 -1,1 1 1 1 1,0 -1 -1 -1 -1,0 0 -1 0 0,-1 -1 -1 -1 0,1 -1 -1 1 -1,0 -1 -1 -1 -1,...,-1 1 1 1 -1,0 0 -1 0 0,1 -1 -1 1 -1,0 0 -1 0 0,-1 0 1 0 -1,-1 -1 0 0 -1,0 -1 -1 -1 -1,-1 0 0 -1 -1,0 -1 -1 -1 0,-1 0 -1 -1 0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
micro,0 -1 -1 1 -1,-1 0 -1 -1 -1,-1 -1 0 -1 -1,-1 0 -1 -1 -1,-1 -1 0 -1 0,0 0 0 -1 -1,0 -1 -1 -1 0,-1 -1 -1 0 -1,-1 -1 -1 -1 -1,-1 -1 0 0 -1,...,-1 -1 0 -1 -1,-1 -1 -1 -1 0,-1 -1 -1 -1 -1,0 -1 -1 -1 0,-1 -1 0 -1 -1,1 1 1 1 1,-1 -1 0 -1 -1,-1 -1 -1 -1 0,-1 -1 0 0 -1,-1 -1 -1 -1 -1
align,-1 0 -1 -1 -1,-1 -1 -1 -1 0,-1 -1 1 -1 -1,-1 -1 -1 -1 -1,-1 -1 -1 0 -1,-1 -1 1 1 1,-1 0 -1 -1 -1,-1 1 -1 -1 0,-1 -1 -1 0 1,1 0 -1 0 0,...,-1 -1 -1 0 0,1 -1 -1 -1 -1,-1 -1 0 0 -1,-1 0 -1 -1 -1,-1 0 -1 -1 -1,-1 0 -1 -1 -1,1 1 1 1 1,-1 -1 0 0 -1,1 0 1 -1 -1,-1 0 0 -1 -1
psalm,-1 0 -1 -1 -1,-1 -1 -1 -1 0,-1 -1 -1 -1 -1,1 -1 -1 -1 -1,0 -1 -1 0 -1,-1 -1 -1 -1 -1,-1 0 0 -1 -1,1 0 -1 0 0,0 -1 -1 0 -1,0 -1 -1 -1 0,...,-1 -1 -1 0 0,0 1 0 -1 -1,0 0 0 0 -1,-1 0 -1 -1 -1,-1 0 -1 -1 -1,0 -1 -1 -1 -1,0 0 -1 -1 -1,1 1 1 1 1,0 -1 -1 0 -1,-1 0 0 -1 0
anime,-1 0 -1 -1 1,-1 -1 0 -1 0,-1 -1 1 -1 -1,-1 -1 0 -1 -1,-1 0 -1 0 -1,-1 -1 1 -1 0,-1 0 -1 0 -1,-1 -1 -1 1 0,-1 -1 -1 0 0,1 1 -1 0 -1,...,-1 0 -1 0 -1,1 -1 -1 0 -1,-1 -1 -1 0 -1,-1 0 -1 0 -1,-1 0 -1 0 -1,0 0 -1 -1 -1,1 -1 1 -1 0,-1 -1 0 -1 0,1 1 1 1 1,-1 0 -1 -1 -1


In [26]:
all_anagram_scoring_df.to_csv("./data/all_anagram_scoring.csv", index=False)

## 5 bit binary  

Scoring of anagrams produces 5 bit binary with range from `00000` to `11111` `(0 to 31)`.  
- `00000`, all letter are in wrong place
- `11111`, letters matched correctly

In [27]:
df2

Unnamed: 0,slate,satle,stale,stela,steal,tales,astel
slate,1 1 1 1 1,1 0 0 0 1,1 0 1 0 1,1 0 0 0 0,1 0 0 0 0,0 0 0 0 0,0 0 0 0 0
satle,1 0 0 0 1,1 1 1 1 1,1 0 0 1 1,1 0 0 1 0,1 0 0 0 0,0 1 0 0 0,0 0 1 0 0
stale,1 0 1 0 1,1 0 0 1 1,1 1 1 1 1,1 1 0 1 0,1 1 0 0 0,0 0 0 0 0,0 0 0 0 0
stela,1 0 0 0 0,1 0 0 1 0,1 1 0 1 0,1 1 1 1 1,1 1 1 0 0,0 0 0 0 0,0 0 0 0 0
steal,1 0 0 0 0,1 0 0 0 0,1 1 0 0 0,1 1 1 0 0,1 1 1 1 1,0 0 0 0 0,0 0 0 0 1
tales,0 0 0 0 0,0 1 0 0 0,0 0 0 0 0,0 0 0 0 0,0 0 0 0 0,1 1 1 1 1,0 0 0 1 0
astel,0 0 0 0 0,0 0 1 0 0,0 0 0 0 0,0 0 0 0 0,0 0 0 0 1,0 0 0 1 0,1 1 1 1 1


In [28]:
int("11111", 2)

31

In [29]:
int("00000", 2)

0

In [30]:
int("10000", 2)

16

In [31]:
int("10101", 2)

21

**`stale` contains 24 bits of information necessary to describe `steal`**

In [32]:
int("11000", 2)

24

In [33]:
int("00100", 2)

4

In [34]:
(df2
 .loc[df2.index == "stale"]
)

Unnamed: 0,slate,satle,stale,stela,steal,tales,astel
stale,1 0 1 0 1,1 0 0 1 1,1 1 1 1 1,1 1 0 1 0,1 1 0 0 0,0 0 0 0 0,0 0 0 0 0


In [35]:
stale = (df2
         .loc[df2.index == "stale"]
         .T
        )
stale

Unnamed: 0,stale
slate,1 0 1 0 1
satle,1 0 0 1 1
stale,1 1 1 1 1
stela,1 1 0 1 0
steal,1 1 0 0 0
tales,0 0 0 0 0
astel,0 0 0 0 0


In [36]:
# df2.loc[df2.index.isin(["stale"])].squeeze()
(df2
 .loc[df2.index == "stale"]
 .squeeze()
)

slate    1 0 1 0 1
satle    1 0 0 1 1
stale    1 1 1 1 1
stela    1 1 0 1 0
steal    1 1 0 0 0
tales    0 0 0 0 0
astel    0 0 0 0 0
Name: stale, dtype: object

In [37]:
pd.DataFrame((df2
              .loc[df2.index == "stale"]
              .squeeze())
            )

Unnamed: 0,stale
slate,1 0 1 0 1
satle,1 0 0 1 1
stale,1 1 1 1 1
stela,1 1 0 1 0
steal,1 1 0 0 0
tales,0 0 0 0 0
astel,0 0 0 0 0


In [38]:
stale_binaries = (df2
                  .loc[df2.index == "stale"]
                  .squeeze()
                  .tolist()
                 )
stale_binaries

['1 0 1 0 1',
 '1 0 0 1 1',
 '1 1 1 1 1',
 '1 1 0 1 0',
 '1 1 0 0 0',
 '0 0 0 0 0',
 '0 0 0 0 0']

In [39]:
stale_binaries[0]

'1 0 1 0 1'

In [40]:
"".join(stale_binaries[0].split())

'10101'

In [41]:
int("".join(stale_binaries[0].split()), 2)

21

In [42]:
stale_binaries = ["".join(x.split()) for x in stale_binaries]
stale_binaries

['10101', '10011', '11111', '11010', '11000', '00000', '00000']

In [43]:
[int(binary, 2) for binary in stale_binaries]

[21, 19, 31, 26, 24, 0, 0]

In [44]:
scare = (all_anagram_scoring_df
         .loc[all_anagram_scoring_df.index == "scare"]
         .squeeze()
        )
scare

cadre     0  0 -1  1  1
freya    -1  0  0 -1  0
whity    -1 -1 -1 -1 -1
predy    -1  0  0 -1 -1
serac         1 0 0 0 0
              ...      
micro    -1 -1  0  1 -1
align     0 -1 -1 -1 -1
psalm    -1  0  1 -1 -1
anime     0 -1 -1 -1  1
talus    -1  0 -1 -1  0
Name: scare, Length: 726, dtype: string

In [45]:
#  you can find the scores for scare by filtering out those with -1
cols = [(idx, binary) for idx, binary in enumerate(scare) if "-1" not in binary]
cols

[(4, '1 0 0 0 0'),
 (193, '1 1 1 1 1'),
 (248, '0 0 1 0 1'),
 (376, '1 0 0 1 1'),
 (507, '0 0 0 0 1')]

In [46]:
scare_cols = [col[0] for col in cols]
scare_cols

[4, 193, 248, 376, 507]

In [47]:
(all_anagram_scoring_df
 .iloc[scare_cols, scare_cols])

Unnamed: 0,serac,scare,crase,sacre,carse
serac,1 1 1 1 1,1 0 0 0 0,0 0 0 0 0,1 0 0 0 0,0 0 1 0 0
scare,1 0 0 0 0,1 1 1 1 1,0 0 1 0 1,1 0 0 1 1,0 0 0 0 1
crase,0 0 0 0 0,0 0 1 0 1,1 1 1 1 1,0 0 0 0 1,1 0 0 1 1
sacre,1 0 0 0 0,1 0 0 1 1,0 0 0 0 1,1 1 1 1 1,0 1 0 0 1
carse,0 0 1 0 0,0 0 0 0 1,1 0 0 1 1,0 1 0 0 1,1 1 1 1 1


In [48]:
# remove spaces
(all_anagram_scoring_df
 .iloc[scare_cols, scare_cols]
 .applymap(lambda x: "".join(x.split()))
)

Unnamed: 0,serac,scare,crase,sacre,carse
serac,11111,10000,0,10000,100
scare,10000,11111,101,10011,1
crase,0,101,11111,1,10011
sacre,10000,10011,1,11111,1001
carse,100,1,10011,1001,11111


In [49]:
# convert to int
(all_anagram_scoring_df
 .iloc[scare_cols, scare_cols]
 .applymap(lambda x: "".join(x.split()))
 .applymap(lambda x: int(x, 2))
)

Unnamed: 0,serac,scare,crase,sacre,carse
serac,31,16,0,16,4
scare,16,31,5,19,1
crase,0,5,31,1,19
sacre,16,19,1,31,9
carse,4,1,19,9,31


In [50]:
(all_anagram_scoring_df
 .iloc[scare_cols, scare_cols]
 .applymap(lambda x: "".join(x.split()))
 .applymap(lambda x: int(x, 2))
 .assign(sum_across=lambda x: x.sum())
 .astype(np.int8)
)

Unnamed: 0,serac,scare,crase,sacre,carse,sum_across
serac,31,16,0,16,4,67
scare,16,31,5,19,1,72
crase,0,5,31,1,19,56
sacre,16,19,1,31,9,76
carse,4,1,19,9,31,64


## Anagram cipher lookup table  
With `itertools.zip_longest` you can create an encode/decode lookup table for 4, 5 and 6 letter anagrams.

In [51]:
list(itertools.zip_longest("taper", "slip", fillvalue="*"))

[('t', 's'), ('a', 'l'), ('p', 'i'), ('e', 'p'), ('r', '*')]

In [52]:
list(itertools.zip_longest("taper", "scare"))

[('t', 's'), ('a', 'c'), ('p', 'a'), ('e', 'r'), ('r', 'e')]