In [1]:
import numpy as np
import pandas as pd
import itertools
import random
from tqdm import tqdm
import re
from wordle_helpers import anagram_scoring, all_anagram_scoring

import os
# where to save the data
ROOT_DIR = "."
FOLDER_NAME = "data"
PATH_TO_FOLDER = os.path.join(ROOT_DIR, FOLDER_NAME)
os.makedirs(PATH_TO_FOLDER, exist_ok=True)

## Load data

In [2]:
anagrams = pd.read_csv("./data/anagrams.csv")

In [3]:
anagrams

Unnamed: 0,word,anagrams
0,abode,"abode, adobe"
1,abort,"abort, tabor"
2,acred,"acred, cader"
3,acrid,"acrid, caird"
4,adept,"adept, pated"
...,...,...
382,worth,"worth, wroth, whort"
383,wrath,"wrath, thraw"
384,wreak,"wreak, waker"
385,wrote,"wrote, tower"


In [4]:
anagrams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 387 entries, 0 to 386
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   word      387 non-null    object
 1   anagrams  387 non-null    object
dtypes: object(2)
memory usage: 6.2+ KB


In [5]:
anagrams["word"].nunique()

387

## Number of anagrams

In [6]:
my_list_of_anagrams = [pair.split(", ") for pair in anagrams["anagrams"]]

In [7]:
my_list_of_anagrams[:5]

[['abode', 'adobe'],
 ['abort', 'tabor'],
 ['acred', 'cader'],
 ['acrid', 'caird'],
 ['adept', 'pated']]

In [8]:
# num_anagrams_per_list = [len(lst) for lst in my_list_of_anagrams]

In [9]:
vec = np.vectorize(len)
num_anagrams_per_list = vec(np.char.split(np.array(anagrams["anagrams"], dtype=str), sep=", "))
num_anagrams_per_list[:5]

array([2, 2, 2, 2, 2])

In [10]:
anagrams["num_anagrams"] = num_anagrams_per_list

In [11]:
anagrams.head()

Unnamed: 0,word,anagrams,num_anagrams
0,abode,"abode, adobe",2
1,abort,"abort, tabor",2
2,acred,"acred, cader",2
3,acrid,"acrid, caird",2
4,adept,"adept, pated",2


In [12]:
anagrams_copy = anagrams.copy()
anagrams_copy.head()

Unnamed: 0,word,anagrams,num_anagrams
0,abode,"abode, adobe",2
1,abort,"abort, tabor",2
2,acred,"acred, cader",2
3,acrid,"acrid, caird",2
4,adept,"adept, pated",2


In [13]:
# using assign to create new column
# (anagrams.
#  assign(num_anagrams = [len(lst) for lst in [pair.split(", ") for pair in anagrams["anagrams"]]]))

# (anagrams.
#  assign(num_anagrams = vec(np.char.split(np.array(anagrams["anagrams"], dtype=str),sep=", "))))

In [14]:
anagrams.loc[anagrams["num_anagrams"] == anagrams["num_anagrams"].max()]

Unnamed: 0,word,anagrams,num_anagrams
292,steal,"steal, tales, slate, stale, satle, stela, astel",7


In [15]:
anagrams.loc[anagrams["num_anagrams"] == anagrams["num_anagrams"].max()].index[0]

292

In [16]:
indx = np.argmax(num_anagrams_per_list)
indx

292

In [17]:
np.argmax(anagrams["num_anagrams"])

292

In [18]:
words = my_list_of_anagrams[indx]
words

['steal', 'tales', 'slate', 'stale', 'satle', 'stela', 'astel']

## Scoring anagrams

In [19]:
challenge = words[0]
challenge

'steal'

In [20]:
guess_anagrams = words[1:]
guess_anagrams

['tales', 'slate', 'stale', 'satle', 'stela', 'astel']

In [21]:
data = []
for guess in guess_anagrams:
    paired = zip(challenge, guess)
    scores = np.zeros(5, dtype=np.int8)
    for x, (i, j) in enumerate(paired):
#         print((x, (i, j)))
        if i == j:
            scores[x] = 1
        elif i != j and j in challenge:
            scores[x] = 0
        else:
            scores[x] = -1
            
    temp = {
        "challenge_word": challenge,
        "guess": guess,
        "positional_scores": re.sub(r"[\[\]]", "", str(scores)).replace("\n", ","),
        "overall_scores": np.sum(scores, dtype=np.int8),
    }
    data.append(temp)

In [22]:
df = pd.DataFrame(data)

In [23]:
df

Unnamed: 0,challenge_word,guess,positional_scores,overall_scores
0,steal,tales,0 0 0 0 0,0
1,steal,slate,1 0 0 0 0,1
2,steal,stale,1 1 0 0 0,2
3,steal,satle,1 0 0 0 0,1
4,steal,stela,1 1 1 0 0,3
5,steal,astel,0 0 0 0 1,1


In [24]:
# 3 anagram words
anagrams.loc[anagrams["num_anagrams"] == 3].iloc[:5]

Unnamed: 0,word,anagrams,num_anagrams
11,alien,"alien, aline, anile",3
22,ample,"ample, maple, pelma",3
30,argol,"argol, algor, orgal",3
31,argon,"argon, orang, angor",3
32,aries,"aries, serai, aesir",3


### Scoring each anagram against itself

In [25]:
data2 = []
num_guesses = len(words)
counter = 0

while counter < num_guesses:
    for word in words:
        guess = words[counter]
        combo_guess_word = zip(guess, word)
        scores = np.zeros(5, dtype=np.int8)
        for x, (i, j) in enumerate(combo_guess_word):
            if i == j:
                scores[x] = 1
            elif i != j and j in word:
                scores[x] = 0
            else:
                scores[x] = -1
        data2.append(re.sub(r"[\[\]]", "", str(scores)).replace("\n", ","))
    counter += 1   

In [26]:
data2[:7]

['1 1 1 1 1',
 '0 0 0 0 0',
 '1 0 0 0 0',
 '1 1 0 0 0',
 '1 0 0 0 0',
 '1 1 1 0 0',
 '0 0 0 0 1']

In [27]:
len(data2), num_guesses

(49, 7)

In [28]:
# Every 7th value (num guesses) in data2 represents the next guess word scored against the challenge word
# so we can reshape data2 to be a 7x7 array
np.array(data2).reshape(num_guesses, num_guesses)

array([['1 1 1 1 1', '0 0 0 0 0', '1 0 0 0 0', '1 1 0 0 0', '1 0 0 0 0',
        '1 1 1 0 0', '0 0 0 0 1'],
       ['0 0 0 0 0', '1 1 1 1 1', '0 0 0 0 0', '0 0 0 0 0', '0 1 0 0 0',
        '0 0 0 0 0', '0 0 0 1 0'],
       ['1 0 0 0 0', '0 0 0 0 0', '1 1 1 1 1', '1 0 1 0 1', '1 0 0 0 1',
        '1 0 0 0 0', '0 0 0 0 0'],
       ['1 1 0 0 0', '0 0 0 0 0', '1 0 1 0 1', '1 1 1 1 1', '1 0 0 1 1',
        '1 1 0 1 0', '0 0 0 0 0'],
       ['1 0 0 0 0', '0 1 0 0 0', '1 0 0 0 1', '1 0 0 1 1', '1 1 1 1 1',
        '1 0 0 1 0', '0 0 1 0 0'],
       ['1 1 1 0 0', '0 0 0 0 0', '1 0 0 0 0', '1 1 0 1 0', '1 0 0 1 0',
        '1 1 1 1 1', '0 0 0 0 0'],
       ['0 0 0 0 1', '0 0 0 1 0', '0 0 0 0 0', '0 0 0 0 0', '0 0 1 0 0',
        '0 0 0 0 0', '1 1 1 1 1']], dtype='<U9')

In [29]:
df2 = pd.DataFrame(np.array(data2).reshape(7, 7), columns=words, index=words)

In [30]:
df2

Unnamed: 0,steal,tales,slate,stale,satle,stela,astel
steal,1 1 1 1 1,0 0 0 0 0,1 0 0 0 0,1 1 0 0 0,1 0 0 0 0,1 1 1 0 0,0 0 0 0 1
tales,0 0 0 0 0,1 1 1 1 1,0 0 0 0 0,0 0 0 0 0,0 1 0 0 0,0 0 0 0 0,0 0 0 1 0
slate,1 0 0 0 0,0 0 0 0 0,1 1 1 1 1,1 0 1 0 1,1 0 0 0 1,1 0 0 0 0,0 0 0 0 0
stale,1 1 0 0 0,0 0 0 0 0,1 0 1 0 1,1 1 1 1 1,1 0 0 1 1,1 1 0 1 0,0 0 0 0 0
satle,1 0 0 0 0,0 1 0 0 0,1 0 0 0 1,1 0 0 1 1,1 1 1 1 1,1 0 0 1 0,0 0 1 0 0
stela,1 1 1 0 0,0 0 0 0 0,1 0 0 0 0,1 1 0 1 0,1 0 0 1 0,1 1 1 1 1,0 0 0 0 0
astel,0 0 0 0 1,0 0 0 1 0,0 0 0 0 0,0 0 0 0 0,0 0 1 0 0,0 0 0 0 0,1 1 1 1 1


### Take the anagram dataframe and randomly select a word to display the scoring

In [31]:
# show score for random anagrams
anagram_scoring_df = anagram_scoring(anagrams)
anagram_scoring_df

Unnamed: 0,shore,shoer
shore,1 1 1 1 1,1 1 1 0 0
shoer,1 1 1 0 0,1 1 1 1 1


### Score all anagrams against each other. The scoring partners which aren't anagrams will contain `-1s`  

In [32]:
all_anagram_scoring_df = all_anagram_scoring(anagrams)

100%|█████████████████████████████████████████████████████████████████████████████████| 672/672 [00:22<00:00, 30.30it/s]


In [33]:
all_anagram_scoring_df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 672 entries, shale to misly
Columns: 672 entries, shale to misly
dtypes: string(672)
memory usage: 7.8 MB


In [34]:
all_anagram_scoring_df

Unnamed: 0,shale,stark,inset,cheap,ayont,argus,afire,couth,foxly,align,...,atony,melic,carom,somne,craps,ceint,feral,pacer,chief,misly
shale,1 1 1 1 1,1 -1 1 -1 -1,-1 -1 0 0 -1,-1 1 0 0 -1,0 -1 -1 -1 -1,0 -1 -1 -1 0,0 -1 -1 -1 1,-1 -1 -1 -1 0,-1 -1 -1 1 -1,0 0 -1 -1 -1,...,0 -1 -1 -1 -1,-1 0 0 -1 -1,-1 0 -1 -1 -1,1 -1 -1 -1 1,-1 -1 1 -1 0,-1 0 -1 -1 -1,-1 0 -1 0 0,-1 0 -1 0 -1,-1 1 -1 0 -1,-1 -1 0 1 -1
stark,1 -1 1 -1 -1,1 1 1 1 1,-1 -1 0 -1 0,-1 -1 -1 0 -1,0 -1 -1 -1 0,0 0 -1 -1 0,0 -1 -1 1 -1,-1 -1 -1 0 -1,-1 -1 -1 -1 -1,0 -1 -1 -1 -1,...,0 1 -1 -1 -1,-1 -1 -1 -1 -1,-1 0 0 -1 -1,1 -1 -1 -1 -1,-1 0 1 -1 0,-1 -1 -1 -1 0,-1 -1 0 0 -1,-1 0 -1 -1 0,-1 -1 -1 -1 -1,-1 -1 0 -1 -1
inset,0 -1 -1 -1 0,0 0 -1 -1 -1,1 1 1 1 1,-1 -1 0 -1 -1,-1 -1 -1 0 1,-1 -1 -1 -1 0,-1 -1 0 -1 0,-1 -1 -1 0 -1,-1 -1 -1 -1 -1,-1 -1 0 -1 0,...,-1 0 -1 0 -1,-1 0 -1 0 -1,-1 -1 -1 -1 -1,0 -1 -1 0 0,-1 -1 -1 -1 0,-1 0 0 0 1,-1 0 -1 -1 -1,-1 -1 -1 1 -1,-1 -1 0 1 -1,-1 0 1 -1 -1
cheap,-1 1 0 -1 0,-1 -1 0 -1 -1,-1 -1 -1 0 -1,1 1 1 1 1,0 -1 -1 -1 -1,0 -1 -1 -1 -1,0 -1 -1 -1 0,1 -1 -1 -1 0,-1 -1 -1 -1 -1,0 -1 -1 -1 -1,...,0 -1 -1 -1 -1,-1 0 -1 -1 0,1 0 -1 -1 -1,-1 -1 -1 -1 0,1 -1 0 0 -1,1 0 -1 -1 -1,-1 0 -1 1 -1,0 0 0 0 -1,1 1 -1 0 -1,-1 -1 -1 -1 -1
ayont,-1 -1 0 -1 -1,-1 0 0 -1 -1,-1 0 -1 -1 1,-1 -1 -1 0 -1,1 1 1 1 1,1 -1 -1 -1 -1,1 -1 -1 -1 -1,-1 0 -1 0 -1,-1 0 -1 -1 0,1 -1 -1 -1 0,...,1 0 1 1 0,-1 -1 -1 -1 -1,-1 0 -1 0 -1,-1 0 -1 1 -1,-1 -1 0 -1 -1,-1 -1 -1 1 1,-1 -1 -1 0 -1,-1 0 -1 -1 -1,-1 -1 -1 -1 -1,-1 -1 -1 -1 0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ceint,-1 -1 -1 -1 0,-1 0 -1 -1 -1,0 0 -1 0 1,1 -1 0 -1 -1,-1 -1 -1 1 1,-1 -1 -1 -1 -1,-1 -1 1 -1 0,1 -1 -1 0 -1,-1 -1 -1 -1 -1,-1 -1 1 -1 0,...,-1 0 -1 1 -1,-1 1 -1 0 0,1 -1 -1 -1 -1,-1 -1 -1 1 0,1 -1 -1 -1 -1,1 1 1 1 1,-1 1 -1 -1 -1,-1 -1 0 0 -1,1 -1 1 0 -1,-1 0 -1 -1 -1
feral,-1 -1 0 0 0,-1 -1 0 0 -1,-1 -1 -1 0 -1,-1 -1 0 1 -1,0 -1 -1 -1 -1,0 0 -1 -1 -1,0 0 -1 0 0,-1 -1 -1 -1 -1,1 -1 -1 0 -1,0 0 -1 -1 -1,...,0 -1 -1 -1 -1,-1 1 0 -1 -1,-1 0 1 -1 -1,-1 -1 -1 -1 0,-1 0 0 -1 -1,-1 1 -1 -1 -1,1 1 1 1 1,-1 0 -1 0 0,-1 -1 -1 0 0,-1 -1 -1 0 -1
pacer,-1 -1 0 -1 0,-1 -1 0 0 -1,-1 -1 -1 1 -1,0 -1 0 0 0,0 -1 -1 -1 -1,0 0 -1 -1 -1,0 -1 -1 0 0,0 -1 -1 -1 -1,-1 -1 -1 -1 -1,0 -1 -1 -1 -1,...,0 -1 -1 -1 -1,-1 0 -1 -1 0,0 1 0 -1 -1,-1 -1 -1 -1 0,0 0 0 0 -1,0 0 -1 -1 -1,-1 0 0 0 -1,1 1 1 1 1,0 -1 -1 1 -1,-1 -1 -1 -1 -1
chief,-1 1 -1 -1 0,-1 -1 -1 -1 -1,0 -1 -1 1 -1,1 1 0 -1 -1,-1 -1 -1 -1 -1,-1 -1 -1 -1 -1,-1 0 1 -1 0,1 -1 -1 -1 0,0 -1 -1 -1 -1,-1 -1 1 -1 -1,...,-1 -1 -1 -1 -1,-1 0 -1 0 0,1 -1 -1 -1 -1,-1 -1 -1 -1 0,1 -1 -1 -1 -1,1 0 1 -1 -1,0 0 -1 -1 -1,-1 -1 0 1 -1,1 1 1 1 1,-1 0 -1 -1 -1


In [35]:
all_anagram_scoring_df.to_csv("./data/all_anagram_scoring.csv", index=False)

## 5 bit binary  

Scoring of anagrams produces 5 bit binary with range from `00000` to `11111` `(0 to 31)`.  
- `00000`, all letter are in wrong place
- `11111`, letters matched correctly

In [36]:
df2

Unnamed: 0,steal,tales,slate,stale,satle,stela,astel
steal,1 1 1 1 1,0 0 0 0 0,1 0 0 0 0,1 1 0 0 0,1 0 0 0 0,1 1 1 0 0,0 0 0 0 1
tales,0 0 0 0 0,1 1 1 1 1,0 0 0 0 0,0 0 0 0 0,0 1 0 0 0,0 0 0 0 0,0 0 0 1 0
slate,1 0 0 0 0,0 0 0 0 0,1 1 1 1 1,1 0 1 0 1,1 0 0 0 1,1 0 0 0 0,0 0 0 0 0
stale,1 1 0 0 0,0 0 0 0 0,1 0 1 0 1,1 1 1 1 1,1 0 0 1 1,1 1 0 1 0,0 0 0 0 0
satle,1 0 0 0 0,0 1 0 0 0,1 0 0 0 1,1 0 0 1 1,1 1 1 1 1,1 0 0 1 0,0 0 1 0 0
stela,1 1 1 0 0,0 0 0 0 0,1 0 0 0 0,1 1 0 1 0,1 0 0 1 0,1 1 1 1 1,0 0 0 0 0
astel,0 0 0 0 1,0 0 0 1 0,0 0 0 0 0,0 0 0 0 0,0 0 1 0 0,0 0 0 0 0,1 1 1 1 1


In [37]:
int("11111", 2)

31

In [38]:
int("00000", 2)

0

In [39]:
int("10000", 2)

16

In [40]:
int("10101", 2)

21

**`stale` contains 24 bits of information necessary to describe `steal`**

In [41]:
int("11000", 2)

24

In [42]:
int("00100", 2)

4

In [43]:
df2.loc[df2.index.isin(["stale"])]

Unnamed: 0,steal,tales,slate,stale,satle,stela,astel
stale,1 1 0 0 0,0 0 0 0 0,1 0 1 0 1,1 1 1 1 1,1 0 0 1 1,1 1 0 1 0,0 0 0 0 0


In [44]:
stale = df2.loc[df2.index.isin(["stale"])].T
stale

Unnamed: 0,stale
steal,1 1 0 0 0
tales,0 0 0 0 0
slate,1 0 1 0 1
stale,1 1 1 1 1
satle,1 0 0 1 1
stela,1 1 0 1 0
astel,0 0 0 0 0


In [45]:
df2.loc[df2.index.isin(["stale"])].squeeze()

steal    1 1 0 0 0
tales    0 0 0 0 0
slate    1 0 1 0 1
stale    1 1 1 1 1
satle    1 0 0 1 1
stela    1 1 0 1 0
astel    0 0 0 0 0
Name: stale, dtype: object

In [46]:
pd.DataFrame(df2.loc[df2.index.isin(["stale"])].squeeze())

Unnamed: 0,stale
steal,1 1 0 0 0
tales,0 0 0 0 0
slate,1 0 1 0 1
stale,1 1 1 1 1
satle,1 0 0 1 1
stela,1 1 0 1 0
astel,0 0 0 0 0


In [47]:
stale_binaries = df2.loc[df2.index.isin(["stale"])].squeeze().tolist()
stale_binaries

['1 1 0 0 0',
 '0 0 0 0 0',
 '1 0 1 0 1',
 '1 1 1 1 1',
 '1 0 0 1 1',
 '1 1 0 1 0',
 '0 0 0 0 0']

In [48]:
stale_binaries[0]

'1 1 0 0 0'

In [49]:
"".join(stale_binaries[0].split())

'11000'

In [50]:
int("".join(stale_binaries[0].split()), 2)

24

In [51]:
stale_binaries = ["".join(x.split()) for x in stale_binaries]
stale_binaries

['11000', '00000', '10101', '11111', '10011', '11010', '00000']

In [52]:
[int(binary, 2) for binary in stale_binaries]

[24, 0, 21, 31, 19, 26, 0]

In [53]:
scare = all_anagram_scoring_df.loc[all_anagram_scoring_df.index.isin(["scare"])].squeeze()
scare

shale     1 -1  1 -1  1
stark     1 -1  1  1 -1
inset    -1 -1  0  0 -1
cheap     0 -1  0  0 -1
ayont     0 -1 -1 -1 -1
              ...      
ceint     0  0 -1 -1 -1
feral    -1  0  0  0 -1
pacer    -1  0  0  0  0
chief     0 -1 -1  0 -1
misly    -1 -1  0 -1 -1
Name: scare, Length: 672, dtype: string

In [54]:
#  you can find the scores for scare by filtering out those with -1
cols = [(idx, binary) for idx, binary in enumerate(scare) if "-1" not in binary]
cols

[(23, '0 0 0 0 1'),
 (29, '0 0 1 0 1'),
 (114, '1 0 0 1 1'),
 (317, '1 0 0 0 0'),
 (657, '1 1 1 1 1')]

In [55]:
scare_cols = [col[0] for col in cols]
scare_cols

[23, 29, 114, 317, 657]

In [56]:
all_anagram_scoring_df.iloc[scare_cols, scare_cols]

Unnamed: 0,carse,crase,sacre,serac,scare
carse,1 1 1 1 1,1 0 0 1 1,0 1 0 0 1,0 0 1 0 0,0 0 0 0 1
crase,1 0 0 1 1,1 1 1 1 1,0 0 0 0 1,0 0 0 0 0,0 0 1 0 1
sacre,0 1 0 0 1,0 0 0 0 1,1 1 1 1 1,1 0 0 0 0,1 0 0 1 1
serac,0 0 1 0 0,0 0 0 0 0,1 0 0 0 0,1 1 1 1 1,1 0 0 0 0
scare,0 0 0 0 1,0 0 1 0 1,1 0 0 1 1,1 0 0 0 0,1 1 1 1 1


In [57]:
# remove spaces
all_anagram_scoring_df.iloc[scare_cols, scare_cols].applymap(lambda x: "".join(x.split()))

Unnamed: 0,carse,crase,sacre,serac,scare
carse,11111,10011,1001,100,1
crase,10011,11111,1,0,101
sacre,1001,1,11111,10000,10011
serac,100,0,10000,11111,10000
scare,1,101,10011,10000,11111


In [58]:
# convert to int
(all_anagram_scoring_df.iloc[scare_cols, scare_cols]
 .applymap(lambda x: "".join(x.split()))
 .applymap(lambda x: int(x, 2)))

Unnamed: 0,carse,crase,sacre,serac,scare
carse,31,19,9,4,1
crase,19,31,1,0,5
sacre,9,1,31,16,19
serac,4,0,16,31,16
scare,1,5,19,16,31


In [59]:
(all_anagram_scoring_df.iloc[scare_cols, scare_cols]
 .applymap(lambda x: "".join(x.split()))
 .applymap(lambda x: int(x, 2))
 .assign(sum_across=lambda x: x.sum())
 .astype(np.int8))

Unnamed: 0,carse,crase,sacre,serac,scare,sum_across
carse,31,19,9,4,1,64
crase,19,31,1,0,5,56
sacre,9,1,31,16,19,76
serac,4,0,16,31,16,67
scare,1,5,19,16,31,72


## Anagram cipher lookup table  
With `itertools.zip_longest` you can create an encode/decode lookup table for 4, 5 and 6 letter anagrams.

In [60]:
list(itertools.zip_longest("taper", "slip"))

[('t', 's'), ('a', 'l'), ('p', 'i'), ('e', 'p'), ('r', None)]

In [61]:
list(itertools.zip_longest("taper", "scare"))

[('t', 's'), ('a', 'c'), ('p', 'a'), ('e', 'r'), ('r', 'e')]