In [1]:
import numpy as np
import pandas as pd
import itertools
import random
from tqdm import tqdm
import re

import os
# where to save the data
ROOT_DIR = "."
FOLDER_NAME = "data"
PATH_TO_FOLDER = os.path.join(ROOT_DIR, FOLDER_NAME)
os.makedirs(PATH_TO_FOLDER, exist_ok=True)

## Load data

In [2]:
anagrams = pd.read_csv("./data/anagrams.csv")

In [3]:
anagrams

Unnamed: 0,word,anagrams
0,abode,"abode, adobe"
1,abort,"abort, tabor"
2,acred,"acred, cader"
3,acrid,"acrid, caird"
4,adept,"adept, pated"
...,...,...
382,worth,"worth, wroth, whort"
383,wrath,"wrath, thraw"
384,wreak,"wreak, waker"
385,wrote,"wrote, tower"


In [4]:
anagrams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 387 entries, 0 to 386
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   word      387 non-null    object
 1   anagrams  387 non-null    object
dtypes: object(2)
memory usage: 6.2+ KB


In [5]:
anagrams["word"].nunique()

387

## Number of anagrams

In [6]:
my_list_of_anagrams = [x.split(", ") for x in anagrams["anagrams"]]

In [7]:
my_list_of_anagrams

[['abode', 'adobe'],
 ['abort', 'tabor'],
 ['acred', 'cader'],
 ['acrid', 'caird'],
 ['adept', 'pated'],
 ['adore', 'oread'],
 ['afire', 'feria'],
 ['aider', 'irade'],
 ['aitch', 'chati'],
 ['alert', 'alter'],
 ['algor', 'orgal'],
 ['alien', 'aline', 'anile'],
 ['align', 'algin'],
 ['aline', 'anile'],
 ['alish', 'shail'],
 ['aloft', 'flota'],
 ['aloin', 'anoil'],
 ['amend', 'maned'],
 ['amigo', 'imago'],
 ['amine', 'manie'],
 ['amity', 'atimy'],
 ['amole', 'maleo'],
 ['ample', 'maple', 'pelma'],
 ['ampul', 'pluma'],
 ['angel', 'angle'],
 ['anime', 'maine', 'amine', 'manie'],
 ['apert', 'peart'],
 ['apish', 'aphis'],
 ['aptly', 'typal', 'platy', 'patly'],
 ['archy', 'chary'],
 ['argol', 'algor', 'orgal'],
 ['argon', 'orang', 'angor'],
 ['aries', 'serai', 'aesir'],
 ['arise', 'aries', 'serai', 'aesir'],
 ['arist', 'trias'],
 ['armet', 'terma'],
 ['arose', 'seora'],
 ['aspen', 'snape', 'sneap', 'spane'],
 ['aspic', 'spica'],
 ['aster', 'tarse', 'arest'],
 ['astir', 'tisar', 'tarsi', 'aris

In [8]:
vec = np.vectorize(len)

In [9]:
num_anagrams_per_list = vec(np.array(my_list_of_anagrams, dtype=object))
num_anagrams_per_list

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       3, 2, 2, 4, 2, 2, 4, 2, 3, 3, 3, 4, 2, 2, 2, 4, 2, 3, 5, 2, 2, 2,
       3, 2, 2, 2, 2, 4, 3, 2, 2, 2, 2, 3, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 2, 2, 2, 2, 4, 2, 2,
       2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 4, 2, 3, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 3, 2, 2, 4, 2, 2, 2, 2, 3, 2, 3, 2,
       3, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 3, 2, 2, 2, 2,
       3, 2, 2, 3, 4, 2, 2, 2, 2, 2, 3, 2, 2, 2, 4, 2, 2, 2, 2, 2, 3, 2,
       3, 2, 2, 3, 3, 2, 2, 2, 2, 2, 3, 2, 5, 4, 2, 2, 2, 4, 4, 6, 2, 2,
       4, 2, 2, 2, 2, 2, 2, 2, 3, 2, 5, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2,
       2, 4, 4, 2, 2, 2, 2, 3, 2, 3, 2, 5, 2, 2, 2, 3, 2, 3, 2, 2, 3, 2,
       3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 5, 2, 2, 3, 2, 2, 2,
       3, 2, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 4, 5, 4, 2, 2, 3, 2, 2, 2, 3,
       3, 4, 2, 4, 2, 2, 7, 2, 4, 2, 2, 2, 2, 2, 2,

In [10]:
anagrams["num_anagrams"] = num_anagrams_per_list

In [11]:
anagrams

Unnamed: 0,word,anagrams,num_anagrams
0,abode,"abode, adobe",2
1,abort,"abort, tabor",2
2,acred,"acred, cader",2
3,acrid,"acrid, caird",2
4,adept,"adept, pated",2
...,...,...,...
382,worth,"worth, wroth, whort",3
383,wrath,"wrath, thraw",2
384,wreak,"wreak, waker",2
385,wrote,"wrote, tower",2


In [12]:
anagrams.loc[anagrams["num_anagrams"] == anagrams["num_anagrams"].max()]

Unnamed: 0,word,anagrams,num_anagrams
292,steal,"steal, tales, slate, stale, satle, stela, astel",7


In [13]:
index_loc = np.where(num_anagrams_per_list == np.max(num_anagrams_per_list))
index_loc

(array([292]),)

In [14]:
index_loc[0][0]

292

In [15]:
my_list_of_anagrams[index_loc[0][0]]

['steal', 'tales', 'slate', 'stale', 'satle', 'stela', 'astel']

In [16]:
combo_words_1 = [combo for combo in itertools.combinations(my_list_of_anagrams[index_loc[0][0]], 2)]
combo_words_1

[('steal', 'tales'),
 ('steal', 'slate'),
 ('steal', 'stale'),
 ('steal', 'satle'),
 ('steal', 'stela'),
 ('steal', 'astel'),
 ('tales', 'slate'),
 ('tales', 'stale'),
 ('tales', 'satle'),
 ('tales', 'stela'),
 ('tales', 'astel'),
 ('slate', 'stale'),
 ('slate', 'satle'),
 ('slate', 'stela'),
 ('slate', 'astel'),
 ('stale', 'satle'),
 ('stale', 'stela'),
 ('stale', 'astel'),
 ('satle', 'stela'),
 ('satle', 'astel'),
 ('stela', 'astel')]

In [17]:
len(anagrams["word"])

387

In [18]:
challenge = my_list_of_anagrams[index_loc[0][0]][0]
challenge

'steal'

## Scoring anagrams

In [19]:
guess_anagrams = my_list_of_anagrams[index_loc[0][0]][1:]
guess_anagrams

['tales', 'slate', 'stale', 'satle', 'stela', 'astel']

In [20]:
data = []
for guess in guess_anagrams:
    paired = zip(challenge, guess)
    scores = np.zeros(5, dtype=np.int8)
    for x, (i, j) in enumerate(paired):
#         print((x, (i, j)))
        if i == j:
            scores[x] = 1
        elif i != j and j in challenge:
            scores[x] = 0
        else:
            scores[x] = -1
            
    temp = {
        "challenge_word": challenge,
        "guess": guess,
        "positional_scores": re.sub(r"[\[\]]", "", str(scores)).replace("\n", ","),
        "overall_scores": np.sum(scores, dtype=np.int8),
    }
    data.append(temp)

In [21]:
df = pd.DataFrame(data)

In [22]:
df

Unnamed: 0,challenge_word,guess,positional_scores,overall_scores
0,steal,tales,0 0 0 0 0,0
1,steal,slate,1 0 0 0 0,1
2,steal,stale,1 1 0 0 0,2
3,steal,satle,1 0 0 0 0,1
4,steal,stela,1 1 1 0 0,3
5,steal,astel,0 0 0 0 1,1


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   challenge_word     6 non-null      object
 1   guess              6 non-null      object
 2   positional_scores  6 non-null      object
 3   overall_scores     6 non-null      int8  
dtypes: int8(1), object(3)
memory usage: 278.0+ bytes


In [24]:
anagrams.loc[anagrams["num_anagrams"] == 3]

Unnamed: 0,word,anagrams,num_anagrams
11,alien,"alien, aline, anile",3
22,ample,"ample, maple, pelma",3
30,argol,"argol, algor, orgal",3
31,argon,"argon, orang, angor",3
32,aries,"aries, serai, aesir",3
39,aster,"aster, tarse, arest",3
44,cadre,"cadre, acred, cader",3
50,caple,"caple, capel, clape",3
55,cater,"cater, creat, caret",3
103,cupel,"cupel, pucel, pecul",3


In [25]:
words = my_list_of_anagrams[index_loc[0][0]]
words

['steal', 'tales', 'slate', 'stale', 'satle', 'stela', 'astel']

In [26]:
guesses = my_list_of_anagrams[index_loc[0][0]]
guesses

['steal', 'tales', 'slate', 'stale', 'satle', 'stela', 'astel']

In [27]:
data2 = []
for guess in guesses:
    for word in words:
        paired_guess_word = zip(word, guess)
        score = np.zeros(5, dtype=np.int8)
        for x, (i, j) in enumerate(paired_guess_word):
            if i == j:
                score[x] = 1
            elif i != j and j in word:
                score[x] = 0
            else:
                score[x] = -1
            
        temp2 = {
            "challenge_word": word,
            "guess": guess,
            "positional_scores": re.sub(r"[\[\]]", "", str(score)).replace("\n", ","),
        }
        data2.append(temp2)
df2 = pd.DataFrame(data2)
df2

Unnamed: 0,challenge_word,guess,positional_scores
0,steal,steal,1 1 1 1 1
1,tales,steal,0 0 0 0 0
2,slate,steal,1 0 0 0 0
3,stale,steal,1 1 0 0 0
4,satle,steal,1 0 0 0 0
5,stela,steal,1 1 1 0 0
6,astel,steal,0 0 0 0 1
7,steal,tales,0 0 0 0 0
8,tales,tales,1 1 1 1 1
9,slate,tales,0 0 0 0 0


### Scoring each anagram against itself

In [28]:
print(guesses)
print(words)

['steal', 'tales', 'slate', 'stale', 'satle', 'stela', 'astel']
['steal', 'tales', 'slate', 'stale', 'satle', 'stela', 'astel']


In [29]:
len(guesses)

7

In [30]:
data3 = []
num_guesses = len(words)
counter = 0

while counter < num_guesses:
    for word in words:
        guess = words[counter]
        combo_guess_word = zip(guess, word)
        scores = np.zeros(5, dtype=np.int8)
        for x, (i, j) in enumerate(combo_guess_word):
            if i == j:
                scores[x] = 1
            elif i != j and j in word:
                scores[x] = 0
            else:
                scores[x] = -1
        data3.append(re.sub(r"[\[\]]", "", str(scores)).replace("\n", ","))
    counter += 1   


In [31]:
data3[:7]

['1 1 1 1 1',
 '0 0 0 0 0',
 '1 0 0 0 0',
 '1 1 0 0 0',
 '1 0 0 0 0',
 '1 1 1 0 0',
 '0 0 0 0 1']

In [32]:
len(data3), num_guesses

(49, 7)

In [33]:
# Every 7th value (num guesses) in data3 represents the next guess word scored against the challenge word
# so we can reshape data3 to be a 7x7 array
np.array(data3).reshape(num_guesses, num_guesses)

array([['1 1 1 1 1', '0 0 0 0 0', '1 0 0 0 0', '1 1 0 0 0', '1 0 0 0 0',
        '1 1 1 0 0', '0 0 0 0 1'],
       ['0 0 0 0 0', '1 1 1 1 1', '0 0 0 0 0', '0 0 0 0 0', '0 1 0 0 0',
        '0 0 0 0 0', '0 0 0 1 0'],
       ['1 0 0 0 0', '0 0 0 0 0', '1 1 1 1 1', '1 0 1 0 1', '1 0 0 0 1',
        '1 0 0 0 0', '0 0 0 0 0'],
       ['1 1 0 0 0', '0 0 0 0 0', '1 0 1 0 1', '1 1 1 1 1', '1 0 0 1 1',
        '1 1 0 1 0', '0 0 0 0 0'],
       ['1 0 0 0 0', '0 1 0 0 0', '1 0 0 0 1', '1 0 0 1 1', '1 1 1 1 1',
        '1 0 0 1 0', '0 0 1 0 0'],
       ['1 1 1 0 0', '0 0 0 0 0', '1 0 0 0 0', '1 1 0 1 0', '1 0 0 1 0',
        '1 1 1 1 1', '0 0 0 0 0'],
       ['0 0 0 0 1', '0 0 0 1 0', '0 0 0 0 0', '0 0 0 0 0', '0 0 1 0 0',
        '0 0 0 0 0', '1 1 1 1 1']], dtype='<U9')

In [34]:
df3 = pd.DataFrame(np.array(data3).reshape(7, 7), columns=words, index=guesses)

In [35]:
df3

Unnamed: 0,steal,tales,slate,stale,satle,stela,astel
steal,1 1 1 1 1,0 0 0 0 0,1 0 0 0 0,1 1 0 0 0,1 0 0 0 0,1 1 1 0 0,0 0 0 0 1
tales,0 0 0 0 0,1 1 1 1 1,0 0 0 0 0,0 0 0 0 0,0 1 0 0 0,0 0 0 0 0,0 0 0 1 0
slate,1 0 0 0 0,0 0 0 0 0,1 1 1 1 1,1 0 1 0 1,1 0 0 0 1,1 0 0 0 0,0 0 0 0 0
stale,1 1 0 0 0,0 0 0 0 0,1 0 1 0 1,1 1 1 1 1,1 0 0 1 1,1 1 0 1 0,0 0 0 0 0
satle,1 0 0 0 0,0 1 0 0 0,1 0 0 0 1,1 0 0 1 1,1 1 1 1 1,1 0 0 1 0,0 0 1 0 0
stela,1 1 1 0 0,0 0 0 0 0,1 0 0 0 0,1 1 0 1 0,1 0 0 1 0,1 1 1 1 1,0 0 0 0 0
astel,0 0 0 0 1,0 0 0 1 0,0 0 0 0 0,0 0 0 0 0,0 0 1 0 0,0 0 0 0 0,1 1 1 1 1


### Take the anagram dataframe and randomly select a word to display the scoring

In [157]:
def anagram_entropy(dataf):
    rnd_indx = random.sample(population=dataf.index.tolist(), k=1)
    anagrams_words = dataf["anagrams"].iloc[rnd_indx[0]].split(", ")
    num_guesses = len(anagrams_words)
    data = []
    counter = 0

    while counter < num_guesses:
        for word in anagrams_words:
            guess = anagrams_words[counter]
            combo_guess_word = zip(guess, word)
            scores = np.zeros(5, dtype=np.int8)
            for x, (i, j) in enumerate(combo_guess_word):
                if i == j:
                    scores[x] = 1
                elif i != j and j in word:
                    scores[x] = 0
                else:
                    scores[x] = -1
            data.append(re.sub(r"[\[\]]", "", str(scores)).replace("\n", ","))
        counter += 1
        
    # Every num_guesses in data represents the next guess word scored against the challenge word
    # so we can reshape data to be a num_guesses x num_guesses array
    return pd.DataFrame(np.array(data).reshape(num_guesses, num_guesses),
                        columns=anagrams_words,
                        index=anagrams_words)

anagram_scoring_df = anagram_entropy(anagrams)
anagram_scoring_df

Unnamed: 0,polka,pokal
polka,1 1 1 1 1,1 1 0 0 0
pokal,1 1 0 0 0,1 1 1 1 1


### Score all anagrams against each other. The scoring partners which aren't anagrams will contain `-1s`

In [37]:
anagrams["anagrams"].iloc[:5]

0    abode, adobe
1    abort, tabor
2    acred, cader
3    acrid, caird
4    adept, pated
Name: anagrams, dtype: object

In [38]:
some_anagrams = [grams.split(", ") for grams in anagrams["anagrams"]]

In [39]:
# grammys = []
# for subgram in some_anagrams:
#     for gram in subgram:
#         grammys.append(gram)

In [40]:
grammys = [gram for subgram in some_anagrams for gram in subgram]
grammys[:5]

['abode', 'adobe', 'abort', 'tabor', 'acred']

In [41]:
len(grammys)

938

In [42]:
# ensure unique words
len(set(grammys))

672

In [43]:
def all_anagram_entropy(dataf):
    datax = []
    all_anagrams = [grams.split(", ") for grams in anagrams["anagrams"].tolist()]
    all_anagrams = set([gram for subgram in some_anagrams for gram in subgram]) # ensure unique words
    num_anagrams = len(all_anagrams)
    for word in tqdm(all_anagrams):
        for guess in all_anagrams:
            scores = np.zeros(5, dtype=np.int8)
            for idx, (x, y) in enumerate(zip(word, guess)):
                if x == y:
                    scores[idx] = 1
                elif x != y and y in word:
                    scores[idx] = 0
                else:
                    scores[idx] = -1
            datax.append(re.sub(r"[\[\]]", "", str(scores)).replace("\n", ","))
            
    
    # len(datax) is num_anagrams squared, so we reshape data to be a num_anagrams x num_anagrams
    # array before putting it into a dataframe
    return pd.DataFrame(np.array(datax).reshape(num_anagrams, num_anagrams),
                        columns=all_anagrams,
                        index=all_anagrams)

In [44]:
all_anagram_scoring_df = all_anagram_entropy(anagrams)

100%|███████████████████████████████████████████████████████████████████████| 672/672 [00:22<00:00, 29.88it/s]


In [45]:
all_anagram_scoring_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 672 entries, spate to prude
Columns: 672 entries, spate to prude
dtypes: object(672)
memory usage: 3.5+ MB


In [46]:
all_anagram_scoring_df

Unnamed: 0,spate,plane,petal,saleb,steik,clary,sarpo,close,swale,polka,...,stour,algor,sutor,torus,shail,flame,skute,owher,scate,prude
spate,1 1 1 1 1,0 -1 1 -1 1,0 0 0 0 -1,1 0 -1 0 -1,1 0 0 -1 -1,-1 -1 1 -1 -1,1 0 -1 0 -1,-1 -1 -1 0 1,1 -1 1 -1 1,0 -1 -1 -1 0,...,1 0 -1 -1 -1,0 -1 -1 -1 -1,1 -1 0 -1 -1,0 -1 -1 -1 0,1 -1 1 -1 -1,-1 -1 1 -1 1,1 -1 -1 1 1,-1 -1 -1 0 -1,1 -1 1 1 1,0 -1 -1 -1 1
plane,-1 0 1 -1 1,1 1 1 1 1,1 0 -1 0 0,-1 0 0 0 -1,-1 -1 0 -1 -1,-1 1 1 -1 -1,-1 0 -1 0 -1,-1 1 -1 -1 1,-1 -1 1 0 1,1 -1 0 -1 0,...,-1 -1 -1 -1 -1,0 1 -1 -1 -1,-1 -1 -1 -1 -1,-1 -1 -1 -1 -1,-1 -1 1 -1 0,-1 1 1 -1 1,-1 -1 -1 -1 1,-1 -1 -1 0 -1,-1 -1 1 -1 1,1 -1 -1 -1 1
petal,-1 0 0 0 0,1 0 0 -1 0,1 1 1 1 1,-1 0 0 0 -1,-1 0 0 -1 -1,-1 0 0 -1 -1,-1 0 -1 0 -1,-1 0 -1 -1 0,-1 -1 0 0 0,1 -1 0 -1 0,...,-1 0 -1 -1 -1,0 0 -1 -1 -1,-1 -1 1 -1 -1,0 -1 -1 -1 -1,-1 -1 0 -1 1,-1 0 0 -1 0,-1 -1 -1 0 0,-1 -1 -1 0 -1,-1 -1 0 0 0,1 -1 -1 -1 0
saleb,1 -1 0 -1 0,-1 0 0 -1 0,-1 0 -1 0 0,1 1 1 1 1,1 -1 0 -1 -1,-1 0 0 -1 -1,1 1 -1 -1 -1,-1 0 -1 0 0,1 -1 0 0 0,-1 -1 1 -1 0,...,1 -1 -1 -1 -1,0 0 -1 -1 -1,1 -1 -1 -1 -1,-1 -1 -1 -1 0,1 -1 0 -1 0,-1 0 0 -1 0,1 -1 -1 -1 0,-1 -1 -1 1 -1,1 -1 0 -1 0,-1 -1 -1 -1 0
steik,1 -1 -1 0 0,-1 -1 -1 -1 0,-1 0 0 -1 -1,1 -1 -1 0 -1,1 1 1 1 1,-1 -1 -1 -1 -1,1 -1 -1 -1 -1,-1 -1 -1 0 0,1 -1 -1 -1 0,-1 -1 -1 0 -1,...,1 1 -1 -1 -1,-1 -1 -1 -1 -1,1 -1 0 -1 -1,0 -1 -1 -1 0,1 -1 -1 1 -1,-1 -1 -1 -1 0,1 0 -1 0 0,-1 -1 -1 0 -1,1 -1 -1 0 0,-1 -1 -1 -1 0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
flame,-1 -1 1 -1 1,-1 1 1 -1 1,-1 0 -1 0 0,-1 0 0 0 -1,-1 -1 0 -1 -1,-1 1 1 -1 -1,-1 0 -1 -1 -1,-1 1 -1 -1 1,-1 -1 1 0 1,-1 -1 0 -1 0,...,-1 -1 -1 -1 -1,0 1 -1 -1 -1,-1 -1 -1 -1 -1,-1 -1 -1 -1 -1,-1 -1 1 -1 0,1 1 1 1 1,-1 -1 -1 -1 1,-1 -1 -1 0 -1,-1 -1 1 -1 1,-1 -1 -1 -1 1
skute,1 -1 -1 1 1,-1 -1 -1 -1 1,-1 0 0 -1 -1,1 -1 -1 0 -1,1 0 0 -1 0,-1 -1 -1 -1 -1,1 -1 -1 -1 -1,-1 -1 -1 0 1,1 -1 -1 -1 1,-1 -1 -1 0 -1,...,1 0 -1 0 -1,-1 -1 -1 -1 -1,1 0 0 -1 -1,0 -1 -1 0 0,1 -1 -1 -1 -1,-1 -1 -1 -1 1,1 1 1 1 1,-1 -1 -1 0 -1,1 -1 -1 1 1,-1 -1 1 -1 1
owher,-1 -1 -1 -1 0,-1 -1 -1 -1 0,-1 0 -1 -1 -1,-1 -1 -1 1 -1,-1 -1 0 -1 -1,-1 -1 -1 0 -1,-1 -1 0 -1 0,-1 -1 0 -1 0,-1 1 -1 -1 0,-1 0 -1 -1 -1,...,-1 -1 0 -1 1,-1 -1 -1 0 1,-1 -1 -1 0 1,-1 0 0 -1 -1,-1 0 -1 -1 -1,-1 -1 -1 -1 0,-1 -1 -1 -1 0,1 1 1 1 1,-1 -1 -1 -1 0,-1 0 -1 -1 0
scate,1 -1 1 1 1,-1 -1 1 -1 1,-1 0 0 0 -1,1 0 -1 0 -1,1 0 0 -1 -1,0 -1 1 -1 -1,1 0 -1 -1 -1,0 -1 -1 0 1,1 -1 1 -1 1,-1 -1 -1 -1 0,...,1 0 -1 -1 -1,0 -1 -1 -1 -1,1 -1 0 -1 -1,0 -1 -1 -1 0,1 -1 1 -1 -1,-1 -1 1 -1 1,1 -1 -1 1 1,-1 -1 -1 0 -1,1 1 1 1 1,-1 -1 -1 -1 1


In [47]:
all_anagram_scoring_df.to_csv("./data/all_anagram_scoring.csv", index=False)

## 5 bit binary  

Scoring of anagrams produces 5 bit binary with range from `00000` to `11111` `(0 to 31)`.  
- `00000`, all letter are in wrong place
- `11111`, letters matched correctly

In [48]:
df3

Unnamed: 0,steal,tales,slate,stale,satle,stela,astel
steal,1 1 1 1 1,0 0 0 0 0,1 0 0 0 0,1 1 0 0 0,1 0 0 0 0,1 1 1 0 0,0 0 0 0 1
tales,0 0 0 0 0,1 1 1 1 1,0 0 0 0 0,0 0 0 0 0,0 1 0 0 0,0 0 0 0 0,0 0 0 1 0
slate,1 0 0 0 0,0 0 0 0 0,1 1 1 1 1,1 0 1 0 1,1 0 0 0 1,1 0 0 0 0,0 0 0 0 0
stale,1 1 0 0 0,0 0 0 0 0,1 0 1 0 1,1 1 1 1 1,1 0 0 1 1,1 1 0 1 0,0 0 0 0 0
satle,1 0 0 0 0,0 1 0 0 0,1 0 0 0 1,1 0 0 1 1,1 1 1 1 1,1 0 0 1 0,0 0 1 0 0
stela,1 1 1 0 0,0 0 0 0 0,1 0 0 0 0,1 1 0 1 0,1 0 0 1 0,1 1 1 1 1,0 0 0 0 0
astel,0 0 0 0 1,0 0 0 1 0,0 0 0 0 0,0 0 0 0 0,0 0 1 0 0,0 0 0 0 0,1 1 1 1 1


In [49]:
int("11111", 2)

31

In [50]:
int("00000", 2)

0

In [51]:
int("10000", 2)

16

In [52]:
int("10101", 2)

21

**`stale` contains 24 bits of information necessary to describe `steal`**

In [53]:
int("11000", 2)

24

In [153]:
int("00100", 2)

4

In [55]:
df3.loc[df3.index.isin(["stale"])]

Unnamed: 0,steal,tales,slate,stale,satle,stela,astel
stale,1 1 0 0 0,0 0 0 0 0,1 0 1 0 1,1 1 1 1 1,1 0 0 1 1,1 1 0 1 0,0 0 0 0 0


In [56]:
stale = df3.loc[df3.index.isin(["stale"])].T
stale

Unnamed: 0,stale
steal,1 1 0 0 0
tales,0 0 0 0 0
slate,1 0 1 0 1
stale,1 1 1 1 1
satle,1 0 0 1 1
stela,1 1 0 1 0
astel,0 0 0 0 0


In [57]:
df3.loc[df3.index.isin(["stale"])].squeeze()

steal    1 1 0 0 0
tales    0 0 0 0 0
slate    1 0 1 0 1
stale    1 1 1 1 1
satle    1 0 0 1 1
stela    1 1 0 1 0
astel    0 0 0 0 0
Name: stale, dtype: object

In [58]:
pd.DataFrame(df3.loc[df3.index.isin(["stale"])].squeeze())

Unnamed: 0,stale
steal,1 1 0 0 0
tales,0 0 0 0 0
slate,1 0 1 0 1
stale,1 1 1 1 1
satle,1 0 0 1 1
stela,1 1 0 1 0
astel,0 0 0 0 0


In [59]:
stale_binaries = df3.loc[df3.index.isin(["stale"])].squeeze().tolist()
stale_binaries

['1 1 0 0 0',
 '0 0 0 0 0',
 '1 0 1 0 1',
 '1 1 1 1 1',
 '1 0 0 1 1',
 '1 1 0 1 0',
 '0 0 0 0 0']

In [60]:
stale_binaries[0]

'1 1 0 0 0'

In [61]:
stale_binaries[0].split()

['1', '1', '0', '0', '0']

In [62]:
stale_binaries[0].strip()

'1 1 0 0 0'

In [63]:
"".join(stale_binaries[0].split())

'11000'

In [64]:
int("".join(stale_binaries[0].split()), 2)

24

In [65]:
stale_binaries = ["".join(x.split()) for x in stale_binaries]
stale_binaries

['11000', '00000', '10101', '11111', '10011', '11010', '00000']

In [66]:
[int(binary, 2) for binary in stale_binaries]

[24, 0, 21, 31, 19, 26, 0]

In [67]:
all_anagram_scoring_df.loc[all_anagram_scoring_df.index.isin(["stale"])].squeeze()

spate     1 -1  1  0  1
plane    -1  0  1 -1  1
petal    -1  0  0  0  0
saleb     1  0  0  0 -1
steik     1  1  0 -1 -1
              ...      
flame    -1  0  1 -1  1
skute     1 -1 -1  0  1
owher    -1 -1 -1  0 -1
scate     1 -1  1  0  1
prude    -1 -1 -1 -1  1
Name: stale, Length: 672, dtype: object

In [68]:
scare = all_anagram_scoring_df.loc[all_anagram_scoring_df.index.isin(["scare"])].squeeze()
scare

spate     1 -1  1 -1  1
plane    -1 -1  1 -1  1
petal    -1  0 -1  0 -1
saleb     1  0 -1  0 -1
steik     1 -1  0 -1 -1
              ...      
flame    -1 -1  1 -1  1
skute     1 -1 -1 -1  1
owher    -1 -1 -1  0  0
scate     1  1  1 -1  1
prude    -1  0 -1 -1  1
Name: scare, Length: 672, dtype: object

In [69]:
cols = [(idx, binary) for idx, binary in enumerate(scare) if "-1" not in binary]
cols

[(66, '0 0 0 0 1'),
 (106, '1 0 0 0 0'),
 (316, '1 0 0 1 1'),
 (347, '1 1 1 1 1'),
 (367, '0 0 1 0 1')]

In [70]:
scare_cols = [col[0] for col in cols]
scare_cols

[66, 106, 316, 347, 367]

In [71]:
all_anagram_scoring_df.iloc[scare_cols, scare_cols]

Unnamed: 0,carse,serac,sacre,scare,crase
carse,1 1 1 1 1,0 0 1 0 0,0 1 0 0 1,0 0 0 0 1,1 0 0 1 1
serac,0 0 1 0 0,1 1 1 1 1,1 0 0 0 0,1 0 0 0 0,0 0 0 0 0
sacre,0 1 0 0 1,1 0 0 0 0,1 1 1 1 1,1 0 0 1 1,0 0 0 0 1
scare,0 0 0 0 1,1 0 0 0 0,1 0 0 1 1,1 1 1 1 1,0 0 1 0 1
crase,1 0 0 1 1,0 0 0 0 0,0 0 0 0 1,0 0 1 0 1,1 1 1 1 1


In [72]:
all_anagram_scoring_df.iloc[scare_cols, scare_cols].applymap(lambda x: "".join(x.split()))

Unnamed: 0,carse,serac,sacre,scare,crase
carse,11111,100,1001,1,10011
serac,100,11111,10000,10000,0
sacre,1001,10000,11111,10011,1
scare,1,10000,10011,11111,101
crase,10011,0,1,101,11111


In [73]:
(all_anagram_scoring_df.iloc[scare_cols, scare_cols]
 .applymap(lambda x: "".join(x.split()))
 .applymap(lambda x: int(x, 2)))

Unnamed: 0,carse,serac,sacre,scare,crase
carse,31,4,9,1,19
serac,4,31,16,16,0
sacre,9,16,31,19,1
scare,1,16,19,31,5
crase,19,0,1,5,31


In [98]:
(all_anagram_scoring_df.iloc[scare_cols, scare_cols]
 .applymap(lambda x: "".join(x.split()))
 .applymap(lambda x: int(x, 2))
 .assign(sum_across=lambda x: x.sum())
 .astype(np.int8))

Unnamed: 0,carse,serac,sacre,scare,crase,sum_across
carse,31,4,9,1,19,64
serac,4,31,16,16,0,67
sacre,9,16,31,19,1,76
scare,1,16,19,31,5,72
crase,19,0,1,5,31,56


## Anagram cipher lookup table  
With `itertools.zip_longest` you can create an encode/decode lookup table for 4, 5 and 6 letter anagrams.

In [159]:
list(itertools.zip_longest("taper", "slip"))

[('t', 's'), ('a', 'l'), ('p', 'i'), ('e', 'p'), ('r', None)]

In [160]:
list(itertools.zip_longest("taper", "scare"))

[('t', 's'), ('a', 'c'), ('p', 'a'), ('e', 'r'), ('r', 'e')]

In [168]:
if all(("r", None)):
    print("Hurrah!!")
else:
    print("we are not the same")

we are not the same


In [169]:
type(None)

NoneType

In [166]:
"r" == None

False