In [1]:
import pandas as pd
import numpy as np
import sys
import os
import random
from tqdm import tqdm
import re
import itertools
from wordle_helpers import wordle_scoring

In [2]:
scoring = \
    pd.read_csv("./data/top_100_freq_used_5_letter_words_vs_webster_dict_all_five_letter_words.csv.gz",
                dtype = {
                    "challenge_word": "string[pyarrow]",
                    "guess_pair": "string[pyarrow]",
                    "first_guess": "string[pyarrow]",
                    "second_guess": "string[pyarrow]",
                    "first_pos_score": "string[pyarrow]",
                    "second_pos_score": "string[pyarrow]",
                    "sum_first_pos_score": np.int8,
                    "sum_second_pos_score": np.int8,
                    "correct_letter_pos_score": "string[pyarrow]",
                    "sum_correct_letter_pos_score": np.int8,
                },
                compression="gzip")

In [35]:
scoring.head(5)

Unnamed: 0,challenge_word,guess_pair,first_guess,second_guess,first_pos_score,second_pos_score,sum_first_pos_score,sum_second_pos_score,correct_letter_pos_score,sum_correct_letter_pos_score
0,zebra,"tried, along",tried,along,-1 0 -1 0 -1,0 -1 -1 -1 -1,-3,-4,0 0 -1 0 -1,-2
1,zebra,"thank, comes",thank,comes,-1 -1 0 -1 -1,-1 -1 -1 0 -1,-4,-4,-1 -1 0 0 -1,-3
2,zebra,"wrong, field",wrong,field,-1 0 -1 -1 -1,-1 -1 0 -1 -1,-4,-4,-1 0 0 -1 -1,-3
3,zebra,"front, music",front,music,-1 0 -1 -1 -1,-1 -1 -1 -1 -1,-4,-5,-1 0 -1 -1 -1,-4
4,zebra,"place, wrong",place,wrong,-1 -1 0 -1 0,-1 0 -1 -1 -1,-3,-4,-1 0 0 -1 0,-2


In [4]:
scoring.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2785079 entries, 0 to 2785078
Data columns (total 10 columns):
 #   Column                        Dtype 
---  ------                        ----- 
 0   challenge_word                string
 1   guess_pair                    string
 2   first_guess                   string
 3   second_guess                  string
 4   first_pos_score               string
 5   second_pos_score              string
 6   sum_first_pos_score           int8  
 7   sum_second_pos_score          int8  
 8   correct_letter_pos_score      string
 9   sum_correct_letter_pos_score  int8  
dtypes: int8(3), string(7)
memory usage: 263.6 MB


# best word pair

**The best `word pair` is the one that has the `highest score`**

In [5]:
guess_pair_sum_scores = (scoring
                        .groupby(by="guess_pair")[["sum_correct_letter_pos_score"]].sum()
                        .sort_values(by="sum_correct_letter_pos_score", ascending=False))

guess_pair_sum_scores.rename(columns={
    "sum_correct_letter_pos_score": "overall_sum_correct_letter_pos_score"}, inplace=True)
guess_pair_sum_scores[:10]

Unnamed: 0_level_0,overall_sum_correct_letter_pos_score
guess_pair,Unnamed: 1_level_1
"close, train",-7427.0
"story, plane",-7584.0
"point, share",-7647.0
"since, party",-7791.0
"point, scare",-7802.0
"share, count",-7819.0
"short, plane",-7938.0
"count, spare",-8024.0
"place, story",-8071.0
"train, chose",-8148.0


In [6]:
best_guess_pair = guess_pair_sum_scores.idxmax()[0]
best_guess_pair

'close, train'

In [7]:
worst_guess_pair = guess_pair_sum_scores.idxmin()[0]
worst_guess_pair

'might, found'

In [33]:
best_guess_pair_df = \
    (scoring
     .loc[scoring["guess_pair"] == guess_pair_sum_scores.idxmax()[0]])
best_guess_pair_df.head()

Unnamed: 0,challenge_word,guess_pair,first_guess,second_guess,first_pos_score,second_pos_score,sum_first_pos_score,sum_second_pos_score,correct_letter_pos_score,sum_correct_letter_pos_score
55,zebra,"close, train",close,train,-1 -1 -1 -1 0,-1 0 0 -1 -1,-4,-3,-1 0 0 -1 0,-2
572,ficus,"close, train",close,train,0 -1 -1 0 -1,-1 -1 -1 0 -1,-3,-4,0 -1 -1 0 -1,-3
1089,bourd,"close, train",close,train,-1 -1 0 -1 -1,-1 0 -1 -1 -1,-4,-4,-1 0 0 -1 -1,-3
1606,sited,"close, train",close,train,-1 -1 -1 0 0,0 -1 -1 0 -1,-3,-3,0 -1 -1 0 0,-2
2123,oxfly,"close, train",close,train,-1 0 0 -1 -1,-1 -1 -1 -1 -1,-3,-5,-1 0 0 -1 -1,-3


# conclusion  

My best 2 word opener is `"close, train"`, which coincidentally are anagrams of the ones identified by [3blue1brown](https://www.youtube.com/watch?v=fRed0Xmc2Wg) - timestamp 10:52

## anagrams  

anagrams of `close, train`:  
- `soare, clint`
- `slane, troic` (slane - a spade for cutting turf, troic - pertaining to Troy)
- `lanes, troic`
- `salet, orcin` (salet - a combat helmet, orcin - a colorless substance obtained from certain lichens)
- `tales, orcin`
- `lates, orcin`
- `slate, orcin`
- `trace, loins`
- `trace, lions`
- `crate, lions`
- `crate, loins`

`arose, clint` maybe more appropriate as `soare` (a young hawk) is rarely used.  

additional anagrams:  
- `arise, clton`
- `raise, clton`

the second words in additional anagrams above are the left over letters

In [20]:
new_guess_words  = ["soare", "clint", "slane", "troic",
                    "lanes", "salet", "lates", "tales",
                    "slate", "orcin", "arise", "raise",
                    "crate", "trace", "loins", "lions", "arose"]

In [21]:
guess_guess_pairs = [combo for combo in itertools.combinations(new_guess_words, 2)]
guess_guess_pairs[:10]

[('soare', 'clint'),
 ('soare', 'slane'),
 ('soare', 'troic'),
 ('soare', 'lanes'),
 ('soare', 'salet'),
 ('soare', 'lates'),
 ('soare', 'tales'),
 ('soare', 'slate'),
 ('soare', 'orcin'),
 ('soare', 'arise')]

In [22]:
len(guess_guess_pairs)

136

In [23]:
with open("./data/webster_dict_all_five_letter_words.txt", mode="r") as file:
    challenge_words = file.read().splitlines()

In [24]:
challenge_words[:5]

['gnarl', 'arles', 'villa', 'stagy', 'betty']

In [25]:
random.seed(43)
anagram_scores = wordle_scoring(guess_guess_pairs, challenge_words)
anagram_scores

100%|█████████████████████████████████████████████████████████████████████| 5387/5387 [01:48<00:00, 49.59it/s]


Unnamed: 0,challenge_word,guess_pair,first_guess,second_guess,first_pos_score,second_pos_score,sum_first_pos_score,sum_second_pos_score,correct_letter_pos_score,sum_correct_letter_pos_score
0,arson,"lanes, lions",lanes,lions,-1 0 0 -1 0,-1 -1 0 0 0,-2,-2,-1 0 0 0 0,-1
1,arson,"arise, trace",arise,trace,1 1 -1 0 -1,-1 1 0 -1 -1,0,-2,1 1 0 0 -1,1
2,arson,"crate, arose",crate,arose,-1 1 0 -1 -1,1 1 0 0 -1,-2,1,1 1 0 0 -1,1
3,arson,"soare, salet",soare,salet,0 0 0 0 -1,0 0 -1 -1 -1,-1,-3,0 0 0 0 -1,-1
4,arson,"soare, clint",soare,clint,0 0 0 0 -1,-1 -1 -1 0 -1,-1,-4,0 0 0 0 -1,-1
...,...,...,...,...,...,...,...,...,...,...
732627,pesky,"slate, raise",slate,raise,0 -1 -1 -1 0,-1 -1 -1 0 0,-3,-3,0 -1 -1 0 0,-2
732628,pesky,"troic, salet",troic,salet,-1 -1 -1 -1 -1,0 -1 -1 0 -1,-5,-3,0 -1 -1 0 -1,-3
732629,pesky,"lanes, crate",lanes,crate,-1 -1 -1 0 0,-1 -1 -1 -1 0,-3,-4,-1 -1 -1 0 0,-3
732630,pesky,"troic, raise",troic,raise,-1 -1 -1 -1 -1,-1 -1 -1 0 0,-5,-3,-1 -1 -1 0 0,-3


In [26]:
best_anagram_words = (anagram_scores
                      .groupby(by="guess_pair")[["sum_correct_letter_pos_score"]].sum()
                      .sort_values(by="sum_correct_letter_pos_score", ascending=False))

best_anagram_words.rename(columns={"sum_correct_letter_pos_score": "overall_sum_correct_letter_pos_score"}, inplace=True)
best_anagram_words[:10]

Unnamed: 0_level_0,overall_sum_correct_letter_pos_score
guess_pair,Unnamed: 1_level_1
"soare, lates",-4720.0
"soare, lanes",-4752.0
"soare, tales",-4827.0
"salet, arose",-4983.0
"salet, arise",-5022.0
"salet, crate",-5475.0
"salet, trace",-5578.0
"tales, arose",-5589.0
"tales, arise",-5628.0
"lates, slate",-5632.0
