In [1]:
import pandas as pd
import numpy as np
import sys
import os
import random
from tqdm import tqdm
import re
import itertools
from wordle_helpers import wordle_scoring

In [2]:
col_names =\
    pd.read_csv("./data/top_100_freq_used_5_letter_words_vs_webster_dict_all_five_letter_words.csv.gz", nrows=0).columns
types_dict = {"sum_first_pos_score": np.int8, "sum_second_pos_score": np.int8}
types_dict.update({col: "string[pyarrow]" for col in col_names if col not in types_dict})
scoring = pd.read_csv("./data/top_100_freq_used_5_letter_words_vs_webster_dict_all_five_letter_words.csv.gz",
                      dtype=types_dict,
                      engine="pyarrow",
                      compression="gzip")

In [3]:
scoring.head(5)

Unnamed: 0,challenge_word,guess_pair,first_guess,second_guess,first_pos_score,second_pos_score,sum_first_pos_score,sum_second_pos_score,correct_letter_pos_score,sum_correct_letter_pos_score
0,refar,"power, child",power,child,-1 -1 -1 0 1,-1 -1 -1 -1 -1,-2,-5,-1 -1 -1 0 1,-2
1,refar,"while, sound",while,sound,-1 -1 -1 -1 0,-1 -1 -1 -1 -1,-4,-5,-1 -1 -1 -1 0,-4
2,refar,"thing, scare",thing,scare,-1 -1 -1 -1 -1,-1 -1 0 0 0,-5,-2,-1 -1 0 0 0,-2
3,refar,"ought, plane",ought,plane,-1 -1 -1 -1 -1,-1 -1 0 -1 0,-5,-3,-1 -1 0 -1 0,-3
4,refar,"thing, comes",thing,comes,-1 -1 -1 -1 -1,-1 -1 -1 0 -1,-5,-4,-1 -1 -1 0 -1,-4


In [4]:
scoring.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2785079 entries, 0 to 2785078
Data columns (total 10 columns):
 #   Column                        Dtype 
---  ------                        ----- 
 0   challenge_word                string
 1   guess_pair                    string
 2   first_guess                   string
 3   second_guess                  string
 4   first_pos_score               string
 5   second_pos_score              string
 6   sum_first_pos_score           int8  
 7   sum_second_pos_score          int8  
 8   correct_letter_pos_score      string
 9   sum_correct_letter_pos_score  string
dtypes: int8(2), string(8)
memory usage: 276.5 MB


# best word pair

**The best `word pair` is the one that has the `highest score`**

In [5]:
guess_pair_sum_scores = (scoring
                        .groupby(by="guess_pair")[["sum_correct_letter_pos_score"]].sum()
                        .sort_values(by="sum_correct_letter_pos_score", ascending=False))

guess_pair_sum_scores.rename(columns={
    "sum_correct_letter_pos_score": "overall_sum_correct_letter_pos_score"}, inplace=True)
guess_pair_sum_scores[:10]

Unnamed: 0_level_0,overall_sum_correct_letter_pos_score
guess_pair,Unnamed: 1_level_1
"close, train",-7427.0
"story, plane",-7584.0
"point, share",-7647.0
"since, party",-7791.0
"point, scare",-7802.0
"share, count",-7819.0
"short, plane",-7938.0
"count, spare",-8024.0
"place, story",-8071.0
"train, chose",-8148.0


In [6]:
best_guess_pair = guess_pair_sum_scores.idxmax()[0]
best_guess_pair

'close, train'

In [7]:
worst_guess_pair = guess_pair_sum_scores.idxmin()[0]
worst_guess_pair

'might, found'

In [8]:
best_guess_pair_df = \
    (scoring
     .loc[scoring["guess_pair"] == guess_pair_sum_scores.idxmax()[0]])
best_guess_pair_df.head()

Unnamed: 0,challenge_word,guess_pair,first_guess,second_guess,first_pos_score,second_pos_score,sum_first_pos_score,sum_second_pos_score,correct_letter_pos_score,sum_correct_letter_pos_score
27,refar,"close, train",close,train,-1 -1 -1 -1 0,-1 0 0 -1 -1,-4,-3,-1 0 0 -1 0,-2
544,horal,"close, train",close,train,-1 0 0 -1 -1,-1 0 0 -1 -1,-3,-3,-1 0 0 -1 -1,-3
1061,stoma,"close, train",close,train,-1 -1 1 0 -1,0 -1 0 -1 -1,-2,-3,0 -1 1 0 -1,-1
1578,beard,"close, train",close,train,-1 -1 -1 -1 0,-1 0 1 -1 -1,-4,-2,-1 0 1 -1 0,-1
2095,vends,"close, train",close,train,-1 -1 -1 0 0,-1 -1 -1 -1 0,-3,-4,-1 -1 -1 0 0,-3


# conclusion  

My best 2 word opener is `"close, train"`, which coincidentally are anagrams of the ones identified by [3blue1brown](https://www.youtube.com/watch?v=fRed0Xmc2Wg) - timestamp 10:52

## anagrams  

anagrams of `close, train`:  
- `soare, clint`
- `slane, troic` (slane - a spade for cutting turf, troic - pertaining to Troy)
- `lanes, troic`
- `salet, orcin` (salet - a combat helmet, orcin - a colorless substance obtained from certain lichens)
- `tales, orcin`
- `lates, orcin`
- `slate, orcin`
- `trace, loins`
- `trace, lions`
- `crate, lions`
- `crate, loins`

`arose, clint` maybe more appropriate as `soare` (a young hawk) is rarely used.  

additional anagrams:  
- `arise, clton`
- `raise, clton`

the second words in additional anagrams above are the left over letters

In [9]:
new_guess_words  = ["soare", "clint", "slane", "troic",
                    "lanes", "salet", "lates", "tales",
                    "slate", "orcin", "arise", "raise",
                    "crate", "trace", "loins", "lions", "arose"]

In [10]:
guess_guess_pairs = [combo for combo in itertools.combinations(new_guess_words, 2)]
guess_guess_pairs[:10]

[('soare', 'clint'),
 ('soare', 'slane'),
 ('soare', 'troic'),
 ('soare', 'lanes'),
 ('soare', 'salet'),
 ('soare', 'lates'),
 ('soare', 'tales'),
 ('soare', 'slate'),
 ('soare', 'orcin'),
 ('soare', 'arise')]

In [11]:
len(guess_guess_pairs)

136

In [12]:
# with open("./data/webster_dict_all_five_letter_words.txt", mode="r") as file:
#     challenge_words = file.read().splitlines()

In [13]:
# challenge_words[:5]

In [14]:
# random.seed(43)
# anagram_scores = wordle_scoring(guess_guess_pairs, challenge_words)
# anagram_scores

In [15]:
# best_anagram_words = (anagram_scores
#                       .groupby(by="guess_pair")[["sum_correct_letter_pos_score"]].sum()
#                       .sort_values(by="sum_correct_letter_pos_score", ascending=False))

# best_anagram_words.rename(columns={"sum_correct_letter_pos_score": "overall_sum_correct_letter_pos_score"}, inplace=True)
# best_anagram_words[:10]