In [1]:
import pandas as pd
import numpy as np
import sys
import os
import random
from tqdm import tqdm
import re
import itertools
from wordle_helpers import wordle_scoring

In [2]:
col_names =\
    pd.read_csv("./data/top_100_freq_used_5_letter_words_vs_webster_dict_all_five_letter_words.csv.gz", nrows=0).columns
types_dict = {"sum_first_pos_score": np.int8,
              "sum_second_pos_score": np.int8,
              "sum_correct_letter_pos_score": np.int8,}
types_dict.update({col: "string[pyarrow]" for col in col_names if col not in types_dict})
scoring = pd.read_csv("./data/top_100_freq_used_5_letter_words_vs_webster_dict_all_five_letter_words.csv.gz",
                      dtype=types_dict,
                      engine="pyarrow",
                      compression="gzip")

In [3]:
scoring.head(5)

Unnamed: 0,challenge_word,guess_pair,first_guess,second_guess,first_pos_score,second_pos_score,sum_first_pos_score,sum_second_pos_score,correct_letter_pos_score,sum_correct_letter_pos_score
0,proof,"money, faith",money,faith,-1 0 -1 -1 -1,0 -1 -1 -1 -1,-4,-4,0 0 -1 -1 -1,-3
1,proof,"found, steal",found,steal,0 0 -1 -1 -1,-1 -1 -1 -1 -1,-3,-5,0 0 -1 -1 -1,-3
2,proof,"first, place",first,place,0 -1 0 -1 -1,1 -1 -1 -1 -1,-3,-3,1 -1 0 -1 -1,-2
3,proof,"music, often",music,often,-1 -1 -1 -1 -1,0 0 -1 -1 -1,-5,-3,0 0 -1 -1 -1,-3
4,proof,"women, study",women,study,-1 0 -1 -1 -1,-1 -1 -1 -1 -1,-4,-5,-1 0 -1 -1 -1,-4


In [4]:
scoring.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1679782 entries, 0 to 1679781
Data columns (total 10 columns):
 #   Column                        Non-Null Count    Dtype 
---  ------                        --------------    ----- 
 0   challenge_word                1679782 non-null  string
 1   guess_pair                    1679782 non-null  string
 2   first_guess                   1679782 non-null  string
 3   second_guess                  1679782 non-null  string
 4   first_pos_score               1679782 non-null  string
 5   second_pos_score              1679782 non-null  string
 6   sum_first_pos_score           1679782 non-null  int8  
 7   sum_second_pos_score          1679782 non-null  int8  
 8   correct_letter_pos_score      1679782 non-null  string
 9   sum_correct_letter_pos_score  1679782 non-null  int8  
dtypes: int8(3), string(7)
memory usage: 158.7 MB


# best word pair

**The best `word pair` is the one that has the `highest score`**

In [5]:
guess_pair_sum_scores = (scoring
                        .groupby(by="guess_pair")[["sum_correct_letter_pos_score"]].sum()
                        .sort_values(by="sum_correct_letter_pos_score", ascending=False))

guess_pair_sum_scores.rename(columns={
    "sum_correct_letter_pos_score": "overall_sum_correct_letter_pos_score"}, inplace=True)
guess_pair_sum_scores[:10]

Unnamed: 0_level_0,overall_sum_correct_letter_pos_score
guess_pair,Unnamed: 1_level_1
"store, claim",-2964.0
"point, share",-2972.0
"point, scare",-3000.0
"close, train",-3025.0
"story, plane",-3036.0
"share, count",-3117.0
"short, plane",-3117.0
"since, party",-3132.0
"count, spare",-3157.0
"place, story",-3187.0


In [6]:
best_guess_pair = guess_pair_sum_scores.idxmax()[0]
best_guess_pair

'store, claim'

In [7]:
worst_guess_pair = guess_pair_sum_scores.idxmin()[0]
worst_guess_pair

'would, fight'

In [8]:
best_guess_pair_df = \
    (scoring
     .loc[scoring["guess_pair"] == guess_pair_sum_scores.idxmax()[0]])
best_guess_pair_df.head()

Unnamed: 0,challenge_word,guess_pair,first_guess,second_guess,first_pos_score,second_pos_score,sum_first_pos_score,sum_second_pos_score,correct_letter_pos_score,sum_correct_letter_pos_score
213,proof,"store, claim",store,claim,-1 -1 1 0 -1,-1 -1 -1 -1 -1,-2,-5,-1 -1 1 0 -1,-2
811,world,"store, claim",store,claim,-1 -1 0 0 -1,-1 0 -1 -1 -1,-3,-4,-1 0 0 0 -1,-2
1409,sloop,"store, claim",store,claim,1 -1 1 -1 -1,-1 1 -1 -1 -1,-1,-3,1 1 1 -1 -1,1
2007,meter,"store, claim",store,claim,-1 0 -1 0 0,-1 -1 -1 -1 0,-2,-4,-1 0 -1 0 0,-2
2605,fogie,"store, claim",store,claim,-1 -1 0 -1 1,-1 -1 -1 1 -1,-2,-3,-1 -1 0 1 1,0


# conclusion  

My best 2 word opener is `"store, claim"`