In [1]:
import os
from spellchecker import SpellChecker
import random
import numpy as np
import itertools
import pandas as pd
from collections import Counter
from tqdm import tqdm
import re
from wordle_helpers import sorted_words_by_frequency, wordle_scoring

# where to save the data
ROOT_DIR = "."
FOLDER_NAME = "data"
PATH_TO_FOLDER = os.path.join(ROOT_DIR, FOLDER_NAME)
os.makedirs(PATH_TO_FOLDER, exist_ok=True)

# challenge words

In [2]:
with open("./data/webster_dict_all_five_letter_words.txt", mode="r") as file:
    challenge_words = file.read().splitlines()

In [3]:
len(challenge_words)

5568

In [4]:
challenge_words[:5]

['water', 'gnarl', 'arles', 'villa', 'stagy']

In [5]:
len(challenge_words)

5568

## Choose the 100 most frequently used 5-letter words as guess words  

We only have a few dozen 5 letter word at our finger tips before reaching for a thesaurus or dictionary, so it makes sense to reduce our search space to reflect this. Doing this also minimizing the computational expense of a brute force search.


In [6]:
with open("./data/webster_common_starting_letters_final_words.txt", mode="r") as file:
    guess_words_list = file.read().splitlines()

In [7]:
len(guess_words_list)

1873

In [8]:
random.seed(67)
guess_words_top_100 = [x[0] for x in sorted_words_by_frequency(guess_words_list, sorting=True)][:100]

In [9]:
guess_words_top_100[:10]

['about',
 'would',
 'could',
 'their',
 'maybe',
 'after',
 'those',
 'other',
 'first',
 'thing']

# create a guess word pairs list from from words with similar letters and score them against each other

In [10]:
random.seed(97)
combo_word_pairs = [combo for combo in itertools.combinations(guess_words_top_100, 2)]

In [11]:
print(f"Possible combinations: {len(combo_word_pairs)}")

Possible combinations: 4950


## create guess pairs  

Let's create a pair of words where there are no repeat letters across the words. Each guess word will contain letters unique to those in its pair.

In [12]:
guess_word_pairs = [combo for combo in combo_word_pairs if len(set("".join(combo))) == 10]

In [13]:
len(guess_word_pairs)

531

In [14]:
guess_word_pairs[:5]

[('about', 'since'),
 ('about', 'while'),
 ('about', 'spend'),
 ('about', 'child'),
 ('about', 'weird')]

## score challenge word against guess word based on position

**scoring:**
- 0 if letter from guess word in challenge word but in wrong position
- 1 if letter from guess word in challenge word and in correct position
- -1 if letter from guess word not in challenge word  

In [15]:
scoring = wordle_scoring(guess_word_pairs, challenge_words)

100%|███████████████████████████████████████████████████████████████████████████| 5568/5568 [12:18<00:00,  7.53it/s]


In [16]:
scoring.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2956608 entries, 0 to 2956607
Data columns (total 10 columns):
 #   Column                        Dtype 
---  ------                        ----- 
 0   challenge_word                string
 1   guess_pair                    string
 2   first_guess                   string
 3   second_guess                  string
 4   first_pos_score               string
 5   second_pos_score              string
 6   sum_first_pos_score           int8  
 7   sum_second_pos_score          int8  
 8   correct_letter_pos_score      string
 9   sum_correct_letter_pos_score  int8  
dtypes: int8(3), string(7)
memory usage: 279.8 MB


In [17]:
scoring.to_csv("./data/top_100_freq_used_5_letter_words_vs_webster_dict_all_five_letter_words.csv.gz",
               index=False,
               compression="gzip")