## Imports

In [1]:
import urllib.request # for downloading wordle words from source
import numpy as np # for stats
import random # for randomly generating target and start words
import operator # for sorting letter frequency distribution
import time # for #dramaticeffect
import pandas as pd
from nltk.corpus import movie_reviews, treebank, brown, gutenberg, switchboard

## Importing datasets

### official words
- official wordle word list

In [2]:
### Official list
official_words = []

with open("data/official_words_processed.txt", "r", encoding = "utf-8") as f:
    for word in f.read().split("\n"):
        official_words.append(word)

f.close() # closes connection to file

print(len(official_words))
official_words[:5]

2310


['foist', 'dowdy', 'bleat', 'basis', 'tango']

### alternative list 1
- an alternate list of 5-letter words found on the web

In [3]:
### Official list
alt_words_1 = []

with open("data/alt_words_1.txt", "r", encoding = "utf-8") as f:
    for word in f.read().split("\n"):
        alt_words_1.append(word)

f.close() # closes connection to file

print(len(alt_words_1))
alt_words_1[:5]

14856


['rossa', 'jetty', 'wizzo', 'cuppa', 'cohoe']

### nltk grand corpus
- Amalgamation of all words in various NLTK corpora to have as big a dataset as possible
- Developed manually

In [4]:
### grand corpus tokens
nltk_tokens = []

with open("data/nltk_grand_corpus_tokens.txt", "r", encoding = "utf-8") as f:
    for word in f.read().split("\n"):
        nltk_tokens.append(word)

f.close() # closes connection to file

print(len(nltk_tokens))
nltk_tokens[:5]

535189


['years', 'board', 'dutch', 'group', 'agnew']

### nltk grand corpus types and counts

In [5]:
### grand corpus types and counts
nltk_counts = {}

with open("data/nltk_grand_corpus_types_and_counts.txt", "r", encoding = "utf-8") as f:
    for line in f.read().split("\n"):
        if len(line.split("\t")) == 2:
            word = line.split("\t")[0]
            count = line.split("\t")[1]
            nltk_counts[word] = count
        else:
            continue

f.close() # closes connection to file

print(len(nltk_counts))
nltk_counts['which']

8043


'15760'

In [6]:
### Official list
official_words = []

with open("data/official_words_processed.txt", "r", encoding = "utf-8") as f:
    for word in f.read().split("\n"):
        if len(word) > 0: # there's one blank entry at the start
            official_words.append(word)

f.close() # closes connection to file

print(len(official_words))
official_words[:10]

2309


['foist',
 'dowdy',
 'bleat',
 'basis',
 'tango',
 'eking',
 'knead',
 'power',
 'dwell',
 'bleep']

## ML

In [7]:
sims_df = pd.read_csv("data/sims_df_complete.csv").drop(columns = "Unnamed: 0")
sims_df.head()

Unnamed: 0,first_guess,target_word,first_guess_vowels,first_guess_consonants,target_vowels,target_consonants,target_entropy,first_guess_entropy,target_guessed,mid_guesses_avg_vows,mid_guesses_avg_cons,avg_perf_letters,avg_wrong_pos_letters,avg_wrong_letters,avg_remaining,avg_intermediate_guess_entropy,luck,bias,valid_success,num_guesses
0,detox,glide,2.0,3.0,2.0,3.0,63.75,58.53,True,2.25,2.75,4.0,12.0,14.0,27.0,89.63,0.69,entropy,True,4.0
1,whiff,glide,1.0,4.0,2.0,3.0,63.75,9.47,True,2.4,2.6,8.0,2.0,27.0,51.25,81.89,0.65,entropy,True,5.0
2,soggy,glide,2.0,3.0,2.0,3.0,63.75,29.4,True,2.25,2.75,4.0,9.0,16.0,27.33,82.35,0.71,entropy,True,4.0
3,front,glide,1.0,4.0,2.0,3.0,63.75,59.66,True,2.0,3.0,1.0,2.0,12.0,227.5,86.55,0.78,entropy,True,3.0
4,stein,glide,2.0,3.0,2.0,3.0,63.75,80.0,True,2.33,2.67,0.0,7.0,8.0,41.0,93.33,0.76,entropy,True,3.0


In [8]:
print(f"Average guesses: {sims_df['num_guesses'].mean()}\n")
print(sims_df.value_counts("target_guessed", normalize = True))
print("\n")
print(sims_df.value_counts("valid_success", normalize = True))
print("\n")
print(sims_df.value_counts("num_guesses", normalize = True))
# sims_df.sort_values(by = "num_guesses", ascending = False)

Average guesses: 3.7764298093587523

target_guessed
True    1.0
dtype: float64


valid_success
True    1.0
dtype: float64


num_guesses
4.0    0.569757
3.0    0.300260
5.0    0.101386
2.0    0.019931
6.0    0.008232
1.0    0.000433
dtype: float64
