In [1]:
import urllib.request # for downloading if necessary
import operator # for sorting letter frequency distribution
from nltk.corpus import movie_reviews, treebank, brown, gutenberg, switchboard
from wordle_functions import *

## Building Datasets
- Get all possible words that the target word could be
- For each word in the target words list, get counts of each letter to create letter distribution across entire vocabulary

### `alt_words_1` dataset

In [2]:
### If getting words from local file -- should be 14855 words in total

alt_words_1 = set() # set of all words

file_path = "data/alt_words_1.txt" # taken from "https://raw.githubusercontent.com/tabatkins/wordle-list/main/words"

with open(file_path, "r", encoding = "utf-8") as f:
    for word in f.read().split("\n"):
        if len(word) > 0: # there's one blank entry at the start
            alt_words_1.add(word)

f.close() # closes connection to file

print(len(alt_words_1))
alt_words_1 = list(alt_words_1)
alt_words_1[:10]

14855


['thyme',
 'allin',
 'judgy',
 'serac',
 'raggs',
 'hauls',
 'oliva',
 'skail',
 'buyin',
 'imbos']

### `official_words` list dataset

In [3]:
official_words = set() # set of all words

file_path = "data/official_words_unprocessed.txt"

with open(file_path, "r", encoding = "utf-8") as f:
    for line in f.read().split("\n"):
        word = line.split(" ")[-1]
        if (len(word) != 5 or word.isalpha() == False):
            pass
        else:
            official_words.add(word.lower())

f.close() # closes connection to file

for word in official_words:
    if len(word) != 5:
        print (word)

official_words = list(set(official_words))
print(len(official_words))
official_words[:10]

2309


['wince',
 'thyme',
 'mower',
 'horde',
 'heard',
 'tenor',
 'zonal',
 'parry',
 'shied',
 'fizzy']

In [4]:
### writing clean words list to .txt
with open("data/official_words_processed.txt", "w") as fout:
    for word in official_words:
        fout.write(word + "\n")

f.close() # closes connection to file

## Grand Corpus Development
- 2132 words in common with official wordle list

### brown

In [5]:
brown_words_tokens = []

for word in brown.words():
    word = word.lower()
    if len(word) == 5:
        if word.isalpha():
            brown_words_tokens.append(word)

print(len(brown_words_tokens))

107314


In [6]:
brown_words_types = set(brown_words_tokens)
print(len(brown_words_types))
print((list(brown_words_types)[:10]))

missing_words_brown = set()
for word in official_words:
    if word not in brown_words_types:
        missing_words_brown.add(word)
        
len(missing_words_brown)

4072
['hoots', 'hauls', 'mises', 'bayed', 'ahmet', 'sings', 'horde', 'aides', 'aided', 'howry']


577

### treebank

In [7]:
treebank_words_tokens = []

for word in treebank.words():
    word = word.lower()
    if len(word) == 5:
        if word.isalpha():
            treebank_words_tokens.append(word)

print(len(treebank_words_tokens))

8586


In [8]:
treebank_words_types = set(treebank_words_tokens)
print(len(treebank_words_types))
print((list(treebank_words_types)[:10]))

missing_words_treebank = set()
for word in official_words:
    if word not in treebank_words_types:
        missing_words_treebank.add(word)
        
len(missing_words_treebank)

1096
['gangs', 'bronx', 'sites', 'cosby', 'binge', 'aides', 'jumps', 'heard', 'lizhi', 'foods']


1706

### switchboard

In [9]:
switchboard_words_tokens = []

for word in switchboard.words():
    word = word.lower()
    if len(word) == 5:
        if word.isalpha():
            switchboard_words_tokens.append(word)

print(len(switchboard_words_tokens))

6393


In [10]:
switchboard_words_types = set(switchboard_words_tokens)
print(len(switchboard_words_types))
print((list(switchboard_words_types)[:10]))

missing_words_switchboard = set()
for word in official_words:
    if word not in switchboard_words_types:
        missing_words_switchboard.add(word)
        
len(missing_words_switchboard)

656
['gangs', 'choke', 'cosby', 'greek', 'heard', 'worry', 'rexes', 'coils', 'first', 'songs']


1864

### gutenberg

In [11]:
gutenberg_words_tokens = []

for word in gutenberg.words():
    word = word.lower()
    if len(word) == 5:
        if word.isalpha():
            gutenberg_words_tokens.append(word)

print(len(gutenberg_words_tokens))

249307


In [12]:
gutenberg_words_types = set(gutenberg_words_tokens)
print(len(gutenberg_words_types))
print((list(gutenberg_words_types)[:10]))

missing_words_gutenberg = set()
for word in official_words:
    if word not in gutenberg_words_types:
        missing_words_gutenberg.add(word)
        
len(missing_words_gutenberg)

4684
['keziz', 'anted', 'wince', 'becam', 'bayed', 'mower', 'sings', 'thara', 'horde', 'emims']


636

### movie_reviews

In [13]:
movie_reviews_words_tokens = []

for word in movie_reviews.words():
    word = word.lower()
    if len(word) == 5:
        if word.isalpha():
            movie_reviews_words_tokens.append(word)

print(len(movie_reviews_words_tokens))

163588


In [14]:
movie_reviews_words_types = set(movie_reviews_words_tokens)
print(len(movie_reviews_words_types))
print((list(movie_reviews_words_types)[:10]))

missing_words_movie_reviews = set()
for word in official_words:
    if word not in movie_reviews_words_types:
        missing_words_movie_reviews.add(word)
        
len(missing_words_movie_reviews)

4508
['wince', 'thyme', 'motss', 'ravel', 'hauls', 'ahmet', 'mower', 'sings', 'nears', 'horde']


596

### Grand Corpus - 5 letters
- 8043 types
- only 177 missing from official wordle list (2132 / 2109, 92% mutual)
- written to text file and put in working directory (put this in another .ipynb called "data processing" or something)

In [15]:
all_corpora = [treebank_words_tokens, brown_words_tokens, gutenberg_words_tokens, switchboard_words_tokens, movie_reviews_words_tokens]
grand_corpus_tokens = []
for corpus in all_corpora:
    for word in corpus:
        grand_corpus_tokens.append(word)

print(len(grand_corpus_tokens))
print(grand_corpus_tokens[:20])

535188
['years', 'board', 'dutch', 'group', 'agnew', 'years', 'named', 'among', 'group', 'years', 'fiber', 'lungs', 'brief', 'later', 'loews', 'makes', 'using', 'today', 'forum', 'bring']


In [16]:
### writing just words to .txt file
with open("data/nltk_grand_corpus_tokens_5.txt", "w") as fout:
    for word in grand_corpus_tokens:
        fout.write(word + "\n")

grand_corpus_word_freq = get_word_distribution(grand_corpus_tokens, sort = "descending")

### writing words and counts to .txt file
with open("data/nltk_grand_corpus_types_and_counts_5.txt", "w") as fout:
    for word, count in grand_corpus_word_freq:
        fout.write(word + "\t" + str(count) + "\n")

In [17]:
grand_corpus_types = set(grand_corpus_tokens)
print(len(grand_corpus_types))
print(list(grand_corpus_types)[:20])

grand_corpus_missing = []
for word in official_words:
    if word not in grand_corpus_types:
        grand_corpus_missing.append(word)

print(len(grand_corpus_missing))
print(list(grand_corpus_missing)[:20])

8043
['keziz', 'thyme', 'hauls', 'ahmet', 'thara', 'nears', 'sakes', 'weepy', 'maron', 'libby', 'flush', 'sayle', 'skunk', 'fucks', 'latit', 'digge', 'cocoa', 'balak', 'ornan', 'stacy']
177
['zonal', 'fizzy', 'wooer', 'torus', 'girly', 'tuber', 'krill', 'toddy', 'creme', 'slosh', 'pleat', 'vegan', 'duchy', 'rayon', 'decal', 'aping', 'frond', 'bleep', 'rearm', 'enema']


In [18]:
most_freq_20 = get_word_distribution(grand_corpus_tokens, sort = "descending")[:20]
least_freq_20 = get_word_distribution(grand_corpus_tokens, sort = "ascending")[:20]
print(f"20 most frequent words in grand corpus:\n\n{most_freq_20}\n")
print(f"20 least frequent words in grand corpus:\n\n{least_freq_20}\n")

20 most frequent words in grand corpus:

[('which', 15760), ('their', 13925), ('there', 13447), ('shall', 11997), ('would', 9361), ('about', 8388), ('could', 6851), ('other', 6340), ('these', 5859), ('movie', 5826), ('after', 5686), ('first', 5294), ('great', 4738), ('where', 4403), ('every', 4398), ('never', 4030), ('house', 3960), ('being', 3789), ('those', 3657), ('while', 3534)]

20 least frequent words in grand corpus:

[('agnew', 1), ('borge', 1), ('menem', 1), ('imsai', 1), ('gingl', 1), ('harpo', 1), ('chary', 1), ('kuala', 1), ('shrum', 1), ('kelli', 1), ('nelms', 1), ('desai', 1), ('kuhns', 1), ('erode', 1), ('kuvin', 1), ('soups', 1), ('coors', 1), ('spiro', 1), ('milne', 1), ('rotie', 1)]



In [19]:
grand_freqs = get_word_distribution(grand_corpus_tokens, sort = "descending")
grand_freqs[:10]

[('which', 15760),
 ('their', 13925),
 ('there', 13447),
 ('shall', 11997),
 ('would', 9361),
 ('about', 8388),
 ('could', 6851),
 ('other', 6340),
 ('these', 5859),
 ('movie', 5826)]

In [20]:
wordle_freq_ratings = []
for word in official_words:
    for tup in grand_freqs:
        if tup[0] == word:
            wordle_freq_ratings.append(tup)

found_words_sorted = sorted(wordle_freq_ratings, key = operator.itemgetter(1), reverse = True) # sorted descending
print(len(found_words_sorted)) # 2132 of 2309 wordle words will have a frequency

2132


### Grand Corpus - Other Word Lengths

In [21]:
### Takes 50s to run cell

grand_corpus_tokens_3 = []
grand_corpus_tokens_4 = []
# grand_corpus_tokens_5 = []
grand_corpus_tokens_6 = []
grand_corpus_tokens_7 = []
grand_corpus_tokens_8 = []
grand_corpus_tokens_9 = []
grand_corpus_tokens_10 = []

tokens_lists = [
    (grand_corpus_tokens_3, 3), (grand_corpus_tokens_4, 4), 
    # (grand_corpus_tokens_5, 5), 
    (grand_corpus_tokens_6, 6),
    (grand_corpus_tokens_7, 7), (grand_corpus_tokens_8, 8),
    (grand_corpus_tokens_9, 9), (grand_corpus_tokens_10, 10)]

corpora = [brown, treebank, switchboard, gutenberg, movie_reviews]

for corpus in corpora:

    for tokens_list, word_len in tokens_lists:

        for word in corpus.words():
            word = word.lower()
            if len(word) == word_len:
                if word.isalpha():
                    tokens_list.append(word)

grand_corpus_types_3 = set(grand_corpus_tokens_3)
grand_corpus_types_4 = set(grand_corpus_tokens_4)
# grand_corpus_types_5 = set(grand_corpus_tokens_5)
grand_corpus_types_6 = set(grand_corpus_tokens_6)
grand_corpus_types_7 = set(grand_corpus_tokens_7)
grand_corpus_types_8 = set(grand_corpus_tokens_8)
grand_corpus_types_9 = set(grand_corpus_tokens_9)
grand_corpus_types_10 = set(grand_corpus_tokens_10)

print(len(grand_corpus_types_3))
print((list(grand_corpus_types_3)[:7]))
print(len(grand_corpus_types_4))
print((list(grand_corpus_types_4)[:7]))
# print(len(grand_corpus_types_5))
# print((list(grand_corpus_types_5)[:7]))
print(len(grand_corpus_types_6))
print((list(grand_corpus_types_6)[:7]))
print(len(grand_corpus_types_7))
print((list(grand_corpus_types_7)[:7]))
print(len(grand_corpus_types_8))
print((list(grand_corpus_types_8)[:7]))
print(len(grand_corpus_types_9))
print((list(grand_corpus_types_9)[:7]))
print(len(grand_corpus_types_10))
print((list(grand_corpus_types_10)[:7]))

tokens_lists = [
    (grand_corpus_tokens_3, 3), (grand_corpus_tokens_4, 4), 
    # (grand_corpus_tokens_5, 5), 
    (grand_corpus_tokens_6, 6),
    (grand_corpus_tokens_7, 7), (grand_corpus_tokens_8, 8),
    (grand_corpus_tokens_9, 9), (grand_corpus_tokens_10, 10)]

for token_list, word_len in tokens_lists:

    tokens_list_word_freq = get_word_distribution(token_list, sort = "descending")

    ### writing types and counts to .txt file
    with open(f"data/nltk_grand_corpus_types_and_counts_{word_len}.txt", "w") as fout:
        for word, count in tokens_list_word_freq:
            fout.write(word + "\t" + str(count) + "\n")