# scraping a less impossible wordle target word list

In [1]:
import numpy as np
from bs4 import BeautifulSoup as bsoup
import requests
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

In [2]:
# this works for me (without having driver binaries in a system path)
from webdriver_manager.chrome import ChromeDriverManager

# we're not a robot, right?
options = webdriver.ChromeOptions() 
options.add_argument('--disable-blink-features=AutomationControlled')

# start service
service = Service()

# driver with options
driver = webdriver.Chrome(service=service, options=options)

# get the webpage
driver.get('https://www.wordunscrambler.net/word-list/wordle-word-list')

In [3]:
response = requests.get('https://www.wordunscrambler.net/word-list/wordle-word-list')
print(response.status_code)
soup = bsoup(response.text, 'lxml')

print(soup.prettify())

200
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <title>
   Wordle Words - All 2309 Words (Not in Order) No Spoilers!
  </title>
  <link href="/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
  <link as="style" href="/css/bs-main.min.css" rel="preload"/>
  <link href="/css/bs-main.min.css" rel="stylesheet"/>
  <meta content="width=device-width,initial-scale=1" name="viewport"/>
  <link as="font" href="/fonts/glyphicons-halflings-regular.woff2" rel="preload"/>
  <style>
   @font-face {
            font-family: 'Glyphicons Halflings';
            src: url('/fonts/glyphicons-halflings-regular.woff2') format('woff2');
            font-display: swap;
        }
  </style>
  <script type="application/ld+json">
   {
          "@context": "https://schema.org/", 
          "@type": "BreadcrumbList", 
          "itemListElement": [{
            "@type": "ListItem", 
            "position": 1, 
            "name": "Word Unscrambler",
            "item": "https://www

In [4]:
a_href = soup.find_all('a', href=True)

# note that all target words we are after are exactly 5 letters long and contain no upper case letters
words = [tag.contents[0] for tag in a_href if len(tag.contents[0]) == 5 and tag.contents[0].islower()]

# from the text header of the website, there are exactly 2309 words in the wordle list
# check that we found exactly that many
assert len(words) == 2309

# for consistency with the hard mode list, set to upper case
words = [w.upper() for w in words]

# print the first 25 words
for i in range(25):
    print(words[i])

np.save('five_letter_words_normal.npy', words, allow_pickle=True)

ABACK
ABASE
ABATE
ABBEY
ABBOT
ABHOR
ABIDE
ABLED
ABODE
ABORT
ABOUT
ABOVE
ABUSE
ABYSS
ACORN
ACRID
ACTOR
ACUTE
ADAGE
ADAPT
ADEPT
ADMIN
ADMIT
ADOBE
ADOPT


# get list of all valid wordle guesses
https://github.com/tabatkins/wordle-list/blob/main/words

In [13]:
with open('wordle_possibles.txt', 'r') as f:
    words = [w.replace('\n', '').upper() for w in f.readlines()]

for i in range(25):
    print(words[i])
    
np.save('valid_guesses.npy', words, allow_pickle=True)

ROSSA
JETTY
WIZZO
CUPPA
COHOE
GURKS
SQUAD
BEISA
SHRUG
FOSSA
FLUYT
CAMUS
SPEED
MAMIL
ARRAY
POLIO
BARNS
PANES
SOUTS
LIMAS
FETCH
QUECK
TWINK
GRAZE
CROCK
