https://www.wired.com/story/best-wordle-tips/

In [None]:
%load_ext kedro.ipython

In [None]:
%reload_kedro

In [3]:
import sys
sys.path.append("../src/projectwordle")

In [4]:
import plotly.express as px
import plotly.graph_objects as go
import polars as pl
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from typing import List
from IPython.display import HTML
from more_itertools import sliding_window, pairwise
from utils import (
    color_pattern_matching,
    plot_guess_stats,
    plot_challenge_stats,
    difficulty_distribution,
    plot_most_difficult_words,
    plot_guess_stats_highlighted_words
)

pl.Config(tbl_rows=48)

[1m<[0m[1;95mpolars.config.Config[0m[39m object at [0m[1;36m0x316cafe20[0m[1m>[0m

# Load Data

In [None]:
five_letter_word_anagrams = catalog.load("five_letter_words_anagrams")
simulating_top_words = catalog.load("simulating_top_words")

In [6]:
simulating_top_words_tries_difficulty = (
    simulating_top_words
    .join(
        five_letter_word_anagrams.select("words", "anagrams", "anagram_num"),
        left_on="next_guess",
        right_on="words",
        how="left",
        coalesce=True
    )
    .select(
        "challenge", "guess", "letter_differences", "common_letters",
        "num_diff_letters", "num_common_letters", "num_matching_index",
        "match_pattern", "num_choices_after_guess", "possible_guesses",
        "challenge_in_possible_guesses", "next_guess", "group", "tries", "difficulty",
        "guess_word_anagrams", "anagram_num"
    )
)

In [7]:
simulating_top_words_tries_difficulty.head()

challenge,guess,letter_differences,common_letters,num_diff_letters,num_common_letters,num_matching_index,match_pattern,num_choices_after_guess,possible_guesses,challenge_in_possible_guesses,next_guess,group,tries,difficulty,guess_word_anagrams,anagram_num
str,str,str,str,u8,u8,u8,str,u16,str,bool,str,u32,u8,str,str,u8
"""bagie""","""maise""","""ms""","""aie""",2,3,2,"""BGYBG""",6,"""bagie, jaxie, tawie, cadie, ca…",True,"""bagie""",0,2,"""easy""","""maise""",1
"""bagie""","""bagie""","""""","""aiebg""",0,5,5,"""GGGGG""",0,"""""",False,"""bagie""",0,2,"""easy""","""bagie, bigae""",2
"""bagie""","""bagie""","""""","""aiebg""",0,5,5,"""GGGGG""",0,"""""",False,"""bagie""",0,2,"""easy""","""bagie, bigae""",2
"""bagie""","""bagie""","""""","""aiebg""",0,5,5,"""GGGGG""",0,"""""",False,"""bagie""",0,2,"""easy""","""bagie, bigae""",2
"""bagie""","""bagie""","""""","""aiebg""",0,5,5,"""GGGGG""",0,"""""",False,"""bagie""",0,2,"""easy""","""bagie, bigae""",2


In [8]:
simulating_top_words_tries_difficulty.height

[1;36m3816960[0m

In [9]:
(
    simulating_top_words_tries_difficulty
    .unique(subset=["group"], keep="first")
    ["difficulty"]
    .value_counts()
    .sort("count", descending=True)
)

difficulty,count
str,u32
"""moderate""",300001
"""hard""",264193
"""very hard""",60412
"""easy""",11554


# Create dataframe of first guesses

In [10]:
# Group by "Category" and return the first row of each group
first_guess = (
    simulating_top_words_tries_difficulty
    .group_by("group", maintain_order=True)
    .first()
)

In [11]:
first_guess.head(18)

group,challenge,guess,letter_differences,common_letters,num_diff_letters,num_common_letters,num_matching_index,match_pattern,num_choices_after_guess,possible_guesses,challenge_in_possible_guesses,next_guess,tries,difficulty,guess_word_anagrams,anagram_num
u32,str,str,str,str,u8,u8,u8,str,u16,str,bool,str,u8,str,str,u8
0,"""bagie""","""maise""","""ms""","""aie""",2,3,2,"""BGYBG""",6,"""bagie, jaxie, tawie, cadie, ca…",True,"""bagie""",2.0,"""easy""","""maise""",1.0
1,"""bagie""","""least""","""lts""","""ea""",3,2,0,"""BYYBB""",349,"""maybe, carve, cream, dance, ra…",False,"""maybe""",4.0,"""moderate""","""least, steal, slate, stale, st…",8.0
2,"""bagie""","""ariel""","""lr""","""aie""",2,3,0,"""YBYYB""",20,"""image, vitae, mamie, sepia, in…",False,"""image""",3.0,"""moderate""","""ariel, raile""",2.0
3,"""bagie""","""caste""","""cts""","""ae""",3,2,2,"""BGBBG""",108,"""large, maize, range, value, ma…",False,"""large""",4.0,"""moderate""","""caste, cesta, sceat""",3.0
4,"""bagie""","""slate""","""lts""","""ae""",3,2,1,"""BBYBG""",159,"""maybe, waive, argue, dance, ra…",False,"""maybe""",4.0,"""moderate""",,
5,"""bagie""","""aisle""","""ls""","""aie""",2,3,1,"""YYBBG""",26,"""image, zaire, maize, naive, ba…",False,"""image""",3.0,"""moderate""","""aisle""",1.0
6,"""bagie""","""paseo""","""sop""","""ae""",3,2,1,"""BGBYB""",132,"""early, eagle, earth, table, la…",False,"""early""",6.0,"""hard""","""paseo, psoae""",2.0
7,"""bagie""","""morae""","""rmo""","""ae""",3,2,1,"""BBBYG""",256,"""value, awake, shape, stage, st…",False,"""value""",,"""very hard""","""morae""",1.0
8,"""bagie""","""serai""","""rs""","""eai""",2,3,0,"""BYBYY""",48,"""image, adieu, alike, alive, na…",False,"""image""",4.0,"""moderate""",,
9,"""bagie""","""react""","""ctr""","""ea""",3,2,0,"""BYYBB""",309,"""james, badge, panel, ahead, va…",False,"""james""",5.0,"""hard""",,


# EDA

In [12]:
# Plot difficulty distribution
difficulty_distribution(
    simulating_top_words_tries_difficulty
)

## Most frequent number of guesses

In [13]:
(
    first_guess
    ["tries"]
    .value_counts()
    .sort(by="count", descending=True)
)

tries,count
u8,u32
4.0,203927
5.0,174169
3.0,96074
6.0,90024
,60412
2.0,11486
1.0,68


## Top most popular next guess

In [14]:
# Top most popular next guess
top_ten_next_guess = (
    first_guess
    ["next_guess"]
    .value_counts()
    .sort(by="count", descending=True)
    .head(10)
)

In [15]:
top_ten_next_guess

next_guess,count
str,u32
"""sound""",12701
"""final""",9442
"""model""",9365
"""night""",9285
"""basic""",8349
"""shall""",7957
"""bring""",7805
"""china""",7246
"""south""",7136
"""judge""",6866


In [16]:
# Plot difficulty distribution of challenge word based on top ten next guess
difficulty_distribution(
    simulating_top_words_tries_difficulty
    .filter(pl.col("next_guess").is_in(top_ten_next_guess["next_guess"]))
)

## Anagrams  

Only the most frequently occuring anagrams were only considered for root words after the five letter words were processed to remove plurals, names and places. As such there won't be any anagrams for `leads` as it's root word is `lead` which is a 4-letter word.  

However, if our challenge word is `leads`, or even its anagram `deals`, we can use their anagram `slade` to make a correct guess.

In [17]:
# where challenge and guess are anagrams, removing instances where you guess right
# the first time to see how quickly we converge on the challenge word

anagrams = (
    simulating_top_words_tries_difficulty
    .filter(
        (pl.col("letter_differences") == "")
        & (pl.col("num_common_letters") == 5)
        & (pl.col("challenge") != pl.col("guess"))
    )
)

anagrams.head()

challenge,guess,letter_differences,common_letters,num_diff_letters,num_common_letters,num_matching_index,match_pattern,num_choices_after_guess,possible_guesses,challenge_in_possible_guesses,next_guess,group,tries,difficulty,guess_word_anagrams,anagram_num
str,str,str,str,u8,u8,u8,str,u16,str,bool,str,u32,u8,str,str,u8
"""spaer""","""spare""","""""","""spare""",0,5,3,"""GGGYY""",1,"""spaer""",True,"""spaer""",2512,2,"""easy""","""spare, spear, parse, asper, pr…",7.0
"""stela""","""least""","""""","""least""",0,5,0,"""YYYYY""",2,"""steal, stela""",True,"""steal""",5321,3,"""moderate""","""least, steal, slate, stale, st…",8.0
"""stela""","""steal""","""""","""least""",0,5,3,"""GGGYY""",1,"""stela""",True,"""stela""",5321,3,"""moderate""",,
"""stela""","""slate""","""""","""slate""",0,5,1,"""GYYYY""",4,"""steal, stela, salet, setal""",True,"""steal""",5324,3,"""moderate""",,
"""stela""","""steal""","""""","""slate""",0,5,3,"""GGGYY""",1,"""stela""",True,"""stela""",5324,3,"""moderate""",,


In [18]:
rng = np.random.default_rng()

In [19]:
# Display the DataFrame with colored patterns

anagrams_select_group = rng.choice(anagrams["group"], 1, replace = False)[0]

HTML(
    simulating_top_words_tries_difficulty
    .filter(pl.col("group") == anagrams_select_group)
    .to_pandas()
    .assign(
        match_pattern = lambda df_:
        color_pattern_matching(
            dataf=df_,
            challenge_col="challenge",
            guess_col="guess"
        )
    )
    [[
        "challenge", "guess", "match_pattern", "next_guess",
        "num_choices_after_guess", "possible_guesses", "difficulty",
    ]]
    .to_html(escape=False)
)


Unnamed: 0,challenge,guess,match_pattern,next_guess,num_choices_after_guess,possible_guesses,difficulty
0,steal,slate,slate,steal,4,"steal, stela, salet, setal",easy
1,steal,steal,steal,steal,0,,easy
2,steal,steal,steal,steal,0,,easy
3,steal,steal,steal,steal,0,,easy
4,steal,steal,steal,steal,0,,easy
5,steal,steal,steal,steal,0,,easy


### Challenge words without `r` , `s` or `t`
These are some of the most frequently ocuuring letters. We can then see how our naive algorithm operates in coming to the correct guess.

In [20]:
challenge_words_without_r_s_t = (
    first_guess
    .filter(
        ~(pl.col("challenge").str.contains("r|s|t"))
        & ((pl.col("challenge") != (pl.col("guess"))))
    )
)

challenge_words_without_r_s_t.sample(n = 50, with_replacement = False).head()

group,challenge,guess,letter_differences,common_letters,num_diff_letters,num_common_letters,num_matching_index,match_pattern,num_choices_after_guess,possible_guesses,challenge_in_possible_guesses,next_guess,tries,difficulty,guess_word_anagrams,anagram_num
u32,str,str,str,str,u8,u8,u8,str,u16,str,bool,str,u8,str,str,u8
20522,"""alley""","""realo""","""or""","""eal""",2,3,0,"""BYYYB""",101,"""equal, plead, alive, ideal, va…",True,"""equal""",4,"""moderate""","""realo""",1.0
442245,"""almah""","""teary""","""ryet""","""a""",4,1,0,"""BBYBB""",958,"""basis, lilac, naval, allow, lo…",False,"""basis""",5,"""hard""","""teary""",1.0
450935,"""bocci""","""later""","""aetlr""","""""",5,0,0,"""BBBBB""",904,"""music, nicky, jimmy, shock, yo…",False,"""music""",5,"""hard""","""later, alter, alert, artel, ra…",6.0
572707,"""glike""","""caret""","""ctra""","""e""",4,1,0,"""BBBYB""",743,"""jesus, homme, slope, smile, se…",False,"""jesus""",5,"""hard""",,
383590,"""blawn""","""sayer""","""ryes""","""a""",4,1,0,"""BYBBB""",600,"""black, comma, plain, china, to…",False,"""black""",5,"""hard""","""sayer, arsey, eyras, resay""",4.0


In [21]:
# Plot difficulty distribution of of challenge words that doesn't contain 'r', 's' or 't'
difficulty_distribution(challenge_words_without_r_s_t)

## Null values (Incomplete games)

In [22]:
incomplete_games = (
    simulating_top_words_tries_difficulty
    .filter(pl.col("tries").is_null())
)

In [23]:
incomplete_games.head()

challenge,guess,letter_differences,common_letters,num_diff_letters,num_common_letters,num_matching_index,match_pattern,num_choices_after_guess,possible_guesses,challenge_in_possible_guesses,next_guess,group,tries,difficulty,guess_word_anagrams,anagram_num
str,str,str,str,u8,u8,u8,str,u16,str,bool,str,u32,u8,str,str,u8
"""bagie""","""morae""","""rmo""","""ae""",3,2,1,"""BBBYG""",256,"""value, awake, shape, stage, st…",False,"""value""",7,,"""very hard""","""morae""",1
"""bagie""","""value""","""vuolmr""","""ae""",6,2,2,"""BGBBG""",56,"""waste, bathe, dance, taste, ha…",False,"""waste""",7,,"""very hard""","""value, uveal""",2
"""bagie""","""waste""","""vuwsotlmr""","""ae""",9,2,2,"""BGBBG""",24,"""dance, cache, badge, payee, na…",False,"""dance""",7,,"""very hard""","""waste, sweat, tawse, twaes""",4
"""bagie""","""dance""","""nvuwcdsotlmr""","""ae""",12,2,2,"""BGBBG""",5,"""payee, baize, gaffe, bagie, ja…",True,"""payee""",7,,"""very hard""","""dance""",1
"""bagie""","""payee""","""nvuwycdspotlmr""","""ae""",14,2,2,"""BGBYG""",4,"""baize, gaffe, bagie, jaxie""",True,"""baize""",7,,"""very hard""","""payee""",1


In [24]:
null_select_group = rng.choice(incomplete_games["group"], 1, replace = False)[0]

HTML(
    incomplete_games
    .filter(pl.col("group") == null_select_group)
    .to_pandas()
    .assign(
        match_pattern = lambda df_:
        color_pattern_matching(
            dataf=df_,
            challenge_col="challenge",
            guess_col="guess"
        )
    )
    [[
        "challenge", "guess", "match_pattern", "next_guess",
        "num_choices_after_guess", "possible_guesses", "difficulty",
    ]]
    .to_html(escape=False)
)

Unnamed: 0,challenge,guess,match_pattern,next_guess,num_choices_after_guess,possible_guesses,difficulty
0,racer,aurei,aurei,water,106,"water, laser, paper, later, layer, eager, ranee, eater, baker, maker",very hard
1,racer,water,water,paper,63,"paper, maker, layer, laser, eager, sager, saber, baker, saver, laver",very hard
2,racer,paper,paper,layer,53,"layer, saber, laser, baker, maker, eager, haver, lager, baser, racer",very hard
3,racer,layer,layer,baker,37,"baker, baser, maker, eager, saber, gazer, sager, racer, jager, saver",very hard
4,racer,baker,baker,eager,26,"eager, maser, saver, racer, gazer, haver, jager, sager, fader, gamer",very hard
5,racer,eager,eager,saver,18,"saver, racer, maser, fader, haver, saser, raver, caver, mazer, namer",very hard


### Plot incomplete games stats

In [25]:
incomplete_games_stats = (
    incomplete_games
    .select("challenge")
    .unique(keep="first")
    .join(
        (
            incomplete_games
            .unique(subset="group", keep="first")
            ["challenge"]
            .value_counts()
        ),
        left_on="challenge",
        right_on="challenge"
    )
    .rename({"count": "incomplete_games_count"})
    .with_columns(
        pl.col("incomplete_games_count").cast(pl.UInt8),
        incomplete_games_pct = (
            (pl.col("incomplete_games_count") / first_guess["guess"].n_unique() * 100)
            .round(2)
            .cast(pl.Float32)
        )
    )
    .sort(by="incomplete_games_pct", descending=True)
)

incomplete_games_stats.head()

challenge,incomplete_games_count,incomplete_games_pct
str,u8,f32
"""laxer""",70,100.0
"""kakis""",70,100.0
"""koker""",70,100.0
"""zappy""",70,100.0
"""waker""",70,100.0


### Using `first_guess` dataframe

In [26]:
incomplete_games_count = (
    first_guess
    .filter(pl.col("difficulty") == "very hard")
    .group_by("challenge")
    .len()
    .rename({"len": "incomplete_games_count"})
    .sort("incomplete_games_count", descending=True)
)

incomplete_games_count.head()

challenge,incomplete_games_count
str,u32
"""gager""",70
"""raker""",70
"""laxer""",70
"""tater""",70
"""kakis""",70


In [27]:
plot_most_difficult_words(dataf=incomplete_games_stats)

## Words never guessed

In [28]:
(
    incomplete_games_stats
    .filter(pl.col("incomplete_games_pct") == 100)
)

challenge,incomplete_games_count,incomplete_games_pct
str,u8,f32
"""laxer""",70,100.0
"""kakis""",70,100.0
"""koker""",70,100.0
"""zappy""",70,100.0
"""waker""",70,100.0
"""gager""",70,100.0
"""kerry""",70,100.0
"""tater""",70,100.0
"""razer""",70,100.0
"""boozy""",70,100.0


# Guess word statistics

In [29]:
guess_stats = (
    first_guess
    .group_by("guess")
    .agg(
        tries_mode=pl.col("tries").mode().cast(pl.UInt8),
        tries_mean=(pl.col("tries").mean().round(3).cast(pl.Float32)),
        tries_null_pct=(pl.col("tries").is_null().mean() * 100).round(3).cast(pl.Float32),
        avg_letters_identified=(pl.col("num_common_letters").mean().round(3).cast(pl.Float32)),
        avg_letters_matched=(pl.col("num_matching_index").mean().round(3).cast(pl.Float32)),
    )
    .explode("tries_mode")
    .join(
        (
            first_guess
            .select("guess_word_anagrams", "guess")
            .unique(subset="guess", keep="first")
        ),
        left_on="guess",
        right_on="guess"
    )
)

guess_stats.head()

guess,tries_mode,tries_mean,tries_null_pct,avg_letters_identified,avg_letters_matched,guess_word_anagrams
str,u8,f32,f32,f32,f32,str
"""caret""",4,4.387,8.583,1.585,0.531,
"""raise""",4,4.379,10.409,1.789,0.509,"""raise, arise, serai, aesir"""
"""morae""",4,4.426,9.727,1.643,0.559,"""morae"""
"""aiery""",4,4.422,12.082,1.661,0.425,"""aiery, ayrie"""
"""slate""",4,4.368,8.198,1.665,0.498,


### Scatter Plot of Avg. letters identified vs Avg. tries

In [30]:
plot_guess_stats(
    dataf=guess_stats,
    x_axis_data="avg_letters_identified",
    y_axis_data="tries_mean",
    words="guess",
    title="Avg. letters identified vs Avg. tries",
    xaxis_title="Avg. letters identified",
    yaxis_title="Avg. tries"
)

In [None]:
raise_anagrams = catalog.load("params:raise_anagrams")
trace_anagrams = catalog.load("params:trace_anagrams")
soare_anagrams = catalog.load("params:soare_anagrams")
salet_anagrams = catalog.load("params:salet_anagrams")

words_to_highlight = raise_anagrams + trace_anagrams + soare_anagrams + salet_anagrams

In [32]:
plot_guess_stats_highlighted_words(
    dataf=guess_stats,
    x_axis_data="avg_letters_identified",
    y_axis_data="tries_mean",
    words=words_to_highlight,
    title="Highlighted Avg. letters identified vs Avg. tries",
    xaxis_title="Avg. letters identified",
    yaxis_title="Avg. tries"
)

In [33]:
(
    guess_stats
    .select(pl.exclude("guess_word_anagrams"))
    .filter(pl.col("guess"). is_in(soare_anagrams))
    .sort(["avg_letters_identified", "tries_mean"], descending=[True, False])
)

guess,tries_mode,tries_mean,tries_null_pct,avg_letters_identified,avg_letters_matched
str,u8,f32,f32,f32,f32
"""soare""",4,4.38,10.53,1.809,0.598
"""aeros""",4,4.405,9.98,1.809,0.461
"""arose""",4,4.412,10.42,1.809,0.432


In [34]:
(
    guess_stats
    .select(pl.exclude("guess_word_anagrams"))
    .sort(["avg_letters_identified", "tries_mean"], descending=[True, False])
    .with_row_index(offset=1)
    .head(10)
)

index,guess,tries_mode,tries_mean,tries_null_pct,avg_letters_identified,avg_letters_matched
u32,str,u8,f32,f32,f32,f32
1,"""soare""",4,4.38,10.53,1.809,0.598
2,"""aeros""",4,4.405,9.98,1.809,0.461
3,"""arose""",4,4.412,10.42,1.809,0.432
4,"""arise""",4,4.377,9.43,1.789,0.434
5,"""raise""",4,4.379,10.409,1.789,0.509
6,"""aesir""",4,4.394,9.221,1.789,0.384
7,"""serai""",4,4.397,9.859,1.789,0.472
8,"""laser""",4,4.414,9.397,1.746,0.44
9,"""realo""",4,4.447,10.123,1.743,0.369
10,"""stare""",4,4.363,9.76,1.739,0.466


### Scatter Plot of Avg. number of letters matched vs Avg. tries

In [35]:
plot_guess_stats(
    dataf=guess_stats,
    x_axis_data="avg_letters_matched",
    y_axis_data="tries_mean",
    words="guess",
    title="Avg. number of letters matched vs Avg. tries",
    xaxis_title="Avg. number of letters matched",
    yaxis_title="Avg. tries"
)

In [36]:
plot_guess_stats_highlighted_words(
    dataf=guess_stats,
    x_axis_data="avg_letters_matched",
    y_axis_data="tries_mean",
    words=words_to_highlight,
    title="Highlighted Avg. number of letters matched vs Avg. tries",
    xaxis_title="Avg. number of letters matched",
    yaxis_title="Avg. tries"
)

In [37]:
(
    guess_stats
    .select(["guess", "avg_letters_matched", "tries_mean"])
    .sort(["avg_letters_matched", "tries_mean"], descending=[True, False])
    .with_row_index(offset=1)
    .head(20)
)

index,guess,avg_letters_matched,tries_mean
u32,str,f32,f32
1,"""soare""",0.598,4.38
2,"""sarge""",0.581,4.439
3,"""saice""",0.576,4.38
4,"""saute""",0.568,4.423
5,"""salue""",0.56,4.413
6,"""morae""",0.559,4.426
7,"""salet""",0.556,4.358
8,"""soave""",0.554,4.521
9,"""carte""",0.551,4.392
10,"""sabre""",0.547,4.396


# Challenge word statistics

In [38]:
challenge_stats = (
    simulating_top_words_tries_difficulty
    .group_by("challenge")
    .agg(
        tries_mode=pl.col("tries").mode().cast(pl.UInt8),
        tries_mean=(pl.col("tries").mean().cast(pl.Float32)).round(3),
        tries_null_pct=(pl.col("tries").is_null().mean().cast(pl.Float32) * 100).round(3),
    )
    .explode("tries_mode")
    .sort("tries_null_pct", descending = True)
)

In [39]:
print(f"Overall incompletion rate: {(first_guess.select(pl.col('tries').is_null().sum()).item() / first_guess.height) * 100:.2f}%")

Overall incompletion rate: 9.50%


In [40]:
plot_challenge_stats(
    dataf=challenge_stats,
    x_axis_data="tries_mean",
    y_axis_data="tries_null_pct",
    words="challenge",
    title="Scatter Plot of Avg. tries for Challenge words vs Failure rate",
    xaxis_title="Avg. tries",
    yaxis_title="Failure rate"
)