### **Performing Exploratory Data Analysis of [Term.ooo](https://term.ooo) game** *(Brazilian Portuguese variant of [Wordle](https://www.nytimes.com/games/wordle/index.html))*

Only valid answers are being considered for now.

In [1]:
import plotly.graph_objects as go
import pandas as pd

#### Defining constants

In [2]:
WORDS_LENGTH = 5

#### Loading the data

In [3]:
filepath = f"data\\valid_answers.txt"

with open(filepath, "r", encoding="utf-8") as f:
    df_raw = pd.DataFrame(f.read().splitlines(), columns=["word"])

df_raw

Unnamed: 0,word
0,abano
1,abono
2,abril
3,abrir
4,abuso
...,...
1437,zerar
1438,zinco
1439,ziper
1440,zonas


#### Spliting the words into chars

In [4]:
df_chars = pd.DataFrame()

for i in range(WORDS_LENGTH):
    df_chars[i] = df_raw["word"].str[i].astype("category")

df_chars["unique"] = df_raw["word"].apply(lambda row: len(set(row)))
df_chars.index = df_raw["word"]

df_chars

Unnamed: 0_level_0,0,1,2,3,4,unique
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
abano,a,b,a,n,o,4
abono,a,b,o,n,o,4
abril,a,b,r,i,l,5
abrir,a,b,r,i,r,4
abuso,a,b,u,s,o,5
...,...,...,...,...,...,...
zerar,z,e,r,a,r,4
zinco,z,i,n,c,o,5
ziper,z,i,p,e,r,5
zonas,z,o,n,a,s,5


#### Counting the letter frequency for each position

In [5]:
df_counts = pd.DataFrame()

for i in range(WORDS_LENGTH):
        df_counts = pd.concat([df_counts, df_chars[i].value_counts()], axis=1)

df_counts = df_counts.convert_dtypes()
df_counts = df_counts.sort_index()
df_counts = df_counts.fillna(0)

df_counts

Unnamed: 0,0,1,2,3,4
a,113,302,81,283,374
b,88,10,42,25,0
c,154,13,74,80,0
d,53,12,53,81,0
e,25,235,76,93,127
f,107,3,13,6,0
g,57,5,47,35,0
h,16,12,5,40,0
i,19,163,117,122,8
j,21,1,8,9,0


In [6]:
fig = go.Figure()

fig.add_trace(
    go.Heatmap(
        x=df_counts.columns,
        y=df_counts.index,
        z=df_counts / len(df_chars),
    )
)

fig.update_layout(
    title="Letter Frequency Count per Position",
    height=700,
    width=500,
)

fig.show()

#### Compute the score of each word based on letter frequency per position

In [7]:
df_freq_scores = pd.DataFrame()

for i in range(WORDS_LENGTH):
        df_freq_scores[i] = df_chars[i].map(df_counts[i])

df_freq_scores["total"] = df_freq_scores.sum(axis=1)
df_freq_scores = df_freq_scores.convert_dtypes()

df_freq_scores

Unnamed: 0_level_0,0,1,2,3,4,total
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
abano,113,10,81,42,419,665
abono,113,10,43,42,419,627
abril,113,10,200,122,61,506
abrir,113,10,200,122,214,659
abuso,113,10,67,64,419,673
...,...,...,...,...,...,...
zerar,7,235,200,283,214,939
zinco,7,163,147,80,419,816
ziper,7,163,29,93,214,506
zonas,7,259,147,283,184,880


#### Build 'final' DataFrame

In [8]:
df = pd.concat([df_chars, df_freq_scores], axis=1, keys=['chars', 'freq_scores'])

df.info()
df

<class 'pandas.core.frame.DataFrame'>
Index: 1442 entries, abano to zonzo
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   (chars, 0)            1442 non-null   category
 1   (chars, 1)            1442 non-null   category
 2   (chars, 2)            1442 non-null   category
 3   (chars, 3)            1442 non-null   category
 4   (chars, 4)            1442 non-null   category
 5   (chars, unique)       1442 non-null   int64   
 6   (freq_scores, 0)      1442 non-null   Int64   
 7   (freq_scores, 1)      1442 non-null   Int64   
 8   (freq_scores, 2)      1442 non-null   Int64   
 9   (freq_scores, 3)      1442 non-null   Int64   
 10  (freq_scores, 4)      1442 non-null   category
 11  (freq_scores, total)  1442 non-null   Int64   
dtypes: Int64(5), category(6), int64(1)
memory usage: 98.0+ KB


Unnamed: 0_level_0,chars,chars,chars,chars,chars,chars,freq_scores,freq_scores,freq_scores,freq_scores,freq_scores,freq_scores
Unnamed: 0_level_1,0,1,2,3,4,unique,0,1,2,3,4,total
word,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
abano,a,b,a,n,o,4,113,10,81,42,419,665
abono,a,b,o,n,o,4,113,10,43,42,419,627
abril,a,b,r,i,l,5,113,10,200,122,61,506
abrir,a,b,r,i,r,4,113,10,200,122,214,659
abuso,a,b,u,s,o,5,113,10,67,64,419,673
...,...,...,...,...,...,...,...,...,...,...,...,...
zerar,z,e,r,a,r,4,7,235,200,283,214,939
zinco,z,i,n,c,o,5,7,163,147,80,419,816
ziper,z,i,p,e,r,5,7,163,29,93,214,506
zonas,z,o,n,a,s,5,7,259,147,283,184,880


#### Sort by number of unique letters and the total frquency score

In [9]:
df.sort_values([("chars", "unique"), ("freq_scores", "total")], ascending=False)

Unnamed: 0_level_0,chars,chars,chars,chars,chars,chars,freq_scores,freq_scores,freq_scores,freq_scores,freq_scores,freq_scores
Unnamed: 0_level_1,0,1,2,3,4,unique,0,1,2,3,4,total
word,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
parto,p,a,r,t,o,5,141,302,200,147,419,1209
verao,v,e,r,a,o,5,70,235,200,283,419,1207
pirao,p,i,r,a,o,5,141,163,200,283,419,1206
senao,s,e,n,a,o,5,96,235,147,283,419,1180
farto,f,a,r,t,o,5,107,302,200,147,419,1175
...,...,...,...,...,...,...,...,...,...,...,...,...
desde,d,e,s,d,e,3,53,235,115,81,127,611
nenem,n,e,n,e,m,3,39,235,147,93,27,541
estes,e,s,t,e,s,3,25,15,87,93,184,404
urubu,u,r,u,b,u,3,16,131,67,25,7,246


#### Computing the correlations across positions

In [10]:
fig = go.Figure()

fig.add_trace(
    go.Heatmap(
        x=df_counts.columns,
        y=df_counts.columns,
        z=df_counts.corr(),
    )
)

fig.update_layout(
    title="Positions Correlations",
    height=600,
    width=600,
)

fig.show()

# TODO:

+ Compute letters correlations across positions
+ Compute letter frequency by position scores discarding repeated letters
+ Compute which words eliminate the most words for 2nd and 3rd guess
+ Compute letters that are likely to be repeated in a word