[Wordle](https://www.kaggle.com/datasets/cprosser3/wordle-5-letter-words)

In [244]:
import os
import re

import pandas as pd

import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim

import time

import numpy as np

# Data actions

In [158]:
file = 'data/texts.csv'
ds = pd.read_csv(file)

ds.head()

Unnamed: 0,1,2,3,4,5
0,a,b,a,c,k
1,a,b,a,s,e
2,a,b,a,t,e
3,a,b,a,y,a
4,a,b,b,e,y


In [159]:
ds.shape

(2499, 5)

In [230]:
N_LETTERS = 26

In [169]:
words = [''.join(v) for v in ds.values]
words[:10]

['aback',
 'abase',
 'abate',
 'abaya',
 'abbey',
 'abbot',
 'abets',
 'abhor',
 'abide',
 'abode']

## For lstm

In [242]:
ORD_A = ord('a')

def word_one_hot(word, use_torch=False):
    letters = [letter_one_hot(letter, use_torch) for letter in word]
    return torch.stack(letters) if use_torch else letters

def letter_one_hot(letter, use_torch=False):
    encoding = [0. for _ in range(N_LETTERS)]
    encoding[ord(letter) - ORD_A] = 1.
    return torch.tensor(encoding) if use_torch else encoding

In [243]:
letter_one_hot('b', True)

tensor([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.])

In [241]:
word_one_hot('aboba', True)

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]])

In [246]:
rnn_inputs = [word_one_hot(word, True) for word in words]
rnn_inputs[0]

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]])

# Markov chain

In [221]:
class Node:
    def __init__(self, string):
        self.current = string
        self.transitions = {}


class MarkovChain:
    def __init__(self):
        self.nodes = {'(eps)':  Node('')}
    
    def __build(self, ds, current, i):
        value_counts = ds[str(i)].value_counts()
        total = ds.shape[0]
        for (value, count) in zip(value_counts.keys(), value_counts.values):
            current.transitions[current.current + value] = count / total
            self.nodes[current.current + value] = Node(current.current + value)
            if i < 5:
                self.__build(ds.loc[ds[str(i)] == value], self.nodes[current.current + value], i + 1)
        
    def build(self, ds):
        self.__build(ds, self.nodes['(eps)'], 1)
        
    def __generate(self, current, letters):
        if len(letters) != 0:
            cur_letter = letters[0]
            current = []
        return cur_letter + self.__generate()
    
    def generate(self, letters=None):
        if letters is None:
            letters = '(eps)'
        while True:
            if letters not in self.nodes:
                break
            node = self.nodes[letters]
            if len(node.transitions) == 0:
                break
            letters = max(node.transitions)
        return letters

In [222]:
chain = MarkovChain()

start = time.time()
chain.build(ds)
f'Build finished in {time.time() - start} seconds'

'Build finished in 1.4896173477172852 seconds'

In [224]:
chain.generate('aba')

'abaya'

# LSTM

In [None]:
class RNN(nn.Module):
    def __init__(self, lstm_size=128, embedding_dim=26, n_layers=3):
        self.lstm_size = lstm_size
        self.embedding_dim = embedding_dim
        self.n_layers = n_layers
        
        self.lstm = nn.LSTM(self.lstm_size, self.lstm_size, num_layers=self.n_layers)
        self.fc = nn.Linear(self.lstm_size, embedding_dim)
        
    def forward(self, x, prev):
        output, state = self.lstm(x, prev)
        logits

In [252]:
lstm = nn.LSTM(26, 26)

In [258]:
hidden = (torch.randn(1, 1, 26),
          torch.randn(1, 1, 26))
for i in rnn_inputs[0]:
    out, hidden = lstm(i.view(1, 1, -1), hidden)
    print(out)

tensor([[[ 0.1563,  0.1768,  0.4268,  0.2701, -0.2688, -0.3182, -0.4097,
           0.4143,  0.3269,  0.0976, -0.2728, -0.1277, -0.1209,  0.1134,
           0.1069, -0.1285, -0.0218, -0.2246, -0.3846,  0.1948,  0.2221,
           0.0960,  0.1530, -0.2188,  0.0286,  0.2667]]],
       grad_fn=<StackBackward0>)
tensor([[[ 0.0111,  0.2258,  0.2012,  0.0552, -0.3005, -0.2774, -0.3198,
           0.3216,  0.1920, -0.0105, -0.1704, -0.0527, -0.0853,  0.0904,
           0.0875, -0.1263, -0.0269, -0.2040, -0.0494,  0.1493,  0.2976,
           0.1250,  0.1206, -0.1196, -0.0774,  0.2243]]],
       grad_fn=<StackBackward0>)
tensor([[[-0.0633,  0.0893,  0.1419,  0.0226, -0.2268, -0.1534, -0.2687,
           0.2861,  0.0948, -0.0747, -0.1178, -0.0119, -0.0400,  0.0416,
           0.1147, -0.1138, -0.0630, -0.1614,  0.0326,  0.1539,  0.2307,
           0.0904,  0.0656, -0.0411, -0.1076,  0.0732]]],
       grad_fn=<StackBackward0>)
tensor([[[-0.0338,  0.0079,  0.0985, -0.0327, -0.2009, -0.0976, -0.159