In [1]:
import numpy as np
import torch
from src.utilities import vocabulary, CONTEXT_LEN

In [151]:
def get_words(file_path: str='data/names.txt') -> list:
    """Function that reads the raw data and outputs list of words"""
    return open(file_path, 'r').read().splitlines()

def word2vec(word: str, vocabulary: list=vocabulary) -> list:
    """Function that transforms passed word into a vector of indicies using input vocabulary"""
    return [vocabulary.index(let) for let in word]

def sample_train(words: list):
    """Function beaks down a random word from passed list into train and target samples"""
    word = word2vec(words[np.random.choice(range(len(words)))] + '.')
    X = []; y = []
    for n, ch in enumerate(word[:-1]):
        X.append(ch); y.append(word[n+1])
    return torch.tensor(X), torch.tensor(y).float()

In [152]:
words = get_words()
input, target = sample_train(words)

In [153]:
input

tensor([12,  9,  4,  9,  1])

In [154]:
target

tensor([9., 4., 9., 1., 0.])

In [155]:
class RNN(torch.nn.Module):
    def __init__(self, vocab_len: int, embedding_dim: int, hidden_size: int):
        super().__init__()
        self.hidden_size = hidden_size
        self.emb = torch.nn.Embedding(num_embeddings=vocab_len, embedding_dim=embedding_dim)
        self.lstm = torch.nn.LSTMCell(embedding_dim, self.hidden_size)
        self.lin = torch.nn.Linear(hidden_size, vocab_len)

    def forward(self, char: torch.Tensor, hidden_state: torch.Tensor, cell_state: torch.Tensor):
        """Applies all the network layers to the passed character encoded as a number"""
        embedding = self.emb(char)
        hidden_state, cell_state = self.lstm(embedding, (hidden_state, cell_state))
        output = self.lin(hidden_state)
        return output, hidden_state, cell_state
    
    def init_zero_state(self) -> tuple[torch.Tensor, torch.Tensor]:
        """Initiates dummy hidden and cell states for an lstm cell"""
        zero_hidden_state = torch.zeros(self.hidden_size)
        zero_cell_state = torch.zeros(self.hidden_size)
        return zero_hidden_state, zero_cell_state

In [156]:
rnn = RNN(vocab_len=len(vocabulary),
          embedding_dim=20,
          hidden_size=128)

In [157]:
zero_hidden_state, zero_cell_state = rnn.init_zero_state()
rnn.forward(char=input[0],
            hidden_state=zero_hidden_state,
            cell_state=zero_cell_state)

(tensor([ 0.0453, -0.0190,  0.0697, -0.0252, -0.0370,  0.0077, -0.1055, -0.0464,
         -0.0520, -0.0313, -0.1233, -0.0230,  0.1307,  0.0703, -0.0355,  0.0156,
         -0.0300, -0.1042, -0.0973,  0.1073, -0.0235, -0.0679, -0.0452,  0.1162,
         -0.0536,  0.0659,  0.1008], grad_fn=<AddBackward0>),
 tensor([-0.0027,  0.0092, -0.0104,  0.0678, -0.0068,  0.1505, -0.0362, -0.0111,
         -0.0511,  0.1982, -0.0137,  0.0637,  0.0907,  0.0112, -0.0136, -0.0872,
         -0.0179, -0.0207, -0.0629, -0.0434,  0.0057,  0.0419,  0.0539,  0.0109,
          0.0208, -0.0333,  0.0100,  0.0956, -0.0188,  0.1072, -0.0160, -0.0899,
          0.0275, -0.0763,  0.0459,  0.0358,  0.0462,  0.0246,  0.0239, -0.0227,
         -0.0721,  0.0820,  0.0992, -0.0416,  0.0157,  0.0309,  0.0597, -0.0478,
          0.0129,  0.0224, -0.0457, -0.0412, -0.0260,  0.0240, -0.0971,  0.0139,
         -0.1770, -0.0113, -0.0146,  0.0628,  0.0128, -0.0851,  0.0672, -0.0397,
         -0.0373, -0.0523,  0.0299, -0.0178, -0

In [None]:
# def build_train(words: list):
#     """Function beaks down every word from passed list into train and target samples"""
#     X = []; y=[]
#     for word in words:
#         context = '.' * CONTEXT_LEN
#         for ch in word + '.':
#             X.append(word2vec(context)); y.append(word2vec(ch))
#             context = context[1:] + ch
#     return torch.tensor(X), torch.tensor(y).float()