In [70]:
import sys
import os

sys.path.append(os.path.abspath("../")) 

from data import Tokenizer, Vocabulary, TextLoader
from models import RNN
from utils import get_device
import torch
from torch import nn
import torch.nn.functional as F

In [71]:
filename = "../books/sherlock_holmes.txt"
data = TextLoader(filename, batch_size=16, num_steps=5)

In [72]:
data.raw_text[:20]

'\n\n\n\n                '

In [73]:
data.tokens[:20]

['',
 'the',
 'adventures',
 'of',
 'sherlock',
 'holmes',
 'arthur',
 'conan',
 'doyle',
 'table',
 'of',
 'contents',
 'a',
 'scandal',
 'in',
 'bohemia',
 'the',
 'red',
 'headed',
 'league']

In [74]:
data.vocab_size

7805

In [75]:
data.vocab[2]

'adventures'

In [76]:
data.vocab["adventures"]

2

In [77]:
data.tokens[1:5]

['the', 'adventures', 'of', 'sherlock']

In [78]:
rnn_layer = RNN(input_size=300, hidden_size=128).to(device=get_device())
rnn_layer.state_dict()

OrderedDict([('W_xh',
              tensor([[ 0.0225,  0.0002,  0.0070,  ..., -0.0090, -0.0030, -0.0140],
                      [ 0.0078, -0.0010, -0.0052,  ..., -0.0141, -0.0085,  0.0005],
                      [ 0.0104,  0.0026,  0.0132,  ..., -0.0099,  0.0004, -0.0036],
                      ...,
                      [ 0.0028, -0.0142,  0.0091,  ..., -0.0030, -0.0168,  0.0026],
                      [ 0.0044,  0.0077, -0.0091,  ..., -0.0027,  0.0076, -0.0047],
                      [ 0.0054, -0.0054,  0.0136,  ..., -0.0104,  0.0190, -0.0112]],
                     device='mps:0')),
             ('W_hh',
              tensor([[-0.0011,  0.0096, -0.0157,  ..., -0.0111, -0.0048, -0.0156],
                      [-0.0004, -0.0049,  0.0176,  ..., -0.0118,  0.0072, -0.0003],
                      [-0.0033, -0.0175, -0.0020,  ..., -0.0021, -0.0079,  0.0071],
                      ...,
                      [ 0.0088, -0.0045, -0.0026,  ..., -0.0118,  0.0161, -0.0178],
                      

In [79]:
def vectorize_batch(X, y, emb):
    return emb(torch.tensor(X, device=get_device())).transpose(0, 1), \
           emb(torch.tensor(y, device=get_device())).transpose(0, 1)

In [80]:
emb = nn.Embedding(num_embeddings=data.vocab_size, embedding_dim=300, device=get_device())

In [81]:
X_train, y_train = next(iter(data.get_batches()))
X_train, y_train = vectorize_batch(X_train, y_train, emb)
X_train.shape

torch.Size([5, 16, 300])

In [82]:
outputs, state = rnn_layer(X_train)

In [83]:
outputs.shape

torch.Size([5, 16, 128])