In [1]:
import numpy as np, tensorflow as tf, string, re
from tensorflow import keras
from tensorflow.keras import layers

### DATA

In [4]:
f = open('/content/hamlet.txt', 'r').readlines()

### FUNCTIONS

In [2]:
class Vocabulary:
  def __init__(self) -> None:
      self.word2index = {}
      self.word2count = {}
      self.index2word = {}
      self.num_words = 0
      self.num_sentences = 0
      self.length_of_longest_sentence = 0
  
  def _add_word(self, word):
      if word not in self.word2index:
        self.word2count[word] = 1
        self.word2index[word] = self.num_words
        self.index2word[self.num_words] = word
        self.num_words += 1
      else:
        self.word2count[word] += 1

  def _add_sentence(self, sentence):
      sentence = sentence.lower()
      new = self._clean_sentence(sentence=sentence)
      new = new.replace('\n', '')
      for word in new.split(' '):
        self._add_word(word)
      
      self.num_sentences += 1

  def compile_vocab(self, corpus):
    """
    Creates vocabulary

    Params:
      Corpus --> List[str]
    
    Returns:
      self
    """
    for s in corpus:
      self._add_sentence(s)

    assert len(self.word2count) == len(self.word2index) == len(self.index2word)
    self.size = len(self.word2count)

  def _clean_sentence(self, sentence):
    new_string = re.sub(r'[^\w\s]', '', sentence)
    return new_string

  def to_word(self, index):
      return self.index2word[index]

  def to_index(self, word):
      return self.word2index[word]

In [3]:
class EmbeddingLayer:
  def __init__(self, vocab_size, hidden_dim):
    self.vocab_size = vocab_size
    self.hidden_dim = hidden_dim
    self.weights = np.random.randn(vocab_size, hidden_dim) ## (vocab_size, hidden_dim)

  def predict(self, array):
    """
    PARAMS:
      array: 
       -- integer matrix of batch_size x seq_length

    RETURNS:
      array:
       -- integer matrix of batch_size x seq_length x hidden_dim
       -- the word vectors for each word in the tokenized input
    """
    assert np.max(array) <= self.vocab_size

    return np.array([self.weights[i] for i in array])    

### USAGE

In [5]:
v = Vocabulary()
v.compile_vocab(f)

sentence_str = 'this has been a quiet hour'
inp = np.array([v.to_index(w) for w in sentence_str.split(' ')])
inp = inp.reshape(1, -1) ## batch_size x input_length

In [6]:
e = EmbeddingLayer(vocab_size=v.size, hidden_dim=10) ## hidden_dim is a hyper-param

pred = e.predict(inp)
pred.shape # shape == batch_size x seq_length x hidden_dim

(1, 6, 10)