In [1]:
import numpy as np, tensorflow as tf, string, re
from tensorflow import keras
from tensorflow.keras import layers

### DATA

In [7]:
f = open(r"C:\Users\12482\Desktop\Archives\udacity-ai-for-trading\ann-learning-datasets\txt_data\hamlet.txt", 'r').readlines()

### FUNCTIONS

In [21]:
class Vocabulary:
    def __init__(self) -> None:
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.num_words = 0
        self.num_sentences = 0
        self.length_of_longest_sentence = 0

    def _add_word(self, word):
        if word not in self.word2index:
            self.word2count[word] = 1
            self.word2index[word] = self.num_words
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    def _add_sentence(self, sentence):
        sentence = sentence.lower()
        new = self._clean_sentence(sentence=sentence)
        new = new.replace('\n', '')
        for word in new.split(' '):
            self._add_word(word)
            
        if len(new.split(' ')) > self.length_of_longest_sentence:
            self.length_of_longest_sentence = len(new.split(' '))
      
        self.num_sentences += 1
        
    def pad_sequences(self, sequence):
        """
        Params:
        sequence --> numpy.array, Integer vector of tokenized words
        
        Returns:
        padded_sequence --> Integer vector of tokenized words with padding
        """
        return_arr = []
        
        for s in sequence:
            new = list(s)
            missing = self.length_of_longest_sentence - len(new)
            new.extend([0]*missing)
            return_arr.append(new)
            
        return np.vstack(return_arr)
    
    def compile_vocab(self, corpus):
        """
        Creates vocabulary

        Params:
        Corpus --> List[str]
        
        Returns:
        self
        """
        for s in corpus:
            self._add_sentence(s)

        assert len(self.word2count) == len(self.word2index) == len(self.index2word)
        self.size = len(self.word2count)

    def _clean_sentence(self, sentence):
        new_string = re.sub(r'[^\w\s]', '', sentence)
        return new_string

    def to_word(self, index):
        return self.index2word[index]

    def to_index(self, word):
        return self.word2index[word]

In [22]:
class EmbeddingLayer:
    def __init__(self, vocab_size, hidden_dim):
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        self.weights = np.random.randn(vocab_size, hidden_dim) ## (vocab_size, hidden_dim)

    def predict(self, array):
        """
        PARAMS:
          array: 
           -- integer matrix of batch_size x seq_length

        RETURNS:
          array:
           -- integer matrix of batch_size x seq_length x hidden_dim
           -- the word vectors for each word in the tokenized input
        """
        assert np.max(array) <= self.vocab_size

        return np.array([self.weights[i] for i in array])    

### USAGE

In [23]:
v = Vocabulary()
v.compile_vocab(f)

In [31]:
inp1 = np.array([v.to_index(w) for w in 'this is a different one'.split(' ')]).reshape(1, -1) ## batch_size x input_length
inp2 = np.array([v.to_index(w) for w in 'this has been a quiet hour'.split(' ')]).reshape(1, -1)

pad_1 = v.pad_sequences(inp1)
pad_2 = v.pad_sequences(inp2)

fin = np.vstack([pad_1, pad_2])
fin

array([[  33,   74,   49, 2863,  168,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0],
       [  33,   95,  331,   49,   46,   23,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0]])

In [32]:
e = EmbeddingLayer(vocab_size=v.size, hidden_dim=20) ## hidden_dim is a hyper-param

pred = e.predict(fin)
pred.shape # shape == batch_size x seq_length x hidden_dim

(2, 16, 20)

In [40]:
pred[0][0]

array([-2.18448979, -0.38512027,  0.07287051,  0.74902259,  0.07994221,
        0.06186369, -0.27173579, -0.10391711,  0.27328872,  2.15346355,
       -0.07848089,  0.38148867,  0.5619906 , -2.24101433,  0.85738746,
       -1.11064957, -0.10103468,  0.42632561, -0.39540956, -1.189433  ])

In [39]:
pred[0][0] == e.weights[v.to_index('this')]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])