In [1]:
import numpy as np, string, re, random, tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

### DATA

In [2]:
f = open(r"C:\Users\12482\Desktop\alice_wonderland.txt", 'r', encoding='utf-8').readlines()

### FUNCTIONS

In [3]:
class Vocabulary:
    def __init__(self) -> None:
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.sentences = []
        self.tokens = []
        self.num_words = 0
        self.num_sentences = 0

    def _add_word(self, word):
        if word not in self.word2index:
            self.tokens.append(word)
            self.word2count[word] = 1
            self.word2index[word] = self.num_words
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    def _add_sentence(self, sentence):
        sentence = sentence.lower()
        new = self._clean_sentence(sentence=sentence)
        new = new.replace('\n', '')
        self.sentences.append(new)
        
        for word in new.split(' '):
            if word != '':
                self._add_word(word)
            else:
                continue
      
        self.num_sentences += 1
        
    def pad_sequences(self, sequence, length=None):
        """
        Default: Pad an input sequence to be the same as self.seq_length
        
        Alternative: Pad an input sequence to the 'length' param
        
        Keras: Pads input sequences with length of longest sequence
        
        Params:
        sequence --> np.array[numpy.array], integer matrix of tokenized words
        
        Returns:
        padded_sequence --> np.array[numpy.array], integer matrix of tokenized words with padding
        """
        return_arr = []
        
        for s in sequence:
            new = list(s)
            
            if not length:
                missing = self.seq_length - len(new)
            else:
                missing = length - len(new)
                
            new.extend([0]*missing)
            return_arr.append(new)
            
        return np.vstack(return_arr)
    
    def _sort_by_frequency(self):
        sorted_count = dict(sorted(self.word2count.items(), key=lambda x:x[1], reverse=True))

        self.word2index = {}
        
        count = 1 ## 0 is reserved for padding (this is how keras does it)
        for k,v in sorted_count.items():
            self.word2index[k] = count
            count += 1
        
        self.index2word = {v:k for k,v in self.word2index.items()}
        
        return self
    
    def _compile_vocab(self, corpus):
        """
        Creates vocabulary

        Params:
        Corpus --> List[str]
        
        Returns:
        self
        """
        for s in corpus:
            self._add_sentence(s)

        assert len(self.word2count) == len(self.word2index) == len(self.index2word)
        self.size = len(self.word2count)
        
        self._sort_by_frequency()
        
    def tokenize(self, corpus, seq_length):
        """
        Creates sequences of tokens

        Params:
        Corpus --> List[str]
        
        Returns:
        Token Sequences --> List[str]
        """
        self._compile_vocab(corpus)
        self.seq_length = seq_length
        self.token_sequences = []
        
        for i in range(seq_length, self.size):
            seq = self.tokens[i-seq_length:i]
            seq = [self.word2index[i] for i in seq]
            self.token_sequences.append(seq)
            
        return np.array(self.token_sequences)

    def _clean_sentence(self, sentence):
        new_string = re.sub(r'[^\w\s]', '', sentence)
        return new_string

    def to_word(self, index):
        return self.index2word[index]

    def to_index(self, word):
        return self.word2index[word]

In [4]:
class EmbeddingLayer:
    def __init__(self, vocab_size, hidden_dim):
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        self.weights = np.random.randn(vocab_size, hidden_dim) ## (vocab_size, hidden_dim)

    def predict(self, array):
        """
        PARAMS:
          array: 
           -- integer matrix of batch_size x seq_length

        RETURNS:
          array:
           -- integer matrix of batch_size x seq_length x hidden_dim
           -- the word vectors for each word in the tokenized input
        """
        assert np.max(array) <= self.vocab_size

        return np.array([self.weights[i] for i in array])    

### USAGE

In [5]:
v = Vocabulary()
token_sequences = v.tokenize(f, 26)

In [7]:
inp1 = np.array([v.to_index(w) for w in 'this is a different one'.split(' ')]).reshape(1, -1) ## batch_size x input_length
inp2 = np.array([v.to_index(w) for w in 'this has been a quiet hour'.split(' ')]).reshape(1, -1)

pad_1 = v.pad_sequences(inp1)
pad_2 = v.pad_sequences(inp2)

fin = np.vstack([pad_1, pad_2])
fin

array([[  27,   39,    4,  397,   37,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0],
       [  27,  504,  122,    4, 1167, 1203,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0]])

In [8]:
fin.shape # batch_size x seq_length

(2, 26)

In [12]:
e = EmbeddingLayer(vocab_size=v.size, hidden_dim=20) ## hidden_dim is a hyper-param

pred = e.predict(fin)
pred.shape # shape == batch_size x seq_length x hidden_dim

In [10]:
pred[0][0]

array([ 1.16911839e+00,  3.70881440e-01, -3.99547253e-01, -2.02766582e+00,
       -6.53244521e-01, -8.19700961e-01,  3.85087104e-01, -9.01565549e-02,
        6.26177399e-02,  2.41180800e-01, -2.20184133e+00, -2.13868675e-03,
        4.10149836e-01,  7.92416765e-01,  3.35208059e-01, -1.66527578e+00,
        3.76977316e-01,  1.86865876e+00,  2.83007044e-01, -6.36067399e-02])

In [11]:
pred[0][0] == e.weights[v.to_index('this')]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

### TESTING

In [12]:
from keras.preprocessing.text import Tokenizer

text='check check fail'

tokenizer = Tokenizer()
tokenizer.fit_on_texts(f)
tokenizer.word_index

{'the': 1,
 '”': 2,
 'and': 3,
 'to': 4,
 'a': 5,
 'she': 6,
 'of': 7,
 'it': 8,
 'said': 9,
 'alice': 10,
 'in': 11,
 'was': 12,
 'you': 13,
 'i': 14,
 'that': 15,
 'as': 16,
 'her': 17,
 'at': 18,
 'on': 19,
 'with': 20,
 'had': 21,
 'all': 22,
 'be': 23,
 'for': 24,
 'so': 25,
 'very': 26,
 'not': 27,
 'this': 28,
 'little': 29,
 'but': 30,
 '“i': 31,
 'they': 32,
 'out': 33,
 'he': 34,
 'down': 35,
 'what': 36,
 'up': 37,
 'is': 38,
 'one': 39,
 'his': 40,
 'about': 41,
 'were': 42,
 'like': 43,
 'went': 44,
 'herself': 45,
 'them': 46,
 'again': 47,
 'know': 48,
 'then': 49,
 'would': 50,
 'could': 51,
 'have': 52,
 'no': 53,
 'thought': 54,
 'when': 55,
 'if': 56,
 'do': 57,
 'time': 58,
 'or': 59,
 'there': 60,
 'queen': 61,
 'into': 62,
 'me': 63,
 'see': 64,
 'off': 65,
 'king': 66,
 'your': 67,
 'did': 68,
 'began': 69,
 'its': 70,
 'by': 71,
 'an': 72,
 'my': 73,
 'mock': 74,
 'turtle': 75,
 '“and': 76,
 'quite': 77,
 'hatter': 78,
 'gryphon': 79,
 'way': 80,
 'who': 81,
 'd

In [13]:
v.word2index

{'the': 1,
 'and': 2,
 'to': 3,
 'a': 4,
 'she': 5,
 'it': 6,
 'of': 7,
 'said': 8,
 'alice': 9,
 'i': 10,
 'in': 11,
 'was': 12,
 'you': 13,
 'that': 14,
 'as': 15,
 'her': 16,
 'at': 17,
 'on': 18,
 'with': 19,
 'had': 20,
 'all': 21,
 'but': 22,
 'for': 23,
 'so': 24,
 'be': 25,
 'not': 26,
 'this': 27,
 'very': 28,
 'what': 29,
 'they': 30,
 'little': 31,
 'he': 32,
 'out': 33,
 'its': 34,
 'down': 35,
 'up': 36,
 'one': 37,
 'his': 38,
 'is': 39,
 'about': 40,
 'if': 41,
 'then': 42,
 'were': 43,
 'no': 44,
 'like': 45,
 'know': 46,
 'them': 47,
 'went': 48,
 'herself': 49,
 'again': 50,
 'do': 51,
 'when': 52,
 'would': 53,
 'or': 54,
 'have': 55,
 'thought': 56,
 'could': 57,
 'there': 58,
 'off': 59,
 'time': 60,
 'queen': 61,
 'into': 62,
 'see': 63,
 'me': 64,
 'how': 65,
 'did': 66,
 'who': 67,
 'king': 68,
 'well': 69,
 'dont': 70,
 'my': 71,
 'an': 72,
 'began': 73,
 'now': 74,
 'by': 75,
 'mock': 76,
 'your': 77,
 'hatter': 78,
 'turtle': 79,
 'gryphon': 80,
 'im': 81,
 '

In [14]:
len(tokenizer.word_index)

3053

In [15]:
len(v.word2index)

2855

In [16]:
token_sequences

array([[   9,   12,  274, ...,   20,  838,   62],
       [  12,  274,    3, ...,  838,   62,  466],
       [ 274,    3,   99, ...,   62,  466,  839],
       ...,
       [2827, 2828, 2829, ..., 2850, 2851, 2852],
       [2828, 2829, 2830, ..., 2851, 2852, 2853],
       [2829, 2830, 2831, ..., 2852, 2853, 2854]])

In [17]:
print([v.to_word(i) for i in token_sequences[0]])
print()
print(v.to_word(9))
print(v.to_word(12))
print(v.to_word(274))

['alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank', 'and', 'having', 'nothing', 'do', 'once', 'or', 'twice', 'she', 'had', 'peeped', 'into']

alice
was
beginning


In [18]:
print([v.to_word(i) for i in token_sequences[-3]])
print()
print(v.to_word(2827))
print(v.to_word(2828))
print(v.to_word(2829))

['realitythe', 'rustling', 'rippling', 'reedsthe', 'tinkling', 'sheepbells', 'cries', 'shepherd', 'boyand', 'noises', 'clamour', 'farmyardwhile', 'lowing', 'cattle', 'lastly', 'pictured', 'aftertime', 'woman', 'riper', 'years', 'loving', 'childhood', 'gather', '_their_', 'sorrows', 'joys']

realitythe
rustling
rippling


### PART 2 - THE NETWORK

In [6]:
X = token_sequences[:,:-1]
y = token_sequences[:,-1]
y

array([  62,  466,  839, ..., 2852, 2853, 2854])

In [7]:
token_sequences[0]

array([   9,   12,  274,    3,   99,   28,  465,    7,  342,   75,   16,
        417,   18,    1, 1071,    2,  343,  128,   51,  133,   54,  589,
          5,   20,  838,   62])

In [8]:
X[0]

array([   9,   12,  274,    3,   99,   28,  465,    7,  342,   75,   16,
        417,   18,    1, 1071,    2,  343,  128,   51,  133,   54,  589,
          5,   20,  838])

In [9]:
y[0]

62

In [10]:
X.shape

(2829, 25)

In [13]:
lstm_inputs = e.predict(X)
lstm_inputs.shape ## batch_size x seq_length x dimensionality

(2829, 25, 20)

In [14]:
lstm_inputs[0].shape ## seq_length x dimensionaliy

(25, 20)

In [17]:
hidden_size = 50
vocab_size = v.size

# Size of concatenated hidden + input vector
z_size = hidden_size + vocab_size

In [18]:
# Weight matrix (forget gate)
# YOUR CODE HERE!
W_f = np.random.randn(hidden_size, z_size)

# Bias for forget gate
b_f = np.zeros((hidden_size, 1))

# Weight matrix (input gate)
# YOUR CODE HERE!
W_i = np.random.randn(hidden_size, z_size)

# Bias for input gate
b_i = np.zeros((hidden_size, 1))

# Weight matrix (candidate)
# YOUR CODE HERE!
W_g = np.random.randn(hidden_size, z_size)

# Bias for candidate
b_g = np.zeros((hidden_size, 1))

# Weight matrix of the output gate
# YOUR CODE HERE!
W_o = np.random.randn(hidden_size, z_size)
b_o = np.zeros((hidden_size, 1))

# Weight matrix relating the hidden-state to the output
# YOUR CODE HERE!
W_v = np.random.randn(vocab_size, hidden_size)
b_v = np.zeros((vocab_size, 1))

In [68]:
# Initialize hidden state as zeros
h = np.zeros((hidden_size, e.hidden_dim))
c = np.zeros((hidden_size, e.hidden_dim))

In [69]:
lstm_inputs.shape

(2829, 25, 20)

In [89]:
n_x, m = lstm_inputs[0].shape

In [90]:
n_a, m = h.shape

In [107]:
W_f = np.random.randn(n_a, v.seq_length)

In [108]:
z = np.row_stack((lstm_inputs[0], h[0]))

In [109]:
def init_orthogonal(param):
    """
    Initializes weight parameters orthogonally.
    
    Refer to this paper for an explanation of this initialization:
    https://arxiv.org/abs/1312.6120
    """
    if param.ndim < 2:
        raise ValueError("Only parameters with 2 or more dimensions are supported.")

    rows, cols = param.shape
    
    new_param = np.random.randn(rows, cols)
    
    if rows < cols:
        new_param = new_param.T
    
    # Compute QR factorization
    q, r = np.linalg.qr(new_param)
    
    # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf
    d = np.diag(r, 0)
    ph = np.sign(d)
    q *= ph

    if rows < cols:
        q = q.T
    
    new_param = q
    
    return new_param

W_f = init_orthogonal(W_f)

In [110]:
W_f.shape, z.shape

((50, 26), (26, 20))

In [114]:
f = np.dot(W_f, z) + b_f

In [115]:
f.shape

(50, 20)

In [111]:
# Bias for forget gate
b_f = np.zeros((n_a, 1))

In [113]:
b_f = init_orthogonal(b_f)
b_f.shape

(50, 1)

In [118]:
W_i = init_orthogonal(np.random.randn(n_a, v.seq_length))
b_i = init_orthogonal(np.zeros((n_a, 1)))

In [120]:
W_i.shape, b_i.shape

((50, 26), (50, 1))

In [121]:
z.shape

(26, 20)

In [122]:
i = np.dot(W_i, z) + b_i

In [123]:
i.shape

(50, 20)

In [125]:
W_g = init_orthogonal(np.random.randn(n_a, v.seq_length))
b_g = init_orthogonal(np.zeros((n_a, 1)))

In [126]:
g = np.dot(W_g, z) + b_g

In [127]:
C_prev = f * c[0] + i * g

In [128]:
C_prev.shape

(50, 20)