In [1]:
import numpy as np
from tokenizer import Vocabulary
from embedding import EmbeddingLayer
from lstm import LSTM

In [2]:
f = open(r"C:\Users\12482\Desktop\alice_wonderland.txt", 'r', encoding='utf-8').readlines()

In [3]:
## create vocabulary + tokenize
vocab = Vocabulary()
token_sequences = vocab.tokenize(f, 26)

## create embedding layer
embedding = EmbeddingLayer(vocab_size=vocab.size, hidden_dim=50) ## hidden_dim is a hyper-param

## create X & Y datasets
X = token_sequences[:,:-1]
y = token_sequences[:,-1]

lstm_inputs = embedding.predict(X)
lstm_inputs.shape ## batch_size x seq_length x dimensionality

(2829, 25, 50)

In [4]:
X.shape

(2829, 25)

In [5]:
lstm = LSTM(units=100, seq_length=vocab.seq_length, 
            batch_size=X.shape[0], vocab_size=vocab.size, features=embedding.hidden_dim)
lstm._init_params()

In [6]:
def forward(inputs, h_prev, C_prev, LSTM):
    
#     assert h_prev.shape == (LSTM.hidden_dim, 1)
#     assert C_prev.shape == (LSTM.hidden_dim, 1)
    
    # First we unpack our parameters
    W_f, W_i, W_g, W_o, W_v, b_f, b_i, b_g, b_o, b_v = LSTM.W_f,LSTM.W_i,LSTM.W_g,LSTM.W_o,LSTM.W_v,LSTM.b_f, LSTM.b_i, LSTM.b_g, LSTM.b_o, LSTM.b_v
    
    # Save a list of computations for each of the components in the LSTM
    x_s, z_s, f_s, i_s,  = [], [] ,[], []
    g_s, C_s, o_s, h_s = [], [] ,[], []
    v_s, output_s =  [], [] 
    
    # Append the initial cell and hidden state to their respective lists
    h_s.append(h_prev)
    C_s.append(C_prev)
    
    for x in inputs:
        
        # YOUR CODE HERE!
        # Concatenate input and hidden state
        z = np.row_stack((h_prev, x))
        z_s.append(z)
        
        # YOUR CODE HERE!
        # Calculate forget gate
        f = LSTM.sigmoid(np.dot(W_f, z) + b_f)
        f_s.append(f)
        
        # Calculate input gate
        i = LSTM.sigmoid(np.dot(W_i, z) + b_i)
        i_s.append(i)
        
        # Calculate candidate
        g = LSTM.tanh(np.dot(W_g, z) + b_g)
        g_s.append(g)
        
        # YOUR CODE HERE!
        # Calculate memory state
        C_prev = f * C_prev + i * g 
        C_s.append(C_prev)
        
        # Calculate output gate
        o = LSTM.sigmoid(np.dot(W_o, z) + b_o)
        o_s.append(o)
        
        # Calculate hidden state
        h_prev = o * LSTM.tanh(C_prev)
        h_s.append(h_prev)

        # Calculate logits
        v = np.dot(W_v, h_prev) + b_v
        v_s.append(v)
        
        # Calculate softmax
        output = LSTM.softmax(v)
        output_s.append(output)
        
    return output_s

In [7]:
lstm.h.shape

(100, 50)

In [8]:
inp = lstm_inputs[0].reshape(1, 25, 50)
inp[0].shape

(25, 50)

In [9]:
z = np.row_stack((lstm.h, inp[0]))
z.shape

(125, 50)

In [10]:
one_batch_out = forward(inp, lstm.h, lstm.c, lstm)
full_batch_out = forward(lstm_inputs, lstm.h, lstm.c, lstm)

In [11]:
one_batch_out = np.array(one_batch_out)
full_batch_out = np.array(full_batch_out)

In [12]:
print('ONE B SHAPE: ', one_batch_out.shape) ## batch_size x seq_length x dimensionality
print('FULL B SHAPE: ', full_batch_out.shape)

ONE B SHAPE:  (1, 25, 50)
FULL B SHAPE:  (2829, 25, 50)


## TUTORIAL DATASET

In [49]:
# Set seed such that we always get the same dataset
np.random.seed(42)

def generate_dataset(num_sequences=100):
    """
    Generates a number of sequences as our dataset.
    
    Args:
     `num_sequences`: the number of sequences to be generated.
     
    Returns a list of sequences.
    """
    samples = []
    
    for _ in range(num_sequences): 
        num_tokens = np.random.randint(1, 10)
        sample = ['a'] * num_tokens + ['b'] * num_tokens + ['EOS']
        samples.append(sample)
        
    return samples


sequences = generate_dataset()

print('A single sample from the generated dataset:')
print(sequences[0])

A single sample from the generated dataset:
['a', 'a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'EOS']


In [50]:
from collections import defaultdict

def sequences_to_dicts(sequences):
    """
    Creates word_to_idx and idx_to_word dictionaries for a list of sequences.
    """
    # A bit of Python-magic to flatten a nested list
    flatten = lambda l: [item for sublist in l for item in sublist]
    
    # Flatten the dataset
    all_words = flatten(sequences)
    
    # Count number of word occurences
    word_count = defaultdict(int)
    for word in flatten(sequences):
        word_count[word] += 1

    # Sort by frequency
    word_count = sorted(list(word_count.items()), key=lambda l: -l[1])

    # Create a list of all unique words
    unique_words = [item[0] for item in word_count]
    
    # Add UNK token to list of words
    unique_words.append('UNK')

    # Count number of sequences and number of unique words
    num_sentences, vocab_size = len(sequences), len(unique_words)

    # Create dictionaries so that we can go from word to index and back
    # If a word is not in our vocabulary, we assign it to token 'UNK'
    word_to_idx = defaultdict(lambda: num_words)
    idx_to_word = defaultdict(lambda: 'UNK')

    # Fill dictionaries
    for idx, word in enumerate(unique_words):
        # YOUR CODE HERE!
        word_to_idx[word] = idx
        idx_to_word[idx] = word

    return word_to_idx, idx_to_word, num_sentences, vocab_size


word_to_idx, idx_to_word, num_sequences, vocab_size = sequences_to_dicts(sequences)

In [51]:
from torch.utils import data

class Dataset(data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        # Return the size of the dataset
        return len(self.targets)

    def __getitem__(self, index):
        # Retrieve inputs and targets at the given index
        X = self.inputs[index]
        y = self.targets[index]

        return X, y

    
def create_datasets(sequences, dataset_class, p_train=0.8, p_val=0.1, p_test=0.1):
    # Define partition sizes
    num_train = int(len(sequences)*p_train)
    num_val = int(len(sequences)*p_val)
    num_test = int(len(sequences)*p_test)

    # Split sequences into partitions
    sequences_train = sequences[:num_train]
    sequences_val = sequences[num_train:num_train+num_val]
    sequences_test = sequences[-num_test:]

    def get_inputs_targets_from_sequences(sequences):
        # Define empty lists
        inputs, targets = [], []
        
        # Append inputs and targets s.t. both lists contain L-1 words of a sentence of length L
        # but targets are shifted right by one so that we can predict the next word
        for sequence in sequences:
            inputs.append(sequence[:-1])
            targets.append(sequence[1:])
            
        return inputs, targets

    # Get inputs and targets for each partition
    inputs_train, targets_train = get_inputs_targets_from_sequences(sequences_train)
    inputs_val, targets_val = get_inputs_targets_from_sequences(sequences_val)
    inputs_test, targets_test = get_inputs_targets_from_sequences(sequences_test)

    # Create datasets
    training_set = dataset_class(inputs_train, targets_train)
    validation_set = dataset_class(inputs_val, targets_val)
    test_set = dataset_class(inputs_test, targets_test)

    return training_set, validation_set, test_set
    

training_set, validation_set, test_set = create_datasets(sequences, Dataset)

In [52]:
def one_hot_encode(idx, vocab_size):
    """
    One-hot encodes a single word given its index and the size of the vocabulary.
    
    Args:
     `idx`: the index of the given word
     `vocab_size`: the size of the vocabulary
    
    Returns a 1-D numpy array of length `vocab_size`.
    """
    # Initialize the encoded array
    one_hot = np.zeros(vocab_size)
    
    # Set the appropriate element to one
    one_hot[idx] = 1.0

    return one_hot


def one_hot_encode_sequence(sequence, vocab_size, use_word_index=False):
    """
    One-hot encodes a sequence of words given a fixed vocabulary size.
    
    Args:
     `sentence`: a list of words to encode
     `vocab_size`: the size of the vocabulary
     
    Returns a 3-D numpy array of shape (num words, vocab size, 1).
    """
    # Encode each word in the sentence
    if use_word_index == False:
        encoding = np.array([one_hot_encode(word, vocab_size) for word in sequence])
    else:
        encoding = np.array([one_hot_encode(word_to_idx[word], vocab_size) for word in sequence])

    # Reshape encoding s.t. it has shape (num words, vocab size, 1)
    encoding = encoding.reshape(encoding.shape[0], encoding.shape[1], 1)
    
    return encoding

In [53]:
# Get first sentence in test set
inputs, targets = test_set[1]

# One-hot encode input and target sequence
inputs_one_hot = one_hot_encode_sequence(inputs, vocab_size, use_word_index=True)
targets_one_hot = one_hot_encode_sequence(targets, vocab_size, use_word_index=True)

In [54]:
test = one_hot_encode_sequence(X[0], vocab.size)

In [55]:
test_set[1][0]

['a', 'a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b', 'b', 'b']

In [56]:
inputs_one_hot.shape ## seq_len x vocab_size x dimensionality

(14, 4, 1)

In [57]:
inputs_one_hot[0] ## represents 'a'

array([[1.],
       [0.],
       [0.],
       [0.]])

In [59]:
inp = lstm_inputs[0].reshape(1, 25, 50)
inp.shape

(1, 25, 50)

In [60]:
inp[0][0] ## represents 'alice'

array([-0.22668827,  0.83462938,  0.63341763, -0.90669285, -0.42460463,
       -0.35994581, -0.07068088, -0.84804716, -1.23411598, -1.28469931,
       -0.50304828,  1.05471968, -0.76056803,  0.2310673 ,  1.08457435,
       -0.20969323, -0.65914316, -0.04540014,  1.06168693, -0.00364711,
        0.13013741,  0.45547039, -1.06291661,  0.59959742,  1.17565123,
        0.09771961,  0.10988988,  1.16266037, -0.44750473,  0.71309439,
       -0.59454228,  0.75510584, -0.60661873, -1.27421088, -1.34321654,
       -1.78399725,  1.33703898,  0.35895103, -0.3759417 ,  1.58775586,
        0.42807892, -0.08440358,  1.38647533, -0.36448774, -0.92166033,
        0.77313459, -0.89655876,  0.10031769,  1.6403288 , -0.74532344])

In [61]:
print(inputs_one_hot.shape) ## seq_len x vocab_size x dimensionality
print(inp.shape) ## batch_size x seq_len x dimensionality

(14, 4, 1)
(1, 25, 50)


In [62]:
test.shape ## seq_len x vocab_size x dimensionality

(25, 2855, 1)

In [71]:
lstm1 = LSTM(150, vocab.seq_length, vocab.size, embedding.hidden_dim, batch_first=True)
lstm1._init_params()
embedding_model_out = forward(inp, lstm1.h, lstm1.c, lstm1)

lstm2 = LSTM(150, 25, vocab.size, 1, batch_first=False)
lstm2._init_params()
encoding_model_out = forward(test, lstm2.h, lstm2.c, lstm2)

lstm3 = LSTM(150, 14, 4, 1, batch_first=False)
lstm3._init_params()
tutorial_data = forward(inputs_one_hot, lstm3.h, lstm3.c, lstm3)

In [72]:
print(np.array(embedding_model_out).shape)
print(np.array(encoding_model_out).shape)
print(np.array(tutorial_data).shape)

(1, 25, 50)
(25, 2855, 1)
(14, 4, 1)


In [73]:
" ".join([idx_to_word[np.argmax(output)] for output in tutorial_data])

'UNK UNK UNK UNK UNK UNK UNK a a EOS EOS EOS EOS EOS'

In [74]:
" ".join([vocab.to_word(np.argmax(output)) for output in encoding_model_out])

'paint pleasant executioner arguments truth marched took oldest couples anxiously minded became dinahs stole treacle retire mallets practice shining stretching eleventh currants blew honour hit'

In [75]:
" ".join([vocab.to_word(np.argmax(o)) for o in embedding_model_out[0]])

'like and said if but a like up if at like like i if up again know out of not what about know alice she'

In [82]:
" ".join([vocab.to_word(np.argmax(o)) for o in full_batch_out[0]])

'out if that if its if i that know again so went in a its it its this out for is little up as on'