In [1]:
# RNNs

# Family of NN's
# Used for sequential data
# Sequential data -> the order of the data matters.
# Input is not IID, that is the value of one input/row, affects the value of a subsequent input/row
# Example: Predicting stock market -> if I want to predict the stock price of a stock, I need to consider the
# most recent stock prize. That is, i can not randomize data

# Time Series, NLP, Genetic -> example of sequential data
# In MLP's, when training, the model does not remember the past values, it just optimizes on the the next set of
# values

In [2]:
#Types of RNN

# Many-to-one: Input data is a sequence -> need to the output to be a fixed vector. Sentiment analysis.
# One-to-many: Input data is a standard form -> the output is a sequence. Image captioning
# Many-to-Many: Both are sequences. Further divided into synchronized and non-synchronized

In [3]:
# In RNN, the hiden layer recieves input from the current step, as well as the previous step 

In [4]:
#Creating a single-layer RNN. Input sequence is 3. 

import torch
import torch.nn as nn

rnn_layer = nn.RNN(input_size = 5, hidden_size=2, num_layers=1, batch_first=True)

w_xh = rnn_layer.weight_ih_l0

w_hh = rnn_layer.weight_hh_l0

b_xh = rnn_layer.bias_ih_l0

b_hh = rnn_layer.bias_hh_l0

In [5]:
# Input shape of the layer is (batch_size, sequence_length, 5)
# Batch_size, aka batch dimension
# second is the sequence
# fifth is the number of features. each input as 5 features

In [6]:
x_seq = torch.tensor([[1.0]*5, [2.0]*5, [3.0]*5]).float()

output, hnn = rnn_layer(torch.reshape(x_seq, (1, 3, 5)))

out_man = []

for t in range(3):
    xt = torch.reshape(x_seq[t], (1, 5))
    print(f'Time step {t} =>')
    print('Input: ', xt.numpy())
    
    ht = torch.matmul(xt, torch.transpose(w_xh, 0, 1)) + b_xh
    print('Hidden: ', ht.detach().numpy())
    
    if t > 0:
        prev_h = out_man[t-1]
    else:
        prev_h = torch.zeros((ht.shape))
    
    ot = ht + torch.matmul(prev_h, torch.transpose(w_hh, 0, 1)) + b_hh
    
    ot = torch.tanh(ot) #Hyperbolic tanh function as the activation 
    out_man.append(ot)
    
    print('Output (manual): ', ot.detach().numpy())
    print('RNN Output: ', output[:, t].detach().numpy())
    print()

Time step 0 =>
Input:  [[1. 1. 1. 1. 1.]]
Hidden:  [[-0.01172273 -0.32822037]]
Output (manual):  [[-0.27979535 -0.6171595 ]]
RNN Output:  [[-0.2797954 -0.6171595]]

Time step 1 =>
Input:  [[2. 2. 2. 2. 2.]]
Hidden:  [[ 0.19572254 -0.7462684 ]]
Output (manual):  [[ 0.01401133 -0.80719066]]
RNN Output:  [[ 0.01401126 -0.80719066]]

Time step 2 =>
Input:  [[3. 3. 3. 3. 3.]]
Hidden:  [[ 0.40316743 -1.1643164 ]]
Output (manual):  [[ 0.42996258 -0.9186761 ]]
RNN Output:  [[ 0.42996272 -0.9186761 ]]



In [7]:
# Exploding/vanishing problem
# Because of repetative weight calculation, the end effect of the weight is exponential in the form of 
# w^n, where n is sequence length, ie the number of times the weight is multiplied
# Therefore, when calculating the gradient descent, the weight gets too big, and does not match with 
# learning rate



In [8]:
#LSTM
# To solve exploding/vanishing problem

In [3]:
from torchtext.datasets import IMDB
from torch.utils.data.dataset import random_split

# Step 1: load and create the datasets

train_dataset = IMDB(split='train')
test_dataset = IMDB(split='test')


In [4]:
#Splitting the dataset into training and validation
import torch
import torch.nn as nn
from torch.utils.data.dataset import random_split

torch.manual_seed(1)
train_dataset, valid_dataset = random_split(list(train_dataset), [20000, 5000])

In [5]:
# To prepare the data for NN, we need to convert string values to numeric
# Do this via tokenization
# First find unique tokens. 
# We will use counter class from collections package

In [113]:
# Counter object -> collects all unique word frequencies
# 

import re
from collections import Counter, OrderedDict

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)#Removes the HTML tags
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower()) #Finds all the emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '') #Replaces the alphanumeric characters with a space, and assigns emoticons at the back
    tokenized = text.split()
    return tokenized

token_counts = Counter()

for label, line in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)




In [114]:
# Assigning unique words to numbers, by using Vocab package from torchtext
from torchtext.vocab import vocab
sorted_by_freq_couples = sorted(token_counts.items(), key = lambda x: x[1], reverse = True) 
#token_counts.items() returns a dict_items,  a 
# list of keyt value pairs. the sorted function turns the dict_items into a list of tuples. the key function, x:x[1] gets the number, and 
# uses it as a key to sort the list of tuples

ordered_Dict = OrderedDict(sorted_by_freq_couples) #Turns the list of tuples into a OrderedDict datatype
vocab = vocab(ordered_Dict) # converts 
vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

In [142]:
# Functions for transformation

textPipeline = lambda x: [vocab[token] for token in tokenizer(x)]


labelPipeline = lambda x: 1. if x == 2 else 0.

In [143]:
for i, k, l in train_dl:
    print(k)

tensor([0., 0., 1., 0., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 1., 1.,
        0., 1., 1., 0., 1., 1., 0., 1., 1., 0., 1., 0., 0., 1.])
tensor([1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.])
tensor([0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 0., 1., 1., 1., 1.,
        0., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 0., 1.])
tensor([0., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 0., 0.,
        1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1.])
tensor([1., 1., 1., 1., 0., 1., 0., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 1.,
        1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 0.])
tensor([0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0.,
        0., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1.])
tensor([0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1.,
        1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.

In [144]:
# Generate batches using Data Loader, and passing it into the processing pipelines
# The function takes in dataloader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    labelList = []
    textList = []    
    lengths = []

    for label, text in batch:
        labelList.append(labelPipeline(label))
        processedText = torch.tensor(textPipeline(text), dtype = torch.int64)
        textList.append(processedText)

        lengths.append(processedText.size(0))

    labelList = torch.tensor(labelList)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(textList, batch_first=True)
    #pad_sequence: takes a list of tensors with different length, combines them together pads the tensors that are shorter than the 
    # longest tensor
    # The element of these sequences are integer number that correspond to indices of unique words
    
    return padded_text_list.to(device), labelList.to(device), lengths.to(device)

In [145]:
from torch.utils.data import DataLoader

batch_size = 32  

train_dl = DataLoader(train_dataset, batch_size=batch_size,
                      shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size,
                      shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size,
                     shuffle=False, collate_fn=collate_batch)

In [146]:
# Embedding Layers for sentence encodin
# A form of feature extraction
# In this process, we map each word to a vector of a fixed size. We can select the fixed size to be smaller than the number of 
# unique words present in the dataset

# Usually, when we map the words to a number, the number fail to capture the relationship behind the words. For example, in our case, positive
# adjectives and adverbs should be closer together

# One way to counter this one-hot encoding, that is we convert very word into a vector -> the vector length would equal to the number of uniqu
# words -> and each vector would have one 1, and the rest would be zero, ie each word would have unique vector. 
# This would be too long, as now every input would have the m vectors (m: the longest length review long) of n dimension (n: number of 
# unique words plus some to account for words in training but not in testing)

# Feature embedding comes in: we determine the number of features, and assign each word a rank/number based on those features
# The assgning can be done through: Supervised Learnning and self-supervised learning(word2vec

In [154]:
# Building an RNN model for sentiment analysis

# Embedding layer -> 20 features for each word to be judged on
# Since we have long sequences, LSTM layer would also be added
# Then, a hidden layer
# and then an output layer

class RNN(nn.Module):

    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx = 0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first = True)
        self.fc1 = nn.Linear(rnn_hidden_size, 1)
        #self.relu = nn.ReLU()
        #self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        #out = self.relu(out)
        #out = self.fc2(out)
        out = self.sigmoid(out)
        return out

vocabSize = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64
torch.manual_seed(1)

model = RNN(vocab_size=vocabSize, embed_dim = 20, rnn_hidden_size = 64, fc_hidden_size = 64)
model

RNN(
  (embedding): Embedding(69025, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [155]:
#training the model

def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [156]:
def evaluate(dataloader):
    model.eval()

    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:,0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred >= 0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
        return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [157]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)



In [158]:
#Training the model for 10 epochs
num_epochs = 10

for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(epoch, acc_train, acc_valid)

KeyboardInterrupt: 

In [160]:
#Project two: Character-level language modeling

# Input is a text document, and the goal is to develop a model that can generate new text similar to style of input text

# The input is broken down into a sequence of characters, which are fed one at a time


In [167]:
# Reading text data and preprocessing

import numpy as np

with open('1268-0.txt', 'r', encoding='utf8') as fp:
    text = fp.read()

start_idx = text.find("THE MYSTERIOUS ISLAND")
end_idx = text.find("End of the Project Gutenberg")

text = text[start_idx:end_idx]

In [170]:
char_set = set(text)


In [197]:
# Building a dictionary to map characters to integers,
# Reverse mapping via indexing numpy array

import torch
from torch.utils.data import Dataset


chars_sorted = sorted(char_set)  #Sorted list of all characters 
char2int = {ch:i for i,ch in enumerate(chars_sorted)} #Mapped every character to number, specifically its index in chars_sorted
char_array = np.array(chars_sorted) #Turning the unique characters into a list

text_encoded = np.array(
    [char2int[ch] for ch in text],
    dtype=np.int32)
#Turned every character in the text to a number, using the char2int index, which is their index in the sorted characted array



In [198]:
# In our case, clipping the character length to 40
# Longer sequences: better, because they capture the context of the text better, but will struggle computationaly
# Create text chunks of 41 characters: 40 for x, 1 for y

seq_length = 40
chunk_size = seq_length + 1

text_chunks = [text_encoded[i:i+chunk_size] 
               for i in range(len(text_encoded)-chunk_size+1)] 

## inspection:
for seq in text_chunks[:1]:
    input_seq = seq[:seq_length]
    target = seq[seq_length] 
    print(input_seq, ' -> ', target)
    print(repr(''.join(char_array[input_seq])), 
          ' -> ', repr(''.join(char_array[target])))


[44 32 29  1 37 48 43 44 29 42 33 39 45 43  1 33 43 36 25 38 28  1  6  6
  6  0  0  0  0  0 40 67 64 53 70 52 54 53  1 51]  ->  74
'THE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced b'  ->  'y'


In [199]:
import torch
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)
    
    def __getitem__(self, idx):
        text_chunk = self.text_chunks[idx]
        return text_chunk[:-1].long(), text_chunk[1:].long()
    
seq_dataset = TextDataset(torch.tensor(text_chunks))



In [200]:
from torch.utils.data import DataLoader
 
batch_size = 64

torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)



In [201]:


import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim) 
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, 
                           batch_first=True)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell

    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden.to(device), cell.to(device)
    
vocab_size = len(char_array)
embed_dim = 256
rnn_hidden_size = 512

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size) 
model = model.to(device)
model



RNN(
  (embedding): Embedding(80, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=80, bias=True)
)

RNN(
  (embedding): Embedding(80, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=80, bias=True)
)

In [202]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

num_epochs = 100 

torch.manual_seed(1)

for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell) 
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss = loss.item()/seq_length
    if epoch % 10 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')
 



Epoch 0 loss: 4.3722
Epoch 10 loss: 2.5391
Epoch 20 loss: 2.3329
Epoch 30 loss: 2.0799
Epoch 40 loss: 2.0276
Epoch 50 loss: 1.9109
Epoch 60 loss: 1.8438
Epoch 70 loss: 1.8143
Epoch 80 loss: 1.7463
Epoch 90 loss: 1.6834


In [195]:
batch_size

64

In [208]:
#Evaluation
 
# The model trained spits a 80 vector long of probability. 
# Instead of always picking the character with the largest probability, which might always predict the same text, we randomly sample from 
# the outputs
# Kind of liked weighed output. 



torch.manual_seed(1)

logits = torch.tensor([[1.0, 1.0, 3.0]])

print(nn.functional.softmax(logits, dim=1).numpy()[0])

m = Categorical(logits=logits)
samples = m.sample((10,))
 
print(samples.numpy())



[0.10650698 0.10650698 0.78698605]
[[0]
 [2]
 [2]
 [1]
 [2]
 [1]
 [2]
 [2]
 [2]
 [2]]


In [212]:
# sample function
# feed a starting_str, and spits out a generated text of 500 characters
# Initially, the generated text is initially set to starting_str.
# the starting_str is encoded to a sequence of integers, which is the encoded_input
# The encoded_input is put in the RNN one character at a time to update the hidden state
# New text is generated when the last character is entered in the model
# We logits to select the next character
# The next character is appended at the end of generated string
# The process os

def sample(model, starting_str, len_generated_text=100, scale_factor = 1.0):
    encoded_input = torch.tensor([char2int[s] for s in starting_str])
    encoded_input = torch.reshape(encoded_input, (1, -1))

    generated_str = starting_str

    model.eval()
    hidden, cell = model.init_hidden(1)
    hidden = hidden.to('cpu')
    cell = cell.to('cpu')
    for c in range(len(starting_str)-1):
        _, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell) 
    
    last_char = encoded_input[:, -1]
    for i in range(len_generated_text):
        logits, hidden, cell = model(last_char.view(1), hidden, cell) 
        logits = torch.squeeze(logits, 0)
        scaled_logits = logits * scale_factor
        m = Categorical(logits=scaled_logits)
        last_char = m.sample()
        generated_str += str(char_array[last_char])
        
    return generated_str



In [213]:
torch.manual_seed(1)
model.to('cpu')
print(sample(model, starting_str='The island'))

The island spotid
bount on which wut a rillut. “Lourred If Cyruft of and
with cane stopigred did nevers, and b


In [None]:
# The more the alpha, the more predictable, yet closer to the original text.