In [1]:
# First run this cell
!pip install terminaltables rouge
from datetime import datetime
import argparse
import random
import pickle
import codecs
import json
import os
import nltk
import torch
import numpy as np
from pprint import pprint
import torch
import torch.nn.functional as F
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from terminaltables import AsciiTable
from tqdm import tqdm
import json
from rouge import Rouge


import pandas as pd



#### Preprocess data

In [3]:
data = pd.read_csv("./data/Reviews.csv")

In [5]:
data = data.dropna()
data = data.reset_index(drop=True)
data = data.loc[:, ['Summary', 'Text']]
data.head()

Unnamed: 0,Summary,Text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,"""Delight"" says it all",This is a confection that has been around a fe...
3,Cough Medicine,If you are looking for the secret ingredient i...
4,Great taffy,Great taffy at a great price. There was a wid...


In [49]:
len(data)

data = data.sample(frac=1, random_state=42).reset_index(drop=True)

train_split = 0.8
test_split = 1-train_split

train_data = data[:int(len(data)*train_split)].reset_index(drop=True)
test_data = data[int(len(data)*train_split):].reset_index(drop=True)

In [52]:
source_w2i = {}
source_i2w = []
target_w2i = {}
target_i2w = []

# The padding symbol will be used to ensure that all tensors in a batch
# have equal length.
PADDING_SYMBOL = ' '
source_w2i[PADDING_SYMBOL] = 0
source_i2w.append(PADDING_SYMBOL)
target_w2i[PADDING_SYMBOL] = 0
target_i2w.append(PADDING_SYMBOL)

START_SYMBOL = '<START>'
END_SYMBOL = '<END>'
UNK_SYMBOL = '<UNK>'
source_w2i[START_SYMBOL] = 1
source_i2w.append(START_SYMBOL)
target_w2i[START_SYMBOL] = 1
target_i2w.append(START_SYMBOL)
source_w2i[END_SYMBOL] = 2
source_i2w.append(END_SYMBOL)
target_w2i[END_SYMBOL] = 2
target_i2w.append(END_SYMBOL)
source_w2i[UNK_SYMBOL] = 3
source_i2w.append(UNK_SYMBOL)
target_w2i[UNK_SYMBOL] = 3
target_i2w.append(UNK_SYMBOL)

# Max number of words to be predicted if <END> symbol is not reached
MAX_PREDICTIONS = 20

In [53]:
class AmazonDataset(Dataset) :
    """
    A dataset with source sentences and their respective translations
    into the target language.

    Each sentence is represented as a list of word IDs. 
    """
    def __init__(self, data, record_symbols=True):
        try:
            nltk.word_tokenize("hi there.")
        except LookupError:
            nltk.download('punkt')
        self.source_list = []
        self.target_list = []
        # Read the datafile
        
        for i in tqdm(range(len(data))):
            s = data.Text[i]
            t = data.Summary[i]
            source_sentence = []
            for w in nltk.word_tokenize(s):
                w = w.lower()
                if w not in source_i2w and record_symbols:
                    source_w2i[w] = len(source_i2w)
                    source_i2w.append(w)
                source_sentence.append(source_w2i.get(w, source_w2i[UNK_SYMBOL]))
            source_sentence.append(source_w2i[END_SYMBOL])
            self.source_list.append(source_sentence)
            target_sentence = []
            for w in nltk.word_tokenize(t):
                w = w.lower()
                if w not in target_i2w and record_symbols:
                    target_w2i[w] = len(target_i2w)
                    target_i2w.append(w)
                target_sentence.append(target_w2i.get(w, target_w2i[UNK_SYMBOL]))
            target_sentence.append(target_w2i[END_SYMBOL])
            self.target_list.append(target_sentence)

    def __len__(self):
        return len(self.source_list)

    def __getitem__(self, idx):
        return self.source_list[idx], self.target_list[idx]

In [54]:
train_dataset = AmazonDataset(train_data)
test_dataset = AmazonDataset(test_data, record_symbols=False)

100%|██████████| 454728/454728 [19:22<00:00, 391.31it/s]
100%|██████████| 113683/113683 [05:30<00:00, 344.18it/s]


In [55]:
torch.save(train_dataset, 'data/amazon_train_dataset_py')
torch.save(test_dataset, 'data/amazon_test_dataset_py')

with open('source_w2i.json', 'w') as f:
    f.write(json.dumps(source_w2i))

with open('source_i2w.json', 'w') as f:
    f.write(json.dumps(source_i2w))

with open('target_w2i.json', 'w') as f:
    f.write(json.dumps(target_w2i))

with open('target_i2w.json', 'w') as f:
    f.write(json.dumps(target_i2w))

In [10]:
train_dataset = torch.load('data/amazon_train_dataset_py')
test_dataset = torch.load('data/amazon_test_dataset_py')

source_w2i = {}
source_i2w = []
target_w2i = {}
target_i2w = []

with open('source_w2i.json', 'r') as f:
    source_w2i = json.load(f)

with open('source_i2w.json', 'r') as f:
    source_i2w = json.load(f)

with open('target_w2i.json', 'r') as f:
    target_w2i = json.load(f)

with open('target_i2w.json', 'r') as f:
    target_i2w = json.load(f)

In [56]:
def pad_sequence(batch, pad_source=source_w2i[PADDING_SYMBOL], pad_target=target_w2i[PADDING_SYMBOL]):
    source, target = zip(*batch)
    max_source_len = max(map(len, source))
    max_target_len = max(map(len, target))
    padded_source = [[b[i] if i < len(b) else pad_source for i in range(max_source_len)] for b in source]
    padded_target = [[l[i] if i < len(l) else pad_target for i in range(max_target_len)] for l in target]
    return padded_source, padded_target

In [57]:
def load_glove_embeddings(embedding_file) :
    """
    Reads pre-made embeddings from a file
    """
    N = len(source_w2i)
    embeddings = [0]*N
    with codecs.open(embedding_file, 'r', 'utf-8') as f:
        for line in f:
            data = line.split()
            word = data[0].lower()
            if word not in source_w2i:
                source_w2i[word] = N
                source_i2w.append(word)
                N += 1
                embeddings.append(0)
            vec = [float(x) for x in data[1:]]
            D = len(vec)
            embeddings[source_w2i[word]] = vec
    # Add a '0' embedding for the padding symbol
    embeddings[0] = [0]*D
    # Check if there are words that did not have a ready-made Glove embedding
    # For these words, add a random vector
    for word in source_w2i :
        index = source_w2i[word]
        if embeddings[index] == 0 :
            embeddings[index] = (np.random.random(D)-0.5).tolist()
    return D, embeddings

In [70]:
# ==================== Encoder ==================== #

class EncoderRNN(nn.Module) :
    """
    Encodes a batch of source sentences. 
    """
    
    def __init__(self, no_of_input_symbols, embeddings=None, embedding_size=16, hidden_size=25,
        encoder_bidirectional=False, device='cpu', use_gru=False, tune_embeddings=False) :
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.is_bidirectional = encoder_bidirectional
        self.embedding = nn.Embedding(no_of_input_symbols,embedding_size)
        if embeddings !=  None:
            self.embedding.weight = nn.Parameter(torch.tensor(embeddings, dtype=torch.float), requires_grad=tune_embeddings)
        if use_gru:
            self.rnn = nn.GRU(embedding_size, hidden_size, batch_first=True, bidirectional=self.is_bidirectional)
        else:
            self.rnn = nn.RNN(embedding_size, hidden_size, batch_first=True, bidirectional=self.is_bidirectional)
        self.device = device
        self.to(device)

    def set_embeddings(self, embeddings):
        self.embedding.weight = torch.tensor(embeddings, dtype=torch.float)

    def forward(self, x):
        """
        x is a list of lists of size (batch_size,max_seq_length)
        Each inner list contains word IDs and represents one sentence.
        The whole list-of-lists represents a batch of sentences.
       
        Returns:
        the output from the encoder RNN: a pair of two tensors, one containing all hidden states, and one 
        containing the last hidden state (see https://pytorch.org/docs/stable/generated/torch.nn.RNN.html)
        """
        x_tensor = torch.tensor(x).to(self.device)
        embedded_words = self.embedding(x_tensor)
        all_hidden, last_hidden = self.rnn(embedded_words)
        return all_hidden, last_hidden

In [71]:
# ==================== Decoder ==================== #

class DecoderRNN(nn.Module):
    def __init__(self, no_of_output_symbols, embedding_size=16, hidden_size=25, use_attention=True,
        display_attention=False, device='cpu', use_gru=False) :
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(no_of_output_symbols,embedding_size)
        self.no_of_output_symbols = no_of_output_symbols
        self.W = nn.Parameter(torch.rand(hidden_size, hidden_size)-0.5) # shouldn't W be 2*hidden_size
        self.U = nn.Parameter(torch.rand(hidden_size, hidden_size)-0.5)
        self.v = nn.Parameter(torch.rand(hidden_size, 1)-0.5)
        self.use_attention = use_attention
        self.display_attention = display_attention
        if use_gru:
            self.rnn = nn.GRU(embedding_size, hidden_size, batch_first=True)
        else:
            self.rnn = nn.RNN(embedding_size, hidden_size, batch_first=True)
        self.output = nn.Linear(hidden_size, no_of_output_symbols)
        self.device = device
        self.to(device)

    def forward(self, inp, hidden, encoder_outputs):
        """
        'input' is a list of length batch_size, containing the current word
        of each sentence in the batch

        'hidden' is a tensor containing the last hidden state of the decoder, 
        for each sequence in the batch
        hidden.shape = (1, batch_size, hidden_size)

        'encoder_outputs' is a tensor containing all hidden states from the
        encoder (used in problem c)
        encoder_outputs.shape = (batch_size, max_seq_length, hidden_size)

        Note that 'max_seq_length' above refers to the max_seq_length
        of the encoded sequence (not the decoded sequence).

        Returns:
        If use_attention and display_attention are both True (task (c)), return a triple
        (logits for the predicted next word, hidden state, attention weights alpha)

        Otherwise (task (b)), return a pair
        (logits for the predicted next word, hidden state).
        """
        
        inp_tensor = torch.tensor(inp).to(self.device)
        word_embs = self.embedding(inp_tensor).unsqueeze(1)

        if not self.use_attention:
            rnn_output, hidden = self.rnn(word_embs, hidden)
            logits = self.output(rnn_output.squeeze(1))
            return logits, hidden
        
        context, alpha_ij = self.get_context(word_embs, encoder_outputs)
        rnn_output, hidden = self.rnn(word_embs, context)
        logits = self.output(rnn_output.squeeze(1))

        if self.display_attention:
            return logits, hidden, alpha_ij
        return logits, hidden

    def get_context(self, prev_word_embs, encoder_states):
        summed = (torch.matmul(prev_word_embs, self.U) + torch.matmul(encoder_states, self.W))
        summed = torch.tanh(summed)
        e_ij = torch.matmul(summed, self.v)
        alpha_ij = torch.softmax(e_ij, dim=1)
        context = alpha_ij * encoder_states
        context = context.sum(dim=1).unsqueeze(0)
        return context, alpha_ij

In [72]:
def evaluate(ds, encoder, decoder):
    encoder.eval()
    decoder.eval()

    num_correct_words = 0
    num_correct_sentences = 0
    
    tot_words = 0
    tot_sentances = 0

    predicted_sentences = []
    correct_sentences = []
    
    #for x, y in tqdm(ds):
    for x, y in ds:
        predicted_sentence = []
        with torch.no_grad():
            outputs, hidden = encoder([x])
        if encoder.is_bidirectional:
            hidden = hidden.permute((1,0,2)).reshape(1,-1).unsqueeze(0)
        
        predicted_symbol = target_w2i[START_SYMBOL]
        predicted_sentence = []
        num_attempts = 0
        while num_attempts < MAX_PREDICTIONS:
            with torch.no_grad():
                predictions, hidden = decoder([predicted_symbol], hidden, outputs)    
            _, predicted_tensor = predictions.topk(1)
            predicted_symbol = predicted_tensor.detach().item()
    
            num_attempts += 1
    
            if predicted_symbol == target_w2i[END_SYMBOL]:
                break
                
            predicted_sentence.append(predicted_symbol)

        # [:-1] such we dont consider the end symbol
        y = y[:-1]
        
        if predicted_sentence == y:
            num_correct_sentences += 1

        for w_p, w_y in zip(predicted_sentence, y):
            if w_p == w_y:
                num_correct_words += 1

        tot_words += len(y)
        tot_sentances += 1

        predicted_sentence_str = " ".join([target_i2w[i] for i in predicted_sentence])
        correct_sentence_str = " ".join([source_i2w[i] for i in y])

        #if len(predicted_sentence_str) > 1 and len(correct_sentence_str) > 1:
        predicted_sentences.append(predicted_sentence_str)
        correct_sentences.append(correct_sentence_str)

    print(predicted_sentences[-1])
    print(correct_sentences[-1])

    #rouge = Rouge()
    #print(rouge.get_scores(predicted_sentences, correct_sentences, avg=True))  

    word_acc = num_correct_words / tot_words
    sent_acc = num_correct_sentences / tot_sentances

    print(f"Word acc: {word_acc*100:.2f}%")
    print(f"Sent acc: {sent_acc*100:.2f}%")

In [73]:
# ================ Hyper-parameters ================ #
use_attention = True     
use_gru = True         # Use Gated Recurrent Units (rather than plain RNNs)
bidirectional = True   # Use a bidirectional encoder
use_embeddings = True  # Use pre-loaded Glove embeddings
tune_embeddings = True # Fine-tune the Glove embeddings
batch_size = 64
hidden_size = 25       # Number of dimensions in the hidden state
learning_rate = 0.001
epochs = 50            # We will train for this many epochs
save = False           # Do not save the model

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [74]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_sequence)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_sequence)

In [75]:
embedding_size, embeddings = load_glove_embeddings('/datasets/dd2417/glove.6B.50d.txt')

criterion = nn.CrossEntropyLoss()

encoder = EncoderRNN(
    len(source_i2w),
    embeddings=embeddings,
    embedding_size=embedding_size,
    hidden_size=hidden_size,
    encoder_bidirectional=bidirectional,
    tune_embeddings=tune_embeddings,
    use_gru=use_gru,
    device=device
)
decoder = DecoderRNN(
    len(target_i2w),
    embedding_size=embedding_size,
    hidden_size=hidden_size*(bidirectional+1),
    use_attention=use_attention,
    use_gru=use_gru,
    device=device
)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

In [None]:
print(datetime.now().strftime("%H:%M:%S"), "Starting training.")

for epoch in range(epochs):
    total_loss = 0
    encoder.train()
    decoder.train()
    for source, target in tqdm(train_loader):
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        loss = 0
        # hidden is (D * num_layers, B, H)
        outputs, hidden = encoder(source)
        if bidirectional:
            # (2, B, H) -> (B, 2 * H) -> (1, B, 2 * H)
            hidden = torch.cat([hidden[0,:, :], hidden[1,:,:]], dim=1).unsqueeze(0)
                    
        # The probability of doing teacher forcing will decrease
        # from 1 to 0 over the range of epochs. This could be implemented
        # like this:
        # teacher_forcing_ratio = 1- epoch/args.epochs
        # But, for now we will always use teacher forcing
        teacher_forcing_ratio = 1

        # The input to the decoder in the first time step will be
        # the boundary symbol, regardless if we are using teacher
        # forcing or not.
        idx = [target_w2i[START_SYMBOL] for sublist in target]
        predicted_symbol = [target_w2i[START_SYMBOL] for sublist in target]

        target_length = len(target[0])
        for i in range(target_length) :
            use_teacher_forcing = (random.random() < teacher_forcing_ratio)
            if use_teacher_forcing :
                predictions, hidden = decoder(idx, hidden, outputs)
            else:
                # Here we input the previous prediction rather than the
                # correct symbol.
                predictions, hidden = decoder(predicted_symbol, hidden, outputs)
            _, predicted_tensor = predictions.topk(1)
            predicted_symbol = predicted_tensor.squeeze().tolist()

            # The targets will be the ith symbol of all the target
            # strings. They will also be used as inputs for the next
            # time step if we use teacher forcing.
            idx = [sublist[i] for sublist in target]
            loss += criterion(predictions.squeeze(), torch.tensor(idx).to(device))
        loss /= (target_length * batch_size)
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()
        total_loss += loss
    
    print(datetime.now().strftime("%H:%M:%S"), "Epoch", epoch, "loss:", total_loss.detach().item())
    total_loss = 0

    if i % 1 == 0:
        print("Evaluating on the test data...")
        evaluate(test_dataset, encoder, decoder)

22:16:45 Starting training.


100%|██████████| 7106/7106 [03:40<00:00, 32.22it/s]


22:20:25 Epoch 0 loss: 204.9815216064453
Evaluating on the test data...


 28%|██▊       | 32040/113683 [01:03<02:36, 522.46it/s]

In [66]:
encoder.eval()
decoder.eval()

num_examples = 5

count = 0
for x, y in test_dataset:
    predicted_sentence = []
    with torch.no_grad():
        outputs, hidden = encoder([x])
    if encoder.is_bidirectional:
        hidden = hidden.permute((1,0,2)).reshape(1,-1).unsqueeze(0)
    predicted_symbol = target_w2i[START_SYMBOL]
    predicted_sentence = []
    num_attempts = 0
    while num_attempts < MAX_PREDICTIONS:
        with torch.no_grad():
            predictions, hidden = decoder([predicted_symbol], hidden, outputs)    
        _, predicted_tensor = predictions.topk(1)
        predicted_symbol = predicted_tensor.detach().item()
        predicted_sentence.append(predicted_symbol)

        num_attempts += 1

        if predicted_symbol == target_w2i[END_SYMBOL]:
            break
    
    in_sent = ' '.join([source_i2w[i] for i in x])
    y_sent = ' '.join([target_i2w[i] for i in y])
    pred_sent = ' '.join([target_i2w[i] for i in predicted_sentence])

    print(in_sent)
    print()
    print(f"Y-TRUE: {y_sent}")
    print(f"Y-PRED: {pred_sent}")
    print()
    print()
    print()
    print()
    
    count += 1
    if count > num_examples:
        break


  0%|          | 5/113683 [00:00<03:20, 566.22it/s]

my 3-year old akita is very picky when it comes to food , and usually refuses most of the standard dog treats . this is the only thing he visibly appreciates ; btw , when offered chicken happy hips of the same brand he 's not as interested. < br / > < br / > the only downside is the price ; do n't ever buy it in retail stores - they charge 2-3 times more than online retailers . even then it 's rather expensive. < br / > < br / > hope this helps . <END>

Y-TRUE: the only thing my dog will eat <END>
Y-PRED: my dog loves these treats <END>
----------------------------------------------------------------------------------------------------
to put is simply , it ruins you for other <UNK> < br / > < br / > we tried some of this on a whim from our local costco once . it is amazing . even though there is great care needed to pop it just the right amount of time ( the sugary stuff burns quickly ) , it is so very , very worth it . as others have said , it 's like the fresh kind made in a kettle 


